mirror of
https://github.com/micropython/micropython.git
synced 2026-01-05 19:50:30 +01:00
py: Support unicode (utf-8 encoded) identifiers in Python source.
Enabled simply by making the identifier lexing code 8-bit clean.
This commit is contained in:
11
py/lexer.c
11
py/lexer.c
@@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
|
||||
return lex->chr1 >= '0' && lex->chr1 <= '7';
|
||||
}
|
||||
|
||||
// TODO UNICODE include unicode characters in definition of identifiers
|
||||
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
|
||||
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
|
||||
return is_letter(lex) || lex->chr0 == '_';
|
||||
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
|
||||
}
|
||||
|
||||
// TODO UNICODE include unicode characters in definition of identifiers
|
||||
STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
|
||||
return is_head_of_identifier(lex) || is_digit(lex);
|
||||
}
|
||||
@@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
|
||||
} else if (is_head_of_identifier(lex)) {
|
||||
lex->tok_kind = MP_TOKEN_NAME;
|
||||
|
||||
// get first char
|
||||
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
|
||||
// get first char (add as byte to remain 8-bit clean and support utf-8)
|
||||
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
|
||||
next_char(lex);
|
||||
|
||||
// get tail chars
|
||||
while (!is_end(lex) && is_tail_of_identifier(lex)) {
|
||||
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
|
||||
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
|
||||
next_char(lex);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user