Skip to content

Commit 7ed58cb

Browse files
committed
py: Support unicode (utf-8 encoded) identifiers in Python source.
Enabled simply by making the identifier lexing code 8-bit clean.
1 parent 6e56bb6 commit 7ed58cb

2 files changed

Lines changed: 32 additions & 6 deletions

File tree

py/lexer.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
112112
return lex->chr1 >= '0' && lex->chr1 <= '7';
113113
}
114114

115-
// TODO UNICODE include unicode characters in definition of identifiers
115+
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
116116
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
117-
return is_letter(lex) || lex->chr0 == '_';
117+
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
118118
}
119119

120-
// TODO UNICODE include unicode characters in definition of identifiers
121120
STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
122121
return is_head_of_identifier(lex) || is_digit(lex);
123122
}
@@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
523522
} else if (is_head_of_identifier(lex)) {
524523
lex->tok_kind = MP_TOKEN_NAME;
525524

526-
// get first char
527-
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
525+
// get first char (add as byte to remain 8-bit clean and support utf-8)
526+
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
528527
next_char(lex);
529528

530529
// get tail chars
531530
while (!is_end(lex) && is_tail_of_identifier(lex)) {
532-
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
531+
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
533532
next_char(lex);
534533
}
535534

tests/unicode/unicode_id.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# test unicode in identifiers
2+
3+
# comment
4+
# αβγδϵφζ
5+
6+
# global identifiers
7+
α = 1
8+
αβγ = 2
9+
= 3
10+
βb = 4
11+
print(α, αβγ, , βb)
12+
13+
# function, argument, local identifiers
14+
def α(β, γ):
15+
δ = β + γ
16+
print(β, γ, δ)
17+
α(1, 2)
18+
19+
# class, method identifiers
20+
class φ:
21+
def __init__(self):
22+
pass
23+
def δ(self, ϵ):
24+
print(ϵ)
25+
zζzζz = φ()
26+
if hasattr(zζzζz, "δ"):
27+
zζzζz.δ(ϵ=123)

0 commit comments

Comments
 (0)