Skip to content

Commit 94fbe97

Browse files
committed
py: Change lexer stream API to return bytes not chars.
Lexer is now 8-bit clean inside strings.
1 parent 0713341 commit 94fbe97

5 files changed

Lines changed: 42 additions & 40 deletions

File tree

py/lexer.c

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
struct _mp_lexer_t {
4646
qstr source_name; // name of source
4747
void *stream_data; // data for stream
48-
mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
48+
mp_lexer_stream_next_byte_t stream_next_byte; // stream callback to get next byte
4949
mp_lexer_stream_close_t stream_close; // stream callback to free
5050

5151
unichar chr0, chr1, chr2; // current cached characters from source
@@ -103,7 +103,7 @@ void mp_token_show(const mp_token_t *tok) {
103103
#define CUR_CHAR(lex) ((lex)->chr0)
104104

105105
STATIC bool is_end(mp_lexer_t *lex) {
106-
return lex->chr0 == MP_LEXER_CHAR_EOF;
106+
return lex->chr0 == MP_LEXER_EOF;
107107
}
108108

109109
STATIC bool is_physical_newline(mp_lexer_t *lex) {
@@ -171,7 +171,7 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
171171
}
172172

173173
STATIC void next_char(mp_lexer_t *lex) {
174-
if (lex->chr0 == MP_LEXER_CHAR_EOF) {
174+
if (lex->chr0 == MP_LEXER_EOF) {
175175
return;
176176
}
177177

@@ -200,10 +200,10 @@ STATIC void next_char(mp_lexer_t *lex) {
200200
for (; advance > 0; advance--) {
201201
lex->chr0 = lex->chr1;
202202
lex->chr1 = lex->chr2;
203-
lex->chr2 = lex->stream_next_char(lex->stream_data);
204-
if (lex->chr2 == MP_LEXER_CHAR_EOF) {
203+
lex->chr2 = lex->stream_next_byte(lex->stream_data);
204+
if (lex->chr2 == MP_LEXER_EOF) {
205205
// EOF
206-
if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
206+
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
207207
lex->chr2 = '\n'; // insert newline at end of file
208208
}
209209
}
@@ -491,8 +491,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
491491
vstr_add_char(&lex->vstr, '\\');
492492
} else {
493493
switch (c) {
494-
case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
495-
case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
494+
case MP_LEXER_EOF: break; // TODO a proper error message?
495+
case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
496496
case '\\': break;
497497
case '\'': break;
498498
case '"': break;
@@ -546,7 +546,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
546546
break;
547547
}
548548
}
549-
if (c != MP_LEXER_CHAR_EOF) {
549+
if (c != MP_LEXER_EOF) {
550550
if (c < 0x110000 && !is_bytes) {
551551
vstr_add_char(&lex->vstr, c);
552552
} else if (c < 0x100 && is_bytes) {
@@ -556,7 +556,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
556556
}
557557
}
558558
} else {
559-
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
559+
// Add the "character" as a byte so that we remain 8-bit clean.
560+
// This way, strings are parsed correctly whether or not they contain utf-8 chars.
561+
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
560562
}
561563
}
562564
next_char(lex);
@@ -728,7 +730,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
728730
}
729731
}
730732

731-
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
733+
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
732734
mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
733735

734736
// check for memory allocation error
@@ -741,7 +743,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
741743

742744
lex->source_name = src_name;
743745
lex->stream_data = stream_data;
744-
lex->stream_next_char = stream_next_char;
746+
lex->stream_next_byte = stream_next_byte;
745747
lex->stream_close = stream_close;
746748
lex->line = 1;
747749
lex->column = 1;
@@ -762,18 +764,18 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
762764
lex->indent_level[0] = 0;
763765

764766
// preload characters
765-
lex->chr0 = stream_next_char(stream_data);
766-
lex->chr1 = stream_next_char(stream_data);
767-
lex->chr2 = stream_next_char(stream_data);
767+
lex->chr0 = stream_next_byte(stream_data);
768+
lex->chr1 = stream_next_byte(stream_data);
769+
lex->chr2 = stream_next_byte(stream_data);
768770

769771
// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
770-
if (lex->chr0 == MP_LEXER_CHAR_EOF) {
772+
if (lex->chr0 == MP_LEXER_EOF) {
771773
lex->chr0 = '\n';
772-
} else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
774+
} else if (lex->chr1 == MP_LEXER_EOF) {
773775
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
774776
lex->chr1 = '\n';
775777
}
776-
} else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
778+
} else if (lex->chr2 == MP_LEXER_EOF) {
777779
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
778780
lex->chr2 = '\n';
779781
}

py/lexer.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,18 +139,18 @@ typedef struct _mp_token_t {
139139
mp_uint_t len; // (byte) length of string of token
140140
} mp_token_t;
141141

142-
// the next-char function must return the next character in the stream
143-
// it must return MP_LEXER_CHAR_EOF if end of stream
144-
// it can be called again after returning MP_LEXER_CHAR_EOF, and in that case must return MP_LEXER_CHAR_EOF
145-
#define MP_LEXER_CHAR_EOF (-1)
146-
typedef unichar (*mp_lexer_stream_next_char_t)(void*);
142+
// the next-byte function must return the next byte in the stream
143+
// it must return MP_LEXER_EOF if end of stream
144+
// it can be called again after returning MP_LEXER_EOF, and in that case must return MP_LEXER_EOF
145+
#define MP_LEXER_EOF (-1)
146+
typedef mp_uint_t (*mp_lexer_stream_next_byte_t)(void*);
147147
typedef void (*mp_lexer_stream_close_t)(void*);
148148

149149
typedef struct _mp_lexer_t mp_lexer_t;
150150

151151
void mp_token_show(const mp_token_t *tok);
152152

153-
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close);
153+
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close);
154154
mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
155155

156156
void mp_lexer_free(mp_lexer_t *lex);

py/lexerstr.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ typedef struct _mp_lexer_str_buf_t {
3636
const char *src_end; // end (exclusive) of source
3737
} mp_lexer_str_buf_t;
3838

39-
STATIC unichar str_buf_next_char(mp_lexer_str_buf_t *sb) {
39+
STATIC mp_uint_t str_buf_next_byte(mp_lexer_str_buf_t *sb) {
4040
if (sb->src_cur < sb->src_end) {
4141
return *sb->src_cur++;
4242
} else {
43-
return MP_LEXER_CHAR_EOF;
43+
return MP_LEXER_EOF;
4444
}
4545
}
4646

@@ -57,5 +57,5 @@ mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t
5757
sb->src_beg = str;
5858
sb->src_cur = str;
5959
sb->src_end = str + len;
60-
return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_char_t)str_buf_next_char, (mp_lexer_stream_close_t)str_buf_free);
60+
return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_byte_t)str_buf_next_byte, (mp_lexer_stream_close_t)str_buf_free);
6161
}

py/lexerunix.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,20 @@
4141

4242
typedef struct _mp_lexer_file_buf_t {
4343
int fd;
44-
char buf[20];
45-
uint len;
46-
uint pos;
44+
byte buf[20];
45+
mp_uint_t len;
46+
mp_uint_t pos;
4747
} mp_lexer_file_buf_t;
4848

49-
STATIC unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
49+
STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
5050
if (fb->pos >= fb->len) {
5151
if (fb->len == 0) {
52-
return MP_LEXER_CHAR_EOF;
52+
return MP_LEXER_EOF;
5353
} else {
5454
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
5555
if (n <= 0) {
5656
fb->len = 0;
57-
return MP_LEXER_CHAR_EOF;
57+
return MP_LEXER_EOF;
5858
}
5959
fb->len = n;
6060
fb->pos = 0;
@@ -78,7 +78,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
7878
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
7979
fb->len = n;
8080
fb->pos = 0;
81-
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
81+
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
8282
}
8383

8484
#endif // MICROPY_HELPER_LEXER_UNIX

stmhal/lexerfatfs.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,20 @@
3636

3737
typedef struct _mp_lexer_file_buf_t {
3838
FIL fp;
39-
char buf[20];
39+
byte buf[20];
4040
uint16_t len;
4141
uint16_t pos;
4242
} mp_lexer_file_buf_t;
4343

44-
static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
44+
STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
4545
if (fb->pos >= fb->len) {
4646
if (fb->len < sizeof(fb->buf)) {
47-
return MP_LEXER_CHAR_EOF;
47+
return MP_LEXER_EOF;
4848
} else {
4949
UINT n;
5050
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
5151
if (n == 0) {
52-
return MP_LEXER_CHAR_EOF;
52+
return MP_LEXER_EOF;
5353
}
5454
fb->len = n;
5555
fb->pos = 0;
@@ -58,7 +58,7 @@ static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
5858
return fb->buf[fb->pos++];
5959
}
6060

61-
static void file_buf_close(mp_lexer_file_buf_t *fb) {
61+
STATIC void file_buf_close(mp_lexer_file_buf_t *fb) {
6262
f_close(&fb->fp);
6363
m_del_obj(mp_lexer_file_buf_t, fb);
6464
}
@@ -74,5 +74,5 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
7474
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
7575
fb->len = n;
7676
fb->pos = 0;
77-
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
77+
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
7878
}

0 commit comments

Comments
 (0)