4545struct _mp_lexer_t {
4646 qstr source_name ; // name of source
4747 void * stream_data ; // data for stream
48- mp_lexer_stream_next_char_t stream_next_char ; // stream callback to get next char
48+ mp_lexer_stream_next_byte_t stream_next_byte ; // stream callback to get next byte
4949 mp_lexer_stream_close_t stream_close ; // stream callback to free
5050
5151 unichar chr0 , chr1 , chr2 ; // current cached characters from source
@@ -103,7 +103,7 @@ void mp_token_show(const mp_token_t *tok) {
103103#define CUR_CHAR (lex ) ((lex)->chr0)
104104
105105STATIC bool is_end (mp_lexer_t * lex ) {
106- return lex -> chr0 == MP_LEXER_CHAR_EOF ;
106+ return lex -> chr0 == MP_LEXER_EOF ;
107107}
108108
109109STATIC bool is_physical_newline (mp_lexer_t * lex ) {
@@ -171,7 +171,7 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
171171}
172172
173173STATIC void next_char (mp_lexer_t * lex ) {
174- if (lex -> chr0 == MP_LEXER_CHAR_EOF ) {
174+ if (lex -> chr0 == MP_LEXER_EOF ) {
175175 return ;
176176 }
177177
@@ -200,10 +200,10 @@ STATIC void next_char(mp_lexer_t *lex) {
200200 for (; advance > 0 ; advance -- ) {
201201 lex -> chr0 = lex -> chr1 ;
202202 lex -> chr1 = lex -> chr2 ;
203- lex -> chr2 = lex -> stream_next_char (lex -> stream_data );
204- if (lex -> chr2 == MP_LEXER_CHAR_EOF ) {
203+ lex -> chr2 = lex -> stream_next_byte (lex -> stream_data );
204+ if (lex -> chr2 == MP_LEXER_EOF ) {
205205 // EOF
206- if (lex -> chr1 != MP_LEXER_CHAR_EOF && lex -> chr1 != '\n' && lex -> chr1 != '\r' ) {
206+ if (lex -> chr1 != MP_LEXER_EOF && lex -> chr1 != '\n' && lex -> chr1 != '\r' ) {
207207 lex -> chr2 = '\n' ; // insert newline at end of file
208208 }
209209 }
@@ -491,8 +491,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
491491 vstr_add_char (& lex -> vstr , '\\' );
492492 } else {
493493 switch (c ) {
494- case MP_LEXER_CHAR_EOF : break ; // TODO a proper error message?
495- case '\n' : c = MP_LEXER_CHAR_EOF ; break ; // TODO check this works correctly (we are supposed to ignore it
494+ case MP_LEXER_EOF : break ; // TODO a proper error message?
495+ case '\n' : c = MP_LEXER_EOF ; break ; // TODO check this works correctly (we are supposed to ignore it
496496 case '\\' : break ;
497497 case '\'' : break ;
498498 case '"' : break ;
@@ -546,7 +546,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
546546 break ;
547547 }
548548 }
549- if (c != MP_LEXER_CHAR_EOF ) {
549+ if (c != MP_LEXER_EOF ) {
550550 if (c < 0x110000 && !is_bytes ) {
551551 vstr_add_char (& lex -> vstr , c );
552552 } else if (c < 0x100 && is_bytes ) {
@@ -556,7 +556,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
556556 }
557557 }
558558 } else {
559- vstr_add_char (& lex -> vstr , CUR_CHAR (lex ));
559+ // Add the "character" as a byte so that we remain 8-bit clean.
560+ // This way, strings are parsed correctly whether or not they contain utf-8 chars.
561+ vstr_add_byte (& lex -> vstr , CUR_CHAR (lex ));
560562 }
561563 }
562564 next_char (lex );
@@ -728,7 +730,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
728730 }
729731}
730732
731- mp_lexer_t * mp_lexer_new (qstr src_name , void * stream_data , mp_lexer_stream_next_char_t stream_next_char , mp_lexer_stream_close_t stream_close ) {
733+ mp_lexer_t * mp_lexer_new (qstr src_name , void * stream_data , mp_lexer_stream_next_byte_t stream_next_byte , mp_lexer_stream_close_t stream_close ) {
732734 mp_lexer_t * lex = m_new_maybe (mp_lexer_t , 1 );
733735
734736 // check for memory allocation error
@@ -741,7 +743,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
741743
742744 lex -> source_name = src_name ;
743745 lex -> stream_data = stream_data ;
744- lex -> stream_next_char = stream_next_char ;
746+ lex -> stream_next_byte = stream_next_byte ;
745747 lex -> stream_close = stream_close ;
746748 lex -> line = 1 ;
747749 lex -> column = 1 ;
@@ -762,18 +764,18 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
762764 lex -> indent_level [0 ] = 0 ;
763765
764766 // preload characters
765- lex -> chr0 = stream_next_char (stream_data );
766- lex -> chr1 = stream_next_char (stream_data );
767- lex -> chr2 = stream_next_char (stream_data );
767+ lex -> chr0 = stream_next_byte (stream_data );
768+ lex -> chr1 = stream_next_byte (stream_data );
769+ lex -> chr2 = stream_next_byte (stream_data );
768770
769771 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
770- if (lex -> chr0 == MP_LEXER_CHAR_EOF ) {
772+ if (lex -> chr0 == MP_LEXER_EOF ) {
771773 lex -> chr0 = '\n' ;
772- } else if (lex -> chr1 == MP_LEXER_CHAR_EOF ) {
774+ } else if (lex -> chr1 == MP_LEXER_EOF ) {
773775 if (lex -> chr0 != '\n' && lex -> chr0 != '\r' ) {
774776 lex -> chr1 = '\n' ;
775777 }
776- } else if (lex -> chr2 == MP_LEXER_CHAR_EOF ) {
778+ } else if (lex -> chr2 == MP_LEXER_EOF ) {
777779 if (lex -> chr1 != '\n' && lex -> chr1 != '\r' ) {
778780 lex -> chr2 = '\n' ;
779781 }
0 commit comments