4242// TODO seems that CPython allows NULL byte in the input stream
4343// don't know if that's intentional or not, but we don't allow it
4444
45- struct _mp_lexer_t {
46- qstr source_name ; // name of source
47- void * stream_data ; // data for stream
48- mp_lexer_stream_next_byte_t stream_next_byte ; // stream callback to get next byte
49- mp_lexer_stream_close_t stream_close ; // stream callback to free
50-
51- unichar chr0 , chr1 , chr2 ; // current cached characters from source
52-
53- mp_uint_t line ; // source line
54- mp_uint_t column ; // source column
55-
56- mp_int_t emit_dent ; // non-zero when there are INDENT/DEDENT tokens to emit
57- mp_int_t nested_bracket_level ; // >0 when there are nested brackets over multiple lines
58-
59- mp_uint_t alloc_indent_level ;
60- mp_uint_t num_indent_level ;
61- uint16_t * indent_level ;
62-
63- vstr_t vstr ;
64- mp_token_t tok_cur ;
65- };
66-
6745mp_uint_t mp_optimise_value ;
6846
6947// TODO replace with a call to a standard function
70- bool str_strn_equal (const char * str , const char * strn , mp_uint_t len ) {
48+ STATIC bool str_strn_equal (const char * str , const char * strn , mp_uint_t len ) {
7149 mp_uint_t i = 0 ;
7250
7351 while (i < len && * str == * strn ) {
@@ -79,27 +57,6 @@ bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
7957 return i == len && * str == 0 ;
8058}
8159
82- #ifdef MICROPY_DEBUG_PRINTERS
83- void mp_token_show (const mp_token_t * tok ) {
84- printf ("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:" UINT_FMT , tok -> src_line , tok -> src_column , tok -> kind , tok -> str , tok -> len );
85- if (tok -> str != NULL && tok -> len > 0 ) {
86- const byte * i = (const byte * )tok -> str ;
87- const byte * j = (const byte * )i + tok -> len ;
88- printf (" " );
89- while (i < j ) {
90- unichar c = utf8_get_char (i );
91- i = utf8_next_char (i );
92- if (unichar_isprint (c )) {
93- printf ("%c" , c );
94- } else {
95- printf ("?" );
96- }
97- }
98- }
99- printf ("\n" );
100- }
101- #endif
102-
10360#define CUR_CHAR (lex ) ((lex)->chr0)
10461
10562STATIC bool is_end (mp_lexer_t * lex ) {
@@ -210,7 +167,7 @@ STATIC void next_char(mp_lexer_t *lex) {
210167 }
211168}
212169
213- void indent_push (mp_lexer_t * lex , mp_uint_t indent ) {
170+ STATIC void indent_push (mp_lexer_t * lex , mp_uint_t indent ) {
214171 if (lex -> num_indent_level >= lex -> alloc_indent_level ) {
215172 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
216173 lex -> indent_level = m_renew (uint16_t , lex -> indent_level , lex -> alloc_indent_level , lex -> alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC );
@@ -219,11 +176,11 @@ void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
219176 lex -> indent_level [lex -> num_indent_level ++ ] = indent ;
220177}
221178
222- mp_uint_t indent_top (mp_lexer_t * lex ) {
179+ STATIC mp_uint_t indent_top (mp_lexer_t * lex ) {
223180 return lex -> indent_level [lex -> num_indent_level - 1 ];
224181}
225182
226- void indent_pop (mp_lexer_t * lex ) {
183+ STATIC void indent_pop (mp_lexer_t * lex ) {
227184 lex -> num_indent_level -= 1 ;
228185}
229186
@@ -335,7 +292,10 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
335292 return true;
336293}
337294
338- STATIC void mp_lexer_next_token_into (mp_lexer_t * lex , mp_token_t * tok , bool first_token ) {
295+ STATIC void mp_lexer_next_token_into (mp_lexer_t * lex , bool first_token ) {
296+ // start new token text
297+ vstr_reset (& lex -> vstr );
298+
339299 // skip white space and comments
340300 bool had_physical_newline = false;
341301 while (!is_end (lex )) {
@@ -355,12 +315,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
355315 next_char (lex );
356316 if (!is_physical_newline (lex )) {
357317 // SyntaxError: unexpected character after line continuation character
358- tok -> src_line = lex -> line ;
359- tok -> src_column = lex -> column ;
360- tok -> kind = MP_TOKEN_BAD_LINE_CONTINUATION ;
361- vstr_reset (& lex -> vstr );
362- tok -> str = vstr_str (& lex -> vstr );
363- tok -> len = 0 ;
318+ lex -> tok_line = lex -> line ;
319+ lex -> tok_column = lex -> column ;
320+ lex -> tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION ;
364321 return ;
365322 } else {
366323 next_char (lex );
@@ -371,29 +328,26 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
371328 }
372329
373330 // set token source information
374- tok -> src_line = lex -> line ;
375- tok -> src_column = lex -> column ;
376-
377- // start new token text
378- vstr_reset (& lex -> vstr );
331+ lex -> tok_line = lex -> line ;
332+ lex -> tok_column = lex -> column ;
379333
380334 if (first_token && lex -> line == 1 && lex -> column != 1 ) {
381335 // check that the first token is in the first column
382336 // if first token is not on first line, we get a physical newline and
383337 // this check is done as part of normal indent/dedent checking below
384338 // (done to get equivalence with CPython)
385- tok -> kind = MP_TOKEN_INDENT ;
339+ lex -> tok_kind = MP_TOKEN_INDENT ;
386340
387341 } else if (lex -> emit_dent < 0 ) {
388- tok -> kind = MP_TOKEN_DEDENT ;
342+ lex -> tok_kind = MP_TOKEN_DEDENT ;
389343 lex -> emit_dent += 1 ;
390344
391345 } else if (lex -> emit_dent > 0 ) {
392- tok -> kind = MP_TOKEN_INDENT ;
346+ lex -> tok_kind = MP_TOKEN_INDENT ;
393347 lex -> emit_dent -= 1 ;
394348
395349 } else if (had_physical_newline && lex -> nested_bracket_level == 0 ) {
396- tok -> kind = MP_TOKEN_NEWLINE ;
350+ lex -> tok_kind = MP_TOKEN_NEWLINE ;
397351
398352 mp_uint_t num_spaces = lex -> column - 1 ;
399353 lex -> emit_dent = 0 ;
@@ -407,20 +361,20 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
407361 lex -> emit_dent -= 1 ;
408362 }
409363 if (num_spaces != indent_top (lex )) {
410- tok -> kind = MP_TOKEN_DEDENT_MISMATCH ;
364+ lex -> tok_kind = MP_TOKEN_DEDENT_MISMATCH ;
411365 }
412366 }
413367
414368 } else if (is_end (lex )) {
415369 if (indent_top (lex ) > 0 ) {
416- tok -> kind = MP_TOKEN_NEWLINE ;
370+ lex -> tok_kind = MP_TOKEN_NEWLINE ;
417371 lex -> emit_dent = 0 ;
418372 while (indent_top (lex ) > 0 ) {
419373 indent_pop (lex );
420374 lex -> emit_dent -= 1 ;
421375 }
422376 } else {
423- tok -> kind = MP_TOKEN_END ;
377+ lex -> tok_kind = MP_TOKEN_END ;
424378 }
425379
426380 } else if (is_char_or (lex , '\'' , '\"' )
@@ -451,9 +405,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
451405
452406 // set token kind
453407 if (is_bytes ) {
454- tok -> kind = MP_TOKEN_BYTES ;
408+ lex -> tok_kind = MP_TOKEN_BYTES ;
455409 } else {
456- tok -> kind = MP_TOKEN_STRING ;
410+ lex -> tok_kind = MP_TOKEN_STRING ;
457411 }
458412
459413 // get first quoting character
@@ -566,14 +520,14 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
566520
567521 // check we got the required end quotes
568522 if (n_closing < num_quotes ) {
569- tok -> kind = MP_TOKEN_LONELY_STRING_OPEN ;
523+ lex -> tok_kind = MP_TOKEN_LONELY_STRING_OPEN ;
570524 }
571525
572526 // cut off the end quotes from the token text
573527 vstr_cut_tail_bytes (& lex -> vstr , n_closing );
574528
575529 } else if (is_head_of_identifier (lex )) {
576- tok -> kind = MP_TOKEN_NAME ;
530+ lex -> tok_kind = MP_TOKEN_NAME ;
577531
578532 // get first char
579533 vstr_add_char (& lex -> vstr , CUR_CHAR (lex ));
@@ -586,7 +540,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
586540 }
587541
588542 } else if (is_digit (lex ) || (is_char (lex , '.' ) && is_following_digit (lex ))) {
589- tok -> kind = MP_TOKEN_NUMBER ;
543+ lex -> tok_kind = MP_TOKEN_NUMBER ;
590544
591545 // get first char
592546 vstr_add_char (& lex -> vstr , CUR_CHAR (lex ));
@@ -621,9 +575,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
621575 vstr_add_char (& lex -> vstr , '.' );
622576 next_char (lex );
623577 next_char (lex );
624- tok -> kind = MP_TOKEN_ELLIPSIS ;
578+ lex -> tok_kind = MP_TOKEN_ELLIPSIS ;
625579 } else {
626- tok -> kind = MP_TOKEN_DEL_PERIOD ;
580+ lex -> tok_kind = MP_TOKEN_DEL_PERIOD ;
627581 }
628582
629583 } else {
@@ -645,7 +599,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
645599
646600 if (* t == 0 ) {
647601 // didn't match any delimiter or operator characters
648- tok -> kind = MP_TOKEN_INVALID ;
602+ lex -> tok_kind = MP_TOKEN_INVALID ;
649603
650604 } else {
651605 // matched a delimiter or operator character
@@ -670,7 +624,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
670624 next_char (lex );
671625 tok_enc_index = t_index ;
672626 } else {
673- tok -> kind = MP_TOKEN_INVALID ;
627+ lex -> tok_kind = MP_TOKEN_INVALID ;
674628 goto tok_enc_no_match ;
675629 }
676630 break ;
@@ -692,37 +646,33 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
692646 }
693647
694648 // set token kind
695- tok -> kind = tok_enc_kind [tok_enc_index ];
649+ lex -> tok_kind = tok_enc_kind [tok_enc_index ];
696650
697651 tok_enc_no_match :
698652
699653 // compute bracket level for implicit line joining
700- if (tok -> kind == MP_TOKEN_DEL_PAREN_OPEN || tok -> kind == MP_TOKEN_DEL_BRACKET_OPEN || tok -> kind == MP_TOKEN_DEL_BRACE_OPEN ) {
654+ if (lex -> tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex -> tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex -> tok_kind == MP_TOKEN_DEL_BRACE_OPEN ) {
701655 lex -> nested_bracket_level += 1 ;
702- } else if (tok -> kind == MP_TOKEN_DEL_PAREN_CLOSE || tok -> kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok -> kind == MP_TOKEN_DEL_BRACE_CLOSE ) {
656+ } else if (lex -> tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex -> tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex -> tok_kind == MP_TOKEN_DEL_BRACE_CLOSE ) {
703657 lex -> nested_bracket_level -= 1 ;
704658 }
705659 }
706660 }
707661
708- // point token text to vstr buffer
709- tok -> str = vstr_str (& lex -> vstr );
710- tok -> len = vstr_len (& lex -> vstr );
711-
712662 // check for keywords
713- if (tok -> kind == MP_TOKEN_NAME ) {
663+ if (lex -> tok_kind == MP_TOKEN_NAME ) {
714664 // We check for __debug__ here and convert it to its value. This is so
715665 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
716666 // need to check for this special token in many places in the compiler.
717667 // TODO improve speed of these string comparisons
718668 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
719669 for (mp_int_t i = 0 ; i < MP_ARRAY_SIZE (tok_kw ); i ++ ) {
720- if (str_strn_equal (tok_kw [i ], tok -> str , tok -> len )) {
670+ if (str_strn_equal (tok_kw [i ], lex -> vstr . buf , lex -> vstr . len )) {
721671 if (i == MP_ARRAY_SIZE (tok_kw ) - 1 ) {
722672 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
723- tok -> kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE );
673+ lex -> tok_kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE );
724674 } else {
725- tok -> kind = MP_TOKEN_KW_FALSE + i ;
675+ lex -> tok_kind = MP_TOKEN_KW_FALSE + i ;
726676 }
727677 break ;
728678 }
@@ -782,7 +732,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
782732 }
783733
784734 // preload first token
785- mp_lexer_next_token_into (lex , & lex -> tok_cur , true);
735+ mp_lexer_next_token_into (lex , true);
786736
787737 return lex ;
788738}
@@ -798,18 +748,27 @@ void mp_lexer_free(mp_lexer_t *lex) {
798748 }
799749}
800750
801- qstr mp_lexer_source_name (mp_lexer_t * lex ) {
802- return lex -> source_name ;
803- }
804-
805751void mp_lexer_to_next (mp_lexer_t * lex ) {
806- mp_lexer_next_token_into (lex , & lex -> tok_cur , false);
807- }
808-
809- const mp_token_t * mp_lexer_cur (const mp_lexer_t * lex ) {
810- return & lex -> tok_cur ;
752+ mp_lexer_next_token_into (lex , false);
811753}
812754
813- bool mp_lexer_is_kind (mp_lexer_t * lex , mp_token_kind_t kind ) {
814- return lex -> tok_cur .kind == kind ;
755+ #if MICROPY_DEBUG_PRINTERS
756+ void mp_lexer_show_token (const mp_lexer_t * lex ) {
757+ printf ("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%u" , lex -> tok_line , lex -> tok_column , lex -> tok_kind , lex -> vstr .buf , lex -> vstr .len );
758+ if (lex -> vstr .len > 0 ) {
759+ const byte * i = (const byte * )lex -> vstr .buf ;
760+ const byte * j = (const byte * )i + lex -> vstr .len ;
761+ printf (" " );
762+ while (i < j ) {
763+ unichar c = utf8_get_char (i );
764+ i = utf8_next_char (i );
765+ if (unichar_isprint (c )) {
766+ printf ("%c" , c );
767+ } else {
768+ printf ("?" );
769+ }
770+ }
771+ }
772+ printf ("\n" );
815773}
774+ #endif
0 commit comments