Skip to content

Commit a4c52c5

Browse files
committed
py: Optimise lexer by exposing lexer type.
mp_lexer_t type is exposed, mp_token_t type is removed, and simple lexer functions (like checking current token kind) are now inlined. This saves 784 bytes ROM on 32-bit unix, 348 bytes on stmhal, and 460 bytes on bare-arm. It also saves a tiny bit of RAM since mp_lexer_t is a bit smaller. Also will run a bit more efficiently.
1 parent 41c07d5 commit a4c52c5

11 files changed

Lines changed: 123 additions & 153 deletions

File tree

bare-arm/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ void do_str(const char *src) {
3232
}
3333

3434
// parse okay
35-
qstr source_name = mp_lexer_source_name(lex);
35+
qstr source_name = lex->source_name;
3636
mp_lexer_free(lex);
3737
mp_obj_t module_fun = mp_compile(pn, source_name, MP_EMIT_OPT_NONE, true);
3838

py/builtinimport.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ STATIC void do_load(mp_obj_t module_obj, vstr_t *file) {
127127
}
128128

129129
#if MICROPY_PY___FILE__
130-
qstr source_name = mp_lexer_source_name(lex);
130+
qstr source_name = lex->source_name;
131131
mp_store_attr(module_obj, MP_QSTR___file__, MP_OBJ_NEW_QSTR(source_name));
132132
#endif
133133

py/lexer.c

Lines changed: 57 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -42,32 +42,10 @@
4242
// TODO seems that CPython allows NULL byte in the input stream
4343
// don't know if that's intentional or not, but we don't allow it
4444

45-
struct _mp_lexer_t {
46-
qstr source_name; // name of source
47-
void *stream_data; // data for stream
48-
mp_lexer_stream_next_byte_t stream_next_byte; // stream callback to get next byte
49-
mp_lexer_stream_close_t stream_close; // stream callback to free
50-
51-
unichar chr0, chr1, chr2; // current cached characters from source
52-
53-
mp_uint_t line; // source line
54-
mp_uint_t column; // source column
55-
56-
mp_int_t emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57-
mp_int_t nested_bracket_level; // >0 when there are nested brackets over multiple lines
58-
59-
mp_uint_t alloc_indent_level;
60-
mp_uint_t num_indent_level;
61-
uint16_t *indent_level;
62-
63-
vstr_t vstr;
64-
mp_token_t tok_cur;
65-
};
66-
6745
mp_uint_t mp_optimise_value;
6846

6947
// TODO replace with a call to a standard function
70-
bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
48+
STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
7149
mp_uint_t i = 0;
7250

7351
while (i < len && *str == *strn) {
@@ -79,27 +57,6 @@ bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
7957
return i == len && *str == 0;
8058
}
8159

82-
#ifdef MICROPY_DEBUG_PRINTERS
83-
void mp_token_show(const mp_token_t *tok) {
84-
printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:" UINT_FMT, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
85-
if (tok->str != NULL && tok->len > 0) {
86-
const byte *i = (const byte *)tok->str;
87-
const byte *j = (const byte *)i + tok->len;
88-
printf(" ");
89-
while (i < j) {
90-
unichar c = utf8_get_char(i);
91-
i = utf8_next_char(i);
92-
if (unichar_isprint(c)) {
93-
printf("%c", c);
94-
} else {
95-
printf("?");
96-
}
97-
}
98-
}
99-
printf("\n");
100-
}
101-
#endif
102-
10360
#define CUR_CHAR(lex) ((lex)->chr0)
10461

10562
STATIC bool is_end(mp_lexer_t *lex) {
@@ -210,7 +167,7 @@ STATIC void next_char(mp_lexer_t *lex) {
210167
}
211168
}
212169

213-
void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
170+
STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
214171
if (lex->num_indent_level >= lex->alloc_indent_level) {
215172
// TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
216173
lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
@@ -219,11 +176,11 @@ void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
219176
lex->indent_level[lex->num_indent_level++] = indent;
220177
}
221178

222-
mp_uint_t indent_top(mp_lexer_t *lex) {
179+
STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
223180
return lex->indent_level[lex->num_indent_level - 1];
224181
}
225182

226-
void indent_pop(mp_lexer_t *lex) {
183+
STATIC void indent_pop(mp_lexer_t *lex) {
227184
lex->num_indent_level -= 1;
228185
}
229186

@@ -335,7 +292,10 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
335292
return true;
336293
}
337294

338-
STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
295+
STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
296+
// start new token text
297+
vstr_reset(&lex->vstr);
298+
339299
// skip white space and comments
340300
bool had_physical_newline = false;
341301
while (!is_end(lex)) {
@@ -355,12 +315,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
355315
next_char(lex);
356316
if (!is_physical_newline(lex)) {
357317
// SyntaxError: unexpected character after line continuation character
358-
tok->src_line = lex->line;
359-
tok->src_column = lex->column;
360-
tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
361-
vstr_reset(&lex->vstr);
362-
tok->str = vstr_str(&lex->vstr);
363-
tok->len = 0;
318+
lex->tok_line = lex->line;
319+
lex->tok_column = lex->column;
320+
lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
364321
return;
365322
} else {
366323
next_char(lex);
@@ -371,29 +328,26 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
371328
}
372329

373330
// set token source information
374-
tok->src_line = lex->line;
375-
tok->src_column = lex->column;
376-
377-
// start new token text
378-
vstr_reset(&lex->vstr);
331+
lex->tok_line = lex->line;
332+
lex->tok_column = lex->column;
379333

380334
if (first_token && lex->line == 1 && lex->column != 1) {
381335
// check that the first token is in the first column
382336
// if first token is not on first line, we get a physical newline and
383337
// this check is done as part of normal indent/dedent checking below
384338
// (done to get equivalence with CPython)
385-
tok->kind = MP_TOKEN_INDENT;
339+
lex->tok_kind = MP_TOKEN_INDENT;
386340

387341
} else if (lex->emit_dent < 0) {
388-
tok->kind = MP_TOKEN_DEDENT;
342+
lex->tok_kind = MP_TOKEN_DEDENT;
389343
lex->emit_dent += 1;
390344

391345
} else if (lex->emit_dent > 0) {
392-
tok->kind = MP_TOKEN_INDENT;
346+
lex->tok_kind = MP_TOKEN_INDENT;
393347
lex->emit_dent -= 1;
394348

395349
} else if (had_physical_newline && lex->nested_bracket_level == 0) {
396-
tok->kind = MP_TOKEN_NEWLINE;
350+
lex->tok_kind = MP_TOKEN_NEWLINE;
397351

398352
mp_uint_t num_spaces = lex->column - 1;
399353
lex->emit_dent = 0;
@@ -407,20 +361,20 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
407361
lex->emit_dent -= 1;
408362
}
409363
if (num_spaces != indent_top(lex)) {
410-
tok->kind = MP_TOKEN_DEDENT_MISMATCH;
364+
lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
411365
}
412366
}
413367

414368
} else if (is_end(lex)) {
415369
if (indent_top(lex) > 0) {
416-
tok->kind = MP_TOKEN_NEWLINE;
370+
lex->tok_kind = MP_TOKEN_NEWLINE;
417371
lex->emit_dent = 0;
418372
while (indent_top(lex) > 0) {
419373
indent_pop(lex);
420374
lex->emit_dent -= 1;
421375
}
422376
} else {
423-
tok->kind = MP_TOKEN_END;
377+
lex->tok_kind = MP_TOKEN_END;
424378
}
425379

426380
} else if (is_char_or(lex, '\'', '\"')
@@ -451,9 +405,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
451405

452406
// set token kind
453407
if (is_bytes) {
454-
tok->kind = MP_TOKEN_BYTES;
408+
lex->tok_kind = MP_TOKEN_BYTES;
455409
} else {
456-
tok->kind = MP_TOKEN_STRING;
410+
lex->tok_kind = MP_TOKEN_STRING;
457411
}
458412

459413
// get first quoting character
@@ -566,14 +520,14 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
566520

567521
// check we got the required end quotes
568522
if (n_closing < num_quotes) {
569-
tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
523+
lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
570524
}
571525

572526
// cut off the end quotes from the token text
573527
vstr_cut_tail_bytes(&lex->vstr, n_closing);
574528

575529
} else if (is_head_of_identifier(lex)) {
576-
tok->kind = MP_TOKEN_NAME;
530+
lex->tok_kind = MP_TOKEN_NAME;
577531

578532
// get first char
579533
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
@@ -586,7 +540,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
586540
}
587541

588542
} else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
589-
tok->kind = MP_TOKEN_NUMBER;
543+
lex->tok_kind = MP_TOKEN_NUMBER;
590544

591545
// get first char
592546
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
@@ -621,9 +575,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
621575
vstr_add_char(&lex->vstr, '.');
622576
next_char(lex);
623577
next_char(lex);
624-
tok->kind = MP_TOKEN_ELLIPSIS;
578+
lex->tok_kind = MP_TOKEN_ELLIPSIS;
625579
} else {
626-
tok->kind = MP_TOKEN_DEL_PERIOD;
580+
lex->tok_kind = MP_TOKEN_DEL_PERIOD;
627581
}
628582

629583
} else {
@@ -645,7 +599,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
645599

646600
if (*t == 0) {
647601
// didn't match any delimiter or operator characters
648-
tok->kind = MP_TOKEN_INVALID;
602+
lex->tok_kind = MP_TOKEN_INVALID;
649603

650604
} else {
651605
// matched a delimiter or operator character
@@ -670,7 +624,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
670624
next_char(lex);
671625
tok_enc_index = t_index;
672626
} else {
673-
tok->kind = MP_TOKEN_INVALID;
627+
lex->tok_kind = MP_TOKEN_INVALID;
674628
goto tok_enc_no_match;
675629
}
676630
break;
@@ -692,37 +646,33 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
692646
}
693647

694648
// set token kind
695-
tok->kind = tok_enc_kind[tok_enc_index];
649+
lex->tok_kind = tok_enc_kind[tok_enc_index];
696650

697651
tok_enc_no_match:
698652

699653
// compute bracket level for implicit line joining
700-
if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
654+
if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
701655
lex->nested_bracket_level += 1;
702-
} else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
656+
} else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
703657
lex->nested_bracket_level -= 1;
704658
}
705659
}
706660
}
707661

708-
// point token text to vstr buffer
709-
tok->str = vstr_str(&lex->vstr);
710-
tok->len = vstr_len(&lex->vstr);
711-
712662
// check for keywords
713-
if (tok->kind == MP_TOKEN_NAME) {
663+
if (lex->tok_kind == MP_TOKEN_NAME) {
714664
// We check for __debug__ here and convert it to its value. This is so
715665
// the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
716666
// need to check for this special token in many places in the compiler.
717667
// TODO improve speed of these string comparisons
718668
//for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
719669
for (mp_int_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
720-
if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
670+
if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
721671
if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
722672
// tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
723-
tok->kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
673+
lex->tok_kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
724674
} else {
725-
tok->kind = MP_TOKEN_KW_FALSE + i;
675+
lex->tok_kind = MP_TOKEN_KW_FALSE + i;
726676
}
727677
break;
728678
}
@@ -782,7 +732,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
782732
}
783733

784734
// preload first token
785-
mp_lexer_next_token_into(lex, &lex->tok_cur, true);
735+
mp_lexer_next_token_into(lex, true);
786736

787737
return lex;
788738
}
@@ -798,18 +748,27 @@ void mp_lexer_free(mp_lexer_t *lex) {
798748
}
799749
}
800750

801-
qstr mp_lexer_source_name(mp_lexer_t *lex) {
802-
return lex->source_name;
803-
}
804-
805751
void mp_lexer_to_next(mp_lexer_t *lex) {
806-
mp_lexer_next_token_into(lex, &lex->tok_cur, false);
807-
}
808-
809-
const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
810-
return &lex->tok_cur;
752+
mp_lexer_next_token_into(lex, false);
811753
}
812754

813-
bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
814-
return lex->tok_cur.kind == kind;
755+
#if MICROPY_DEBUG_PRINTERS
756+
void mp_lexer_show_token(const mp_lexer_t *lex) {
757+
printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%u", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
758+
if (lex->vstr.len > 0) {
759+
const byte *i = (const byte *)lex->vstr.buf;
760+
const byte *j = (const byte *)i + lex->vstr.len;
761+
printf(" ");
762+
while (i < j) {
763+
unichar c = utf8_get_char(i);
764+
i = utf8_next_char(i);
765+
if (unichar_isprint(c)) {
766+
printf("%c", c);
767+
} else {
768+
printf("?");
769+
}
770+
}
771+
}
772+
printf("\n");
815773
}
774+
#endif

0 commit comments

Comments
 (0)