Skip to content

Commit 4c81ba8

Browse files
committed
py: Never intern data of large string/bytes object; add relevant tests.
Previously to this patch all constant string/bytes objects were interned by the compiler, and this lead to crashes when the qstr was too long (noticeable now that qstr length storage defaults to 1 byte). With this patch, long string/bytes objects are never interned, and are referenced directly as constant objects within generated code using load_const_obj.
1 parent dab1385 commit 4c81ba8

5 files changed

Lines changed: 37 additions & 21 deletions

File tree

py/compile.c

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ typedef enum {
4646
#undef DEF_RULE
4747
PN_maximum_number_of,
4848
PN_string, // special node for non-interned string
49+
PN_bytes, // special node for non-interned bytes
4950
} pn_kind_t;
5051

5152
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
@@ -172,6 +173,7 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
172173
break;
173174
#endif
174175
case PN_string:
176+
case PN_bytes:
175177
return pn;
176178
}
177179

@@ -427,6 +429,9 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
427429
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
428430
return true;
429431
}
432+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
433+
return true;
434+
}
430435
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
431436
return false;
432437
}
@@ -475,9 +480,9 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len,
475480
}
476481

477482
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
478-
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
483+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string) || MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
479484
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
480-
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false);
485+
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes));
481486
return;
482487
}
483488

@@ -2151,7 +2156,8 @@ STATIC void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
21512156
} else {
21522157
// for non-REPL, evaluate then discard the expression
21532158
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
2154-
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
2159+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)
2160+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_bytes)) {
21552161
// do nothing with a lonely constant
21562162
} else {
21572163
compile_node(comp, pns->nodes[0]); // just an expression
@@ -2595,8 +2601,12 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
25952601
} else {
25962602
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
25972603
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
2598-
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
2599-
pn_kind = MP_PARSE_NODE_STRING;
2604+
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
2605+
pn_kind = MP_PARSE_NODE_STRING;
2606+
} else {
2607+
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
2608+
pn_kind = MP_PARSE_NODE_BYTES;
2609+
}
26002610
n_bytes += (mp_uint_t)pns_string->nodes[1];
26012611
}
26022612
if (i == 0) {
@@ -2608,8 +2618,8 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
26082618
}
26092619

26102620
// concatenate string/bytes
2611-
byte *q_ptr;
2612-
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
2621+
byte *s_dest;
2622+
mp_obj_t obj = mp_obj_str_builder_start(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, n_bytes, &s_dest);
26132623
for (int i = 0; i < n; i++) {
26142624
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
26152625
mp_uint_t s_len;
@@ -2622,9 +2632,7 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
26222632
s_dest += (mp_uint_t)pns_string->nodes[1];
26232633
}
26242634
}
2625-
qstr q = qstr_build_end(q_ptr);
2626-
2627-
EMIT_ARG(load_const_str, q, string_kind == MP_PARSE_NODE_BYTES);
2635+
EMIT_ARG(load_const_obj, mp_obj_str_builder_end(obj));
26282636
}
26292637

26302638
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
@@ -2959,7 +2967,9 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
29592967
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
29602968
EMIT_ARG(set_line_number, pns->source_line);
29612969
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
2962-
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]), false);
2970+
EMIT_ARG(load_const_obj, mp_obj_new_str((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false));
2971+
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_bytes) {
2972+
EMIT_ARG(load_const_obj, mp_obj_new_bytes((const byte*)pns->nodes[0], (mp_uint_t)pns->nodes[1]));
29632973
} else {
29642974
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
29652975
if (f == NULL) {

py/parse.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ enum {
7070
#undef DEF_RULE
7171
RULE_maximum_number_of,
7272
RULE_string, // special node for non-interned string
73+
RULE_bytes, // special node for non-interned bytes
7374
};
7475

7576
#define ident (RULE_ACT_ALLOW_IDENT)
@@ -176,7 +177,7 @@ void mp_parse_node_free(mp_parse_node_t pn) {
176177
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
177178
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
178179
mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
179-
if (rule_id == RULE_string) {
180+
if (rule_id == RULE_string || rule_id == RULE_bytes) {
180181
m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]);
181182
} else {
182183
bool adjust = ADD_BLANK_NODE(rules[rule_id]);
@@ -225,6 +226,8 @@ void mp_parse_node_print(mp_parse_node_t pn, mp_uint_t indent) {
225226
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
226227
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
227228
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
229+
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_bytes) {
230+
printf("literal bytes(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
228231
} else {
229232
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
230233
#ifdef USE_RULE_NAME
@@ -281,14 +284,14 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
281284
parser->result_stack[parser->result_stack_top++] = pn;
282285
}
283286

284-
STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char *str, mp_uint_t len) {
287+
STATIC void push_result_string_bytes(parser_t *parser, mp_uint_t src_line, mp_uint_t rule_kind, const char *str, mp_uint_t len) {
285288
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
286289
if (pn == NULL) {
287290
memory_error(parser);
288291
return;
289292
}
290293
pn->source_line = src_line;
291-
pn->kind_num_nodes = RULE_string | (2 << 8);
294+
pn->kind_num_nodes = rule_kind | (2 << 8);
292295
char *p = m_new(char, len);
293296
memcpy(p, str, len);
294297
pn->nodes[0] = (mp_int_t)p;
@@ -340,8 +343,8 @@ STATIC void push_result_token(parser_t *parser) {
340343
} else {
341344
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
342345
}
343-
} else if (lex->tok_kind == MP_TOKEN_STRING) {
344-
// Don't automatically intern all strings. doc strings (which are usually large)
346+
} else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) {
347+
// Don't automatically intern all strings/bytes. doc strings (which are usually large)
345348
// will be discarded by the compiler, and so we shouldn't intern them.
346349
qstr qst = MP_QSTR_NULL;
347350
if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
@@ -353,14 +356,12 @@ STATIC void push_result_token(parser_t *parser) {
353356
}
354357
if (qst != MP_QSTR_NULL) {
355358
// qstr exists, make a leaf node
356-
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
359+
pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst);
357360
} else {
358-
// not interned, make a node holding a pointer to the string data
359-
push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len);
361+
// not interned, make a node holding a pointer to the string/bytes data
362+
push_result_string_bytes(parser, lex->tok_line, lex->tok_kind == MP_TOKEN_STRING ? RULE_string : RULE_bytes, lex->vstr.buf, lex->vstr.len);
360363
return;
361364
}
362-
} else if (lex->tok_kind == MP_TOKEN_BYTES) {
363-
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
364365
} else {
365366
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
366367
}

py/qstr.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ qstr qstr_from_str(const char *str) {
148148
}
149149

150150
qstr qstr_from_strn(const char *str, mp_uint_t len) {
151+
assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
151152
qstr q = qstr_find_strn(str, len);
152153
if (q == 0) {
153154
mp_uint_t hash = qstr_compute_hash((const byte*)str, len);

tests/basics/bytes_large.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
b1 = b"long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes"
2+
b2 = b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes"

tests/basics/string_large.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
s1 = "long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string"
2+
s2 = "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string"

0 commit comments

Comments
 (0)