Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from textwrap import dedent
from unittest import TestCase, mock
from test import support
from test.support import os_helper
from test.support import import_helper, os_helper
from test.support.script_helper import run_test_script, make_script, run_python_until_end
from test.support.numbers import (
VALID_UNDERSCORE_LITERALS,
Expand Down Expand Up @@ -2266,6 +2266,67 @@ def readline(encoding):
))
self.assertEqual(tokens, expected)

@unittest.skipIf(support.Py_TRACE_REFS,
'_testcapi.set_nomemory() is unreliable with Py_TRACE_REFS')
def test_col_offset_conversion_oom(self):
import_helper.import_module('_testcapi')
code = dedent(r"""
import _testcapi
import _tokenize

def check_indented_name(start):
source = "if True:\n \u00e9 = 1\n"
it = _tokenize.TokenizerIter(
iter(source.splitlines(True)).__next__,
extra_tokens=False,
)
for _ in range(5):
next(it)

_testcapi.set_nomemory(start, start + 1)
try:
next(it)
except MemoryError:
return True
finally:
_testcapi.remove_mem_hooks()
return False

def check_multiline_string(start):
source = "x = '''abc\ndef'''\n"
it = _tokenize.TokenizerIter(
iter(source.splitlines(True)).__next__,
extra_tokens=False,
)
next(it)
next(it)

_testcapi.set_nomemory(start, start + 1)
try:
next(it)
except MemoryError:
return True
finally:
_testcapi.remove_mem_hooks()
return False

def check_range(name, func):
seen_memory_error = False
for index in range(20):
if func(index):
seen_memory_error = True
if not seen_memory_error:
raise AssertionError(f"{name}: MemoryError not raised")

check_range("line", check_indented_name)
check_range("raw", check_multiline_string)
print("MemoryError")
""")
with support.SuppressCrashReport():
res, _ = run_python_until_end("-c", code)
self.assertEqual(res.rc, 0, res.err.decode("ascii", "replace"))
self.assertIn(b"MemoryError", res.out)

def test_int(self):

self.check_tokenize('0xff <= 255', """\
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix a possible crash in ``_tokenize.TokenizerIter`` when memory allocation
fails while converting byte offsets to character offsets for non-ASCII source
lines. The tokenizer now correctly propagates ``MemoryError`` instead of
dereferencing a NULL pointer or returning a result with an exception set.
3 changes: 3 additions & 0 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Py_ssize_t
_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
{
const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line);
if (data == NULL) {
return -1;
}

Py_ssize_t len = 0;
while (col_offset < end_col_offset) {
Expand Down
40 changes: 30 additions & 10 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,21 +202,27 @@ _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t si
return line;
}

static void
static int
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
{
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
Py_ssize_t byte_offset = -1;
Py_ssize_t byte_col_offset_diff = it->byte_col_offset_diff;
if (token.start != NULL && token.start >= line_start) {
byte_offset = token.start - line_start;
if (line_changed) {
*col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
it->byte_col_offset_diff = byte_offset - *col_offset;
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_line(
line, 0, byte_offset);
if (offset < 0) {
return -1;
}
*col_offset = offset;
byte_col_offset_diff = byte_offset - *col_offset;
}
else {
*col_offset = byte_offset - it->byte_col_offset_diff;
*col_offset = byte_offset - byte_col_offset_diff;
}
}

Expand All @@ -226,17 +232,28 @@ _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_s
// If the whole token is at the same line, we can just use the token.start
// buffer for figuring out the new column offset, since using line is not
// performant for very long lines.
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(
line, byte_offset, end_byte_offset);
if (token_col_offset < 0) {
return -1;
}
*end_col_offset = *col_offset + token_col_offset;
it->byte_col_offset_diff += token.end - token.start - token_col_offset;
byte_col_offset_diff += token.end - token.start - token_col_offset;
}
else {
*end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_raw(
it->tok->line_start, end_byte_offset);
if (offset < 0) {
return -1;
}
*end_col_offset = offset;
byte_col_offset_diff += end_byte_offset - *end_col_offset;
}
}
it->byte_col_offset_diff = byte_col_offset_diff;
it->last_lineno = lineno;
it->last_end_lineno = end_lineno;
return 0;
}

static PyObject *
Expand Down Expand Up @@ -301,8 +318,11 @@ tokenizeriter_next(PyObject *op)
Py_ssize_t end_lineno = it->tok->lineno;
Py_ssize_t col_offset = -1;
Py_ssize_t end_col_offset = -1;
_get_col_offsets(it, token, line_start, line, line_changed,
lineno, end_lineno, &col_offset, &end_col_offset);
if (_get_col_offsets(it, token, line_start, line, line_changed,
lineno, end_lineno, &col_offset, &end_col_offset) < 0) {
Py_DECREF(str);
goto exit;
}

if (it->tok->tok_extra_tokens) {
if (is_trailing_token) {
Expand Down
Loading