Skip to content

Commit 77eade3

Browse files
committed
Fix PEP 263 encoding detection per review
- Validate '#' is preceded only by whitespace/formfeed (PEP 263) - Normalize UTF-8 encoding aliases (utf-8, utf_8, utf8, etc.)
1 parent 33fca0d commit 77eade3

2 files changed

Lines changed: 141 additions & 5 deletions

File tree

Lib/test/test_utf8source.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ def test_pep3120(self):
1212
b'\\\xd0\x9f'
1313
)
1414

15-
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError
1615
def test_badsyntax(self):
1716
try:
1817
import test.tokenizedata.badsyntax_pep3120 # noqa: F401
@@ -26,7 +25,6 @@ def test_badsyntax(self):
2625
class BuiltinCompileTests(unittest.TestCase):
2726

2827
# Issue 3574.
29-
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError
3028
def test_latin1(self):
3129
# Allow compile() to read Latin-1 source.
3230
source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")

crates/vm/src/stdlib/builtins.rs

Lines changed: 141 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,145 @@ mod builtins {
111111
_feature_version: OptionalArg<i32>,
112112
}
113113

114+
/// Detect PEP 263 encoding cookie from source bytes.
115+
/// Checks first two lines for `# coding[:=] <encoding>` pattern.
116+
/// Returns the encoding name if found, or None for default (UTF-8).
117+
#[cfg(feature = "parser")]
118+
fn detect_source_encoding(source: &[u8]) -> Option<String> {
119+
fn find_encoding_in_line(line: &[u8]) -> Option<String> {
120+
// PEP 263: '#' must be preceded only by whitespace/formfeed
121+
let hash_pos = line.iter().position(|&b| b == b'#')?;
122+
if !line[..hash_pos]
123+
.iter()
124+
.all(|&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
125+
{
126+
return None;
127+
}
128+
let after_hash = &line[hash_pos..];
129+
130+
// Find "coding" after the #
131+
let coding_pos = after_hash.windows(6).position(|w| w == b"coding")?;
132+
let after_coding = &after_hash[coding_pos + 6..];
133+
134+
// Next char must be ':' or '='
135+
let rest = if after_coding.first() == Some(&b':') || after_coding.first() == Some(&b'=')
136+
{
137+
&after_coding[1..]
138+
} else {
139+
return None;
140+
};
141+
142+
// Skip whitespace
143+
let rest = rest
144+
.iter()
145+
.copied()
146+
.skip_while(|&b| b == b' ' || b == b'\t')
147+
.collect::<Vec<_>>();
148+
149+
// Read encoding name: [-\w.]+
150+
let name: String = rest
151+
.iter()
152+
.take_while(|&&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.')
153+
.map(|&b| b as char)
154+
.collect();
155+
156+
if name.is_empty() { None } else { Some(name) }
157+
}
158+
159+
// Split into lines (first two only)
160+
let mut lines = source.splitn(3, |&b| b == b'\n');
161+
162+
if let Some(first) = lines.next() {
163+
// Strip BOM if present
164+
let first = first.strip_prefix(b"\xef\xbb\xbf").unwrap_or(first);
165+
if let Some(enc) = find_encoding_in_line(first) {
166+
return Some(enc);
167+
}
168+
// Only check second line if first line is blank or a comment
169+
let trimmed = first
170+
.iter()
171+
.skip_while(|&&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
172+
.copied()
173+
.collect::<Vec<_>>();
174+
if !trimmed.is_empty() && trimmed[0] != b'#' {
175+
return None;
176+
}
177+
}
178+
179+
lines.next().and_then(find_encoding_in_line)
180+
}
181+
182+
/// Decode source bytes to a string, handling PEP 263 encoding declarations
183+
/// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding
184+
/// declaration (matching CPython behavior).
185+
/// Check if an encoding name is a UTF-8 variant after normalization.
186+
/// Matches: utf-8, utf_8, utf8, UTF-8, etc.
187+
#[cfg(feature = "parser")]
188+
fn is_utf8_encoding(name: &str) -> bool {
189+
let normalized: String = name.chars().filter(|&c| c != '-' && c != '_').collect();
190+
normalized.eq_ignore_ascii_case("utf8")
191+
}
192+
193+
#[cfg(feature = "parser")]
194+
fn decode_source_bytes(source: &[u8], filename: &str, vm: &VirtualMachine) -> PyResult<String> {
195+
let has_bom = source.starts_with(b"\xef\xbb\xbf");
196+
let encoding = detect_source_encoding(source);
197+
198+
let is_utf8 = encoding.as_deref().is_none_or(is_utf8_encoding);
199+
200+
// Validate BOM + encoding combination
201+
if has_bom && !is_utf8 {
202+
return Err(vm.new_exception_msg(
203+
vm.ctx.exceptions.syntax_error.to_owned(),
204+
format!("encoding problem for '{filename}': utf-8").into(),
205+
));
206+
}
207+
208+
if is_utf8 {
209+
let src = if has_bom { &source[3..] } else { source };
210+
match core::str::from_utf8(src) {
211+
Ok(s) => Ok(s.to_owned()),
212+
Err(e) => {
213+
let bad_byte = src[e.valid_up_to()];
214+
let line = src[..e.valid_up_to()]
215+
.iter()
216+
.filter(|&&b| b == b'\n')
217+
.count()
218+
+ 1;
219+
Err(vm.new_exception_msg(
220+
vm.ctx.exceptions.syntax_error.to_owned(),
221+
format!(
222+
"Non-UTF-8 code starting with '\\x{bad_byte:02x}' \
223+
on line {line}, but no encoding declared; \
224+
see https://peps.python.org/pep-0263/ for details \
225+
({filename}, line {line})"
226+
)
227+
.into(),
228+
))
229+
}
230+
}
231+
} else {
232+
// Use codec registry for non-UTF-8 encodings
233+
let enc = encoding.as_deref().unwrap();
234+
let bytes_obj = vm.ctx.new_bytes(source.to_vec());
235+
let decoded = vm
236+
.state
237+
.codec_registry
238+
.decode_text(bytes_obj.into(), enc, None, vm)
239+
.map_err(|exc| {
240+
if exc.fast_isinstance(vm.ctx.exceptions.lookup_error) {
241+
vm.new_exception_msg(
242+
vm.ctx.exceptions.syntax_error.to_owned(),
243+
format!("unknown encoding for '{filename}': {enc}").into(),
244+
)
245+
} else {
246+
exc
247+
}
248+
})?;
249+
Ok(decoded.to_string_lossy().into_owned())
250+
}
251+
}
252+
114253
#[cfg(any(feature = "parser", feature = "compiler"))]
115254
#[pyfunction]
116255
fn compile(args: CompileArgs, vm: &VirtualMachine) -> PyResult {
@@ -203,9 +342,8 @@ mod builtins {
203342
let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?;
204343
let source = source.borrow_bytes();
205344

206-
// TODO: compiler::compile should probably get bytes
207-
let source = core::str::from_utf8(&source)
208-
.map_err(|e| vm.new_unicode_decode_error(e.to_string()))?;
345+
let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?;
346+
let source = source.as_str();
209347

210348
let flags = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?;
211349

0 commit comments

Comments
 (0)