Fix PEP 263 encoding detection per review

youknowone · youknowone · commit 77eade37f1e0 · 2026-03-03T21:27:11.000+09:00
- Validate '#' is preceded only by whitespace/formfeed (PEP 263)
- Normalize UTF-8 encoding aliases (utf-8, utf_8, utf8, etc.)
diff --git a/Lib/test/test_utf8source.py b/Lib/test/test_utf8source.py
@@ -12,7 +12,6 @@ def test_pep3120(self):
             b'\\\xd0\x9f'
         )
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; UnicodeDecodeError
     def test_badsyntax(self):
         try:
             import test.tokenizedata.badsyntax_pep3120  # noqa: F401
@@ -26,7 +25,6 @@ def test_badsyntax(self):
 class BuiltinCompileTests(unittest.TestCase):
 
     # Issue 3574.
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; UnicodeDecodeError
     def test_latin1(self):
         # Allow compile() to read Latin-1 source.
         source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")
diff --git a/crates/vm/src/stdlib/builtins.rs b/crates/vm/src/stdlib/builtins.rs
@@ -111,6 +111,145 @@ mod builtins {
         _feature_version: OptionalArg<i32>,
     }
 
+    /// Detect PEP 263 encoding cookie from source bytes.
+    /// Checks first two lines for `# coding[:=] <encoding>` pattern.
+    /// Returns the encoding name if found, or None for default (UTF-8).
+    #[cfg(feature = "parser")]
+    fn detect_source_encoding(source: &[u8]) -> Option<String> {
+        fn find_encoding_in_line(line: &[u8]) -> Option<String> {
+            // PEP 263: '#' must be preceded only by whitespace/formfeed
+            let hash_pos = line.iter().position(|&b| b == b'#')?;
+            if !line[..hash_pos]
+                .iter()
+                .all(|&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
+            {
+                return None;
+            }
+            let after_hash = &line[hash_pos..];
+
+            // Find "coding" after the #
+            let coding_pos = after_hash.windows(6).position(|w| w == b"coding")?;
+            let after_coding = &after_hash[coding_pos + 6..];
+
+            // Next char must be ':' or '='
+            let rest = if after_coding.first() == Some(&b':') || after_coding.first() == Some(&b'=')
+            {
+                &after_coding[1..]
+            } else {
+                return None;
+            };
+
+            // Skip whitespace
+            let rest = rest
+                .iter()
+                .copied()
+                .skip_while(|&b| b == b' ' || b == b'\t')
+                .collect::<Vec<_>>();
+
+            // Read encoding name: [-\w.]+
+            let name: String = rest
+                .iter()
+                .take_while(|&&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.')
+                .map(|&b| b as char)
+                .collect();
+
+            if name.is_empty() { None } else { Some(name) }
+        }
+
+        // Split into lines (first two only)
+        let mut lines = source.splitn(3, |&b| b == b'\n');
+
+        if let Some(first) = lines.next() {
+            // Strip BOM if present
+            let first = first.strip_prefix(b"\xef\xbb\xbf").unwrap_or(first);
+            if let Some(enc) = find_encoding_in_line(first) {
+                return Some(enc);
+            }
+            // Only check second line if first line is blank or a comment
+            let trimmed = first
+                .iter()
+                .skip_while(|&&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
+                .copied()
+                .collect::<Vec<_>>();
+            if !trimmed.is_empty() && trimmed[0] != b'#' {
+                return None;
+            }
+        }
+
+        lines.next().and_then(find_encoding_in_line)
+    }
+
+    /// Decode source bytes to a string, handling PEP 263 encoding declarations
+    /// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding
+    /// declaration (matching CPython behavior).
+    /// Check if an encoding name is a UTF-8 variant after normalization.
+    /// Matches: utf-8, utf_8, utf8, UTF-8, etc.
+    #[cfg(feature = "parser")]
+    fn is_utf8_encoding(name: &str) -> bool {
+        let normalized: String = name.chars().filter(|&c| c != '-' && c != '_').collect();
+        normalized.eq_ignore_ascii_case("utf8")
+    }
+
+    #[cfg(feature = "parser")]
+    fn decode_source_bytes(source: &[u8], filename: &str, vm: &VirtualMachine) -> PyResult<String> {
+        let has_bom = source.starts_with(b"\xef\xbb\xbf");
+        let encoding = detect_source_encoding(source);
+
+        let is_utf8 = encoding.as_deref().is_none_or(is_utf8_encoding);
+
+        // Validate BOM + encoding combination
+        if has_bom && !is_utf8 {
+            return Err(vm.new_exception_msg(
+                vm.ctx.exceptions.syntax_error.to_owned(),
+                format!("encoding problem for '{filename}': utf-8").into(),
+            ));
+        }
+
+        if is_utf8 {
+            let src = if has_bom { &source[3..] } else { source };
+            match core::str::from_utf8(src) {
+                Ok(s) => Ok(s.to_owned()),
+                Err(e) => {
+                    let bad_byte = src[e.valid_up_to()];
+                    let line = src[..e.valid_up_to()]
+                        .iter()
+                        .filter(|&&b| b == b'\n')
+                        .count()
+                        + 1;
+                    Err(vm.new_exception_msg(
+                        vm.ctx.exceptions.syntax_error.to_owned(),
+                        format!(
+                            "Non-UTF-8 code starting with '\\x{bad_byte:02x}' \
+                             on line {line}, but no encoding declared; \
+                             see https://peps.python.org/pep-0263/ for details \
+                             ({filename}, line {line})"
+                        )
+                        .into(),
+                    ))
+                }
+            }
+        } else {
+            // Use codec registry for non-UTF-8 encodings
+            let enc = encoding.as_deref().unwrap();
+            let bytes_obj = vm.ctx.new_bytes(source.to_vec());
+            let decoded = vm
+                .state
+                .codec_registry
+                .decode_text(bytes_obj.into(), enc, None, vm)
+                .map_err(|exc| {
+                    if exc.fast_isinstance(vm.ctx.exceptions.lookup_error) {
+                        vm.new_exception_msg(
+                            vm.ctx.exceptions.syntax_error.to_owned(),
+                            format!("unknown encoding for '{filename}': {enc}").into(),
+                        )
+                    } else {
+                        exc
+                    }
+                })?;
+            Ok(decoded.to_string_lossy().into_owned())
+        }
+    }
+
     #[cfg(any(feature = "parser", feature = "compiler"))]
     #[pyfunction]
     fn compile(args: CompileArgs, vm: &VirtualMachine) -> PyResult {
@@ -203,9 +342,8 @@ mod builtins {
                 let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?;
                 let source = source.borrow_bytes();
 
-                // TODO: compiler::compile should probably get bytes
-                let source = core::str::from_utf8(&source)
-                    .map_err(|e| vm.new_unicode_decode_error(e.to_string()))?;
+                let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?;
+                let source = source.as_str();
 
                 let flags = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?;