diff --git a/crates/vm/src/stdlib/builtins.rs b/crates/vm/src/stdlib/builtins.rs index fd35b287211..40d1e84103a 100644 --- a/crates/vm/src/stdlib/builtins.rs +++ b/crates/vm/src/stdlib/builtins.rs @@ -315,9 +315,11 @@ mod builtins { } #[cfg(feature = "rustpython-codegen")] { - let mode = mode_str - .parse::() - .map_err(|err| vm.new_value_error(err.to_string()))?; + use crate::compiler::Mode; + + let mode = mode_str.parse::().map_err(|_| { + vm.new_value_error("compile() mode must be 'exec', 'eval' or 'single'") + })?; return _ast::compile( vm, args.source, @@ -339,11 +341,31 @@ mod builtins { use ruff_python_parser as parser; - let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?; - let source = source.borrow_bytes(); - - let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?; - let source = source.as_str(); + // CPython parity: encoding declarations (`# -*- coding: ... -*-`) + // only apply to bytes input. For `str` input the source is + // already decoded, so the declaration must be ignored — + // including the "unknown encoding" error path. + let source_string: String = match args.source.downcast_ref::() { + Some(pystr) => match pystr.to_str() { + Some(s) => s.to_owned(), + // Surrogate-bearing str falls back to the bytes path, + // which raises SyntaxError on the WTF-8 encoding. + None => decode_source_bytes( + pystr.as_wtf8().as_bytes(), + &args.filename.to_string_lossy(), + vm, + )?, + }, + None => { + let bytes = ArgBytesLike::try_from_object(vm, args.source.clone())?; + decode_source_bytes( + &bytes.borrow_buf(), + &args.filename.to_string_lossy(), + vm, + )? + } + }; + let source = source_string.as_str(); let flags: i32 = args.flags.map_or(0, |v| v.value); @@ -363,10 +385,22 @@ mod builtins { } #[cfg(feature = "compiler")] { + use crate::compiler::Mode; + + // CPython parity: `func_type` is only valid when + // PyCF_ONLY_AST is set (the `else` branch below). + if mode_str == "func_type" { + return Err(vm.new_value_error( + "compile() mode 'func_type' requires flag PyCF_ONLY_AST", + )); + } + if let Some(feature_version) = feature_version { - let mode = mode_str - .parse::() - .map_err(|err| vm.new_value_error(err.to_string()))?; + let mode = mode_str.parse::().map_err(|_| { + vm.new_value_error( + "compile() mode must be 'exec', 'eval' or 'single'", + ) + })?; let _ = _ast::parse( vm, source, @@ -378,9 +412,9 @@ mod builtins { .map_err(|e| (e, Some(source), allow_incomplete).to_pyexception(vm))?; } - let mode = mode_str - .parse::() - .map_err(|err| vm.new_value_error(err.to_string()))?; + let mode = mode_str.parse::().map_err(|_| { + vm.new_value_error("compile() mode must be 'exec', 'eval' or 'single'") + })?; let mut opts = vm.compile_opts(); opts.optimize = optimize; @@ -403,9 +437,9 @@ mod builtins { .map_err(|e| (e, Some(source), allow_incomplete).to_pyexception(vm)); } - let mode = mode_str - .parse::() - .map_err(|err| vm.new_value_error(err.to_string()))?; + let mode = mode_str.parse::().map_err(|_| { + vm.new_value_error("compile() mode must be 'exec', 'eval' or 'single'") + })?; let parsed = _ast::parse( vm, source, diff --git a/extra_tests/snippets/builtin_compile.py b/extra_tests/snippets/builtin_compile.py new file mode 100644 index 00000000000..d7cf24ee62d --- /dev/null +++ b/extra_tests/snippets/builtin_compile.py @@ -0,0 +1,44 @@ +from testutils import assert_raises + + +# CPython parity: encoding declarations (`# -*- coding: ... -*-`) only apply +# to bytes input. For `str` source the source is already decoded and the +# declaration must be ignored, including the "unknown encoding" error path. +compile("# -*- coding: badencoding -*-\nx = 1\n", "tmp", "exec") +compile("# -*- coding: latin1 -*-\nx = 1\n", "tmp", "exec") +compile("# -*- coding: utf-8 -*-\nx = 1\n", "tmp", "exec") + +# Bytes input keeps applying the declaration, so a bogus encoding still +# raises SyntaxError. +assert_raises( + SyntaxError, compile, b"# -*- coding: badencoding -*-\nx = 1\n", "tmp", "exec" +) + + +# CPython mode error wording. Both the missing `compile()` prefix and the +# Oxford-comma / quote style come from the parser's generic error message; +# `compile()` overrides it to match CPython exactly. +def _check_mode_error(mode_str): + try: + compile("x = 1", "", mode_str) + except ValueError as e: + assert str(e) == "compile() mode must be 'exec', 'eval' or 'single'", repr(e) + else: + raise AssertionError(f"expected ValueError for mode={mode_str!r}") + + +for bad in ("bogus", "", "BAD", "__bogus__"): + _check_mode_error(bad) + + +# `func_type` is only valid when PyCF_ONLY_AST (= 1024) is set. Plain text +# source without the flag must raise the specific "requires flag" error, +# not the generic "invalid mode" one. +try: + compile("def f(x): pass", "", "func_type") +except ValueError as e: + assert ( + str(e) == "compile() mode 'func_type' requires flag PyCF_ONLY_AST" + ), repr(e) +else: + raise AssertionError("expected ValueError for func_type without PyCF_ONLY_AST")