@@ -111,6 +111,145 @@ mod builtins {
111111 _feature_version : OptionalArg < i32 > ,
112112 }
113113
114+ /// Detect PEP 263 encoding cookie from source bytes.
115+ /// Checks first two lines for `# coding[:=] <encoding>` pattern.
116+ /// Returns the encoding name if found, or None for default (UTF-8).
117+ #[ cfg( feature = "parser" ) ]
118+ fn detect_source_encoding ( source : & [ u8 ] ) -> Option < String > {
119+ fn find_encoding_in_line ( line : & [ u8 ] ) -> Option < String > {
120+ // PEP 263: '#' must be preceded only by whitespace/formfeed
121+ let hash_pos = line. iter ( ) . position ( |& b| b == b'#' ) ?;
122+ if !line[ ..hash_pos]
123+ . iter ( )
124+ . all ( |& b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r' )
125+ {
126+ return None ;
127+ }
128+ let after_hash = & line[ hash_pos..] ;
129+
130+ // Find "coding" after the #
131+ let coding_pos = after_hash. windows ( 6 ) . position ( |w| w == b"coding" ) ?;
132+ let after_coding = & after_hash[ coding_pos + 6 ..] ;
133+
134+ // Next char must be ':' or '='
135+ let rest = if after_coding. first ( ) == Some ( & b':' ) || after_coding. first ( ) == Some ( & b'=' )
136+ {
137+ & after_coding[ 1 ..]
138+ } else {
139+ return None ;
140+ } ;
141+
142+ // Skip whitespace
143+ let rest = rest
144+ . iter ( )
145+ . copied ( )
146+ . skip_while ( |& b| b == b' ' || b == b'\t' )
147+ . collect :: < Vec < _ > > ( ) ;
148+
149+ // Read encoding name: [-\w.]+
150+ let name: String = rest
151+ . iter ( )
152+ . take_while ( |& & b| b. is_ascii_alphanumeric ( ) || b == b'-' || b == b'_' || b == b'.' )
153+ . map ( |& b| b as char )
154+ . collect ( ) ;
155+
156+ if name. is_empty ( ) { None } else { Some ( name) }
157+ }
158+
159+ // Split into lines (first two only)
160+ let mut lines = source. splitn ( 3 , |& b| b == b'\n' ) ;
161+
162+ if let Some ( first) = lines. next ( ) {
163+ // Strip BOM if present
164+ let first = first. strip_prefix ( b"\xef \xbb \xbf " ) . unwrap_or ( first) ;
165+ if let Some ( enc) = find_encoding_in_line ( first) {
166+ return Some ( enc) ;
167+ }
168+ // Only check second line if first line is blank or a comment
169+ let trimmed = first
170+ . iter ( )
171+ . skip_while ( |& & b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r' )
172+ . copied ( )
173+ . collect :: < Vec < _ > > ( ) ;
174+ if !trimmed. is_empty ( ) && trimmed[ 0 ] != b'#' {
175+ return None ;
176+ }
177+ }
178+
179+ lines. next ( ) . and_then ( find_encoding_in_line)
180+ }
181+
182+ /// Decode source bytes to a string, handling PEP 263 encoding declarations
183+ /// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding
184+ /// declaration (matching CPython behavior).
185+ /// Check if an encoding name is a UTF-8 variant after normalization.
186+ /// Matches: utf-8, utf_8, utf8, UTF-8, etc.
187+ #[ cfg( feature = "parser" ) ]
188+ fn is_utf8_encoding ( name : & str ) -> bool {
189+ let normalized: String = name. chars ( ) . filter ( |& c| c != '-' && c != '_' ) . collect ( ) ;
190+ normalized. eq_ignore_ascii_case ( "utf8" )
191+ }
192+
193+ #[ cfg( feature = "parser" ) ]
194+ fn decode_source_bytes ( source : & [ u8 ] , filename : & str , vm : & VirtualMachine ) -> PyResult < String > {
195+ let has_bom = source. starts_with ( b"\xef \xbb \xbf " ) ;
196+ let encoding = detect_source_encoding ( source) ;
197+
198+ let is_utf8 = encoding. as_deref ( ) . is_none_or ( is_utf8_encoding) ;
199+
200+ // Validate BOM + encoding combination
201+ if has_bom && !is_utf8 {
202+ return Err ( vm. new_exception_msg (
203+ vm. ctx . exceptions . syntax_error . to_owned ( ) ,
204+ format ! ( "encoding problem for '{filename}': utf-8" ) . into ( ) ,
205+ ) ) ;
206+ }
207+
208+ if is_utf8 {
209+ let src = if has_bom { & source[ 3 ..] } else { source } ;
210+ match core:: str:: from_utf8 ( src) {
211+ Ok ( s) => Ok ( s. to_owned ( ) ) ,
212+ Err ( e) => {
213+ let bad_byte = src[ e. valid_up_to ( ) ] ;
214+ let line = src[ ..e. valid_up_to ( ) ]
215+ . iter ( )
216+ . filter ( |& & b| b == b'\n' )
217+ . count ( )
218+ + 1 ;
219+ Err ( vm. new_exception_msg (
220+ vm. ctx . exceptions . syntax_error . to_owned ( ) ,
221+ format ! (
222+ "Non-UTF-8 code starting with '\\ x{bad_byte:02x}' \
223+ on line {line}, but no encoding declared; \
224+ see https://peps.python.org/pep-0263/ for details \
225+ ({filename}, line {line})"
226+ )
227+ . into ( ) ,
228+ ) )
229+ }
230+ }
231+ } else {
232+ // Use codec registry for non-UTF-8 encodings
233+ let enc = encoding. as_deref ( ) . unwrap ( ) ;
234+ let bytes_obj = vm. ctx . new_bytes ( source. to_vec ( ) ) ;
235+ let decoded = vm
236+ . state
237+ . codec_registry
238+ . decode_text ( bytes_obj. into ( ) , enc, None , vm)
239+ . map_err ( |exc| {
240+ if exc. fast_isinstance ( vm. ctx . exceptions . lookup_error ) {
241+ vm. new_exception_msg (
242+ vm. ctx . exceptions . syntax_error . to_owned ( ) ,
243+ format ! ( "unknown encoding for '{filename}': {enc}" ) . into ( ) ,
244+ )
245+ } else {
246+ exc
247+ }
248+ } ) ?;
249+ Ok ( decoded. to_string_lossy ( ) . into_owned ( ) )
250+ }
251+ }
252+
114253 #[ cfg( any( feature = "parser" , feature = "compiler" ) ) ]
115254 #[ pyfunction]
116255 fn compile ( args : CompileArgs , vm : & VirtualMachine ) -> PyResult {
@@ -203,9 +342,8 @@ mod builtins {
203342 let source = ArgStrOrBytesLike :: try_from_object ( vm, args. source ) ?;
204343 let source = source. borrow_bytes ( ) ;
205344
206- // TODO: compiler::compile should probably get bytes
207- let source = core:: str:: from_utf8 ( & source)
208- . map_err ( |e| vm. new_unicode_decode_error ( e. to_string ( ) ) ) ?;
345+ let source = decode_source_bytes ( & source, & args. filename . to_string_lossy ( ) , vm) ?;
346+ let source = source. as_str ( ) ;
209347
210348 let flags = args. flags . map_or ( Ok ( 0 ) , |v| v. try_to_primitive ( vm) ) ?;
211349
0 commit comments