@@ -37,6 +37,22 @@ mod _json {
3737 count
3838 }
3939
40+ /// Check if a character iterator starts with a given pattern.
41+ /// This avoids byte/char index mismatch issues with non-ASCII strings.
42+ #[ inline]
43+ fn starts_with_chars < I > ( mut chars : I , pattern : & str ) -> bool
44+ where
45+ I : Iterator < Item = char > ,
46+ {
47+ for expected in pattern. chars ( ) {
48+ match chars. next ( ) {
49+ Some ( c) if c == expected => continue ,
50+ _ => return false ,
51+ }
52+ }
53+ true
54+ }
55+
4056 #[ pyattr( name = "make_scanner" ) ]
4157 #[ pyclass( name = "Scanner" , traverse) ]
4258 #[ derive( Debug , PyPayload ) ]
@@ -202,6 +218,54 @@ mod _json {
202218 Some ( ( ret, buf. len ( ) ) )
203219 }
204220
221+ /// Parse a number from a character iterator.
222+ /// Returns (result, character_count) where character_count is the number of chars consumed.
223+ fn parse_number_from_chars < I > (
224+ & self ,
225+ chars : I ,
226+ vm : & VirtualMachine ,
227+ ) -> Option < ( PyResult , usize ) >
228+ where
229+ I : Iterator < Item = char > ,
230+ {
231+ let mut buf = String :: new ( ) ;
232+ let mut has_neg = false ;
233+ let mut has_decimal = false ;
234+ let mut has_exponent = false ;
235+ let mut has_e_sign = false ;
236+
237+ for c in chars {
238+ let i = buf. len ( ) ;
239+ match c {
240+ '-' if i == 0 => has_neg = true ,
241+ n if n. is_ascii_digit ( ) => { }
242+ '.' if !has_decimal => has_decimal = true ,
243+ 'e' | 'E' if !has_exponent => has_exponent = true ,
244+ '+' | '-' if !has_e_sign => has_e_sign = true ,
245+ _ => break ,
246+ }
247+ buf. push ( c) ;
248+ }
249+
250+ let len = buf. len ( ) ;
251+ if len == 0 || ( len == 1 && has_neg) {
252+ return None ;
253+ }
254+
255+ let ret = if has_decimal || has_exponent {
256+ if let Some ( ref parse_float) = self . parse_float {
257+ parse_float. call ( ( & buf, ) , vm)
258+ } else {
259+ Ok ( vm. ctx . new_float ( f64:: from_str ( & buf) . unwrap ( ) ) . into ( ) )
260+ }
261+ } else if let Some ( ref parse_int) = self . parse_int {
262+ parse_int. call ( ( & buf, ) , vm)
263+ } else {
264+ Ok ( vm. new_pyobj ( BigInt :: from_str ( & buf) . unwrap ( ) ) )
265+ } ;
266+ Some ( ( ret, len) )
267+ }
268+
205269 /// Parse a JSON object starting after the opening '{'.
206270 /// Returns (parsed_object, end_character_index).
207271 fn parse_object (
@@ -458,6 +522,7 @@ mod _json {
458522 }
459523
460524 /// Call scan_once and handle the result.
525+ /// Uses character iterators to avoid byte/char index mismatch with non-ASCII strings.
461526 fn call_scan_once (
462527 & self ,
463528 scan_once : & PyObjectRef ,
@@ -466,100 +531,92 @@ mod _json {
466531 memo : & mut HashMap < String , PyStrRef > ,
467532 vm : & VirtualMachine ,
468533 ) -> PyResult < ( PyObjectRef , usize ) > {
469- // First try to handle common cases directly in Rust
470534 let s = pystr. as_str ( ) ;
471- let mut chars = s. chars ( ) . skip ( idx) . peekable ( ) ;
535+ let chars = s. chars ( ) . skip ( idx) . peekable ( ) ;
472536
473- let remaining = & s[ idx..] ;
537+ let first_char = match chars. clone ( ) . next ( ) {
538+ Some ( c) => c,
539+ None => return Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) ) ,
540+ } ;
474541
475- match chars . peek ( ) {
476- Some ( '"' ) => {
477- // String - parse directly in Rust
542+ match first_char {
543+ '"' => {
544+ // String
478545 let ( wtf8, end) = machinery:: scanstring ( pystr. as_wtf8 ( ) , idx + 1 , self . strict )
479546 . map_err ( |e| py_decode_error ( e, pystr. clone ( ) , vm) ) ?;
480547 let py_str = vm. ctx . new_str ( wtf8. to_string ( ) ) ;
481- return Ok ( ( py_str. into ( ) , end) ) ;
548+ Ok ( ( py_str. into ( ) , end) )
482549 }
483- Some ( '{' ) => {
484- // Nested object - parse recursively in Rust
485- return self . parse_object ( pystr, idx + 1 , scan_once, memo, vm) ;
550+ '{' => {
551+ // Object
552+ self . parse_object ( pystr, idx + 1 , scan_once, memo, vm)
486553 }
487- Some ( '[' ) => {
488- // Nested array - parse recursively in Rust
489- return self . parse_array ( pystr, idx + 1 , scan_once, memo, vm) ;
554+ '[' => {
555+ // Array
556+ self . parse_array ( pystr, idx + 1 , scan_once, memo, vm)
490557 }
491- Some ( 'n' ) => {
492- // null - parse directly in Rust
493- if remaining. starts_with ( "null" ) {
494- return Ok ( ( vm. ctx . none ( ) , idx + 4 ) ) ;
495- }
558+ 'n' if starts_with_chars ( chars. clone ( ) , "null" ) => {
559+ // null
560+ Ok ( ( vm. ctx . none ( ) , idx + 4 ) )
496561 }
497- Some ( 't' ) => {
498- // true - parse directly in Rust
499- if remaining. starts_with ( "true" ) {
500- return Ok ( ( vm. ctx . new_bool ( true ) . into ( ) , idx + 4 ) ) ;
501- }
562+ 't' if starts_with_chars ( chars. clone ( ) , "true" ) => {
563+ // true
564+ Ok ( ( vm. ctx . new_bool ( true ) . into ( ) , idx + 4 ) )
502565 }
503- Some ( 'f' ) => {
504- // false - parse directly in Rust
505- if remaining. starts_with ( "false" ) {
506- return Ok ( ( vm. ctx . new_bool ( false ) . into ( ) , idx + 5 ) ) ;
507- }
566+ 'f' if starts_with_chars ( chars. clone ( ) , "false" ) => {
567+ // false
568+ Ok ( ( vm. ctx . new_bool ( false ) . into ( ) , idx + 5 ) )
508569 }
509- Some ( c) if c. is_ascii_digit ( ) => {
510- // Number starting with digit - parse directly in Rust
511- if let Some ( ( result, len) ) = self . parse_number ( remaining, vm) {
512- return Ok ( ( result?, idx + len) ) ;
513- }
570+ 'N' if starts_with_chars ( chars. clone ( ) , "NaN" ) => {
571+ // NaN
572+ let result = self . parse_constant . call ( ( "NaN" , ) , vm) ?;
573+ Ok ( ( result, idx + 3 ) )
514574 }
515- Some ( 'N' ) => {
516- // NaN - parse directly in Rust
517- if remaining. starts_with ( "NaN" ) {
518- let result = self . parse_constant . call ( ( "NaN" , ) , vm) ?;
519- return Ok ( ( result, idx + 3 ) ) ;
520- }
575+ 'I' if starts_with_chars ( chars. clone ( ) , "Infinity" ) => {
576+ // Infinity
577+ let result = self . parse_constant . call ( ( "Infinity" , ) , vm) ?;
578+ Ok ( ( result, idx + 8 ) )
521579 }
522- Some ( 'I' ) => {
523- // Infinity - parse directly in Rust
524- if remaining. starts_with ( "Infinity" ) {
525- let result = self . parse_constant . call ( ( "Infinity" , ) , vm) ?;
526- return Ok ( ( result, idx + 8 ) ) ;
527- }
528- }
529- Some ( '-' ) => {
580+ '-' => {
530581 // -Infinity or negative number
531- if remaining . starts_with ( "-Infinity" ) {
582+ if starts_with_chars ( chars . clone ( ) , "-Infinity" ) {
532583 let result = self . parse_constant . call ( ( "-Infinity" , ) , vm) ?;
533584 return Ok ( ( result, idx + 9 ) ) ;
534585 }
535- // Try parsing as negative number
536- if let Some ( ( result, len) ) = self . parse_number ( remaining , vm) {
586+ // Negative number - collect number characters
587+ if let Some ( ( result, len) ) = self . parse_number_from_chars ( chars , vm) {
537588 return Ok ( ( result?, idx + len) ) ;
538589 }
590+ Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
539591 }
540- _ => {
541- // fall through to call scan_once
542- }
543- }
544-
545- // Fall back to scan_once for other value types
546- let result = scan_once. call ( ( pystr. clone ( ) , idx as isize ) , vm) ;
547-
548- match result {
549- Ok ( tuple) => {
550- use crate :: vm:: builtins:: PyTupleRef ;
551- let tuple: PyTupleRef = tuple. try_into_value ( vm) ?;
552- if tuple. len ( ) != 2 {
553- return Err ( vm. new_value_error ( "scan_once must return 2-tuple" ) ) ;
592+ c if c. is_ascii_digit ( ) => {
593+ // Positive number
594+ if let Some ( ( result, len) ) = self . parse_number_from_chars ( chars, vm) {
595+ return Ok ( ( result?, idx + len) ) ;
554596 }
555- let value = tuple. as_slice ( ) [ 0 ] . clone ( ) ;
556- let end_idx: isize = tuple. as_slice ( ) [ 1 ] . try_to_value ( vm) ?;
557- Ok ( ( value, end_idx as usize ) )
558- }
559- Err ( err) if err. fast_isinstance ( vm. ctx . exceptions . stop_iteration ) => {
560597 Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
561598 }
562- Err ( err) => Err ( err) ,
599+ _ => {
600+ // Fall back to scan_once for unrecognized input
601+ let result = scan_once. call ( ( pystr. clone ( ) , idx as isize ) , vm) ;
602+
603+ match result {
604+ Ok ( tuple) => {
605+ use crate :: vm:: builtins:: PyTupleRef ;
606+ let tuple: PyTupleRef = tuple. try_into_value ( vm) ?;
607+ if tuple. len ( ) != 2 {
608+ return Err ( vm. new_value_error ( "scan_once must return 2-tuple" ) ) ;
609+ }
610+ let value = tuple. as_slice ( ) [ 0 ] . clone ( ) ;
611+ let end_idx: isize = tuple. as_slice ( ) [ 1 ] . try_to_value ( vm) ?;
612+ Ok ( ( value, end_idx as usize ) )
613+ }
614+ Err ( err) if err. fast_isinstance ( vm. ctx . exceptions . stop_iteration ) => {
615+ Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
616+ }
617+ Err ( err) => Err ( err) ,
618+ }
619+ }
563620 }
564621 }
565622
0 commit comments