Fix misc codecs issues

coolreader18 · coolreader18 · commit e21a447400fd · 2021-05-21T01:32:55.000-05:00
diff --git a/Lib/_pycodecs.py b/Lib/_pycodecs.py
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
@@ -217,10 +217,10 @@ where
         let mut saw_f = false;
         loop {
             // Detect r"", f"", b"" and u""
-            if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('b') || self.chr0 == Some('B')) {
+            if !(saw_b || saw_u || saw_f) && matches!(self.chr0, Some('b') | Some('B')) {
                 saw_b = true;
             } else if !(saw_b || saw_r || saw_u || saw_f)
-                && (self.chr0 == Some('u') || self.chr0 == Some('U'))
+                && matches!(self.chr0, Some('u') | Some('U'))
             {
                 saw_u = true;
             } else if !(saw_r || saw_u) && (self.chr0 == Some('r') || self.chr0 == Some('R')) {
@@ -1710,6 +1710,17 @@ mod tests {
                 },
                 Tok::Newline
             ]
+        );
+        let source = r"rb'\\'";
+        let tokens = lex_source(source);
+        assert_eq!(
+            tokens,
+            vec![
+                Tok::Bytes {
+                    value: b"\\\\".to_vec()
+                },
+                Tok::Newline
+            ]
         )
     }
 
diff --git a/vm/src/anystr.rs b/vm/src/anystr.rs
@@ -421,3 +421,15 @@ pub trait AnyStr<'s>: 's {
             .format(vm, values)
     }
 }
+
+/// returns the outer quotes to use and the number of quotes that need to be escaped
+#[inline]
+pub fn choose_quotes_for_repr(num_squotes: usize, num_dquotes: usize) -> (char, usize) {
+    // always use squote unless we have squotes but no dquotes
+    let use_dquote = num_squotes > 0 && num_dquotes == 0;
+    if use_dquote {
+        ('"', num_dquotes)
+    } else {
+        ('\'', num_squotes)
+    }
+}
diff --git a/vm/src/builtins/bytearray.rs b/vm/src/builtins/bytearray.rs
@@ -121,7 +121,7 @@ impl PyByteArray {
 
     #[pymethod(name = "__repr__")]
     fn repr(&self) -> String {
-        format!("bytearray(b'{}')", self.inner().repr())
+        self.inner().repr("bytearray(", ")")
     }
 
     #[pymethod(name = "__len__")]
diff --git a/vm/src/builtins/bytes.rs b/vm/src/builtins/bytes.rs
@@ -111,7 +111,7 @@ impl PyBytes {
 
     #[pymethod(name = "__repr__")]
     pub(crate) fn repr(&self) -> String {
-        format!("b'{}'", self.inner.repr())
+        self.inner.repr("", "")
     }
 
     #[pymethod(name = "__len__")]
diff --git a/vm/src/builtins/pystr.rs b/vm/src/builtins/pystr.rs
@@ -329,7 +329,7 @@ impl PyStr {
                 ch if ch.is_ascii() => 1,
                 ch if char_is_printable(ch) => {
                     // max = std::cmp::max(ch, max);
-                    1
+                    ch.len_utf8()
                 }
                 ch if (ch as u32) < 0x100 => 4,   // \xHH
                 ch if (ch as u32) < 0x10000 => 6, // \uHHHH
@@ -341,59 +341,56 @@ impl PyStr {
             out_len += incr;
         }
 
-        let (quote, unchanged) = {
-            let mut quote = '\'';
-            let mut unchanged = out_len == in_len;
-            if squote > 0 {
-                unchanged = false;
-                if dquote > 0 {
-                    // Both squote and dquote present. Use squote, and escape them
-                    out_len += squote;
-                } else {
-                    quote = '"';
-                }
-            }
-            (quote, unchanged)
-        };
+        let (quote, num_escaped_quotes) = anystr::choose_quotes_for_repr(squote, dquote);
+        // we'll be adding backslashes in front of the existing inner quotes
+        out_len += num_escaped_quotes;
 
-        out_len += 2; // quotes
+        // if we don't need to escape anything we can just copy
+        let unchanged = out_len == in_len;
+
+        // start and ending quotes
+        out_len += 2;
 
         let mut repr = String::with_capacity(out_len);
         repr.push(quote);
         if unchanged {
             repr.push_str(self.as_str());
         } else {
             for ch in self.value.chars() {
-                if ch == quote || ch == '\\' {
-                    repr.push('\\');
-                    repr.push(ch);
-                } else if ch == '\n' {
-                    repr.push_str("\\n")
-                } else if ch == '\t' {
-                    repr.push_str("\\t");
-                } else if ch == '\r' {
-                    repr.push_str("\\r");
-                } else if ch < ' ' || ch as u32 == 0x7F {
-                    repr.push_str(&format!("\\x{:02x}", ch as u32));
-                } else if ch.is_ascii() {
-                    repr.push(ch);
-                } else if !char_is_printable(ch) {
-                    let code = ch as u32;
-                    let escaped = if code < 0xff {
-                        format!("\\x{:02x}", code)
-                    } else if code < 0xffff {
-                        format!("\\u{:04x}", code)
-                    } else {
-                        format!("\\U{:08x}", code)
-                    };
-                    repr.push_str(&escaped);
-                } else {
-                    repr.push(ch)
+                use std::fmt::Write;
+                match ch {
+                    '\n' => repr.push_str("\\n"),
+                    '\t' => repr.push_str("\\t"),
+                    '\r' => repr.push_str("\\r"),
+                    // these 2 branches *would* be handled below, but we shouldn't have to do a
+                    // unicodedata lookup just for ascii characters
+                    '\x20'..='\x7e' => {
+                        // printable ascii range
+                        if ch == quote || ch == '\\' {
+                            repr.push('\\');
+                        }
+                        repr.push(ch);
+                    }
+                    ch if ch.is_ascii() => {
+                        write!(repr, "\\x{:02x}", ch as u8).unwrap();
+                    }
+                    ch if char_is_printable(ch) => {
+                        repr.push(ch);
+                    }
+                    '\0'..='\u{ff}' => {
+                        write!(repr, "\\x{:02x}", ch as u32).unwrap();
+                    }
+                    '\0'..='\u{ffff}' => {
+                        write!(repr, "\\u{:04x}", ch as u32).unwrap();
+                    }
+                    _ => {
+                        write!(repr, "\\U{:08x}", ch as u32).unwrap();
+                    }
                 }
             }
         }
-
         repr.push(quote);
+
         Ok(repr)
     }
 
diff --git a/vm/src/bytesinner.rs b/vm/src/bytesinner.rs
@@ -230,17 +230,61 @@ impl ByteInnerTranslateOptions {
 pub type ByteInnerSplitOptions<'a> = anystr::SplitArgs<'a, PyBytesInner>;
 
 impl PyBytesInner {
-    pub fn repr(&self) -> String {
-        let mut res = String::with_capacity(self.elements.len());
-        for i in self.elements.iter() {
-            match i {
-                9 => res.push_str("\\t"),
-                10 => res.push_str("\\n"),
-                13 => res.push_str("\\r"),
-                32..=126 => res.push(*(i) as char),
-                _ => res.push_str(&format!("\\x{:02x}", i)),
+    pub fn repr(&self, prefix: &str, suffix: &str) -> String {
+        use std::fmt::Write;
+
+        let mut out_len = 0usize;
+        let mut squote = 0;
+        let mut dquote = 0;
+
+        for &ch in self.elements.iter() {
+            let incr = match ch {
+                b'\'' => {
+                    squote += 1;
+                    1
+                }
+                b'"' => {
+                    dquote += 1;
+                    1
+                }
+                b'\\' | b'\t' | b'\r' | b'\n' => 2,
+                0x20..=0x7e => 1,
+                _ => 4, // \xHH
+            };
+            // TODO: OverflowError
+            out_len = out_len.checked_add(incr).unwrap();
+        }
+
+        let (quote, num_escaped_quotes) = anystr::choose_quotes_for_repr(squote, dquote);
+        // we'll be adding backslashes in front of the existing inner quotes
+        out_len += num_escaped_quotes;
+
+        // 3 is for b prefix + outer quotes
+        out_len += 3 + prefix.len() + suffix.len();
+
+        let mut res = String::with_capacity(out_len);
+        res.push_str(prefix);
+        res.push('b');
+        res.push(quote);
+        for &ch in self.elements.iter() {
+            match ch {
+                b'\t' => res.push_str("\\t"),
+                b'\n' => res.push_str("\\n"),
+                b'\r' => res.push_str("\\r"),
+                // printable ascii range
+                0x20..=0x7e => {
+                    let ch = ch as char;
+                    if ch == quote || ch == '\\' {
+                        res.push('\\');
+                    }
+                    res.push(ch);
+                }
+                _ => write!(res, "\\x{:02x}", ch).unwrap(),
             }
         }
+        res.push(quote);
+        res.push_str(suffix);
+
         res
     }
 
diff --git a/vm/src/codecs.rs b/vm/src/codecs.rs
@@ -4,6 +4,7 @@ use std::ops::Range;
 
 use crate::builtins::{pybool, PyBytesRef, PyStr, PyStrRef, PyTuple, PyTupleRef};
 use crate::common::lock::PyRwLock;
+use crate::exceptions::PyBaseExceptionRef;
 use crate::VirtualMachine;
 use crate::{IntoPyObject, PyContext, PyObjectRef, PyResult, PyValue, TryFromObject, TypeProtocol};
 
@@ -327,6 +328,23 @@ fn extract_unicode_error_range(err: &PyObjectRef, vm: &VirtualMachine) -> PyResu
     Ok(Range { start, end })
 }
 
+#[inline]
+fn is_decode_err(err: &PyObjectRef, vm: &VirtualMachine) -> bool {
+    err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
+}
+#[inline]
+fn is_encode_ish_err(err: &PyObjectRef, vm: &VirtualMachine) -> bool {
+    err.isinstance(&vm.ctx.exceptions.unicode_encode_error)
+        || err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
+}
+
+fn bad_err_type(err: PyObjectRef, vm: &VirtualMachine) -> PyBaseExceptionRef {
+    vm.new_type_error(format!(
+        "don't know how to handle {} in error callback",
+        err.class().name
+    ))
+}
+
 fn strict_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult {
     let err = err
         .downcast()
@@ -335,45 +353,35 @@ fn strict_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult {
 }
 
 fn ignore_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(PyObjectRef, usize)> {
-    if err.isinstance(&vm.ctx.exceptions.unicode_encode_error)
-        || err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
-        || err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
-    {
+    if is_encode_ish_err(&err, vm) || is_decode_err(&err, vm) {
         let range = extract_unicode_error_range(&err, vm)?;
         Ok((vm.ctx.new_str(""), range.end))
     } else {
-        Err(vm.new_type_error(format!(
-            "don't know how to handle {} in error callback",
-            err.class().name
-        )))
+        Err(bad_err_type(err, vm))
     }
 }
 
 fn replace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
+    // char::REPLACEMENT_CHARACTER as a str
+    let replacement_char = "\u{FFFD}";
     let replace = if err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
         "?"
-    } else if err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
-        || err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
-    {
-        // char::REPLACEMENT_CHARACTER
-        "\u{FFFD}"
+    } else if err.isinstance(&vm.ctx.exceptions.unicode_decode_error) {
+        let range = extract_unicode_error_range(&err, vm)?;
+        return Ok((replacement_char.to_owned(), range.end));
+    } else if err.isinstance(&vm.ctx.exceptions.unicode_translate_error) {
+        replacement_char
     } else {
-        return Err(vm.new_type_error(format!(
-            "don't know how to handle {} in error callback",
-            err.class().name
-        )));
+        return Err(bad_err_type(err, vm));
     };
     let range = extract_unicode_error_range(&err, vm)?;
     let replace = replace.repeat(range.end - range.start);
     Ok((replace, range.end))
 }
 
 fn xmlcharrefreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
-    if !err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
-        return Err(vm.new_type_error(format!(
-            "don't know how to handle {} in error callback",
-            err.class().name
-        )));
+    if !is_encode_ish_err(&err, vm) {
+        return Err(bad_err_type(err, vm));
     }
     let range = extract_unicode_error_range(&err, vm)?;
     let s = PyStrRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;
@@ -389,11 +397,17 @@ fn xmlcharrefreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(
 }
 
 fn backslashreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
-    if !err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
-        return Err(vm.new_type_error(format!(
-            "don't know how to handle {} in error callback",
-            err.class().name
-        )));
+    if is_decode_err(&err, vm) {
+        let range = extract_unicode_error_range(&err, vm)?;
+        let b = PyBytesRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;
+        let mut replace = String::with_capacity(4 * range.len());
+        for &c in &b[range.clone()] {
+            use std::fmt::Write;
+            write!(replace, "\\x{:02x}", c).unwrap();
+        }
+        return Ok((replace, range.end));
+    } else if !is_encode_ish_err(&err, vm) {
+        return Err(bad_err_type(err, vm));
     }
     let range = extract_unicode_error_range(&err, vm)?;
     let s = PyStrRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;
diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs
diff --git a/vm/src/stdlib/pystruct.rs b/vm/src/stdlib/pystruct.rs

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ impl PyByteArray {`
`121`	`121`
`122`	`122`	`#[pymethod(name = "__repr__")]`
`123`	`123`	`fn repr(&self) -> String {`
`124`		`- format!("bytearray(b'{}')", self.inner().repr())`
	`124`	`+ self.inner().repr("bytearray(", ")")`
`125`	`125`	`}`
`126`	`126`
`127`	`127`	`#[pymethod(name = "__len__")]`
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ impl PyBytes {`
`111`	`111`
`112`	`112`	`#[pymethod(name = "__repr__")]`
`113`	`113`	`pub(crate) fn repr(&self) -> String {`
`114`		`- format!("b'{}'", self.inner.repr())`
	`114`	`+ self.inner.repr("", "")`
`115`	`115`	`}`
`116`	`116`
`117`	`117`	`#[pymethod(name = "__len__")]`