Skip to content

Commit e21a447

Browse files
committed
Fix misc codecs issues
1 parent d1a4812 commit e21a447

File tree

10 files changed

+349
-275
lines changed

10 files changed

+349
-275
lines changed

Lib/_pycodecs.py

Lines changed: 152 additions & 148 deletions
Large diffs are not rendered by default.

parser/src/lexer.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,10 @@ where
217217
let mut saw_f = false;
218218
loop {
219219
// Detect r"", f"", b"" and u""
220-
if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('b') || self.chr0 == Some('B')) {
220+
if !(saw_b || saw_u || saw_f) && matches!(self.chr0, Some('b') | Some('B')) {
221221
saw_b = true;
222222
} else if !(saw_b || saw_r || saw_u || saw_f)
223-
&& (self.chr0 == Some('u') || self.chr0 == Some('U'))
223+
&& matches!(self.chr0, Some('u') | Some('U'))
224224
{
225225
saw_u = true;
226226
} else if !(saw_r || saw_u) && (self.chr0 == Some('r') || self.chr0 == Some('R')) {
@@ -1710,6 +1710,17 @@ mod tests {
17101710
},
17111711
Tok::Newline
17121712
]
1713+
);
1714+
let source = r"rb'\\'";
1715+
let tokens = lex_source(source);
1716+
assert_eq!(
1717+
tokens,
1718+
vec![
1719+
Tok::Bytes {
1720+
value: b"\\\\".to_vec()
1721+
},
1722+
Tok::Newline
1723+
]
17131724
)
17141725
}
17151726

vm/src/anystr.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,3 +421,15 @@ pub trait AnyStr<'s>: 's {
421421
.format(vm, values)
422422
}
423423
}
424+
425+
/// returns the outer quotes to use and the number of quotes that need to be escaped
426+
#[inline]
427+
pub fn choose_quotes_for_repr(num_squotes: usize, num_dquotes: usize) -> (char, usize) {
428+
// always use squote unless we have squotes but no dquotes
429+
let use_dquote = num_squotes > 0 && num_dquotes == 0;
430+
if use_dquote {
431+
('"', num_dquotes)
432+
} else {
433+
('\'', num_squotes)
434+
}
435+
}

vm/src/builtins/bytearray.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ impl PyByteArray {
121121

122122
#[pymethod(name = "__repr__")]
123123
fn repr(&self) -> String {
124-
format!("bytearray(b'{}')", self.inner().repr())
124+
self.inner().repr("bytearray(", ")")
125125
}
126126

127127
#[pymethod(name = "__len__")]

vm/src/builtins/bytes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ impl PyBytes {
111111

112112
#[pymethod(name = "__repr__")]
113113
pub(crate) fn repr(&self) -> String {
114-
format!("b'{}'", self.inner.repr())
114+
self.inner.repr("", "")
115115
}
116116

117117
#[pymethod(name = "__len__")]

vm/src/builtins/pystr.rs

Lines changed: 39 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ impl PyStr {
329329
ch if ch.is_ascii() => 1,
330330
ch if char_is_printable(ch) => {
331331
// max = std::cmp::max(ch, max);
332-
1
332+
ch.len_utf8()
333333
}
334334
ch if (ch as u32) < 0x100 => 4, // \xHH
335335
ch if (ch as u32) < 0x10000 => 6, // \uHHHH
@@ -341,59 +341,56 @@ impl PyStr {
341341
out_len += incr;
342342
}
343343

344-
let (quote, unchanged) = {
345-
let mut quote = '\'';
346-
let mut unchanged = out_len == in_len;
347-
if squote > 0 {
348-
unchanged = false;
349-
if dquote > 0 {
350-
// Both squote and dquote present. Use squote, and escape them
351-
out_len += squote;
352-
} else {
353-
quote = '"';
354-
}
355-
}
356-
(quote, unchanged)
357-
};
344+
let (quote, num_escaped_quotes) = anystr::choose_quotes_for_repr(squote, dquote);
345+
// we'll be adding backslashes in front of the existing inner quotes
346+
out_len += num_escaped_quotes;
358347

359-
out_len += 2; // quotes
348+
// if we don't need to escape anything we can just copy
349+
let unchanged = out_len == in_len;
350+
351+
// start and ending quotes
352+
out_len += 2;
360353

361354
let mut repr = String::with_capacity(out_len);
362355
repr.push(quote);
363356
if unchanged {
364357
repr.push_str(self.as_str());
365358
} else {
366359
for ch in self.value.chars() {
367-
if ch == quote || ch == '\\' {
368-
repr.push('\\');
369-
repr.push(ch);
370-
} else if ch == '\n' {
371-
repr.push_str("\\n")
372-
} else if ch == '\t' {
373-
repr.push_str("\\t");
374-
} else if ch == '\r' {
375-
repr.push_str("\\r");
376-
} else if ch < ' ' || ch as u32 == 0x7F {
377-
repr.push_str(&format!("\\x{:02x}", ch as u32));
378-
} else if ch.is_ascii() {
379-
repr.push(ch);
380-
} else if !char_is_printable(ch) {
381-
let code = ch as u32;
382-
let escaped = if code < 0xff {
383-
format!("\\x{:02x}", code)
384-
} else if code < 0xffff {
385-
format!("\\u{:04x}", code)
386-
} else {
387-
format!("\\U{:08x}", code)
388-
};
389-
repr.push_str(&escaped);
390-
} else {
391-
repr.push(ch)
360+
use std::fmt::Write;
361+
match ch {
362+
'\n' => repr.push_str("\\n"),
363+
'\t' => repr.push_str("\\t"),
364+
'\r' => repr.push_str("\\r"),
365+
// these 2 branches *would* be handled below, but we shouldn't have to do a
366+
// unicodedata lookup just for ascii characters
367+
'\x20'..='\x7e' => {
368+
// printable ascii range
369+
if ch == quote || ch == '\\' {
370+
repr.push('\\');
371+
}
372+
repr.push(ch);
373+
}
374+
ch if ch.is_ascii() => {
375+
write!(repr, "\\x{:02x}", ch as u8).unwrap();
376+
}
377+
ch if char_is_printable(ch) => {
378+
repr.push(ch);
379+
}
380+
'\0'..='\u{ff}' => {
381+
write!(repr, "\\x{:02x}", ch as u32).unwrap();
382+
}
383+
'\0'..='\u{ffff}' => {
384+
write!(repr, "\\u{:04x}", ch as u32).unwrap();
385+
}
386+
_ => {
387+
write!(repr, "\\U{:08x}", ch as u32).unwrap();
388+
}
392389
}
393390
}
394391
}
395-
396392
repr.push(quote);
393+
397394
Ok(repr)
398395
}
399396

vm/src/bytesinner.rs

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -230,17 +230,61 @@ impl ByteInnerTranslateOptions {
230230
pub type ByteInnerSplitOptions<'a> = anystr::SplitArgs<'a, PyBytesInner>;
231231

232232
impl PyBytesInner {
233-
pub fn repr(&self) -> String {
234-
let mut res = String::with_capacity(self.elements.len());
235-
for i in self.elements.iter() {
236-
match i {
237-
9 => res.push_str("\\t"),
238-
10 => res.push_str("\\n"),
239-
13 => res.push_str("\\r"),
240-
32..=126 => res.push(*(i) as char),
241-
_ => res.push_str(&format!("\\x{:02x}", i)),
233+
pub fn repr(&self, prefix: &str, suffix: &str) -> String {
234+
use std::fmt::Write;
235+
236+
let mut out_len = 0usize;
237+
let mut squote = 0;
238+
let mut dquote = 0;
239+
240+
for &ch in self.elements.iter() {
241+
let incr = match ch {
242+
b'\'' => {
243+
squote += 1;
244+
1
245+
}
246+
b'"' => {
247+
dquote += 1;
248+
1
249+
}
250+
b'\\' | b'\t' | b'\r' | b'\n' => 2,
251+
0x20..=0x7e => 1,
252+
_ => 4, // \xHH
253+
};
254+
// TODO: OverflowError
255+
out_len = out_len.checked_add(incr).unwrap();
256+
}
257+
258+
let (quote, num_escaped_quotes) = anystr::choose_quotes_for_repr(squote, dquote);
259+
// we'll be adding backslashes in front of the existing inner quotes
260+
out_len += num_escaped_quotes;
261+
262+
// 3 is for b prefix + outer quotes
263+
out_len += 3 + prefix.len() + suffix.len();
264+
265+
let mut res = String::with_capacity(out_len);
266+
res.push_str(prefix);
267+
res.push('b');
268+
res.push(quote);
269+
for &ch in self.elements.iter() {
270+
match ch {
271+
b'\t' => res.push_str("\\t"),
272+
b'\n' => res.push_str("\\n"),
273+
b'\r' => res.push_str("\\r"),
274+
// printable ascii range
275+
0x20..=0x7e => {
276+
let ch = ch as char;
277+
if ch == quote || ch == '\\' {
278+
res.push('\\');
279+
}
280+
res.push(ch);
281+
}
282+
_ => write!(res, "\\x{:02x}", ch).unwrap(),
242283
}
243284
}
285+
res.push(quote);
286+
res.push_str(suffix);
287+
244288
res
245289
}
246290

vm/src/codecs.rs

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::ops::Range;
44

55
use crate::builtins::{pybool, PyBytesRef, PyStr, PyStrRef, PyTuple, PyTupleRef};
66
use crate::common::lock::PyRwLock;
7+
use crate::exceptions::PyBaseExceptionRef;
78
use crate::VirtualMachine;
89
use crate::{IntoPyObject, PyContext, PyObjectRef, PyResult, PyValue, TryFromObject, TypeProtocol};
910

@@ -327,6 +328,23 @@ fn extract_unicode_error_range(err: &PyObjectRef, vm: &VirtualMachine) -> PyResu
327328
Ok(Range { start, end })
328329
}
329330

331+
#[inline]
332+
fn is_decode_err(err: &PyObjectRef, vm: &VirtualMachine) -> bool {
333+
err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
334+
}
335+
#[inline]
336+
fn is_encode_ish_err(err: &PyObjectRef, vm: &VirtualMachine) -> bool {
337+
err.isinstance(&vm.ctx.exceptions.unicode_encode_error)
338+
|| err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
339+
}
340+
341+
fn bad_err_type(err: PyObjectRef, vm: &VirtualMachine) -> PyBaseExceptionRef {
342+
vm.new_type_error(format!(
343+
"don't know how to handle {} in error callback",
344+
err.class().name
345+
))
346+
}
347+
330348
fn strict_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult {
331349
let err = err
332350
.downcast()
@@ -335,45 +353,35 @@ fn strict_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult {
335353
}
336354

337355
fn ignore_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(PyObjectRef, usize)> {
338-
if err.isinstance(&vm.ctx.exceptions.unicode_encode_error)
339-
|| err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
340-
|| err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
341-
{
356+
if is_encode_ish_err(&err, vm) || is_decode_err(&err, vm) {
342357
let range = extract_unicode_error_range(&err, vm)?;
343358
Ok((vm.ctx.new_str(""), range.end))
344359
} else {
345-
Err(vm.new_type_error(format!(
346-
"don't know how to handle {} in error callback",
347-
err.class().name
348-
)))
360+
Err(bad_err_type(err, vm))
349361
}
350362
}
351363

352364
fn replace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
365+
// char::REPLACEMENT_CHARACTER as a str
366+
let replacement_char = "\u{FFFD}";
353367
let replace = if err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
354368
"?"
355-
} else if err.isinstance(&vm.ctx.exceptions.unicode_decode_error)
356-
|| err.isinstance(&vm.ctx.exceptions.unicode_translate_error)
357-
{
358-
// char::REPLACEMENT_CHARACTER
359-
"\u{FFFD}"
369+
} else if err.isinstance(&vm.ctx.exceptions.unicode_decode_error) {
370+
let range = extract_unicode_error_range(&err, vm)?;
371+
return Ok((replacement_char.to_owned(), range.end));
372+
} else if err.isinstance(&vm.ctx.exceptions.unicode_translate_error) {
373+
replacement_char
360374
} else {
361-
return Err(vm.new_type_error(format!(
362-
"don't know how to handle {} in error callback",
363-
err.class().name
364-
)));
375+
return Err(bad_err_type(err, vm));
365376
};
366377
let range = extract_unicode_error_range(&err, vm)?;
367378
let replace = replace.repeat(range.end - range.start);
368379
Ok((replace, range.end))
369380
}
370381

371382
fn xmlcharrefreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
372-
if !err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
373-
return Err(vm.new_type_error(format!(
374-
"don't know how to handle {} in error callback",
375-
err.class().name
376-
)));
383+
if !is_encode_ish_err(&err, vm) {
384+
return Err(bad_err_type(err, vm));
377385
}
378386
let range = extract_unicode_error_range(&err, vm)?;
379387
let s = PyStrRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;
@@ -389,11 +397,17 @@ fn xmlcharrefreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(
389397
}
390398

391399
fn backslashreplace_errors(err: PyObjectRef, vm: &VirtualMachine) -> PyResult<(String, usize)> {
392-
if !err.isinstance(&vm.ctx.exceptions.unicode_encode_error) {
393-
return Err(vm.new_type_error(format!(
394-
"don't know how to handle {} in error callback",
395-
err.class().name
396-
)));
400+
if is_decode_err(&err, vm) {
401+
let range = extract_unicode_error_range(&err, vm)?;
402+
let b = PyBytesRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;
403+
let mut replace = String::with_capacity(4 * range.len());
404+
for &c in &b[range.clone()] {
405+
use std::fmt::Write;
406+
write!(replace, "\\x{:02x}", c).unwrap();
407+
}
408+
return Ok((replace, range.end));
409+
} else if !is_encode_ish_err(&err, vm) {
410+
return Err(bad_err_type(err, vm));
397411
}
398412
let range = extract_unicode_error_range(&err, vm)?;
399413
let s = PyStrRef::try_from_object(vm, vm.get_attribute(err, "object")?)?;

0 commit comments

Comments
 (0)