From 0f889ce92b51cd93f056652fa9d625f16a668ecb Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Thu, 23 Sep 2021 21:34:33 -0400 Subject: [PATCH 1/4] Implement latin_1 in Rust This implementation is patterned off of the ascii codec. --- common/src/encodings.rs | 76 +++++++++++++++++++++++++++++++++++++++++ vm/src/stdlib/codecs.rs | 21 +++++++----- 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/common/src/encodings.rs b/common/src/encodings.rs index 7a6cd16069e..ca77c529f63 100644 --- a/common/src/encodings.rs +++ b/common/src/encodings.rs @@ -172,6 +172,82 @@ pub mod utf8 { } } +pub mod latin_1 { + use super::*; + + pub const ENCODING_NAME: &str = "latin-1"; + + const ERR_REASON: &str = "ordinal not in range(256)"; + + #[inline] + pub fn encode(s: &str, errors: &E) -> Result, E::Error> { + let full_data = s; + let mut data = s; + let mut char_data_index = 0; + let mut out = Vec::::new(); + loop { + match data + .char_indices() + .enumerate() + .find(|(_, (_, c))| (*c as u32) > 255) + { + None => { + out.extend_from_slice(data.as_bytes()); + break; + } + Some((char_i, (byte_i, _))) => { + out.extend_from_slice(&data.as_bytes()[..byte_i]); + let char_start = char_data_index + char_i; + // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char + let non_latin_1_run_length = data[byte_i..] + .chars() + .take_while(|c| (*c as u32) > 255) + .count(); + let char_range = char_start..char_start + non_latin_1_run_length; + let (replace, char_restart) = + errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?; + match replace { + EncodeReplace::Str(s) => { + if s.as_ref().chars().any(|c| (c as u32) > 255) { + return Err( + errors.error_encoding(full_data, char_range, ERR_REASON) + ); + } + out.extend_from_slice(s.as_ref().as_bytes()); + } + EncodeReplace::Bytes(b) => { + out.extend_from_slice(b.as_ref()); + } + } + data = crate::str::try_get_chars(full_data, char_restart..) + .ok_or_else(|| errors.error_oob_restart(char_restart))?; + char_data_index = char_restart; + continue; + } + } + } + Ok(out) + } + + pub fn decode(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> { + decode_utf8_compatible( + data, + errors, + |v| { + std::str::from_utf8(v).map_err(|e| { + // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()] + // is valid ascii & therefore valid utf8 + unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) } + }) + }, + |_rest, err_len| HandleResult::Error { + err_len, + reason: ERR_REASON, + }, + ) + } +} + pub mod ascii { use super::*; use ::ascii::AsciiStr; diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs index ac63e4e1d4b..ede48947f33 100644 --- a/vm/src/stdlib/codecs.rs +++ b/vm/src/stdlib/codecs.rs @@ -315,6 +315,19 @@ mod _codecs { do_codec!(utf8::decode, args, vm) } + #[pyfunction] + fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult { + if args.s.as_ref().chars().all(|c| (c as u32) < 256) { + return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len())); + } + do_codec!(latin_1::encode, args, vm) + } + + #[pyfunction] + fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult { + do_codec!(latin_1::decode, args, vm) + } + #[pyfunction] fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult { if args.s.is_ascii() { @@ -353,14 +366,6 @@ mod _codecs { }}; } - #[pyfunction] - fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { - delegate_pycodecs!(latin_1_encode, args, vm) - } - #[pyfunction] - fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { - delegate_pycodecs!(latin_1_decode, args, vm) - } #[pyfunction] fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { delegate_pycodecs!(mbcs_encode, args, vm) From 7d322b728c912ebba0a65f94f1d180beca53e66b Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Thu, 23 Sep 2021 23:52:15 -0400 Subject: [PATCH 2/4] Simplify latin_1 decode function --- common/src/encodings.rs | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/common/src/encodings.rs b/common/src/encodings.rs index ca77c529f63..0a7758e1e0f 100644 --- a/common/src/encodings.rs +++ b/common/src/encodings.rs @@ -229,22 +229,10 @@ pub mod latin_1 { Ok(out) } - pub fn decode(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> { - decode_utf8_compatible( - data, - errors, - |v| { - std::str::from_utf8(v).map_err(|e| { - // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()] - // is valid ascii & therefore valid utf8 - unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) } - }) - }, - |_rest, err_len| HandleResult::Error { - err_len, - reason: ERR_REASON, - }, - ) + pub fn decode(data: &[u8], _errors: &E) -> Result<(String, usize), E::Error> { + let out: String = data.iter().map(|c| *c as char).collect(); + let out_len = out.len(); + Ok((out, out_len)) } } From 4375307a17b68b6f25f02329a95e1526753228a5 Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Fri, 24 Sep 2021 18:56:45 -0400 Subject: [PATCH 3/4] Streamline latin_1_encode fast path --- vm/src/stdlib/codecs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs index ede48947f33..355661852ab 100644 --- a/vm/src/stdlib/codecs.rs +++ b/vm/src/stdlib/codecs.rs @@ -317,7 +317,7 @@ mod _codecs { #[pyfunction] fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult { - if args.s.as_ref().chars().all(|c| (c as u32) < 256) { + if args.s.is_ascii() { return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len())); } do_codec!(latin_1::encode, args, vm) From a100178fb408133588416e5b5e92c1d4e23f10d2 Mon Sep 17 00:00:00 2001 From: Padraic Fanning Date: Fri, 24 Sep 2021 19:44:59 -0400 Subject: [PATCH 4/4] Account for `0x80..=0xff` range in memory --- common/src/encodings.rs | 57 ++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/common/src/encodings.rs b/common/src/encodings.rs index 0a7758e1e0f..bbbc5e8202b 100644 --- a/common/src/encodings.rs +++ b/common/src/encodings.rs @@ -189,39 +189,50 @@ pub mod latin_1 { match data .char_indices() .enumerate() - .find(|(_, (_, c))| (*c as u32) > 255) + .find(|(_, (_, c))| !c.is_ascii()) { None => { out.extend_from_slice(data.as_bytes()); break; } - Some((char_i, (byte_i, _))) => { + Some((char_i, (byte_i, ch))) => { out.extend_from_slice(&data.as_bytes()[..byte_i]); let char_start = char_data_index + char_i; - // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char - let non_latin_1_run_length = data[byte_i..] - .chars() - .take_while(|c| (*c as u32) > 255) - .count(); - let char_range = char_start..char_start + non_latin_1_run_length; - let (replace, char_restart) = - errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?; - match replace { - EncodeReplace::Str(s) => { - if s.as_ref().chars().any(|c| (c as u32) > 255) { - return Err( - errors.error_encoding(full_data, char_range, ERR_REASON) - ); + if (ch as u32) <= 255 { + out.push(ch as u8); + let char_restart = char_start + 1; + data = crate::str::try_get_chars(full_data, char_restart..) + .ok_or_else(|| errors.error_oob_restart(char_restart))?; + char_data_index = char_restart; + } else { + // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char + let non_latin_1_run_length = data[byte_i..] + .chars() + .take_while(|c| (*c as u32) > 255) + .count(); + let char_range = char_start..char_start + non_latin_1_run_length; + let (replace, char_restart) = errors.handle_encode_error( + full_data, + char_range.clone(), + ERR_REASON, + )?; + match replace { + EncodeReplace::Str(s) => { + if s.as_ref().chars().any(|c| (c as u32) > 255) { + return Err( + errors.error_encoding(full_data, char_range, ERR_REASON) + ); + } + out.extend_from_slice(s.as_ref().as_bytes()); + } + EncodeReplace::Bytes(b) => { + out.extend_from_slice(b.as_ref()); } - out.extend_from_slice(s.as_ref().as_bytes()); - } - EncodeReplace::Bytes(b) => { - out.extend_from_slice(b.as_ref()); } + data = crate::str::try_get_chars(full_data, char_restart..) + .ok_or_else(|| errors.error_oob_restart(char_restart))?; + char_data_index = char_restart; } - data = crate::str::try_get_chars(full_data, char_restart..) - .ok_or_else(|| errors.error_oob_restart(char_restart))?; - char_data_index = char_restart; continue; } }