From 0f889ce92b51cd93f056652fa9d625f16a668ecb Mon Sep 17 00:00:00 2001
From: Padraic Fanning <fanninpm@miamioh.edu>
Date: Thu, 23 Sep 2021 21:34:33 -0400
Subject: [PATCH 1/4] Implement latin_1 in Rust

This implementation is patterned off of the ascii codec.
---
 common/src/encodings.rs | 76 +++++++++++++++++++++++++++++++++++++++++
 vm/src/stdlib/codecs.rs | 21 +++++++-----
 2 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/common/src/encodings.rs b/common/src/encodings.rs
index 7a6cd16069e..ca77c529f63 100644
--- a/common/src/encodings.rs
+++ b/common/src/encodings.rs
@@ -172,6 +172,82 @@ pub mod utf8 {
     }
 }
 
+pub mod latin_1 {
+    use super::*;
+
+    pub const ENCODING_NAME: &str = "latin-1";
+
+    const ERR_REASON: &str = "ordinal not in range(256)";
+
+    #[inline]
+    pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
+        let full_data = s;
+        let mut data = s;
+        let mut char_data_index = 0;
+        let mut out = Vec::<u8>::new();
+        loop {
+            match data
+                .char_indices()
+                .enumerate()
+                .find(|(_, (_, c))| (*c as u32) > 255)
+            {
+                None => {
+                    out.extend_from_slice(data.as_bytes());
+                    break;
+                }
+                Some((char_i, (byte_i, _))) => {
+                    out.extend_from_slice(&data.as_bytes()[..byte_i]);
+                    let char_start = char_data_index + char_i;
+                    // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
+                    let non_latin_1_run_length = data[byte_i..]
+                        .chars()
+                        .take_while(|c| (*c as u32) > 255)
+                        .count();
+                    let char_range = char_start..char_start + non_latin_1_run_length;
+                    let (replace, char_restart) =
+                        errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
+                    match replace {
+                        EncodeReplace::Str(s) => {
+                            if s.as_ref().chars().any(|c| (c as u32) > 255) {
+                                return Err(
+                                    errors.error_encoding(full_data, char_range, ERR_REASON)
+                                );
+                            }
+                            out.extend_from_slice(s.as_ref().as_bytes());
+                        }
+                        EncodeReplace::Bytes(b) => {
+                            out.extend_from_slice(b.as_ref());
+                        }
+                    }
+                    data = crate::str::try_get_chars(full_data, char_restart..)
+                        .ok_or_else(|| errors.error_oob_restart(char_restart))?;
+                    char_data_index = char_restart;
+                    continue;
+                }
+            }
+        }
+        Ok(out)
+    }
+
+    pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
+        decode_utf8_compatible(
+            data,
+            errors,
+            |v| {
+                std::str::from_utf8(v).map_err(|e| {
+                    // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
+                    //         is valid ascii & therefore valid utf8
+                    unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
+                })
+            },
+            |_rest, err_len| HandleResult::Error {
+                err_len,
+                reason: ERR_REASON,
+            },
+        )
+    }
+}
+
 pub mod ascii {
     use super::*;
     use ::ascii::AsciiStr;
diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs
index ac63e4e1d4b..ede48947f33 100644
--- a/vm/src/stdlib/codecs.rs
+++ b/vm/src/stdlib/codecs.rs
@@ -315,6 +315,19 @@ mod _codecs {
         do_codec!(utf8::decode, args, vm)
     }
 
+    #[pyfunction]
+    fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
+        if args.s.as_ref().chars().all(|c| (c as u32) < 256) {
+            return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len()));
+        }
+        do_codec!(latin_1::encode, args, vm)
+    }
+
+    #[pyfunction]
+    fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult {
+        do_codec!(latin_1::decode, args, vm)
+    }
+
     #[pyfunction]
     fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
         if args.s.is_ascii() {
@@ -353,14 +366,6 @@ mod _codecs {
         }};
     }
 
-    #[pyfunction]
-    fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
-        delegate_pycodecs!(latin_1_encode, args, vm)
-    }
-    #[pyfunction]
-    fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
-        delegate_pycodecs!(latin_1_decode, args, vm)
-    }
     #[pyfunction]
     fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
         delegate_pycodecs!(mbcs_encode, args, vm)

From 7d322b728c912ebba0a65f94f1d180beca53e66b Mon Sep 17 00:00:00 2001
From: Padraic Fanning <fanninpm@miamioh.edu>
Date: Thu, 23 Sep 2021 23:52:15 -0400
Subject: [PATCH 2/4] Simplify latin_1 decode function

---
 common/src/encodings.rs | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/common/src/encodings.rs b/common/src/encodings.rs
index ca77c529f63..0a7758e1e0f 100644
--- a/common/src/encodings.rs
+++ b/common/src/encodings.rs
@@ -229,22 +229,10 @@ pub mod latin_1 {
         Ok(out)
     }
 
-    pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
-        decode_utf8_compatible(
-            data,
-            errors,
-            |v| {
-                std::str::from_utf8(v).map_err(|e| {
-                    // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
-                    //         is valid ascii & therefore valid utf8
-                    unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
-                })
-            },
-            |_rest, err_len| HandleResult::Error {
-                err_len,
-                reason: ERR_REASON,
-            },
-        )
+    pub fn decode<E: ErrorHandler>(data: &[u8], _errors: &E) -> Result<(String, usize), E::Error> {
+        let out: String = data.iter().map(|c| *c as char).collect();
+        let out_len = out.len();
+        Ok((out, out_len))
     }
 }
 

From 4375307a17b68b6f25f02329a95e1526753228a5 Mon Sep 17 00:00:00 2001
From: Padraic Fanning <fanninpm@miamioh.edu>
Date: Fri, 24 Sep 2021 18:56:45 -0400
Subject: [PATCH 3/4] Streamline latin_1_encode fast path

---
 vm/src/stdlib/codecs.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs
index ede48947f33..355661852ab 100644
--- a/vm/src/stdlib/codecs.rs
+++ b/vm/src/stdlib/codecs.rs
@@ -317,7 +317,7 @@ mod _codecs {
 
     #[pyfunction]
     fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
-        if args.s.as_ref().chars().all(|c| (c as u32) < 256) {
+        if args.s.is_ascii() {
             return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len()));
         }
         do_codec!(latin_1::encode, args, vm)

From a100178fb408133588416e5b5e92c1d4e23f10d2 Mon Sep 17 00:00:00 2001
From: Padraic Fanning <fanninpm@miamioh.edu>
Date: Fri, 24 Sep 2021 19:44:59 -0400
Subject: [PATCH 4/4] Account for `0x80..=0xff` range in memory

---
 common/src/encodings.rs | 57 ++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/common/src/encodings.rs b/common/src/encodings.rs
index 0a7758e1e0f..bbbc5e8202b 100644
--- a/common/src/encodings.rs
+++ b/common/src/encodings.rs
@@ -189,39 +189,50 @@ pub mod latin_1 {
             match data
                 .char_indices()
                 .enumerate()
-                .find(|(_, (_, c))| (*c as u32) > 255)
+                .find(|(_, (_, c))| !c.is_ascii())
             {
                 None => {
                     out.extend_from_slice(data.as_bytes());
                     break;
                 }
-                Some((char_i, (byte_i, _))) => {
+                Some((char_i, (byte_i, ch))) => {
                     out.extend_from_slice(&data.as_bytes()[..byte_i]);
                     let char_start = char_data_index + char_i;
-                    // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
-                    let non_latin_1_run_length = data[byte_i..]
-                        .chars()
-                        .take_while(|c| (*c as u32) > 255)
-                        .count();
-                    let char_range = char_start..char_start + non_latin_1_run_length;
-                    let (replace, char_restart) =
-                        errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
-                    match replace {
-                        EncodeReplace::Str(s) => {
-                            if s.as_ref().chars().any(|c| (c as u32) > 255) {
-                                return Err(
-                                    errors.error_encoding(full_data, char_range, ERR_REASON)
-                                );
+                    if (ch as u32) <= 255 {
+                        out.push(ch as u8);
+                        let char_restart = char_start + 1;
+                        data = crate::str::try_get_chars(full_data, char_restart..)
+                            .ok_or_else(|| errors.error_oob_restart(char_restart))?;
+                        char_data_index = char_restart;
+                    } else {
+                        // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
+                        let non_latin_1_run_length = data[byte_i..]
+                            .chars()
+                            .take_while(|c| (*c as u32) > 255)
+                            .count();
+                        let char_range = char_start..char_start + non_latin_1_run_length;
+                        let (replace, char_restart) = errors.handle_encode_error(
+                            full_data,
+                            char_range.clone(),
+                            ERR_REASON,
+                        )?;
+                        match replace {
+                            EncodeReplace::Str(s) => {
+                                if s.as_ref().chars().any(|c| (c as u32) > 255) {
+                                    return Err(
+                                        errors.error_encoding(full_data, char_range, ERR_REASON)
+                                    );
+                                }
+                                out.extend_from_slice(s.as_ref().as_bytes());
+                            }
+                            EncodeReplace::Bytes(b) => {
+                                out.extend_from_slice(b.as_ref());
                             }
-                            out.extend_from_slice(s.as_ref().as_bytes());
-                        }
-                        EncodeReplace::Bytes(b) => {
-                            out.extend_from_slice(b.as_ref());
                         }
+                        data = crate::str::try_get_chars(full_data, char_restart..)
+                            .ok_or_else(|| errors.error_oob_restart(char_restart))?;
+                        char_data_index = char_restart;
                     }
-                    data = crate::str::try_get_chars(full_data, char_restart..)
-                        .ok_or_else(|| errors.error_oob_restart(char_restart))?;
-                    char_data_index = char_restart;
                     continue;
                 }
             }