Jack-R-lantern
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Lib/_pycodecs.py‎
Lines changed: 0 additions & 251 deletions b/‎Lib/_pycodecs.py‎
Lines changed: 0 additions & 251 deletions
diff --git a/‎common/src/encodings.rs‎
Lines changed: 108 additions & 0 deletions b/‎common/src/encodings.rs‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎common/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎common/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
@@ -85,16 +85,6 @@ def escape_encode( obj, errors='strict'):
     v = s[1:-1]
     return v, len(v)
 
-def utf_8_decode( data, errors='strict', final=False):
-    """None
-    """
-    consumed = len(data)
-    if final:
-        consumed = 0
-    res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
-    res = ''.join(res)
-    return res, consumed
-
 def raw_unicode_escape_decode( data, errors='strict'):
     """None
     """
@@ -324,13 +314,6 @@ def raw_unicode_escape_encode( obj, errors='strict'):
     res = bytes(res)
     return res, len(res)
 
-def utf_8_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
-    res = bytes(res)
-    return res, len(res)
-
 def utf_16_le_encode( obj, errors='strict'):
     """None
     """
@@ -882,240 +865,6 @@ def unicode_call_errorhandler(errors,  encoding,
     else:
         raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
 
-def PyUnicode_DecodeUTF8(s, size, errors):
-    return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
-
-##    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
-##       illegal prefix.  see RFC 2279 for details */
-utf8_code_length = [
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-]
-
-def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
-    
-    consumed = 0
-    if (size == 0):
-        if not final:
-            consumed = 0
-        return '', consumed
-    p = []
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-        if ch < 0x80:
-            p += chr(ch)
-            pos += 1
-            continue
-        
-        n = utf8_code_length[ch]
-        startinpos =  pos 
-        if (startinpos + n > size):
-            if not final:
-                break
-            else:
-                errmsg = "unexpected end of data"
-                endinpos = size 
-                res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-        if n == 0:
-            errmsg = "unexpected code byte"
-            endinpos = startinpos+1
-            res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-        elif n == 1:
-            errmsg = "internal error"
-            endinpos = startinpos+1
-            res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-        elif n == 2:
-            if ((s[pos+1] & 0xc0) != 0x80):
-                errmsg = "invalid data"
-                endinpos = startinpos+2
-                res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((s[pos] & 0x1f) << 6) + (s[pos+1] & 0x3f)
-                if c < 0x80:
-                    errmsg = "illegal encoding"
-                    endinpos = startinpos+2
-                    res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-                    p += chr(c)
-                    pos += n
-                    #break
-        elif n == 3:
-            if ((s[pos+1] & 0xc0) != 0x80 or
-                    (s[pos+2] & 0xc0) != 0x80):
-                errmsg = "invalid data"
-                endinpos = startinpos+3
-                res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((s[pos] & 0x0f) << 12) + \
-                        ((s[pos+1] & 0x3f) << 6) +\
-                        (s[pos+2] & 0x3f)       
-                        
-##              /* Note: UTF-8 encodings of surrogates are considered
-##                 legal UTF-8 sequences;
-##
-##                 XXX For wide builds (UCS-4) we should probably try
-##                     to recombine the surrogates into a single code
-##                     unit.
-##              */
-                if c < 0x0800:
-                    errmsg = "illegal encoding"
-                    endinpos = startinpos+3
-                    res = unicode_call_errorhandler(
-                                        errors, "utf8", errmsg,
-                                        s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-                    p += chr(c)
-                    pos += n
-        elif n == 4:
-##        case 4:
-            if ((s[pos+1] & 0xc0) != 0x80 or
-                (s[pos+2] & 0xc0) != 0x80 or
-                (s[pos+3] & 0xc0) != 0x80):
-                
-                errmsg = "invalid data"
-                startinpos = pos
-                endinpos = startinpos+4
-                res = unicode_call_errorhandler(
-                            errors, "utf8", errmsg,
-                            s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((s[pos+0] & 0x7) << 18) + ((s[pos+1] & 0x3f) << 12) +\
-                     ((s[pos+2] & 0x3f) << 6) + (s[pos+3] & 0x3f)
-                #/* validate and convert to UTF-16 */
-                if ((c < 0x10000) or (c > 0x10ffff)):
-                    #/* minimum value allowed for 4 byte encoding */
-                    #/* maximum value allowed for UTF-16 */
-           
-                    errmsg = "illegal encoding"
-                    startinpos = pos
-                    endinpos = startinpos+4
-                    res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-#ifdef Py_UNICODE_WIDE
-                    if c < sys.maxunicode:
-                        p += chr(c)
-                        pos += n
-                    else:
-##                /*  compute and append the two surrogates: */
-##                /*  translate from 10000..10FFFF to 0..FFFF */
-                        c -= 0x10000
-            #/*  high surrogate = top 10 bits added to D800 */
-                        p += chr(0xD800 + (c >> 10))
-            #/*  low surrogate = bottom 10 bits added to DC00 */
-                        p += chr(0xDC00 + (c & 0x03FF))
-                        pos += n
-        else:
-##        default:
-##            /* Other sizes are only needed for UCS-4 */
-            errmsg = "unsupported Unicode code range"
-            startinpos = pos
-            endinpos = startinpos+n
-            res = unicode_call_errorhandler(
-                     errors, "utf8", errmsg,
-                     s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-            
-        #continue
-
-    if not final:
-        consumed = pos
-    return p, pos # consumed
-
-def PyUnicode_EncodeUTF8(s, size, errors):
-
-    #assert(s != None)
-    assert(size >= 0)
-    p = bytearray()
-    i = 0
-    while i < size:
-        ch = s[i]
-        i += 1
-        if (ord(ch) < 0x80):
-##         /* Encode ASCII */
-            p.append(ord(ch))
-        elif (ord(ch) < 0x0800) :
-##            /* Encode Latin-1 */
-            p.append(0xc0 | (ord(ch) >> 6))
-            p.append(0x80 | (ord(ch) & 0x3f))
-        else:
-##            /* Encode UCS2 Unicode ordinals */
-            if (ord(ch) < 0x10000):
-##                /* Special case: check for high surrogate */
-                if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
-                    ch2 = s[i]
-##                    /* Check for low surrogate and combine the two to
-##                       form a UCS4 value */
-                    if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
-                        ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
-                        i += 1
-                        p += encodeUCS4(ch3)
-                        continue
-##                    /* Fall through: handles isolated high surrogates */
-                p.append(0xe0 | (ord(ch) >> 12))
-                p.append(0x80 | ((ord(ch) >> 6) & 0x3f))
-                p.append(0x80 | (ord(ch) & 0x3f))
-                continue
-            else:
-                p += encodeUCS4(ord(ch))
-    return p
-
-def encodeUCS4(ch):
-##      /* Encode UCS4 Unicode ordinals */
-    p = bytearray()
-    p.append(0xf0 | (ch >> 18))
-    p.append(0x80 | ((ch >> 12) & 0x3f))
-    p.append(0x80 | ((ch >> 6) & 0x3f))
-    p.append(0x80 | (ch & 0x3f))
-    return p
-
 #/* --- Latin-1 Codec ------------------------------------------------------ */
 
 def PyUnicode_DecodeLatin1(s, size, errors):
 
@@ -0,0 +1,108 @@
+use std::ops::Range;
+
+pub trait ErrorHandler {
+    type Error;
+    type StrBuf: AsRef<str>;
+    type BytesBuf: AsRef<[u8]>;
+    fn handle_encode_error(
+        &self,
+        byte_range: Range<usize>,
+        reason: &str,
+    ) -> Result<(EncodeReplace<Self::StrBuf, Self::BytesBuf>, usize), Self::Error>;
+    fn handle_decode_error(
+        &self,
+        data: &[u8],
+        byte_range: Range<usize>,
+        reason: &str,
+    ) -> Result<(Self::StrBuf, Option<Self::BytesBuf>, usize), Self::Error>;
+    fn error_oob_restart(&self, i: usize) -> Self::Error;
+}
+pub enum EncodeReplace<S, B> {
+    Str(S),
+    Bytes(B),
+}
+
+pub mod utf8 {
+    use super::*;
+
+    pub fn encode<E: ErrorHandler>(s: &str, _errors: &E) -> Result<Vec<u8>, E::Error> {
+        Ok(s.as_bytes().to_vec())
+    }
+
+    pub fn decode<E: ErrorHandler>(
+        data: &[u8],
+        errors: &E,
+        final_decode: bool,
+    ) -> Result<(String, usize), E::Error> {
+        if data.is_empty() {
+            return Ok((String::new(), 0));
+        }
+        // we need to coerce the lifetime to that of the function body rather than the
+        // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
+        let mut data = &*data;
+        let mut data_from_err: E::BytesBuf;
+        let mut out = String::with_capacity(data.len());
+        let mut remaining_index = 0;
+        let mut remaining_data = data;
+        macro_rules! handle_error {
+            ($range:expr, $reason:expr) => {{
+                let (replace, new_data, restart) =
+                    errors.handle_decode_error(data, $range, $reason)?;
+                out.push_str(replace.as_ref());
+                if let Some(new_data) = new_data {
+                    data_from_err = new_data;
+                    data = data_from_err.as_ref();
+                }
+                remaining_data = data
+                    .get(restart..)
+                    .ok_or_else(|| errors.error_oob_restart(restart))?;
+                remaining_index = restart;
+                continue;
+            }};
+        }
+        loop {
+            match core::str::from_utf8(remaining_data) {
+                Ok(decoded) => {
+                    out.push_str(decoded);
+                    remaining_index += decoded.len();
+                    break;
+                }
+                Err(e) => {
+                    let (valid_prefix, rest, first_err) = unsafe {
+                        let index = e.valid_up_to();
+                        // SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
+                        let valid =
+                            std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
+                        let rest = remaining_data.get_unchecked(index..);
+                        // SAFETY: if index didn't have something at it, this wouldn't be an error
+                        let first_err = *remaining_data.get_unchecked(index);
+                        (valid, rest, first_err)
+                    };
+                    out.push_str(valid_prefix);
+                    let err_idx = remaining_index + e.valid_up_to();
+                    remaining_data = rest;
+                    remaining_index += valid_prefix.len();
+                    if (0x80..0xc0).contains(&first_err) {
+                        handle_error!(err_idx..err_idx + 1, "invalid start byte");
+                    }
+                    let err_len = match e.error_len() {
+                        Some(l) => l,
+                        // error_len() == None means unexpected eof
+                        None => {
+                            if !final_decode {
+                                break;
+                            }
+                            handle_error!(err_idx..data.len(), "unexpected end of data");
+                        }
+                    };
+                    if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
+                        // truncated surrogate
+                        break;
+                    }
+                    handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
+                }
+            }
+        }
+        Ok((out, remaining_index))
+    }
+}
@@ -3,6 +3,7 @@
 pub mod borrow;
 pub mod boxvec;
 pub mod cmp;
+pub mod encodings;
 pub mod float_ops;
 pub mod hash;
 pub mod lock;