Skip to content

Commit d1a4812

Browse files
committed
Implement _codecs.utf_8_decode in rust
1 parent 58fb28a commit d1a4812

7 files changed

Lines changed: 239 additions & 257 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/_pycodecs.py

Lines changed: 0 additions & 251 deletions
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,6 @@ def escape_encode( obj, errors='strict'):
8585
v = s[1:-1]
8686
return v, len(v)
8787

88-
def utf_8_decode( data, errors='strict', final=False):
89-
"""None
90-
"""
91-
consumed = len(data)
92-
if final:
93-
consumed = 0
94-
res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
95-
res = ''.join(res)
96-
return res, consumed
97-
9888
def raw_unicode_escape_decode( data, errors='strict'):
9989
"""None
10090
"""
@@ -324,13 +314,6 @@ def raw_unicode_escape_encode( obj, errors='strict'):
324314
res = bytes(res)
325315
return res, len(res)
326316

327-
def utf_8_encode( obj, errors='strict'):
328-
"""None
329-
"""
330-
res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
331-
res = bytes(res)
332-
return res, len(res)
333-
334317
def utf_16_le_encode( obj, errors='strict'):
335318
"""None
336319
"""
@@ -882,240 +865,6 @@ def unicode_call_errorhandler(errors, encoding,
882865
else:
883866
raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
884867

885-
def PyUnicode_DecodeUTF8(s, size, errors):
886-
return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
887-
888-
## /* Map UTF-8 encoded prefix byte to sequence length. zero means
889-
## illegal prefix. see RFC 2279 for details */
890-
utf8_code_length = [
891-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
892-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
893-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
894-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
896-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
897-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
898-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
899-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
901-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
902-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
903-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
904-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
905-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
906-
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
907-
]
908-
909-
def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
910-
911-
consumed = 0
912-
if (size == 0):
913-
if not final:
914-
consumed = 0
915-
return '', consumed
916-
p = []
917-
pos = 0
918-
while pos < size:
919-
ch = s[pos]
920-
if ch < 0x80:
921-
p += chr(ch)
922-
pos += 1
923-
continue
924-
925-
n = utf8_code_length[ch]
926-
startinpos = pos
927-
if (startinpos + n > size):
928-
if not final:
929-
break
930-
else:
931-
errmsg = "unexpected end of data"
932-
endinpos = size
933-
res = unicode_call_errorhandler(
934-
errors, "utf8", errmsg,
935-
s, startinpos, endinpos)
936-
p += res[0]
937-
pos = res[1]
938-
if n == 0:
939-
errmsg = "unexpected code byte"
940-
endinpos = startinpos+1
941-
res = unicode_call_errorhandler(
942-
errors, "utf8", errmsg,
943-
s, startinpos, endinpos)
944-
p += res[0]
945-
pos = res[1]
946-
elif n == 1:
947-
errmsg = "internal error"
948-
endinpos = startinpos+1
949-
res = unicode_call_errorhandler(
950-
errors, "utf8", errmsg,
951-
s, startinpos, endinpos)
952-
p += res[0]
953-
pos = res[1]
954-
elif n == 2:
955-
if ((s[pos+1] & 0xc0) != 0x80):
956-
errmsg = "invalid data"
957-
endinpos = startinpos+2
958-
res = unicode_call_errorhandler(
959-
errors, "utf8", errmsg,
960-
s, startinpos, endinpos)
961-
p += res[0]
962-
pos = res[1]
963-
else:
964-
c = ((s[pos] & 0x1f) << 6) + (s[pos+1] & 0x3f)
965-
if c < 0x80:
966-
errmsg = "illegal encoding"
967-
endinpos = startinpos+2
968-
res = unicode_call_errorhandler(
969-
errors, "utf8", errmsg,
970-
s, startinpos, endinpos)
971-
p += res[0]
972-
pos = res[1]
973-
else:
974-
p += chr(c)
975-
pos += n
976-
#break
977-
elif n == 3:
978-
if ((s[pos+1] & 0xc0) != 0x80 or
979-
(s[pos+2] & 0xc0) != 0x80):
980-
errmsg = "invalid data"
981-
endinpos = startinpos+3
982-
res = unicode_call_errorhandler(
983-
errors, "utf8", errmsg,
984-
s, startinpos, endinpos)
985-
p += res[0]
986-
pos = res[1]
987-
else:
988-
c = ((s[pos] & 0x0f) << 12) + \
989-
((s[pos+1] & 0x3f) << 6) +\
990-
(s[pos+2] & 0x3f)
991-
992-
## /* Note: UTF-8 encodings of surrogates are considered
993-
## legal UTF-8 sequences;
994-
##
995-
## XXX For wide builds (UCS-4) we should probably try
996-
## to recombine the surrogates into a single code
997-
## unit.
998-
## */
999-
if c < 0x0800:
1000-
errmsg = "illegal encoding"
1001-
endinpos = startinpos+3
1002-
res = unicode_call_errorhandler(
1003-
errors, "utf8", errmsg,
1004-
s, startinpos, endinpos)
1005-
p += res[0]
1006-
pos = res[1]
1007-
else:
1008-
p += chr(c)
1009-
pos += n
1010-
elif n == 4:
1011-
## case 4:
1012-
if ((s[pos+1] & 0xc0) != 0x80 or
1013-
(s[pos+2] & 0xc0) != 0x80 or
1014-
(s[pos+3] & 0xc0) != 0x80):
1015-
1016-
errmsg = "invalid data"
1017-
startinpos = pos
1018-
endinpos = startinpos+4
1019-
res = unicode_call_errorhandler(
1020-
errors, "utf8", errmsg,
1021-
s, startinpos, endinpos)
1022-
p += res[0]
1023-
pos = res[1]
1024-
else:
1025-
c = ((s[pos+0] & 0x7) << 18) + ((s[pos+1] & 0x3f) << 12) +\
1026-
((s[pos+2] & 0x3f) << 6) + (s[pos+3] & 0x3f)
1027-
#/* validate and convert to UTF-16 */
1028-
if ((c < 0x10000) or (c > 0x10ffff)):
1029-
#/* minimum value allowed for 4 byte encoding */
1030-
#/* maximum value allowed for UTF-16 */
1031-
1032-
errmsg = "illegal encoding"
1033-
startinpos = pos
1034-
endinpos = startinpos+4
1035-
res = unicode_call_errorhandler(
1036-
errors, "utf8", errmsg,
1037-
s, startinpos, endinpos)
1038-
p += res[0]
1039-
pos = res[1]
1040-
else:
1041-
#ifdef Py_UNICODE_WIDE
1042-
if c < sys.maxunicode:
1043-
p += chr(c)
1044-
pos += n
1045-
else:
1046-
## /* compute and append the two surrogates: */
1047-
## /* translate from 10000..10FFFF to 0..FFFF */
1048-
c -= 0x10000
1049-
#/* high surrogate = top 10 bits added to D800 */
1050-
p += chr(0xD800 + (c >> 10))
1051-
#/* low surrogate = bottom 10 bits added to DC00 */
1052-
p += chr(0xDC00 + (c & 0x03FF))
1053-
pos += n
1054-
else:
1055-
## default:
1056-
## /* Other sizes are only needed for UCS-4 */
1057-
errmsg = "unsupported Unicode code range"
1058-
startinpos = pos
1059-
endinpos = startinpos+n
1060-
res = unicode_call_errorhandler(
1061-
errors, "utf8", errmsg,
1062-
s, startinpos, endinpos)
1063-
p += res[0]
1064-
pos = res[1]
1065-
1066-
#continue
1067-
1068-
if not final:
1069-
consumed = pos
1070-
return p, pos # consumed
1071-
1072-
def PyUnicode_EncodeUTF8(s, size, errors):
1073-
1074-
#assert(s != None)
1075-
assert(size >= 0)
1076-
p = bytearray()
1077-
i = 0
1078-
while i < size:
1079-
ch = s[i]
1080-
i += 1
1081-
if (ord(ch) < 0x80):
1082-
## /* Encode ASCII */
1083-
p.append(ord(ch))
1084-
elif (ord(ch) < 0x0800) :
1085-
## /* Encode Latin-1 */
1086-
p.append(0xc0 | (ord(ch) >> 6))
1087-
p.append(0x80 | (ord(ch) & 0x3f))
1088-
else:
1089-
## /* Encode UCS2 Unicode ordinals */
1090-
if (ord(ch) < 0x10000):
1091-
## /* Special case: check for high surrogate */
1092-
if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
1093-
ch2 = s[i]
1094-
## /* Check for low surrogate and combine the two to
1095-
## form a UCS4 value */
1096-
if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
1097-
ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
1098-
i += 1
1099-
p += encodeUCS4(ch3)
1100-
continue
1101-
## /* Fall through: handles isolated high surrogates */
1102-
p.append(0xe0 | (ord(ch) >> 12))
1103-
p.append(0x80 | ((ord(ch) >> 6) & 0x3f))
1104-
p.append(0x80 | (ord(ch) & 0x3f))
1105-
continue
1106-
else:
1107-
p += encodeUCS4(ord(ch))
1108-
return p
1109-
1110-
def encodeUCS4(ch):
1111-
## /* Encode UCS4 Unicode ordinals */
1112-
p = bytearray()
1113-
p.append(0xf0 | (ch >> 18))
1114-
p.append(0x80 | ((ch >> 12) & 0x3f))
1115-
p.append(0x80 | ((ch >> 6) & 0x3f))
1116-
p.append(0x80 | (ch & 0x3f))
1117-
return p
1118-
1119868
#/* --- Latin-1 Codec ------------------------------------------------------ */
1120869

1121870
def PyUnicode_DecodeLatin1(s, size, errors):

common/src/encodings.rs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
use std::ops::Range;
2+
3+
pub trait ErrorHandler {
4+
type Error;
5+
type StrBuf: AsRef<str>;
6+
type BytesBuf: AsRef<[u8]>;
7+
fn handle_encode_error(
8+
&self,
9+
byte_range: Range<usize>,
10+
reason: &str,
11+
) -> Result<(EncodeReplace<Self::StrBuf, Self::BytesBuf>, usize), Self::Error>;
12+
fn handle_decode_error(
13+
&self,
14+
data: &[u8],
15+
byte_range: Range<usize>,
16+
reason: &str,
17+
) -> Result<(Self::StrBuf, Option<Self::BytesBuf>, usize), Self::Error>;
18+
fn error_oob_restart(&self, i: usize) -> Self::Error;
19+
}
20+
pub enum EncodeReplace<S, B> {
21+
Str(S),
22+
Bytes(B),
23+
}
24+
25+
pub mod utf8 {
26+
use super::*;
27+
28+
pub fn encode<E: ErrorHandler>(s: &str, _errors: &E) -> Result<Vec<u8>, E::Error> {
29+
Ok(s.as_bytes().to_vec())
30+
}
31+
32+
pub fn decode<E: ErrorHandler>(
33+
data: &[u8],
34+
errors: &E,
35+
final_decode: bool,
36+
) -> Result<(String, usize), E::Error> {
37+
if data.is_empty() {
38+
return Ok((String::new(), 0));
39+
}
40+
// we need to coerce the lifetime to that of the function body rather than the
41+
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
42+
let mut data = &*data;
43+
let mut data_from_err: E::BytesBuf;
44+
let mut out = String::with_capacity(data.len());
45+
let mut remaining_index = 0;
46+
let mut remaining_data = data;
47+
macro_rules! handle_error {
48+
($range:expr, $reason:expr) => {{
49+
let (replace, new_data, restart) =
50+
errors.handle_decode_error(data, $range, $reason)?;
51+
out.push_str(replace.as_ref());
52+
if let Some(new_data) = new_data {
53+
data_from_err = new_data;
54+
data = data_from_err.as_ref();
55+
}
56+
remaining_data = data
57+
.get(restart..)
58+
.ok_or_else(|| errors.error_oob_restart(restart))?;
59+
remaining_index = restart;
60+
continue;
61+
}};
62+
}
63+
loop {
64+
match core::str::from_utf8(remaining_data) {
65+
Ok(decoded) => {
66+
out.push_str(decoded);
67+
remaining_index += decoded.len();
68+
break;
69+
}
70+
Err(e) => {
71+
let (valid_prefix, rest, first_err) = unsafe {
72+
let index = e.valid_up_to();
73+
// SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
74+
let valid =
75+
std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
76+
let rest = remaining_data.get_unchecked(index..);
77+
// SAFETY: if index didn't have something at it, this wouldn't be an error
78+
let first_err = *remaining_data.get_unchecked(index);
79+
(valid, rest, first_err)
80+
};
81+
out.push_str(valid_prefix);
82+
let err_idx = remaining_index + e.valid_up_to();
83+
remaining_data = rest;
84+
remaining_index += valid_prefix.len();
85+
if (0x80..0xc0).contains(&first_err) {
86+
handle_error!(err_idx..err_idx + 1, "invalid start byte");
87+
}
88+
let err_len = match e.error_len() {
89+
Some(l) => l,
90+
// error_len() == None means unexpected eof
91+
None => {
92+
if !final_decode {
93+
break;
94+
}
95+
handle_error!(err_idx..data.len(), "unexpected end of data");
96+
}
97+
};
98+
if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
99+
// truncated surrogate
100+
break;
101+
}
102+
handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
103+
}
104+
}
105+
}
106+
Ok((out, remaining_index))
107+
}
108+
}

common/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
pub mod borrow;
44
pub mod boxvec;
55
pub mod cmp;
6+
pub mod encodings;
67
pub mod float_ops;
78
pub mod hash;
89
pub mod lock;

0 commit comments

Comments
 (0)