Skip to content

Commit 1694bc7

Browse files
committed
py: Add stream reading of n unicode chars; unicode support by default.
With unicode enabled, this patch allows reading a fixed number of characters from text-mode streams; eg file.read(5) will read 5 unicode chars, which can made of more than 5 bytes. For an ASCII stream (ie no chars > 127) it only needs to do 1 read. If there are lots of non-ASCII chars in a stream, then it needs multiple reads of the underlying object. Adds a new test for this case. Enables unicode support by default on unix and stmhal ports.
1 parent 02bc882 commit 1694bc7

6 files changed

Lines changed: 103 additions & 4 deletions

File tree

py/stream.c

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,100 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
6767
nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported"));
6868
}
6969

70+
// What to do if sz < -1? Python docs don't specify this case.
71+
// CPython does a readall, but here we silently let negatives through,
72+
// and they will cause a MemoryError.
7073
mp_int_t sz;
7174
if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
7275
return stream_readall(args[0]);
7376
}
7477

7578
#if MICROPY_PY_BUILTINS_STR_UNICODE
7679
if (!o->type->stream_p->is_bytes) {
77-
mp_not_implemented("Reading from unicode text streams by character count");
80+
// We need to read sz number of unicode characters. Because we don't have any
81+
// buffering, and because the stream API can only read bytes, we must read here
82+
// in units of bytes and must never over read. If we want sz chars, then reading
83+
// sz bytes will never over-read, so we follow this approach, in a loop to keep
84+
// reading until we have exactly enough chars. This will be 1 read for text
85+
// with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
86+
// chars. For text with lots of non-ASCII chars, it'll be pretty inefficient
87+
// in time and memory.
88+
89+
vstr_t vstr;
90+
vstr_init(&vstr, sz);
91+
mp_uint_t more_bytes = sz;
92+
mp_uint_t last_buf_offset = 0;
93+
while (more_bytes > 0) {
94+
char *p = vstr_add_len(&vstr, more_bytes);
95+
if (p == NULL) {
96+
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory"));
97+
}
98+
int error;
99+
mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error);
100+
if (out_sz == -1) {
101+
vstr_cut_tail_bytes(&vstr, more_bytes);
102+
if (is_nonblocking_error(error)) {
103+
// With non-blocking streams, we read as much as we can.
104+
// If we read nothing, return None, just like read().
105+
// Otherwise, return data read so far.
106+
// TODO what if we have read only half a non-ASCII char?
107+
if (vstr.len == 0) {
108+
vstr_clear(&vstr);
109+
return mp_const_none;
110+
}
111+
break;
112+
}
113+
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error));
114+
}
115+
116+
if (out_sz == 0) {
117+
// Finish reading.
118+
// TODO what if we have read only half a non-ASCII char?
119+
vstr_cut_tail_bytes(&vstr, more_bytes);
120+
break;
121+
}
122+
123+
// count chars from bytes just read
124+
for (mp_uint_t off = last_buf_offset;;) {
125+
byte b = vstr.buf[off];
126+
int n;
127+
if (!UTF8_IS_NONASCII(b)) {
128+
// 1-byte ASCII char
129+
n = 1;
130+
} else if ((b & 0xe0) == 0xc0) {
131+
// 2-byte char
132+
n = 2;
133+
} else if ((b & 0xf0) == 0xe0) {
134+
// 3-byte char
135+
n = 3;
136+
} else if ((b & 0xf8) == 0xf0) {
137+
// 4-byte char
138+
n = 4;
139+
} else {
140+
// TODO
141+
n = 5;
142+
}
143+
if (off + n <= vstr.len) {
144+
// got a whole char in n bytes
145+
off += n;
146+
sz -= 1;
147+
last_buf_offset = off;
148+
if (off >= vstr.len) {
149+
more_bytes = sz;
150+
break;
151+
}
152+
} else {
153+
// didn't get a whole char, so work out how many extra bytes are needed for
154+
// this partial char, plus bytes for additional chars that we want
155+
more_bytes = (off + n - vstr.len) + (sz - 1);
156+
break;
157+
}
158+
}
159+
}
160+
161+
mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len);
162+
vstr_clear(&vstr);
163+
return ret;
78164
}
79165
#endif
80166

stmhal/mpconfigport.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
*/
4545
#define MICROPY_ENABLE_LFN (1)
4646
#define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */
47-
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
47+
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
4848
#define MICROPY_PY_BUILTINS_FROZENSET (1)
4949
#define MICROPY_PY_SYS_EXIT (1)
5050
#define MICROPY_PY_SYS_STDFILES (1)

tests/run-tests

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def main():
134134
if args.test_dirs is None:
135135
if pyb is None:
136136
# run PC tests
137-
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc')
137+
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode')
138138
else:
139139
# run pyboard tests
140140
test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm')

tests/unicode/data/utf-8_2.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
aαbβcγdδ

tests/unicode/file2.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# test reading a given number of characters
2+
3+
def do(mode):
4+
f = open('unicode/data/utf-8_2.txt', mode)
5+
print(f.read(1))
6+
print(f.read(1))
7+
print(f.read(2))
8+
print(f.read(4))
9+
f.close()
10+
11+
do('rb')
12+
do('rt')

unix/mpconfigport.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
#define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ)
4444
#define MICROPY_STREAMS_NON_BLOCK (1)
4545
#define MICROPY_OPT_COMPUTED_GOTO (1)
46-
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
46+
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
4747
#define MICROPY_PY_BUILTINS_FROZENSET (1)
4848
#define MICROPY_PY_SYS_EXIT (1)
4949
#define MICROPY_PY_SYS_PLATFORM "linux"

0 commit comments

Comments
 (0)