@@ -67,14 +67,100 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
6767 nlr_raise (mp_obj_new_exception_msg (& mp_type_OSError , "Operation not supported" ));
6868 }
6969
70+ // What to do if sz < -1? Python docs don't specify this case.
71+ // CPython does a readall, but here we silently let negatives through,
72+ // and they will cause a MemoryError.
7073 mp_int_t sz ;
7174 if (n_args == 1 || ((sz = mp_obj_get_int (args [1 ])) == -1 )) {
7275 return stream_readall (args [0 ]);
7376 }
7477
7578 #if MICROPY_PY_BUILTINS_STR_UNICODE
7679 if (!o -> type -> stream_p -> is_bytes ) {
77- mp_not_implemented ("Reading from unicode text streams by character count" );
80+ // We need to read sz number of unicode characters. Because we don't have any
81+ // buffering, and because the stream API can only read bytes, we must read here
82+ // in units of bytes and must never over read. If we want sz chars, then reading
83+ // sz bytes will never over-read, so we follow this approach, in a loop to keep
84+ // reading until we have exactly enough chars. This will be 1 read for text
85+ // with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
86+ // chars. For text with lots of non-ASCII chars, it'll be pretty inefficient
87+ // in time and memory.
88+
89+ vstr_t vstr ;
90+ vstr_init (& vstr , sz );
91+ mp_uint_t more_bytes = sz ;
92+ mp_uint_t last_buf_offset = 0 ;
93+ while (more_bytes > 0 ) {
94+ char * p = vstr_add_len (& vstr , more_bytes );
95+ if (p == NULL ) {
96+ nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_MemoryError , "out of memory" ));
97+ }
98+ int error ;
99+ mp_int_t out_sz = o -> type -> stream_p -> read (o , p , more_bytes , & error );
100+ if (out_sz == -1 ) {
101+ vstr_cut_tail_bytes (& vstr , more_bytes );
102+ if (is_nonblocking_error (error )) {
103+ // With non-blocking streams, we read as much as we can.
104+ // If we read nothing, return None, just like read().
105+ // Otherwise, return data read so far.
106+ // TODO what if we have read only half a non-ASCII char?
107+ if (vstr .len == 0 ) {
108+ vstr_clear (& vstr );
109+ return mp_const_none ;
110+ }
111+ break ;
112+ }
113+ nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_OSError , "[Errno %d]" , error ));
114+ }
115+
116+ if (out_sz == 0 ) {
117+ // Finish reading.
118+ // TODO what if we have read only half a non-ASCII char?
119+ vstr_cut_tail_bytes (& vstr , more_bytes );
120+ break ;
121+ }
122+
123+ // count chars from bytes just read
124+ for (mp_uint_t off = last_buf_offset ;;) {
125+ byte b = vstr .buf [off ];
126+ int n ;
127+ if (!UTF8_IS_NONASCII (b )) {
128+ // 1-byte ASCII char
129+ n = 1 ;
130+ } else if ((b & 0xe0 ) == 0xc0 ) {
131+ // 2-byte char
132+ n = 2 ;
133+ } else if ((b & 0xf0 ) == 0xe0 ) {
134+ // 3-byte char
135+ n = 3 ;
136+ } else if ((b & 0xf8 ) == 0xf0 ) {
137+ // 4-byte char
138+ n = 4 ;
139+ } else {
140+ // TODO
141+ n = 5 ;
142+ }
143+ if (off + n <= vstr .len ) {
144+ // got a whole char in n bytes
145+ off += n ;
146+ sz -= 1 ;
147+ last_buf_offset = off ;
148+ if (off >= vstr .len ) {
149+ more_bytes = sz ;
150+ break ;
151+ }
152+ } else {
153+ // didn't get a whole char, so work out how many extra bytes are needed for
154+ // this partial char, plus bytes for additional chars that we want
155+ more_bytes = (off + n - vstr .len ) + (sz - 1 );
156+ break ;
157+ }
158+ }
159+ }
160+
161+ mp_obj_t ret = mp_obj_new_str_of_type (& mp_type_str , (byte * )vstr .buf , vstr .len );
162+ vstr_clear (& vstr );
163+ return ret ;
78164 }
79165 #endif
80166
0 commit comments