Skip to content

Commit deed087

Browse files
committed
py: str.split: handle non-default separator.
1 parent 36dd19a commit deed087

2 files changed

Lines changed: 78 additions & 25 deletions

File tree

py/objstr.c

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const mp_obj_t mp_const_empty_bytes;
3333
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
3434
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
3535
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
36+
STATIC void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
3637

3738
/******************************************************************************/
3839
/* str */
@@ -367,38 +368,71 @@ STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
367368
#define is_ws(c) ((c) == ' ' || (c) == '\t')
368369

369370
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
370-
int splits = -1;
371+
machine_int_t splits = -1;
371372
mp_obj_t sep = mp_const_none;
372373
if (n_args > 1) {
373374
sep = args[1];
374375
if (n_args > 2) {
375-
splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
376+
splits = mp_obj_get_int(args[2]);
376377
}
377378
}
378-
assert(sep == mp_const_none);
379-
(void)sep; // unused; to hush compiler warning
379+
380380
mp_obj_t res = mp_obj_new_list(0, NULL);
381381
GET_STR_DATA_LEN(args[0], s, len);
382382
const byte *top = s + len;
383-
const byte *start;
384-
385-
// Initial whitespace is not counted as split, so we pre-do it
386-
while (s < top && is_ws(*s)) s++;
387-
while (s < top && splits != 0) {
388-
start = s;
389-
while (s < top && !is_ws(*s)) s++;
390-
mp_obj_list_append(res, mp_obj_new_str(start, s - start, false));
391-
if (s >= top) {
392-
break;
393-
}
383+
384+
if (sep == mp_const_none) {
385+
// sep not given, so separate on whitespace
386+
387+
// Initial whitespace is not counted as split, so we pre-do it
394388
while (s < top && is_ws(*s)) s++;
395-
if (splits > 0) {
396-
splits--;
389+
while (s < top && splits != 0) {
390+
const byte *start = s;
391+
while (s < top && !is_ws(*s)) s++;
392+
mp_obj_list_append(res, mp_obj_new_str(start, s - start, false));
393+
if (s >= top) {
394+
break;
395+
}
396+
while (s < top && is_ws(*s)) s++;
397+
if (splits > 0) {
398+
splits--;
399+
}
397400
}
398-
}
399401

400-
if (s < top) {
401-
mp_obj_list_append(res, mp_obj_new_str(s, top - s, false));
402+
if (s < top) {
403+
mp_obj_list_append(res, mp_obj_new_str(s, top - s, false));
404+
}
405+
406+
} else {
407+
// sep given
408+
409+
uint sep_len;
410+
const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
411+
412+
if (sep_len == 0) {
413+
nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
414+
}
415+
416+
for (;;) {
417+
const byte *start = s;
418+
for (;;) {
419+
if (splits == 0 || s + sep_len > top) {
420+
s = top;
421+
break;
422+
} else if (memcmp(s, sep_str, sep_len) == 0) {
423+
break;
424+
}
425+
s++;
426+
}
427+
mp_obj_list_append(res, mp_obj_new_str(start, s - start, false));
428+
if (s >= top) {
429+
break;
430+
}
431+
s += sep_len;
432+
if (splits > 0) {
433+
splits--;
434+
}
435+
}
402436
}
403437

404438
return res;
@@ -1052,7 +1086,7 @@ STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, uint n_args, const mp_obj_t
10521086
}
10531087
pfenv_print_int(&pfenv_vstr, arg_as_int(arg), 1, 16, 'A', flags, fill, width);
10541088
break;
1055-
1089+
10561090
default:
10571091
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError,
10581092
"unsupported format character '%c' (0x%x) at index %d",
@@ -1191,8 +1225,7 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
11911225
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
11921226
assert(MP_OBJ_IS_STR(self_in));
11931227
if (!MP_OBJ_IS_STR(arg)) {
1194-
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
1195-
"Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
1228+
bad_implicit_conversion(arg);
11961229
}
11971230

11981231
GET_STR_DATA_LEN(self_in, str, str_len);
@@ -1365,8 +1398,7 @@ bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
13651398
}
13661399
}
13671400

1368-
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
1369-
void bad_implicit_conversion(mp_obj_t self_in) {
1401+
STATIC void bad_implicit_conversion(mp_obj_t self_in) {
13701402
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
13711403
}
13721404

tests/basics/string_split.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,28 @@
1+
# default separator (whitespace)
12
print("a b".split())
23
print(" a b ".split(None))
34
print(" a b ".split(None, 1))
45
print(" a b ".split(None, 2))
56
print(" a b c ".split(None, 1))
67
print(" a b c ".split(None, 0))
78
print(" a b c ".split(None, -1))
9+
10+
# empty separator should fail
11+
try:
12+
"abc".split('')
13+
except ValueError:
14+
print("ValueError")
15+
16+
# non-empty separator
17+
print("abc".split("a"))
18+
print("abc".split("b"))
19+
print("abc".split("c"))
20+
print("abc".split("z"))
21+
print("abc".split("ab"))
22+
print("abc".split("bc"))
23+
print("abc".split("abc"))
24+
print("abc".split("abcd"))
25+
print("abcabc".split("bc"))
26+
print("abcabc".split("bc", 0))
27+
print("abcabc".split("bc", 1))
28+
print("abcabc".split("bc", 2))

0 commit comments

Comments
 (0)