@@ -77,8 +77,83 @@ STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
7777}
7878MP_DEFINE_CONST_FUN_OBJ_2 (match_group_obj , match_group );
7979
80+ #if MICROPY_PY_URE_MATCH_GROUPS
81+
82+ STATIC mp_obj_t match_groups (mp_obj_t self_in ) {
83+ mp_obj_match_t * self = MP_OBJ_TO_PTR (self_in );
84+ if (self -> num_matches <= 1 ) {
85+ return mp_const_empty_tuple ;
86+ }
87+ mp_obj_tuple_t * groups = MP_OBJ_TO_PTR (mp_obj_new_tuple (self -> num_matches - 1 , NULL ));
88+ for (int i = 1 ; i < self -> num_matches ; ++ i ) {
89+ groups -> items [i - 1 ] = match_group (self_in , MP_OBJ_NEW_SMALL_INT (i ));
90+ }
91+ return MP_OBJ_FROM_PTR (groups );
92+ }
93+ MP_DEFINE_CONST_FUN_OBJ_1 (match_groups_obj , match_groups );
94+
95+ #endif
96+
97+ #if MICROPY_PY_URE_MATCH_SPAN_START_END
98+
99+ STATIC void match_span_helper (size_t n_args , const mp_obj_t * args , mp_obj_t span [2 ]) {
100+ mp_obj_match_t * self = MP_OBJ_TO_PTR (args [0 ]);
101+
102+ mp_int_t no = 0 ;
103+ if (n_args == 2 ) {
104+ no = mp_obj_get_int (args [1 ]);
105+ if (no < 0 || no >= self -> num_matches ) {
106+ nlr_raise (mp_obj_new_exception_arg1 (& mp_type_IndexError , args [1 ]));
107+ }
108+ }
109+
110+ mp_int_t s = -1 ;
111+ mp_int_t e = -1 ;
112+ const char * start = self -> caps [no * 2 ];
113+ if (start != NULL ) {
114+ // have a match for this group
115+ const char * begin = mp_obj_str_get_str (self -> str );
116+ s = start - begin ;
117+ e = self -> caps [no * 2 + 1 ] - begin ;
118+ }
119+
120+ span [0 ] = mp_obj_new_int (s );
121+ span [1 ] = mp_obj_new_int (e );
122+ }
123+
124+ STATIC mp_obj_t match_span (size_t n_args , const mp_obj_t * args ) {
125+ mp_obj_t span [2 ];
126+ match_span_helper (n_args , args , span );
127+ return mp_obj_new_tuple (2 , span );
128+ }
129+ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (match_span_obj , 1 , 2 , match_span );
130+
131+ STATIC mp_obj_t match_start (size_t n_args , const mp_obj_t * args ) {
132+ mp_obj_t span [2 ];
133+ match_span_helper (n_args , args , span );
134+ return span [0 ];
135+ }
136+ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (match_start_obj , 1 , 2 , match_start );
137+
138+ STATIC mp_obj_t match_end (size_t n_args , const mp_obj_t * args ) {
139+ mp_obj_t span [2 ];
140+ match_span_helper (n_args , args , span );
141+ return span [1 ];
142+ }
143+ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (match_end_obj , 1 , 2 , match_end );
144+
145+ #endif
146+
80147STATIC const mp_rom_map_elem_t match_locals_dict_table [] = {
81148 { MP_ROM_QSTR (MP_QSTR_group ), MP_ROM_PTR (& match_group_obj ) },
149+ #if MICROPY_PY_URE_MATCH_GROUPS
150+ { MP_ROM_QSTR (MP_QSTR_groups ), MP_ROM_PTR (& match_groups_obj ) },
151+ #endif
152+ #if MICROPY_PY_URE_MATCH_SPAN_START_END
153+ { MP_ROM_QSTR (MP_QSTR_span ), MP_ROM_PTR (& match_span_obj ) },
154+ { MP_ROM_QSTR (MP_QSTR_start ), MP_ROM_PTR (& match_start_obj ) },
155+ { MP_ROM_QSTR (MP_QSTR_end ), MP_ROM_PTR (& match_end_obj ) },
156+ #endif
82157};
83158
84159STATIC MP_DEFINE_CONST_DICT (match_locals_dict , match_locals_dict_table );
@@ -103,6 +178,35 @@ STATIC mp_obj_t ure_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
103178 size_t len ;
104179 subj .begin = mp_obj_str_get_data (args [1 ], & len );
105180 subj .end = subj .begin + len ;
181+ #if MICROPY_PY_URE_MATCH_SPAN_START_END
182+ if (n_args > 2 ) {
183+ const mp_obj_type_t * self_type = mp_obj_get_type (args [1 ]);
184+ mp_int_t str_len = MP_OBJ_SMALL_INT_VALUE (mp_obj_len_maybe (args [1 ]));
185+ const byte * begin = (const byte * )subj .begin ;
186+
187+ int pos = mp_obj_get_int (args [2 ]);
188+ if (pos >= str_len ) {
189+ return mp_const_none ;
190+ }
191+ if (pos < 0 ) {
192+ pos = 0 ;
193+ }
194+ const byte * pos_ptr = str_index_to_ptr (self_type , begin , len , MP_OBJ_NEW_SMALL_INT (pos ), true);
195+
196+ const byte * endpos_ptr = (const byte * )subj .end ;
197+ if (n_args > 3 ) {
198+ int endpos = mp_obj_get_int (args [3 ]);
199+ if (endpos <= pos ) {
200+ return mp_const_none ;
201+ }
202+ // Will cap to length
203+ endpos_ptr = str_index_to_ptr (self_type , begin , len , args [3 ], true);
204+ }
205+
206+ subj .begin = (const char * )pos_ptr ;
207+ subj .end = (const char * )endpos_ptr ;
208+ }
209+ #endif
106210 int caps_num = (self -> re .sub + 1 ) * 2 ;
107211 mp_obj_match_t * match = m_new_obj_var (mp_obj_match_t , char * , caps_num );
108212 // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
@@ -174,10 +278,127 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
174278}
175279MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (re_split_obj , 2 , 3 , re_split );
176280
281+ #if MICROPY_PY_URE_SUB
282+
283+ STATIC mp_obj_t re_sub_helper (mp_obj_t self_in , size_t n_args , const mp_obj_t * args ) {
284+ mp_obj_re_t * self = MP_OBJ_TO_PTR (self_in );
285+ mp_obj_t replace = args [1 ];
286+ mp_obj_t where = args [2 ];
287+ mp_int_t count = 0 ;
288+ if (n_args > 3 ) {
289+ count = mp_obj_get_int (args [3 ]);
290+ // Note: flags are currently ignored
291+ }
292+
293+ size_t where_len ;
294+ const char * where_str = mp_obj_str_get_data (where , & where_len );
295+ Subject subj ;
296+ subj .begin = where_str ;
297+ subj .end = subj .begin + where_len ;
298+ int caps_num = (self -> re .sub + 1 ) * 2 ;
299+
300+ vstr_t vstr_return ;
301+ vstr_return .buf = NULL ; // We'll init the vstr after the first match
302+ mp_obj_match_t * match = mp_local_alloc (sizeof (mp_obj_match_t ) + caps_num * sizeof (char * ));
303+ match -> base .type = & match_type ;
304+ match -> num_matches = caps_num / 2 ; // caps_num counts start and end pointers
305+ match -> str = where ;
306+
307+ for (;;) {
308+ // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
309+ memset ((char * )match -> caps , 0 , caps_num * sizeof (char * ));
310+ int res = re1_5_recursiveloopprog (& self -> re , & subj , match -> caps , caps_num , false);
311+
312+ // If we didn't have a match, or had an empty match, it's time to stop
313+ if (!res || match -> caps [0 ] == match -> caps [1 ]) {
314+ break ;
315+ }
316+
317+ // Initialise the vstr if it's not already
318+ if (vstr_return .buf == NULL ) {
319+ vstr_init (& vstr_return , match -> caps [0 ] - subj .begin );
320+ }
321+
322+ // Add pre-match string
323+ vstr_add_strn (& vstr_return , subj .begin , match -> caps [0 ] - subj .begin );
324+
325+ // Get replacement string
326+ const char * repl = mp_obj_str_get_str ((mp_obj_is_callable (replace ) ? mp_call_function_1 (replace , MP_OBJ_FROM_PTR (match )) : replace ));
327+
328+ // Append replacement string to result, substituting any regex groups
329+ while (* repl != '\0' ) {
330+ if (* repl == '\\' ) {
331+ ++ repl ;
332+ bool is_g_format = false;
333+ if (* repl == 'g' && repl [1 ] == '<' ) {
334+ // Group specified with syntax "\g<number>"
335+ repl += 2 ;
336+ is_g_format = true;
337+ }
338+
339+ if ('0' <= * repl && * repl <= '9' ) {
340+ // Group specified with syntax "\g<number>" or "\number"
341+ unsigned int match_no = 0 ;
342+ do {
343+ match_no = match_no * 10 + (* repl ++ - '0' );
344+ } while ('0' <= * repl && * repl <= '9' );
345+ if (is_g_format && * repl == '>' ) {
346+ ++ repl ;
347+ }
348+
349+ if (match_no >= (unsigned int )match -> num_matches ) {
350+ nlr_raise (mp_obj_new_exception_arg1 (& mp_type_IndexError , MP_OBJ_NEW_SMALL_INT (match_no )));
351+ }
352+
353+ const char * start_match = match -> caps [match_no * 2 ];
354+ if (start_match != NULL ) {
355+ // Add the substring matched by group
356+ const char * end_match = match -> caps [match_no * 2 + 1 ];
357+ vstr_add_strn (& vstr_return , start_match , end_match - start_match );
358+ }
359+ }
360+ } else {
361+ // Just add the current byte from the replacement string
362+ vstr_add_byte (& vstr_return , * repl ++ );
363+ }
364+ }
365+
366+ // Move start pointer to end of last match
367+ subj .begin = match -> caps [1 ];
368+
369+ // Stop substitutions if count was given and gets to 0
370+ if (count > 0 && -- count == 0 ) {
371+ break ;
372+ }
373+ }
374+
375+ mp_local_free (match );
376+
377+ if (vstr_return .buf == NULL ) {
378+ // Optimisation for case of no substitutions
379+ return where ;
380+ }
381+
382+ // Add post-match string
383+ vstr_add_strn (& vstr_return , subj .begin , subj .end - subj .begin );
384+
385+ return mp_obj_new_str_from_vstr (mp_obj_get_type (where ), & vstr_return );
386+ }
387+
388+ STATIC mp_obj_t re_sub (size_t n_args , const mp_obj_t * args ) {
389+ return re_sub_helper (args [0 ], n_args , args );
390+ }
391+ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (re_sub_obj , 3 , 5 , re_sub );
392+
393+ #endif
394+
177395STATIC const mp_rom_map_elem_t re_locals_dict_table [] = {
178396 { MP_ROM_QSTR (MP_QSTR_match ), MP_ROM_PTR (& re_match_obj ) },
179397 { MP_ROM_QSTR (MP_QSTR_search ), MP_ROM_PTR (& re_search_obj ) },
180398 { MP_ROM_QSTR (MP_QSTR_split ), MP_ROM_PTR (& re_split_obj ) },
399+ #if MICROPY_PY_URE_SUB
400+ { MP_ROM_QSTR (MP_QSTR_sub ), MP_ROM_PTR (& re_sub_obj ) },
401+ #endif
181402};
182403
183404STATIC MP_DEFINE_CONST_DICT (re_locals_dict , re_locals_dict_table );
@@ -232,11 +453,22 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
232453}
233454MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (mod_re_search_obj , 2 , 4 , mod_re_search );
234455
456+ #if MICROPY_PY_URE_SUB
457+ STATIC mp_obj_t mod_re_sub (size_t n_args , const mp_obj_t * args ) {
458+ mp_obj_t self = mod_re_compile (1 , args );
459+ return re_sub_helper (self , n_args , args );
460+ }
461+ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN (mod_re_sub_obj , 3 , 5 , mod_re_sub );
462+ #endif
463+
235464STATIC const mp_rom_map_elem_t mp_module_re_globals_table [] = {
236465 { MP_ROM_QSTR (MP_QSTR___name__ ), MP_ROM_QSTR (MP_QSTR_ure ) },
237466 { MP_ROM_QSTR (MP_QSTR_compile ), MP_ROM_PTR (& mod_re_compile_obj ) },
238467 { MP_ROM_QSTR (MP_QSTR_match ), MP_ROM_PTR (& mod_re_match_obj ) },
239468 { MP_ROM_QSTR (MP_QSTR_search ), MP_ROM_PTR (& mod_re_search_obj ) },
469+ #if MICROPY_PY_URE_SUB
470+ { MP_ROM_QSTR (MP_QSTR_sub ), MP_ROM_PTR (& mod_re_sub_obj ) },
471+ #endif
240472 { MP_ROM_QSTR (MP_QSTR_DEBUG ), MP_ROM_INT (FLAG_DEBUG ) },
241473};
242474
0 commit comments