Skip to content

Commit 7cce2f6

Browse files
committed
extmod/re1.5: Update to 0.8.
Contains implementation of ?: (non-capturing groups), ?? (non-greedy ?), as well as much improved robustness, and edge cases and error handling by Amir Plivatsky (@ampli).
1 parent 000a127 commit 7cce2f6

File tree

2 files changed

+91
-135
lines changed

2 files changed

+91
-135
lines changed

extmod/re1.5/compilecode.c

Lines changed: 88 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -4,203 +4,175 @@
44

55
#include "re1.5.h"
66

7-
static void insert_code(char *code, int at, int num, int *pc)
8-
{
9-
memmove(code + at + num, code + at, *pc - at);
10-
*pc += num;
11-
}
12-
7+
#define INSERT_CODE(at, num, pc) \
8+
((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
139
#define REL(at, to) (to - at - 2)
10+
#define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at))
11+
#define PC (prog->bytelen)
1412

15-
int re1_5_sizecode(const char *re)
16-
{
17-
int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
18-
19-
for (; *re; re++) {
20-
switch (*re) {
21-
case '\\':
22-
re++;
23-
default:
24-
pc += 2;
25-
break;
26-
case '+':
27-
// Skip entire "+?"
28-
if (re[1] == '?')
29-
re++;
30-
case '?':
31-
pc += 2;
32-
break;
33-
case '.':
34-
case '^':
35-
case '$':
36-
pc++;
37-
break;
38-
case '*':
39-
// Skip entire "*?"
40-
if (re[1] == '?')
41-
re++;
42-
case '|':
43-
case '(':
44-
pc += 4;
45-
break;
46-
case ')':
47-
break;
48-
case '[': {
49-
pc += 2;
50-
re++;
51-
if (*re == '^') re++;
52-
while (*re != ']') {
53-
if (!*re) return -1;
54-
if (re[1] == '-') {
55-
re += 2;
56-
}
57-
pc += 2;
58-
re++;
59-
}
60-
}
61-
}
62-
}
63-
64-
return pc;
65-
}
66-
67-
#define EMIT(at, byte) code[at] = byte
68-
69-
static const char *_compilecode(const char *re, ByteProg *prog)
13+
static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
7014
{
71-
char *code = prog->insts;
72-
int pc = prog->bytelen;
73-
int start = pc;
74-
int term = pc;
15+
char *code = sizecode ? NULL : prog->insts;
16+
int start = PC;
17+
int term = PC;
7518
int alt_label = 0;
7619

7720
for (; *re && *re != ')'; re++) {
7821
switch (*re) {
7922
case '\\':
8023
re++;
24+
if (!*re) return NULL; // Trailing backslash
8125
if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
82-
term = pc;
83-
EMIT(pc++, NamedClass);
84-
EMIT(pc++, *re);
26+
term = PC;
27+
EMIT(PC++, NamedClass);
28+
EMIT(PC++, *re);
8529
prog->len++;
8630
break;
8731
}
8832
default:
89-
term = pc;
90-
EMIT(pc++, Char);
91-
EMIT(pc++, *re);
33+
term = PC;
34+
EMIT(PC++, Char);
35+
EMIT(PC++, *re);
9236
prog->len++;
9337
break;
9438
case '.':
95-
term = pc;
96-
EMIT(pc++, Any);
39+
term = PC;
40+
EMIT(PC++, Any);
9741
prog->len++;
9842
break;
9943
case '[': {
10044
int cnt;
101-
term = pc;
45+
term = PC;
10246
re++;
10347
if (*re == '^') {
104-
EMIT(pc++, ClassNot);
48+
EMIT(PC++, ClassNot);
10549
re++;
10650
} else {
107-
EMIT(pc++, Class);
51+
EMIT(PC++, Class);
10852
}
109-
pc++; // Skip # of pair byte
53+
PC++; // Skip # of pair byte
11054
prog->len++;
11155
for (cnt = 0; *re != ']'; re++, cnt++) {
11256
if (!*re) return NULL;
113-
EMIT(pc++, *re);
57+
EMIT(PC++, *re);
11458
if (re[1] == '-') {
11559
re += 2;
11660
}
117-
EMIT(pc++, *re);
61+
EMIT(PC++, *re);
11862
}
11963
EMIT(term + 1, cnt);
12064
break;
12165
}
12266
case '(': {
123-
term = pc;
124-
int sub = ++prog->sub;
125-
126-
EMIT(pc++, Save);
127-
EMIT(pc++, 2 * sub);
128-
prog->len++;
67+
term = PC;
68+
int sub;
69+
int capture = re[1] != '?' || re[2] != ':';
70+
71+
if (capture) {
72+
sub = ++prog->sub;
73+
EMIT(PC++, Save);
74+
EMIT(PC++, 2 * sub);
75+
prog->len++;
76+
} else {
77+
re += 2;
78+
}
12979

130-
prog->bytelen = pc;
131-
re = _compilecode(re + 1, prog);
80+
re = _compilecode(re + 1, prog, sizecode);
13281
if (re == NULL || *re != ')') return NULL; // error, or no matching paren
133-
pc = prog->bytelen;
13482

135-
EMIT(pc++, Save);
136-
EMIT(pc++, 2 * sub + 1);
137-
prog->len++;
83+
if (capture) {
84+
EMIT(PC++, Save);
85+
EMIT(PC++, 2 * sub + 1);
86+
prog->len++;
87+
}
13888

13989
break;
14090
}
14191
case '?':
142-
if (pc == term) return NULL; // nothing to repeat
143-
insert_code(code, term, 2, &pc);
144-
EMIT(term, Split);
145-
EMIT(term + 1, REL(term, pc));
92+
if (PC == term) return NULL; // nothing to repeat
93+
INSERT_CODE(term, 2, PC);
94+
if (re[1] == '?') {
95+
EMIT(term, RSplit);
96+
re++;
97+
} else {
98+
EMIT(term, Split);
99+
}
100+
EMIT(term + 1, REL(term, PC));
146101
prog->len++;
102+
term = PC;
147103
break;
148104
case '*':
149-
if (pc == term) return NULL; // nothing to repeat
150-
insert_code(code, term, 2, &pc);
151-
EMIT(pc, Jmp);
152-
EMIT(pc + 1, REL(pc, term));
153-
pc += 2;
105+
if (PC == term) return NULL; // nothing to repeat
106+
INSERT_CODE(term, 2, PC);
107+
EMIT(PC, Jmp);
108+
EMIT(PC + 1, REL(PC, term));
109+
PC += 2;
154110
if (re[1] == '?') {
155111
EMIT(term, RSplit);
156112
re++;
157113
} else {
158114
EMIT(term, Split);
159115
}
160-
EMIT(term + 1, REL(term, pc));
116+
EMIT(term + 1, REL(term, PC));
161117
prog->len += 2;
118+
term = PC;
162119
break;
163120
case '+':
164-
if (pc == term) return NULL; // nothing to repeat
121+
if (PC == term) return NULL; // nothing to repeat
165122
if (re[1] == '?') {
166-
EMIT(pc, Split);
123+
EMIT(PC, Split);
167124
re++;
168125
} else {
169-
EMIT(pc, RSplit);
126+
EMIT(PC, RSplit);
170127
}
171-
EMIT(pc + 1, REL(pc, term));
172-
pc += 2;
128+
EMIT(PC + 1, REL(PC, term));
129+
PC += 2;
173130
prog->len++;
131+
term = PC;
174132
break;
175133
case '|':
176134
if (alt_label) {
177-
EMIT(alt_label, REL(alt_label, pc) + 1);
135+
EMIT(alt_label, REL(alt_label, PC) + 1);
178136
}
179-
insert_code(code, start, 2, &pc);
180-
EMIT(pc++, Jmp);
181-
alt_label = pc++;
137+
INSERT_CODE(start, 2, PC);
138+
EMIT(PC++, Jmp);
139+
alt_label = PC++;
182140
EMIT(start, Split);
183-
EMIT(start + 1, REL(start, pc));
141+
EMIT(start + 1, REL(start, PC));
184142
prog->len += 2;
143+
term = PC;
185144
break;
186145
case '^':
187-
EMIT(pc++, Bol);
146+
EMIT(PC++, Bol);
188147
prog->len++;
148+
term = PC;
189149
break;
190150
case '$':
191-
EMIT(pc++, Eol);
151+
EMIT(PC++, Eol);
192152
prog->len++;
153+
term = PC;
193154
break;
194155
}
195156
}
196157

197158
if (alt_label) {
198-
EMIT(alt_label, REL(alt_label, pc) + 1);
159+
EMIT(alt_label, REL(alt_label, PC) + 1);
199160
}
200-
prog->bytelen = pc;
201161
return re;
202162
}
203163

164+
int re1_5_sizecode(const char *re)
165+
{
166+
ByteProg dummyprog = {
167+
// Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
168+
.bytelen = 5 + NON_ANCHORED_PREFIX
169+
};
170+
171+
if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
172+
173+
return dummyprog.bytelen;
174+
}
175+
204176
int re1_5_compilecode(ByteProg *prog, const char *re)
205177
{
206178
prog->len = 0;
@@ -221,7 +193,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
221193
prog->insts[prog->bytelen++] = 0;
222194
prog->len++;
223195

224-
re = _compilecode(re, prog);
196+
re = _compilecode(re, prog, /*sizecode*/0);
225197
if (re == NULL || *re) return 1;
226198

227199
prog->insts[prog->bytelen++] = Save;
@@ -234,25 +206,6 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
234206
return 0;
235207
}
236208

237-
void
238-
cleanmarks(ByteProg *prog)
239-
{
240-
char *pc = prog->insts;
241-
char *end = pc + prog->bytelen;
242-
while (pc < end) {
243-
*pc &= 0x7f;
244-
switch (*pc) {
245-
case Jmp:
246-
case Split:
247-
case RSplit:
248-
case Save:
249-
case Char:
250-
pc++;
251-
}
252-
pc++;
253-
}
254-
}
255-
256209
#if 0
257210
int main(int argc, char *argv[])
258211
{

extmod/re1.5/dumpcode.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ void re1_5_dumpcode(ByteProg *prog)
4444
printf("\n");
4545
break;
4646
}
47+
case NamedClass:
48+
printf("namedclass %c\n", code[pc++]);
49+
break;
4750
case Match:
4851
printf("match\n");
4952
break;

0 commit comments

Comments
 (0)