44
55#include "re1.5.h"
66
7- static void insert_code (char * code , int at , int num , int * pc )
8- {
9- memmove (code + at + num , code + at , * pc - at );
10- * pc += num ;
11- }
12-
7+ #define INSERT_CODE (at , num , pc ) \
8+ ((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
139#define REL (at , to ) (to - at - 2)
10+ #define EMIT (at , byte ) (code ? (code[at] = byte) : (void)(at))
11+ #define PC (prog->bytelen)
1412
15- int re1_5_sizecode (const char * re )
16- {
17- int pc = 5 + NON_ANCHORED_PREFIX ; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
18-
19- for (; * re ; re ++ ) {
20- switch (* re ) {
21- case '\\' :
22- re ++ ;
23- default :
24- pc += 2 ;
25- break ;
26- case '+' :
27- // Skip entire "+?"
28- if (re [1 ] == '?' )
29- re ++ ;
30- case '?' :
31- pc += 2 ;
32- break ;
33- case '.' :
34- case '^' :
35- case '$' :
36- pc ++ ;
37- break ;
38- case '*' :
39- // Skip entire "*?"
40- if (re [1 ] == '?' )
41- re ++ ;
42- case '|' :
43- case '(' :
44- pc += 4 ;
45- break ;
46- case ')' :
47- break ;
48- case '[' : {
49- pc += 2 ;
50- re ++ ;
51- if (* re == '^' ) re ++ ;
52- while (* re != ']' ) {
53- if (!* re ) return -1 ;
54- if (re [1 ] == '-' ) {
55- re += 2 ;
56- }
57- pc += 2 ;
58- re ++ ;
59- }
60- }
61- }
62- }
63-
64- return pc ;
65- }
66-
67- #define EMIT (at , byte ) code[at] = byte
68-
69- static const char * _compilecode (const char * re , ByteProg * prog )
13+ static const char * _compilecode (const char * re , ByteProg * prog , int sizecode )
7014{
71- char * code = prog -> insts ;
72- int pc = prog -> bytelen ;
73- int start = pc ;
74- int term = pc ;
15+ char * code = sizecode ? NULL : prog -> insts ;
16+ int start = PC ;
17+ int term = PC ;
7518 int alt_label = 0 ;
7619
7720 for (; * re && * re != ')' ; re ++ ) {
7821 switch (* re ) {
7922 case '\\' :
8023 re ++ ;
24+ if (!* re ) return NULL ; // Trailing backslash
8125 if ((* re | 0x20 ) == 'd' || (* re | 0x20 ) == 's' || (* re | 0x20 ) == 'w' ) {
82- term = pc ;
83- EMIT (pc ++ , NamedClass );
84- EMIT (pc ++ , * re );
26+ term = PC ;
27+ EMIT (PC ++ , NamedClass );
28+ EMIT (PC ++ , * re );
8529 prog -> len ++ ;
8630 break ;
8731 }
8832 default :
89- term = pc ;
90- EMIT (pc ++ , Char );
91- EMIT (pc ++ , * re );
33+ term = PC ;
34+ EMIT (PC ++ , Char );
35+ EMIT (PC ++ , * re );
9236 prog -> len ++ ;
9337 break ;
9438 case '.' :
95- term = pc ;
96- EMIT (pc ++ , Any );
39+ term = PC ;
40+ EMIT (PC ++ , Any );
9741 prog -> len ++ ;
9842 break ;
9943 case '[' : {
10044 int cnt ;
101- term = pc ;
45+ term = PC ;
10246 re ++ ;
10347 if (* re == '^' ) {
104- EMIT (pc ++ , ClassNot );
48+ EMIT (PC ++ , ClassNot );
10549 re ++ ;
10650 } else {
107- EMIT (pc ++ , Class );
51+ EMIT (PC ++ , Class );
10852 }
109- pc ++ ; // Skip # of pair byte
53+ PC ++ ; // Skip # of pair byte
11054 prog -> len ++ ;
11155 for (cnt = 0 ; * re != ']' ; re ++ , cnt ++ ) {
11256 if (!* re ) return NULL ;
113- EMIT (pc ++ , * re );
57+ EMIT (PC ++ , * re );
11458 if (re [1 ] == '-' ) {
11559 re += 2 ;
11660 }
117- EMIT (pc ++ , * re );
61+ EMIT (PC ++ , * re );
11862 }
11963 EMIT (term + 1 , cnt );
12064 break ;
12165 }
12266 case '(' : {
123- term = pc ;
124- int sub = ++ prog -> sub ;
125-
126- EMIT (pc ++ , Save );
127- EMIT (pc ++ , 2 * sub );
128- prog -> len ++ ;
67+ term = PC ;
68+ int sub ;
69+ int capture = re [1 ] != '?' || re [2 ] != ':' ;
70+
71+ if (capture ) {
72+ sub = ++ prog -> sub ;
73+ EMIT (PC ++ , Save );
74+ EMIT (PC ++ , 2 * sub );
75+ prog -> len ++ ;
76+ } else {
77+ re += 2 ;
78+ }
12979
130- prog -> bytelen = pc ;
131- re = _compilecode (re + 1 , prog );
80+ re = _compilecode (re + 1 , prog , sizecode );
13281 if (re == NULL || * re != ')' ) return NULL ; // error, or no matching paren
133- pc = prog -> bytelen ;
13482
135- EMIT (pc ++ , Save );
136- EMIT (pc ++ , 2 * sub + 1 );
137- prog -> len ++ ;
83+ if (capture ) {
84+ EMIT (PC ++ , Save );
85+ EMIT (PC ++ , 2 * sub + 1 );
86+ prog -> len ++ ;
87+ }
13888
13989 break ;
14090 }
14191 case '?' :
142- if (pc == term ) return NULL ; // nothing to repeat
143- insert_code (code , term , 2 , & pc );
144- EMIT (term , Split );
145- EMIT (term + 1 , REL (term , pc ));
92+ if (PC == term ) return NULL ; // nothing to repeat
93+ INSERT_CODE (term , 2 , PC );
94+ if (re [1 ] == '?' ) {
95+ EMIT (term , RSplit );
96+ re ++ ;
97+ } else {
98+ EMIT (term , Split );
99+ }
100+ EMIT (term + 1 , REL (term , PC ));
146101 prog -> len ++ ;
102+ term = PC ;
147103 break ;
148104 case '*' :
149- if (pc == term ) return NULL ; // nothing to repeat
150- insert_code ( code , term , 2 , & pc );
151- EMIT (pc , Jmp );
152- EMIT (pc + 1 , REL (pc , term ));
153- pc += 2 ;
105+ if (PC == term ) return NULL ; // nothing to repeat
106+ INSERT_CODE ( term , 2 , PC );
107+ EMIT (PC , Jmp );
108+ EMIT (PC + 1 , REL (PC , term ));
109+ PC += 2 ;
154110 if (re [1 ] == '?' ) {
155111 EMIT (term , RSplit );
156112 re ++ ;
157113 } else {
158114 EMIT (term , Split );
159115 }
160- EMIT (term + 1 , REL (term , pc ));
116+ EMIT (term + 1 , REL (term , PC ));
161117 prog -> len += 2 ;
118+ term = PC ;
162119 break ;
163120 case '+' :
164- if (pc == term ) return NULL ; // nothing to repeat
121+ if (PC == term ) return NULL ; // nothing to repeat
165122 if (re [1 ] == '?' ) {
166- EMIT (pc , Split );
123+ EMIT (PC , Split );
167124 re ++ ;
168125 } else {
169- EMIT (pc , RSplit );
126+ EMIT (PC , RSplit );
170127 }
171- EMIT (pc + 1 , REL (pc , term ));
172- pc += 2 ;
128+ EMIT (PC + 1 , REL (PC , term ));
129+ PC += 2 ;
173130 prog -> len ++ ;
131+ term = PC ;
174132 break ;
175133 case '|' :
176134 if (alt_label ) {
177- EMIT (alt_label , REL (alt_label , pc ) + 1 );
135+ EMIT (alt_label , REL (alt_label , PC ) + 1 );
178136 }
179- insert_code ( code , start , 2 , & pc );
180- EMIT (pc ++ , Jmp );
181- alt_label = pc ++ ;
137+ INSERT_CODE ( start , 2 , PC );
138+ EMIT (PC ++ , Jmp );
139+ alt_label = PC ++ ;
182140 EMIT (start , Split );
183- EMIT (start + 1 , REL (start , pc ));
141+ EMIT (start + 1 , REL (start , PC ));
184142 prog -> len += 2 ;
143+ term = PC ;
185144 break ;
186145 case '^' :
187- EMIT (pc ++ , Bol );
146+ EMIT (PC ++ , Bol );
188147 prog -> len ++ ;
148+ term = PC ;
189149 break ;
190150 case '$' :
191- EMIT (pc ++ , Eol );
151+ EMIT (PC ++ , Eol );
192152 prog -> len ++ ;
153+ term = PC ;
193154 break ;
194155 }
195156 }
196157
197158 if (alt_label ) {
198- EMIT (alt_label , REL (alt_label , pc ) + 1 );
159+ EMIT (alt_label , REL (alt_label , PC ) + 1 );
199160 }
200- prog -> bytelen = pc ;
201161 return re ;
202162}
203163
164+ int re1_5_sizecode (const char * re )
165+ {
166+ ByteProg dummyprog = {
167+ // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
168+ .bytelen = 5 + NON_ANCHORED_PREFIX
169+ };
170+
171+ if (_compilecode (re , & dummyprog , /*sizecode*/ 1 ) == NULL ) return -1 ;
172+
173+ return dummyprog .bytelen ;
174+ }
175+
204176int re1_5_compilecode (ByteProg * prog , const char * re )
205177{
206178 prog -> len = 0 ;
@@ -221,7 +193,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
221193 prog -> insts [prog -> bytelen ++ ] = 0 ;
222194 prog -> len ++ ;
223195
224- re = _compilecode (re , prog );
196+ re = _compilecode (re , prog , /*sizecode*/ 0 );
225197 if (re == NULL || * re ) return 1 ;
226198
227199 prog -> insts [prog -> bytelen ++ ] = Save ;
@@ -234,25 +206,6 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
234206 return 0 ;
235207}
236208
237- void
238- cleanmarks (ByteProg * prog )
239- {
240- char * pc = prog -> insts ;
241- char * end = pc + prog -> bytelen ;
242- while (pc < end ) {
243- * pc &= 0x7f ;
244- switch (* pc ) {
245- case Jmp :
246- case Split :
247- case RSplit :
248- case Save :
249- case Char :
250- pc ++ ;
251- }
252- pc ++ ;
253- }
254- }
255-
256209#if 0
257210int main (int argc , char * argv [])
258211{
0 commit comments