@@ -121,8 +121,57 @@ deprecated string mode_from_mode_object(Value obj) {
121121abstract class RegexString extends Expr {
122122 RegexString ( ) { ( this instanceof Bytes or this instanceof Unicode ) }
123123
124+ override string toString ( ) {
125+ result = this .( Bytes ) .getText ( )
126+ or
127+ result = this .( Unicode ) .getText ( )
128+ }
129+
130+ /** result is true for those start chars that actually mark a start of a char set. */
131+ boolean char_set_start ( int pos ) {
132+ exists ( int index |
133+ char_set_delimiter ( index , pos ) = true and
134+ (
135+ index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
136+ or
137+ index > 1 and
138+ not char_set_delimiter ( index - 1 , _) = false and
139+ result = false
140+ or
141+ exists ( int p1 |
142+ char_set_delimiter ( index - 1 , p1 ) = false and // if it is preceded by a closing bracket, it starts a char set
143+ if
144+ exists ( int p2 |
145+ p1 = p2 + 1
146+ or
147+ this .getChar ( p2 + 1 ) = "^" and
148+ p1 = p2 + 2
149+ |
150+ char_set_delimiter ( index - 2 , p2 ) = true // but the closing bracket only closes...
151+ )
152+ then
153+ exists ( int p2 | char_set_delimiter ( index - 2 , p2 ) = true |
154+ result = char_set_start ( p2 ) .booleanNot ( ) // ...if it is not the first in a char set
155+ )
156+ else result = true
157+ )
158+ )
159+ )
160+ }
161+
162+ /** result denotes if the index is a left bracket */
163+ boolean char_set_delimiter ( int index , int pos ) {
164+ pos = rank [ index ] ( int p | this .nonEscapedCharAt ( p ) = "[" or this .nonEscapedCharAt ( p ) = "]" ) and
165+ (
166+ this .nonEscapedCharAt ( pos ) = "[" and result = true
167+ or
168+ this .nonEscapedCharAt ( pos ) = "]" and result = false
169+ )
170+ }
171+
172+ /** Hold is a character set starts between `start` and `end`. */
124173 predicate char_set_start ( int start , int end ) {
125- this .nonEscapedCharAt ( start ) = "[" and
174+ this .char_set_start ( start ) = true and
126175 (
127176 this .getChar ( start + 1 ) = "^" and end = start + 2
128177 or
@@ -143,23 +192,80 @@ abstract class RegexString extends Expr {
143192 )
144193 }
145194
195+ /** An indexed version of `char_set_token/3` */
196+ private predicate char_set_token ( int charset_start , int index , int token_start , int token_end ) {
197+ token_start =
198+ rank [ index ] ( int start , int end | this .char_set_token ( charset_start , start , end ) | start ) and
199+ this .char_set_token ( charset_start , token_start , token_end )
200+ }
201+
202+ /** Either a char or a - */
203+ private predicate char_set_token ( int charset_start , int start , int end ) {
204+ this .char_set_start ( charset_start , start ) and
205+ (
206+ this .escapedCharacter ( start , end )
207+ or
208+ exists ( this .nonEscapedCharAt ( start ) ) and end = start + 1
209+ )
210+ or
211+ this .char_set_token ( charset_start , _, start ) and
212+ (
213+ this .escapedCharacter ( start , end )
214+ or
215+ exists ( this .nonEscapedCharAt ( start ) ) and
216+ end = start + 1 and
217+ not this .getChar ( start ) = "]"
218+ )
219+ }
220+
221+ /**
222+ * Holds if the character set starting at `charset_start` contains either
223+ * a character or a range found between `start` and `end`.
224+ */
225+ predicate char_set_child ( int charset_start , int start , int end ) {
226+ this .char_set_token ( charset_start , start , end ) and
227+ not exists ( int range_start , int range_end |
228+ this .charRange ( charset_start , range_start , _, _, range_end ) and
229+ range_start <= start and
230+ range_end >= end
231+ )
232+ or
233+ this .charRange ( charset_start , start , _, _, end )
234+ }
235+
146236 /**
147237 * Holds if the character set starting at `charset_start` contains a character range
148238 * with lower bound found between `start` and `lower_end`
149239 * and upper bound found between `upper_start` and `end`.
150240 */
151241 predicate charRange ( int charset_start , int start , int lower_end , int upper_start , int end ) {
152- // mirror logic from `simpleCharacter`
153- exists ( int x , int y |
154- this .charSet ( charset_start , y ) and
155- this .char_set_start ( charset_start , x )
156- |
157- x <= start and
158- this .simpleCharacter ( start , lower_end ) and
159- this .nonEscapedCharAt ( lower_end ) = "-" and
160- lower_end + 1 = upper_start and
161- this .simpleCharacter ( upper_start , end ) and
162- end < y
242+ exists ( int index |
243+ this .charRangeEnd ( charset_start , index ) = true and
244+ this .char_set_token ( charset_start , index - 2 , start , lower_end ) and
245+ this .char_set_token ( charset_start , index , upper_start , end )
246+ )
247+ }
248+
249+ private boolean charRangeEnd ( int charset_start , int index ) {
250+ this .char_set_token ( charset_start , index , _, _) and
251+ (
252+ index in [ 1 , 2 ] and result = false
253+ or
254+ index > 2 and
255+ exists ( int connector_start |
256+ this .char_set_token ( charset_start , index - 1 , connector_start , _) and
257+ this .nonEscapedCharAt ( connector_start ) = "-" and
258+ result =
259+ this .charRangeEnd ( charset_start , index - 2 )
260+ .booleanNot ( )
261+ .booleanAnd ( this .charRangeEnd ( charset_start , index - 1 ) .booleanNot ( ) )
262+ )
263+ or
264+ not exists ( int connector_start |
265+ this .char_set_token ( charset_start , index - 1 , connector_start , _) and
266+ this .nonEscapedCharAt ( connector_start ) = "-"
267+ ) and
268+ result = false
163269 )
164270 }
165271
@@ -184,14 +290,14 @@ abstract class RegexString extends Expr {
184290
185291 string nonEscapedCharAt ( int i ) {
186292 result = this .getText ( ) .charAt ( i ) and
187- not this .escapingChar ( i - 1 )
293+ not exists ( int x , int y | this .escapedCharacter ( x , y ) and i in [ x .. y - 1 ] )
188294 }
189295
190296 private predicate isOptionDivider ( int i ) { this .nonEscapedCharAt ( i ) = "|" }
191297
192- private predicate isGroupEnd ( int i ) { this .nonEscapedCharAt ( i ) = ")" }
298+ private predicate isGroupEnd ( int i ) { this .nonEscapedCharAt ( i ) = ")" and not this . inCharSet ( i ) }
193299
194- private predicate isGroupStart ( int i ) { this .nonEscapedCharAt ( i ) = "(" }
300+ private predicate isGroupStart ( int i ) { this .nonEscapedCharAt ( i ) = "(" and not this . inCharSet ( i ) }
195301
196302 predicate failedToParse ( int i ) {
197303 exists ( this .getChar ( i ) ) and
@@ -219,14 +325,18 @@ abstract class RegexString extends Expr {
219325 */
220326 predicate escapedCharacter ( int start , int end ) {
221327 this .escapingChar ( start ) and
222- not exists ( this .getText ( ) . substring ( start + 1 , end + 1 ) . toInt ( ) ) and
328+ not this .numbered_backreference ( start , _ , _ ) and
223329 (
224330 // hex value \xhh
225331 this .getChar ( start + 1 ) = "x" and end = start + 4
226332 or
227333 // octal value \ooo
228334 end in [ start + 2 .. start + 4 ] and
229- exists ( this .getText ( ) .substring ( start + 1 , end ) .toInt ( ) )
335+ this .getText ( ) .substring ( start + 1 , end ) .toInt ( ) >= 0 and
336+ not (
337+ end < start + 4 and
338+ exists ( this .getText ( ) .substring ( start + 1 , end + 1 ) .toInt ( ) )
339+ )
230340 or
231341 // 16-bit hex value \uhhhh
232342 this .getChar ( start + 1 ) = "u" and end = start + 6
@@ -238,11 +348,13 @@ abstract class RegexString extends Expr {
238348 or
239349 // escape not handled above, update when adding a new case
240350 not this .getChar ( start + 1 ) in [ "x" , "u" , "U" , "N" ] and
351+ not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
241352 end = start + 2
242353 )
243354 }
244355
245- private predicate inCharSet ( int index ) {
356+ /** Holds if `index` is inside a character set. */
357+ predicate inCharSet ( int index ) {
246358 exists ( int x , int y | this .charSet ( x , y ) and index in [ x + 1 .. y - 2 ] )
247359 }
248360
@@ -262,7 +374,7 @@ abstract class RegexString extends Expr {
262374 or
263375 start = z - 2
264376 or
265- start > y and start < z - 2 and not c = "-"
377+ start > y and start < z - 2 and not this . charRange ( _ , _ , start , end , _ )
266378 )
267379 or
268380 not this .inCharSet ( start ) and
@@ -281,7 +393,8 @@ abstract class RegexString extends Expr {
281393 or
282394 this .escapedCharacter ( start , end )
283395 ) and
284- not exists ( int x , int y | this .group_start ( x , y ) and x <= start and y >= end )
396+ not exists ( int x , int y | this .group_start ( x , y ) and x <= start and y >= end ) and
397+ not exists ( int x , int y | this .backreference ( x , y ) and x <= start and y >= end )
285398 }
286399
287400 predicate normalCharacter ( int start , int end ) {
@@ -326,12 +439,13 @@ abstract class RegexString extends Expr {
326439 or
327440 this .negativeAssertionGroup ( start , end )
328441 or
329- positiveLookaheadAssertionGroup ( start , end )
442+ this . positiveLookaheadAssertionGroup ( start , end )
330443 or
331444 this .positiveLookbehindAssertionGroup ( start , end )
332445 }
333446
334- private predicate emptyGroup ( int start , int end ) {
447+ /** Holds if an empty group is found between `start` and `end`. */
448+ predicate emptyGroup ( int start , int end ) {
335449 exists ( int endm1 | end = endm1 + 1 |
336450 this .group_start ( start , endm1 ) and
337451 this .isGroupEnd ( endm1 )
@@ -364,13 +478,29 @@ abstract class RegexString extends Expr {
364478 )
365479 }
366480
367- private predicate positiveLookaheadAssertionGroup ( int start , int end ) {
481+ /** Holds if a negative lookahead is found between `start` and `end` */
482+ predicate negativeLookaheadAssertionGroup ( int start , int end ) {
483+ exists ( int in_start | this .negative_lookahead_assertion_start ( start , in_start ) |
484+ this .groupContents ( start , end , in_start , _)
485+ )
486+ }
487+
488+ /** Holds if a negative lookbehind is found between `start` and `end` */
489+ predicate negativeLookbehindAssertionGroup ( int start , int end ) {
490+ exists ( int in_start | this .negative_lookbehind_assertion_start ( start , in_start ) |
491+ this .groupContents ( start , end , in_start , _)
492+ )
493+ }
494+
495+ /** Holds if a positive lookahead is found between `start` and `end` */
496+ predicate positiveLookaheadAssertionGroup ( int start , int end ) {
368497 exists ( int in_start | this .lookahead_assertion_start ( start , in_start ) |
369498 this .groupContents ( start , end , in_start , _)
370499 )
371500 }
372501
373- private predicate positiveLookbehindAssertionGroup ( int start , int end ) {
502+ /** Holds if a positive lookbehind is found between `start` and `end` */
503+ predicate positiveLookbehindAssertionGroup ( int start , int end ) {
374504 exists ( int in_start | this .lookbehind_assertion_start ( start , in_start ) |
375505 this .groupContents ( start , end , in_start , _)
376506 )
@@ -429,6 +559,8 @@ abstract class RegexString extends Expr {
429559 this .getChar ( start + 1 ) = "?" and
430560 this .getChar ( start + 2 ) = "P" and
431561 this .getChar ( start + 3 ) = "=" and
562+ // Should this be looking for unescaped ")"?
563+ // TODO: test this
432564 end = min ( int i | i > start + 4 and this .getChar ( i ) = "?" )
433565 }
434566
@@ -519,6 +651,7 @@ abstract class RegexString extends Expr {
519651
520652 private predicate numbered_backreference ( int start , int end , int value ) {
521653 this .escapingChar ( start ) and
654+ not this .getChar ( start + 1 ) = "0" and
522655 exists ( string text , string svalue , int len |
523656 end = start + len and
524657 text = this .getText ( ) and
@@ -527,7 +660,7 @@ abstract class RegexString extends Expr {
527660 svalue = text .substring ( start + 1 , start + len ) and
528661 value = svalue .toInt ( ) and
529662 not exists ( text .substring ( start + 1 , start + len + 1 ) .toInt ( ) ) and
530- value != 0
663+ value > 0
531664 )
532665 }
533666
@@ -551,6 +684,8 @@ abstract class RegexString extends Expr {
551684 this .group ( start , end )
552685 or
553686 this .charSet ( start , end )
687+ or
688+ this .backreference ( start , end )
554689 }
555690
556691 private predicate qualifier ( int start , int end , boolean maybe_empty ) {
0 commit comments