Skip to content

Commit d3302c3

Browse files
committed
JS: Fix offsets in regexes parsed from strings with escapes
1 parent 2901b5e commit d3302c3

5 files changed

Lines changed: 98 additions & 30 deletions

File tree

javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,13 +517,81 @@ public Label visit(Literal nd, Context c) {
517517

518518
trapwriter.addTuple("literals", valueString, source, key);
519519
if (nd.isRegExp()) {
520-
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), nd, false);
520+
OffsetTranslation offsets = new OffsetTranslation();
521+
offsets.set(0, 1); // skip the initial '/'
522+
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
521523
} else if (nd.isStringLiteral()) {
522-
regexpExtractor.extract(valueString, nd, true);
524+
regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
523525
}
524526
return key;
525527
}
526528

529+
/**
530+
* Builds a translation from offsets in a string value back to its original raw literal text
531+
* (including quotes).
532+
*
533+
* <p>This is not a 1:1 mapping since escape sequences take up more characters in the raw
534+
* literal than in the resulting string value. This mapping includes the surrounding quotes.
535+
*
536+
* <p>For example: for the raw literal value <code>'x\.y'</code> (quotes included), the <code>y
537+
* </code> at index 2 in <code>x.y</code> maps to index 4 in the raw literal.
538+
*/
539+
public OffsetTranslation makeStringLiteralOffsets(String rawLiteral) {
540+
OffsetTranslation offsets = new OffsetTranslation();
541+
offsets.set(0, 1); // Skip the initial quote
542+
// Invariant: raw character at 'pos' corresponds to decoded character at 'pos - delta'
543+
int pos = 1;
544+
int delta = 1;
545+
while (pos < rawLiteral.length() - 1) {
546+
if (rawLiteral.charAt(pos) != '\\') {
547+
++pos;
548+
continue;
549+
}
550+
final int length; // Length of the escape sequence, including slash.
551+
int outputLength = 1; // Number characters the sequence expands to.
552+
char ch = rawLiteral.charAt(pos + 1);
553+
if ('0' <= ch && ch <= '7') {
554+
// Octal escape: \NNN
555+
length = 4;
556+
} else if (ch == 'x') {
557+
// Hex escape: \xNN
558+
length = 4;
559+
} else if (ch == 'u' && pos + 2 < rawLiteral.length()) {
560+
if (rawLiteral.charAt(pos + 2) == '{') {
561+
// Variable-length unicode escape: \U{N...}
562+
// Scan for the ending '}'
563+
int firstDigit = pos + 3;
564+
int end = firstDigit;
565+
while (end < rawLiteral.length() && rawLiteral.charAt(end) != '}') {
566+
++end;
567+
}
568+
int numDigits = end - firstDigit;
569+
if (numDigits > 4) {
570+
outputLength = 2; // Encoded as a surrogate pair
571+
}
572+
++end; // Include '}' character
573+
length = end - pos;
574+
} else {
575+
// Fixed-length unicode escape: \UNNNN
576+
length = 6;
577+
}
578+
} else {
579+
// Simple escape: \n or similar.
580+
length = 2;
581+
}
582+
int end = pos + length;
583+
if (end > rawLiteral.length()) {
584+
end = rawLiteral.length();
585+
}
586+
int outputPos = pos - delta;
587+
// Map the next character to the adjusted offset.
588+
offsets.set(outputPos + outputLength, end);
589+
delta += length - outputLength;
590+
pos = end;
591+
}
592+
return offsets;
593+
}
594+
527595
@Override
528596
public Label visit(MemberExpression nd, Context c) {
529597
Label key = super.visit(nd, c);

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,11 @@ private Label extractTerm(RegExpTerm term, Label parent, int idx) {
120120
}
121121

122122
public void emitLocation(SourceElement term, Label lbl) {
123+
int col = literalStart.getColumn();
123124
int sl, sc, el, ec;
124125
sl = el = literalStart.getLine();
125-
// the offset table accounts for the position on the line and for skipping the initial '/'
126-
sc = offsets.get(term.getLoc().getStart().getColumn());
127-
ec = offsets.get(term.getLoc().getEnd().getColumn());
126+
sc = col + offsets.get(term.getLoc().getStart().getColumn());
127+
ec = col + offsets.get(term.getLoc().getEnd().getColumn());
128128
sc += 1; // convert to 1-based
129129
ec += 1; // convert to 1-based
130130
ec -= 1; // convert to inclusive
@@ -346,16 +346,16 @@ public void visit(CharacterClassRange nd) {
346346
}
347347
}
348348

349-
public void extract(String src, Node parent, boolean isSpeculativeParsing) {
349+
public void extract(
350+
String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
350351
Result res = parser.parse(src);
351352

352353
if (isSpeculativeParsing && res.getErrors().size() > 0) {
353354
return;
354355
}
355356

356357
this.literalStart = parent.getLoc().getStart();
357-
offsets = new OffsetTranslation();
358-
offsets.set(0, literalStart.getColumn() + 1); // add 1 to skip the leading '/' or quote
358+
this.offsets = offsets;
359359
RegExpTerm ast = res.getAST();
360360
new V().visit(ast, trapwriter.localID(parent), 0);
361361

javascript/extractor/tests/encoding/output/trap/surrogates.js.trap

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,8 @@ exprContainers(#20075,#20001)
221221
literals("?","'\ud800'",#20075)
222222
#20076=*
223223
regexpterm(#20076,14,#20075,0,"?")
224-
#20077=@"loc,{#10000},2,2,2,2"
225-
locations_default(#20077,#10000,2,2,2,2)
224+
#20077=@"loc,{#10000},2,2,2,7"
225+
locations_default(#20077,#10000,2,2,2,7)
226226
hasLocation(#20076,#20077)
227227
regexpConstValue(#20076,"?")
228228
#20078=*
@@ -237,8 +237,8 @@ exprContainers(#20079,#20001)
237237
literals("foo?","'foo\ud800'",#20079)
238238
#20080=*
239239
regexpterm(#20080,14,#20079,0,"foo?")
240-
#20081=@"loc,{#10000},3,2,3,5"
241-
locations_default(#20081,#10000,3,2,3,5)
240+
#20081=@"loc,{#10000},3,2,3,10"
241+
locations_default(#20081,#10000,3,2,3,10)
242242
hasLocation(#20080,#20081)
243243
regexpConstValue(#20080,"foo?")
244244
#20082=*
@@ -253,8 +253,8 @@ exprContainers(#20083,#20001)
253253
literals("?bar","'\ud800bar'",#20083)
254254
#20084=*
255255
regexpterm(#20084,14,#20083,0,"?bar")
256-
#20085=@"loc,{#10000},4,2,4,5"
257-
locations_default(#20085,#10000,4,2,4,5)
256+
#20085=@"loc,{#10000},4,2,4,10"
257+
locations_default(#20085,#10000,4,2,4,10)
258258
hasLocation(#20084,#20085)
259259
regexpConstValue(#20084,"?bar")
260260
#20086=*
@@ -269,8 +269,8 @@ exprContainers(#20087,#20001)
269269
literals("foo?bar","'foo\ud800bar'",#20087)
270270
#20088=*
271271
regexpterm(#20088,14,#20087,0,"foo?bar")
272-
#20089=@"loc,{#10000},5,2,5,8"
273-
locations_default(#20089,#10000,5,2,5,8)
272+
#20089=@"loc,{#10000},5,2,5,13"
273+
locations_default(#20089,#10000,5,2,5,13)
274274
hasLocation(#20088,#20089)
275275
regexpConstValue(#20088,"foo?bar")
276276
#20090=*
@@ -388,8 +388,8 @@ exprContainers(#20121,#20001)
388388
literals("??","'\udc00\ud800'",#20121)
389389
#20122=*
390390
regexpterm(#20122,14,#20121,0,"??")
391-
#20123=@"loc,{#10000},11,2,11,3"
392-
locations_default(#20123,#10000,11,2,11,3)
391+
#20123=@"loc,{#10000},11,2,11,13"
392+
locations_default(#20123,#10000,11,2,11,13)
393393
hasLocation(#20122,#20123)
394394
regexpConstValue(#20122,"??")
395395
#20124=*
@@ -404,8 +404,8 @@ exprContainers(#20125,#20001)
404404
literals("𝌆","'\uD834\uDF06'",#20125)
405405
#20126=*
406406
regexpterm(#20126,14,#20125,0,"𝌆")
407-
#20127=@"loc,{#10000},13,2,13,3"
408-
locations_default(#20127,#10000,13,2,13,3)
407+
#20127=@"loc,{#10000},13,2,13,13"
408+
locations_default(#20127,#10000,13,2,13,13)
409409
hasLocation(#20126,#20127)
410410
regexpConstValue(#20126,"𝌆")
411411
#20128=*

javascript/extractor/tests/exprs/output/trap/primaries.js.trap

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -988,8 +988,8 @@ literals("'what?'
988988
#20353=*
989989
regexpterm(#20353,1,#20352,0,"'what?'
990990
")
991-
#20354=@"loc,{#10000},12,2,12,9"
992-
locations_default(#20354,#10000,12,2,12,9)
991+
#20354=@"loc,{#10000},12,2,12,12"
992+
locations_default(#20354,#10000,12,2,12,12)
993993
hasLocation(#20353,#20354)
994994
#20355=*
995995
regexpterm(#20355,14,#20353,0,"'wha")
@@ -1012,8 +1012,8 @@ regexpConstValue(#20359,"t")
10121012
#20361=*
10131013
regexpterm(#20361,14,#20353,2,"'
10141014
")
1015-
#20362=@"loc,{#10000},12,8,12,9"
1016-
locations_default(#20362,#10000,12,8,12,9)
1015+
#20362=@"loc,{#10000},12,8,12,12"
1016+
locations_default(#20362,#10000,12,8,12,12)
10171017
hasLocation(#20361,#20362)
10181018
regexpConstValue(#20361,"'
10191019
")
@@ -1031,8 +1031,8 @@ literals("""why?""
10311031
#20365=*
10321032
regexpterm(#20365,1,#20364,0,"""why?""
10331033
")
1034-
#20366=@"loc,{#10000},13,2,13,8"
1035-
locations_default(#20366,#10000,13,2,13,8)
1034+
#20366=@"loc,{#10000},13,2,13,9"
1035+
locations_default(#20366,#10000,13,2,13,9)
10361036
hasLocation(#20365,#20366)
10371037
#20367=*
10381038
regexpterm(#20367,14,#20365,0,"""wh")
@@ -1055,8 +1055,8 @@ regexpConstValue(#20371,"y")
10551055
#20373=*
10561056
regexpterm(#20373,14,#20365,2,"""
10571057
")
1058-
#20374=@"loc,{#10000},13,7,13,8"
1059-
locations_default(#20374,#10000,13,7,13,8)
1058+
#20374=@"loc,{#10000},13,7,13,9"
1059+
locations_default(#20374,#10000,13,7,13,9)
10601060
hasLocation(#20373,#20374)
10611061
regexpConstValue(#20373,"""
10621062
")

javascript/extractor/tests/html/output/trap/tst.html.trap

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,8 @@ exprContainers(#20090,#20074)
302302
literals("I said don't click!","'I said don\'t click!'",#20090)
303303
#20091=*
304304
regexpterm(#20091,14,#20090,0,"I said don't click!")
305-
#20092=@"loc,{#10000},12,30,12,48"
306-
locations_default(#20092,#10000,12,30,12,48)
305+
#20092=@"loc,{#10000},12,30,12,49"
306+
locations_default(#20092,#10000,12,30,12,49)
307307
hasLocation(#20091,#20092)
308308
regexpConstValue(#20091,"I said don't click!")
309309
#20093=*

0 commit comments

Comments
 (0)