Skip to content

Commit 8666bc1

Browse files
committed
JS: Extract placeholders in HTML
1 parent b1ce3d1 commit 8666bc1

8 files changed

Lines changed: 948 additions & 25 deletions

File tree

javascript/extractor/src/com/semmle/js/extractor/HTMLExtractor.java

Lines changed: 104 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ public void handleElement(Element elt, HtmlPopulator.Context context) {
8181
source,
8282
content.getBegin(),
8383
isTypeScript,
84-
elt,
85-
context);
84+
context.getNodeLabel(elt));
8685
}
8786
}
8887
} else {
@@ -93,6 +92,14 @@ public void handleElement(Element elt, HtmlPopulator.Context context) {
9392
// ignore empty attributes
9493
if (attr.getValue() == null || attr.getValue().isEmpty()) continue;
9594

95+
extractTemplateTags(
96+
textualExtractor,
97+
scopeManager,
98+
attr.getSource(),
99+
attr.getBegin(),
100+
attr.getEnd(),
101+
() -> context.getNodeLabel(attr));
102+
96103
String source = attr.getValue();
97104
int valueStart = attr.getValueSegment().getBegin();
98105
if (JS_ATTRIBUTE.matcher(attr.getName()).matches()) {
@@ -104,8 +111,7 @@ public void handleElement(Element elt, HtmlPopulator.Context context) {
104111
source,
105112
valueStart,
106113
false /* isTypeScript */,
107-
attr,
108-
context);
114+
context.getNodeLabel(attr));
109115
} else if (isAngularTemplateAttributeName(attr.getName())) {
110116
// For an attribute *ngFor="let var of EXPR", start parsing at EXPR
111117
int offset = 0;
@@ -125,8 +131,7 @@ public void handleElement(Element elt, HtmlPopulator.Context context) {
125131
source,
126132
valueStart + offset,
127133
false /* isTypeScript */,
128-
attr,
129-
context);
134+
context.getNodeLabel(attr));
130135
} else if (source.startsWith("javascript:")) {
131136
source = source.substring(11);
132137
extractSnippet(
@@ -137,13 +142,19 @@ public void handleElement(Element elt, HtmlPopulator.Context context) {
137142
source,
138143
valueStart + 11,
139144
false /* isTypeScript */,
140-
attr,
141-
context);
145+
context.getNodeLabel(attr));
142146
}
143147
}
144148
}
145149
}
146150

151+
@Override
152+
public void handleText(
153+
Source src, int textBegin, int textEnd, Label parentLabel, boolean isCData) {
154+
extractTemplateTags(
155+
textualExtractor, scopeManager, src, textBegin, textEnd, () -> parentLabel);
156+
}
157+
147158
@Override
148159
public boolean shouldExtractAttributes(Element element) {
149160
Attributes attributes = element.getAttributes();
@@ -294,8 +305,7 @@ private void extractSnippet(
294305
String source,
295306
int offset,
296307
boolean isTypeScript,
297-
Segment parentHtmlNode,
298-
HtmlPopulator.Context context) {
308+
Label parentLabel) {
299309
TrapWriter trapWriter = textualExtractor.getTrapwriter();
300310
LocationManager locationManager = textualExtractor.getLocationManager();
301311
// JavaScript AST extraction does not currently support source maps, so just set
@@ -330,7 +340,7 @@ private void extractSnippet(
330340
scriptLocationManager.getFileLabel(),
331341
scriptLocationManager.getStartLine(),
332342
scriptLocationManager.getStartColumn());
333-
emitTopLevelXmlNodeBinding(parentHtmlNode, topLevelLabel, context, trapWriter);
343+
emitTopLevelXmlNodeBinding(parentLabel, topLevelLabel, trapWriter);
334344
// Note: LoC info is accounted for later, so not added here.
335345
return;
336346
}
@@ -347,7 +357,7 @@ private void extractSnippet(
347357
Pair<Label, LoCInfo> result = extractor.extract(tx, source, toplevelKind, scopeManager);
348358
Label toplevelLabel = result.fst();
349359
if (toplevelLabel != null) { // can be null when script ends up being parsed as JSON
350-
emitTopLevelXmlNodeBinding(parentHtmlNode, toplevelLabel, context, trapWriter);
360+
emitTopLevelXmlNodeBinding(parentLabel, toplevelLabel, trapWriter);
351361
}
352362
locInfo.add(result.snd());
353363
} catch (ParseError e) {
@@ -356,8 +366,88 @@ private void extractSnippet(
356366
}
357367
}
358368

359-
private void emitTopLevelXmlNodeBinding(Segment parentHtmlNode, Label topLevelLabel, HtmlPopulator.Context context, TrapWriter writer) {
360-
Label htmlNodeLabel = context.getNodeLabel(parentHtmlNode);
369+
private void emitTopLevelXmlNodeBinding(
370+
Label htmlNodeLabel, Label topLevelLabel, TrapWriter writer) {
361371
writer.addTuple("toplevel_parent_xml_node", topLevelLabel, htmlNodeLabel);
362372
}
373+
374+
private static final String MUSTACHE_TAG_DOUBLE = "\\{\\{(?!\\{)(.*?)\\}\\}"; // {{ x }}
375+
private static final String MUSTACHE_TAG_TRIPLE = "\\{\\{\\{(.*?)\\}\\}\\}"; // {{{ x }}}
376+
private static final String MUSTACHE_TAG_PERCENT = "\\{%(?!>)(.*?)%\\}"; // {% x %}
377+
private static final String EJS_TAG = "<%(?![%<>}])[-=]?(.*?)[_-]?%>"; // <% x %>
378+
379+
/** Pattern for a template tag whose contents should be parsed as an expression */
380+
private static final Pattern TEMPLATE_EXPR_OPENING_TAG = Pattern.compile("^(?:\\{\\{\\{?|<%[-=])"); // {{, {{{, <%=, <%-
381+
382+
private static final Pattern TEMPLATE_TAGS =
383+
Pattern.compile(
384+
StringUtil.glue(
385+
"|", MUSTACHE_TAG_DOUBLE, MUSTACHE_TAG_TRIPLE, MUSTACHE_TAG_PERCENT, EJS_TAG),
386+
Pattern.DOTALL);
387+
388+
private void extractTemplateTags(
389+
TextualExtractor textualExtractor,
390+
ScopeManager scopeManager,
391+
Source root,
392+
int start,
393+
int end,
394+
Supplier<Label> parentLabel) {
395+
if (isEmbedded) return; // Do not extract template tags for HTML snippets embedded in a JS file
396+
397+
LocationManager locationManager = textualExtractor.getLocationManager();
398+
TrapWriter trapwriter = textualExtractor.getTrapwriter();
399+
Matcher m = TEMPLATE_TAGS.matcher(textualExtractor.getSource()).region(start, end);
400+
while (m.find()) {
401+
int startOffset = m.start();
402+
int endOffset = m.end();
403+
if (endOffset - startOffset > 10_000) {
404+
// Do not extract long template strings as they're likely to be FP matches and
405+
// unlikely to be parsed correctly.
406+
continue;
407+
}
408+
409+
// Emit an entity for the template tag
410+
Label lbl = trapwriter.freshLabel();
411+
String rawText = m.group();
412+
trapwriter.addTuple("template_placeholder_tag_info", lbl, parentLabel.get(), rawText);
413+
414+
// Emit location
415+
Position startPos = textualExtractor.getSourceMap().getStart(startOffset);
416+
Position endPos = textualExtractor.getSourceMap().getEnd(endOffset - 1);
417+
int endColumn = endPos.getColumn() - 1; // Convert to inclusive end position (still 1-based)
418+
locationManager.emitFileLocation(
419+
lbl, startPos.getLine(), startPos.getColumn(), endPos.getLine(), endColumn);
420+
421+
// Parse the contents as a template expression, if the delimiter expects an expression.
422+
Matcher delimMatcher = TEMPLATE_EXPR_OPENING_TAG.matcher(rawText);
423+
if (delimMatcher.find()) {
424+
// The body of the template tag is stored in the first capture group of each pattern
425+
int bodyGroup = getNonNullCaptureGroup(m);
426+
if (bodyGroup != -1) {
427+
extractSnippet(
428+
TopLevelKind.ANGULAR_TEMPLATE,
429+
config.withSourceType(SourceType.ANGULAR_TEMPLATE),
430+
scopeManager,
431+
textualExtractor,
432+
m.group(bodyGroup),
433+
m.start(bodyGroup),
434+
false /* isTypeScript */,
435+
lbl);
436+
}
437+
}
438+
}
439+
}
440+
441+
/**
442+
* Returns the index of the first capture group that captured something
443+
* (apart from group zero which is the whole match).
444+
*/
445+
private static int getNonNullCaptureGroup(Matcher m) {
446+
for (int i = 1; i <= m.groupCount(); ++i) {
447+
if (m.group(i) != null) {
448+
return i;
449+
}
450+
}
451+
return -1;
452+
}
363453
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<body>
4+
<h1>{{title}}</h1>
5+
{{{subtitle_html}}}
6+
<p><%- body_html %></p>
7+
<p><%= footer %></p>
8+
<script>
9+
var data1 = {{{ user_data1 }}};
10+
var data2 = {{ user_data2 | json | safe }};
11+
var data3 = <%- user_data3 %>;
12+
</script>
13+
</body>
14+
</html>

0 commit comments

Comments
 (0)