Skip to content

Commit fe10a90

Browse files
committed
add a bad-tag-filter query for Python and JavaScript
1 parent b4963c7 commit fe10a90

19 files changed

Lines changed: 581 additions & 4 deletions

File tree

config/identical-files.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,5 +461,9 @@
461461
"ReDoS Polynomial Python/JS": [
462462
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
463463
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
464+
],
465+
"BadTagFilterQuery Python/JS": [
466+
"javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
467+
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll"
464468
]
465469
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
lgtm,codescanning
2+
* A new query, `js/bad-tag-filter`, has been added to the query suite,
3+
highlighting regular expressions that only match a subset of the HTML tags
4+
it is supposed to match.
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/**
2+
* Provides precicates for reasoning about bad tag filter vulnerabilities.
3+
*/
4+
5+
import performance.ReDoSUtil
6+
7+
/**
8+
* A module for determining if a regexp matches a given string.
9+
*/
10+
private module RegexpMatching {
11+
/**
12+
* A class to test whether a regular expression matches a string.
13+
* Override this class and extend `toTest` to configure which strings should be tested for acceptance by this regular expression.
14+
* The result can afterwards be read from the `matches` predicate.
15+
*/
16+
abstract class MatchedRegExp extends RegExpTerm {
17+
MatchedRegExp() { this.isRootTerm() }
18+
19+
/**
20+
* Holds if it should be tested whether this regular expression matches `str`.
21+
*
22+
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
23+
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
24+
* but if `ignorePrefix` is true, it will only match "foo".
25+
*/
26+
abstract predicate toTest(string str, boolean ignorePrefix);
27+
28+
/**
29+
* Gets a state a regular expression is in after matching the `i`th char in `str`.
30+
* The regular expression is modelled as a non-determistic finite automaton,
31+
* the regular expression can therefore be in multiple states after matching a character.
32+
*/
33+
private State getAState(int i, string str, boolean ignorePrefix) {
34+
i = -1 and
35+
this.toTest(str, ignorePrefix) and
36+
result.getRepr().getRootTerm() = this and
37+
isStartState(result)
38+
or
39+
exists(State prev |
40+
prev = getAState(i - 1, str, ignorePrefix) and
41+
deltaClosed(prev, getAnInputSymbolMatching(str.charAt(i)), result) and
42+
not (
43+
ignorePrefix = true and
44+
isStartState(prev) and
45+
isStartState(result)
46+
)
47+
)
48+
}
49+
50+
/**
51+
* Holds if `regexp` matches `str`.
52+
*/
53+
predicate matches(string str) {
54+
exists(State state | state = getAState(str.length() - 1, str, _) |
55+
epsilonSucc*(state) = Accept(_)
56+
)
57+
}
58+
}
59+
60+
/**
61+
* Holds if `state` is a start state.
62+
*/
63+
private predicate isStartState(State state) {
64+
state = mkMatch(any(RegExpRoot r)) and
65+
not exists(RegExpCaret car | car.getRootTerm() = state.getRepr().getRootTerm())
66+
or
67+
exists(RegExpCaret car | state = after(car))
68+
}
69+
}
70+
71+
/**
72+
* A class to test whether a regular expression matches certain HTML tags.
73+
*/
74+
class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
75+
HTMLMatchingRegExp() {
76+
// the regexp must mention "<" and ">" explicitly.
77+
forall(string angleBracket | angleBracket = ["<", ">"] |
78+
any(RegExpConstant term | term.getValue().regexpMatch(".*" + angleBracket + ".*"))
79+
.getRootTerm() = this
80+
)
81+
}
82+
83+
override predicate toTest(string str, boolean ignorePrefix) {
84+
ignorePrefix = true and
85+
str =
86+
[
87+
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
88+
"<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
89+
"<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
90+
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
91+
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
92+
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
93+
]
94+
}
95+
}
96+
97+
/**
98+
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
99+
*
100+
* When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::toTest`.
101+
*/
102+
predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
103+
regexp.matches("<!-- foo -->") and
104+
not regexp.matches("<!-- foo --!>") and
105+
not regexp.matches("<!- foo ->") and
106+
not regexp.matches("<foo>") and
107+
not regexp.matches("<script>") and
108+
msg = "This regular expression only matches --> and not --!> as a HTML comment end tag."
109+
or
110+
regexp.matches("<!-- foo -->") and
111+
not regexp.matches("<!-- foo\n -->") and
112+
not regexp.matches("<!- foo ->") and
113+
not regexp.matches("<foo>") and
114+
not regexp.matches("<script>") and
115+
msg = "This regular expression does not match comments containing newlines."
116+
or
117+
regexp.matches("<script>foo</script>") and
118+
regexp.matches("<script src=\"foo\"></script>") and
119+
not regexp.matches("<foo ></foo>") and
120+
(
121+
not regexp.matches("<script \n>foo</script>") and
122+
msg = "This regular expression matches <script></script>, but not <script \\n></script>"
123+
or
124+
not regexp.matches("<script >foo\n</script>") and
125+
msg = "This regular expression matches <script>foo</script>, but not <script >foo\\n</script>"
126+
)
127+
or
128+
regexp.matches("<script src=\"foo\"></script>") and
129+
not regexp.matches("<script src='foo'></script>") and
130+
not regexp.matches("<foo>") and
131+
msg = "This regular expression does not match script tags where the attribute uses single-quotes."
132+
or
133+
regexp.matches("<script src='foo'></script>") and
134+
not regexp.matches("<script src=\"foo\"></script>") and
135+
not regexp.matches("<foo>") and
136+
msg = "This regular expression does not match script tags where the attribute uses double-quotes."
137+
or
138+
regexp.matches("<script src='foo'></script>") and
139+
not regexp.matches("<script\tsrc='foo'></script>") and
140+
not regexp.matches("<foo>") and
141+
not regexp.matches("<foo src=\"foo\"></foo>") and
142+
msg = "This regular expression does not match script tags tabs are used between attributes."
143+
or
144+
regexp.matches("<script>foo</script>") and
145+
not RegExpFlags::isIgnoreCase(regexp) and
146+
not regexp.matches("<foo>") and
147+
not regexp.matches("<foo ></foo>") and
148+
(
149+
not regexp.matches("<SCRIPT>foo</SCRIPT>") and
150+
msg = "This regular expression does not match upper case <SCRIPT> tags."
151+
or
152+
not regexp.matches("<sCrIpT>foo</ScRiPt>") and
153+
regexp.matches("<SCRIPT>foo</SCRIPT>") and
154+
msg = "This regular expression does not match mixed case <sCrIpT> tags."
155+
)
156+
or
157+
regexp.matches("<script src=\"foo\"></script>") and
158+
not regexp.matches("<foo>") and
159+
not regexp.matches("<foo ></foo>") and
160+
(
161+
not regexp.matches("<script src=\"foo\">foo</script >") or
162+
not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") or
163+
not regexp.matches("<script src=\"foo\">foo</script\t\n bar>")
164+
) and
165+
msg =
166+
"This regular expression does not match script end tags containing spaces, tabs or newlines."
167+
}

javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
544544
/**
545545
* Gets a state the NFA may be in after matching `t`.
546546
*/
547-
private State after(RegExpTerm t) {
547+
State after(RegExpTerm t) {
548548
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
549549
or
550550
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) {
673673
/**
674674
* A state in the NFA.
675675
*/
676-
private newtype TState =
676+
newtype TState =
677677
/**
678678
* A state representing that the NFA is about to match a term.
679679
* `i` is used to index into multi-char literals.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
Parsing HTML using regular expressions is impossible, however it is possible to match
9+
single HTML tags. However, if the regexp is not written well it might be easy
10+
to circumvent the regexp, which can lead to XSS or other security issues.
11+
</p>
12+
<p>
13+
Many of these mistakes are caused by browsers being very forgiving when it comes to
14+
HTML parsing. Browser will often render invalid HTML with parser errors.
15+
The regular expressions matching tags must recognize tags containing these parser errors.
16+
</p>
17+
</overview>
18+
19+
<recommendation>
20+
<p>
21+
Use a (well-tested) sanitization or parser library if at all possible. These libraries are much more
22+
likely to handle corner cases correctly than a custom implementation.
23+
</p>
24+
25+
<p>
26+
Otherwise, make sure to look into the corner cases that exist in HTML.
27+
For example that HTML comments can end with <code>--!&gt;</code>, and that HTML tag names can contain
28+
upper case characters.
29+
</p>
30+
</recommendation>
31+
32+
<example>
33+
<p>
34+
For example, assume we want to write a function that filters out all <code>&lt;script&gt;</code> tags.
35+
Such a function might be written like below:
36+
</p>
37+
38+
<sample src="examples/BadTagFilter.js" />
39+
40+
<p>
41+
This sanitizer is very close to getting it right.
42+
However, browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
43+
This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by
44+
the function, but <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
45+
</p>
46+
</example>
47+
48+
<references>
49+
<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
50+
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
51+
<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
52+
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
53+
</references>
54+
</qhelp>
55+
56+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/**
2+
* @name Bad HTML filtering regexp
3+
* @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
4+
* @kind problem
5+
* @problem.severity warning
6+
* @security-severity 7.8
7+
* @precision high
8+
* @id js/bad-tag-filter
9+
* @tags correctness
10+
* security
11+
* external/cwe/cwe-116
12+
* external/cwe/cwe-020
13+
*/
14+
15+
import semmle.javascript.security.BadTagFilterQuery
16+
17+
from HTMLMatchingRegExp regexp, string msg
18+
where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
19+
select regexp, msg
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
function filterScript(html) {
2+
var scriptRegex = /<script\b[^>]*>([\s\S]*?)<\/script>/gi;
3+
var match;
4+
while ((match = scriptRegex.exec(html)) !== null) {
5+
html = html.replace(match[0], match[1]);
6+
}
7+
return html;
8+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
| tst.js:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression matches <script></script>, but not <script \\n></script> |
2+
| tst.js:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags containing spaces, tabs or newlines. |
3+
| tst.js:5:6:5:14 | <!--.*--> | This regular expression only matches --> and not --!> as a HTML comment end tag. |
4+
| tst.js:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
5+
| tst.js:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
6+
| tst.js:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>foo</script>, but not <script >foo\\n</script> |
7+
| tst.js:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
8+
| tst.js:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
9+
| tst.js:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags tabs are used between attributes. |
10+
| tst.js:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
11+
| tst.js:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
12+
| tst.js:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags containing spaces, tabs or newlines. |
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Security/CWE-116/BadTagFilter.ql
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
var filters = [
2+
/<script.*?>.*?<\/script>/i, // NOT OK - doesn't match newlines or `</script >`
3+
/<script.*?>.*?<\/script>/is, // NOT OK - doesn't match `</script >`
4+
/<script.*?>.*?<\/script[^>]*>/is, // OK
5+
/<!--.*-->/is, // NOT OK - misses --!> endings
6+
/<!--.*--!?>/is, // OK
7+
/<!--.*--!?>/i, // NOT OK, does not match newlines
8+
/<script.*?>(.|\s)*?<\/script[^>]*>/i, // NOT OK - doesn't match inside the script tag
9+
/<script[^>]*?>.*?<\/script[^>]*>/i, // NOT OK - doesn't match newlines inside the content
10+
/<script(\s|\w|=|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match single quotes for attribute values
11+
/<script(\s|\w|=|')*?>.*?<\/script[^>]*>/is, // NOT OK - does not match double quotes for attribute values
12+
/<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match tabs between attributes
13+
/<script.*?>.*?<\/script[^>]*>/s, // NOT OK - does not match uppercase SCRIPT tags
14+
/<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/s, // NOT OK - does not match mixed case script tags
15+
/<script[^>]*?>[\s\S]*?<\/script.*>/i, // NOT OK - doesn't match newlines in the end tag
16+
/<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, // OK
17+
]
18+
19+
doFilters(filters)

0 commit comments

Comments
 (0)