github
diff --git a/‎config/identical-files.json‎
Lines changed: 4 additions & 0 deletions b/‎config/identical-files.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎javascript/change-notes/2021-08-26-bad-tag-filter.md‎
Lines changed: 4 additions & 0 deletions b/‎javascript/change-notes/2021-08-26-bad-tag-filter.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll‎
Lines changed: 167 additions & 0 deletions b/‎javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll‎
Lines changed: 2 additions & 2 deletions b/‎javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎javascript/ql/src/Security/CWE-116/BadTagFilter.qhelp‎
Lines changed: 56 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-116/BadTagFilter.qhelp‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-116/BadTagFilter.ql‎
Lines changed: 19 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-116/BadTagFilter.ql‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-116/examples/BadTagFilter.js‎
Lines changed: 8 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-116/examples/BadTagFilter.js‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected‎
Lines changed: 12 additions & 0 deletions b/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref‎
Lines changed: 1 addition & 0 deletions b/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js‎
Lines changed: 19 additions & 0 deletions b/‎javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js‎
Lines changed: 19 additions & 0 deletions
@@ -461,5 +461,9 @@
   "ReDoS Polynomial Python/JS": [
     "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
     "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
+  ],
+  "BadTagFilterQuery Python/JS": [
+    "javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
+    "python/ql/lib/semmle/python/security/BadTagFilterQuery.qll"
   ]
 }
@@ -0,0 +1,4 @@
+lgtm,codescanning
+* A new query, `js/bad-tag-filter`, has been added to the query suite,
+  highlighting regular expressions that only match a subset of the HTML tags
+  it is supposed to match.
@@ -0,0 +1,167 @@
+/**
+ * Provides precicates for reasoning about bad tag filter vulnerabilities.
+ */
+
+import performance.ReDoSUtil
+
+/**
+ * A module for determining if a regexp matches a given string.
+ */
+private module RegexpMatching {
+  /**
+   * A class to test whether a regular expression matches a string.
+   * Override this class and extend `toTest` to configure which strings should be tested for acceptance by this regular expression.
+   * The result can afterwards be read from the `matches` predicate.
+   */
+  abstract class MatchedRegExp extends RegExpTerm {
+    MatchedRegExp() { this.isRootTerm() }
+
+    /**
+     * Holds if it should be tested whether this regular expression matches `str`.
+     *
+     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+     * but if `ignorePrefix` is true, it will only match "foo".
+     */
+    abstract predicate toTest(string str, boolean ignorePrefix);
+
+    /**
+     * Gets a state a regular expression is in after matching the `i`th char in `str`.
+     * The regular expression is modelled as a non-determistic finite automaton,
+     * the regular expression can therefore be in multiple states after matching a character.
+     */
+    private State getAState(int i, string str, boolean ignorePrefix) {
+      i = -1 and
+      this.toTest(str, ignorePrefix) and
+      result.getRepr().getRootTerm() = this and
+      isStartState(result)
+      or
+      exists(State prev |
+        prev = getAState(i - 1, str, ignorePrefix) and
+        deltaClosed(prev, getAnInputSymbolMatching(str.charAt(i)), result) and
+        not (
+          ignorePrefix = true and
+          isStartState(prev) and
+          isStartState(result)
+        )
+      )
+    }
+
+    /**
+     * Holds if `regexp` matches `str`.
+     */
+    predicate matches(string str) {
+      exists(State state | state = getAState(str.length() - 1, str, _) |
+        epsilonSucc*(state) = Accept(_)
+      )
+    }
+  }
+
+  /**
+   * Holds if `state` is a start state.
+   */
+  private predicate isStartState(State state) {
+    state = mkMatch(any(RegExpRoot r)) and
+    not exists(RegExpCaret car | car.getRootTerm() = state.getRepr().getRootTerm())
+    or
+    exists(RegExpCaret car | state = after(car))
+  }
+}
+
+/**
+ * A class to test whether a regular expression matches certain HTML tags.
+ */
+class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
+  HTMLMatchingRegExp() {
+    // the regexp must mention "<" and ">" explicitly.
+    forall(string angleBracket | angleBracket = ["<", ">"] |
+      any(RegExpConstant term | term.getValue().regexpMatch(".*" + angleBracket + ".*"))
+          .getRootTerm() = this
+    )
+  }
+
+  override predicate toTest(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str =
+      [
+        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
+        "<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
+        "<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
+        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
+        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
+        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
+      ]
+  }
+}
+
+/**
+ * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
+ *
+ * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::toTest`.
+ */
+predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
+  regexp.matches("<!-- foo -->") and
+  not regexp.matches("<!-- foo --!>") and
+  not regexp.matches("<!- foo ->") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<script>") and
+  msg = "This regular expression only matches -->  and not --!> as a HTML comment end tag."
+  or
+  regexp.matches("<!-- foo -->") and
+  not regexp.matches("<!-- foo\n -->") and
+  not regexp.matches("<!- foo ->") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<script>") and
+  msg = "This regular expression does not match comments containing newlines."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script \n>foo</script>") and
+    msg = "This regular expression matches <script></script>, but not <script \\n></script>"
+    or
+    not regexp.matches("<script >foo\n</script>") and
+    msg = "This regular expression matches <script>foo</script>, but not <script >foo\\n</script>"
+  )
+  or
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses single-quotes."
+  or
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses double-quotes."
+  or
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script\tsrc='foo'></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo src=\"foo\"></foo>") and
+  msg = "This regular expression does not match script tags tabs are used between attributes."
+  or
+  regexp.matches("<script>foo</script>") and
+  not RegExpFlags::isIgnoreCase(regexp) and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match upper case <SCRIPT> tags."
+    or
+    not regexp.matches("<sCrIpT>foo</ScRiPt>") and
+    regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match mixed case <sCrIpT> tags."
+  )
+  or
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script src=\"foo\">foo</script >") or
+    not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") or
+    not regexp.matches("<script src=\"foo\">foo</script\t\n bar>")
+  ) and
+  msg =
+    "This regular expression does not match script end tags containing spaces, tabs or newlines."
+}
@@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
  * Gets a state the NFA may be in after matching `t`.
  */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
   exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
   or
   exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) {
 /**
  * A state in the NFA.
  */
-private newtype TState =
+newtype TState =
   /**
    * A state representing that the NFA is about to match a term.
    * `i` is used to index into multi-char literals.
 
@@ -0,0 +1,56 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+Parsing HTML using regular expressions is impossible, however it is possible to match
+single HTML tags. However, if the regexp is not written well it might be easy 
+to circumvent the regexp, which can lead to XSS or other security issues.
+</p>
+<p>
+Many of these mistakes are caused by browsers being very forgiving when it comes to
+HTML parsing. Browser will often render invalid HTML with parser errors. 
+The regular expressions matching tags must recognize tags containing these parser errors.
+</p>
+</overview>
+
+<recommendation>
+<p>
+Use a (well-tested) sanitization or parser library if at all possible. These libraries are much more
+likely to handle corner cases correctly than a custom implementation.
+</p>
+
+<p>
+Otherwise, make sure to look into the corner cases that exist in HTML. 
+For example that HTML comments can end with <code>--!&gt;</code>, and that HTML tag names can contain 
+upper case characters.
+</p>
+</recommendation>
+
+<example>
+<p>
+For example, assume we want to write a function that filters out all <code>&lt;script&gt;</code> tags.
+Such a function might be written like below: 
+</p>
+
+<sample src="examples/BadTagFilter.js" />
+
+<p>
+This sanitizer is very close to getting it right. 
+However, browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
+This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by 
+the function, but <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
+</p>
+</example>
+
+<references>
+<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
+<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
+</references>
+</qhelp>
+
+
@@ -0,0 +1,19 @@
+/**
+ * @name Bad HTML filtering regexp
+ * @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id js/bad-tag-filter
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-116
+ *       external/cwe/cwe-020
+ */
+
+import semmle.javascript.security.BadTagFilterQuery
+
+from HTMLMatchingRegExp regexp, string msg
+where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
+select regexp, msg
@@ -0,0 +1,8 @@
+function filterScript(html) {
+    var scriptRegex = /<script\b[^>]*>([\s\S]*?)<\/script>/gi;
+    var match;
+    while ((match = scriptRegex.exec(html)) !== null) {
+        html = html.replace(match[0], match[1]);
+    }
+    return html;
+}
@@ -0,0 +1,12 @@
+| tst.js:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression matches <script></script>, but not <script \\n></script> |
+| tst.js:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags containing spaces, tabs or newlines. |
+| tst.js:5:6:5:14 | <!--.*--> | This regular expression only matches -->  and not --!> as a HTML comment end tag. |
+| tst.js:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
+| tst.js:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
+| tst.js:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>foo</script>, but not <script >foo\\n</script> |
+| tst.js:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
+| tst.js:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
+| tst.js:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags tabs are used between attributes. |
+| tst.js:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
+| tst.js:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
+| tst.js:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags containing spaces, tabs or newlines. |
@@ -0,0 +1 @@
+Security/CWE-116/BadTagFilter.ql
@@ -0,0 +1,19 @@
+var filters = [
+    /<script.*?>.*?<\/script>/i, // NOT OK - doesn't match newlines or `</script >`
+    /<script.*?>.*?<\/script>/is, // NOT OK - doesn't match `</script >`
+    /<script.*?>.*?<\/script[^>]*>/is, // OK
+    /<!--.*-->/is, // NOT OK - misses --!> endings
+    /<!--.*--!?>/is, // OK
+    /<!--.*--!?>/i, // NOT OK, does not match newlines
+    /<script.*?>(.|\s)*?<\/script[^>]*>/i, // NOT OK - doesn't match inside the script tag
+    /<script[^>]*?>.*?<\/script[^>]*>/i, // NOT OK - doesn't match newlines inside the content
+    /<script(\s|\w|=|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match single quotes for attribute values
+    /<script(\s|\w|=|')*?>.*?<\/script[^>]*>/is, // NOT OK - does not match double quotes for attribute values
+    /<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match tabs between attributes
+    /<script.*?>.*?<\/script[^>]*>/s, // NOT OK - does not match uppercase SCRIPT tags
+    /<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/s, // NOT OK - does not match mixed case script tags
+    /<script[^>]*?>[\s\S]*?<\/script.*>/i, // NOT OK - doesn't match newlines in the end tag
+    /<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, // OK
+]
+
+doFilters(filters)
Original file line number	Diff line number	Diff line change
`@@ -461,5 +461,9 @@`
`461`	`461`	`"ReDoS Polynomial Python/JS": [`
`462`	`462`	`"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",`
`463`	`463`	`"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"`
	`464`	`+ ],`
	`465`	`+ "BadTagFilterQuery Python/JS": [`
	`466`	`+ "javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",`
	`467`	`+ "python/ql/lib/semmle/python/security/BadTagFilterQuery.qll"`
`464`	`468`	`]`
`465`	`469`	`}`