JS: add query js/regex/missing-regexp-anchor

Esben Sparre Andreasen · Esben Sparre Andreasen · commit 0fa73b8331e1 · 2019-06-03T08:29:52.000+02:00
diff --git a/change-notes/1.21/analysis-javascript.md b/change-notes/1.21/analysis-javascript.md
@@ -27,6 +27,7 @@
 
 | **Query**                                     | **Tags**                                             | **Purpose**                                                                                                                                                                 |
 |-----------------------------------------------|------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Missing regular expression anchor (`js/regex/missing-regexp-anchor`) | correctness, security, external/cwe/cwe-20 | Highlights regular expression patterns that may be missing an anchor, indicating a possible violation of [CWE-20](https://cwe.mitre.org/data/definitions/20.html). Results are not shown on LGTM by default. |
 | Prototype pollution (`js/prototype-pollution`)    | security, external/cwe-250, external/cwe-400 | Highlights code that allows an attacker to modify a built-in prototype object through an unsanitized recursive merge function. The results are shown on LGTM by default. |
 
 ## Changes to existing queries
diff --git a/javascript/config/suites/javascript/security b/javascript/config/suites/javascript/security
@@ -3,6 +3,7 @@
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteHostnameRegExp.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
++ semmlecode-javascript-queries/Security/CWE-020/MissingRegExpAnchor.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-022/ZipSlip.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-078/CommandInjection.ql: /Security/CWE/CWE-078
diff --git a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.qhelp b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.qhelp
@@ -0,0 +1,77 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted input with regular expressions is a
+			common technique.  However, it is error prone to match untrusted input
+			against regular expressions without anchors such as <code>^</code> or
+			<code>$</code>.  Malicious input can bypass such security checks by
+			embedding one of the allowed patterns in an unexpected location.
+
+		</p>
+
+		<p>
+
+			Even if the matching is not done in a security-critical
+			context, it may still cause undesirable behaviors when the regular
+			expression matches accidentally.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Use anchors to ensure that regular expressions match at
+			the expected locations.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains, and not some malicious site.
+
+		</p>
+
+		<sample src="examples/MissingRegExpAnchor_BAD.js"/>
+
+		<p>
+
+			The check with the regular expression match is, however, easy to bypass. For example
+			by embedding <code>example.com</code> in the path component:
+			<code>http://evil-example.net/example.com</code>, or in the query
+			string component: <code>http://evil-example.net/?x=example.com</code>.
+
+			Address these shortcomings by using anchors in the regular expression instead:
+
+		</p>
+
+		<sample src="examples/MissingRegExpAnchor_GOOD.js"/>
+
+		<p>
+
+			A related mistake is to write a regular expression with
+			multiple alternatives, but to only include an anchor for one of the
+			alternatives. As an example, the regular expression
+			<code>/^www\\.example\\.com|beta\\.example\\.com/</code> will match the host
+			<code>evil.beta.example.com</code> because the regular expression is parsed
+			as <code>/(^www\\.example\\.com)|(beta\\.example\\.com)/</code>
+
+		</p>
+	</example>
+
+	<references>
+		<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">Regular Expressions</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
diff --git a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
@@ -0,0 +1,86 @@
+/**
+ * @name Missing regular expression anchor
+ * @description Regular expressions without anchors can be vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @precision medium
+ * @id js/regex/missing-regexp-anchor
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import javascript
+
+/**
+ * Holds if `src` is a pattern for a collection of alternatives where
+ * only the first or last alternative is anchored, indicating a
+ * precedence mistake explained by `msg`.
+ *
+ * The canonical example of such a mistake is: `^a|b|c`, which is
+ * parsed as `(^a)|(b)|(c)`.
+ */
+predicate isAnInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
+  exists(string str, string maybeGroupedStr, string regex, string anchorPart, string posString, string escapedDot |
+    // a dot that might be escaped in a regular expression, for example `/\./` or new `RegExp('\\.')`
+    escapedDot = "\\\\\\\\?[.]" and
+    // a string that is mostly free from special reqular expression symbols
+    str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
+    // the string may be wrapped in parentheses
+    maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
+    (
+      // a problematic pattern: `^a|b|...|x`
+      regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+" and
+      posString = "beginning"
+      or
+      // a problematic pattern: `a|b|...|x$`
+      regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)" and
+      posString = "end"
+    ) and
+    anchorPart = src.getPattern().regexpCapture(regex, 1) and
+    anchorPart.regexpMatch("(?i).*[a-z].*") and
+    msg = "The alternative '" + anchorPart + "' uses an anchor to match from the " + posString +
+        " of a string, but the other alternatives of this regular expression do not use anchors."
+  )
+}
+
+/**
+ * Holds if `src` is an unanchored pattern for a URL, indicating a
+ * mistake explained by `msg`.
+ */
+predicate isAnInterestingUnanchoredRegExpString(RegExpPatternSource src, string msg) {
+  exists(string pattern | pattern = src.getPattern() |
+    // a substring sequence of a protocol and subdomains, perhaps with some regex characters mixed in, followed by a known TLD
+    pattern
+        .regexpMatch("(?i)[():|?a-z0-9-\\\\./]+[.]" + RegExpPatterns::commonTLD() +
+            "([/#?():]\\S*)?") and
+    // without any anchors
+    pattern.regexpMatch("[^$^]+") and
+    // that is not used for capture or replace
+    not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
+      name = "exec" and
+      mcn = src.getARegExpObject().getAMethodCall() and
+      exists(mcn.getAPropertyRead())
+      or
+      exists(DataFlow::Node arg |
+        arg = mcn.getArgument(0) and
+        (
+          src.getARegExpObject().flowsTo(arg) or
+          src.(StringRegExpPatternSource).getAUse() = arg
+        )
+      |
+        name = "replace"
+        or
+        name = "match" and exists(mcn.getAPropertyRead())
+      )
+    ) and
+    msg = "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
+  )
+}
+
+from DataFlow::Node nd, string msg
+where
+  isAnInterestingUnanchoredRegExpString(nd, msg)
+  or
+  isAnInterestingSemiAnchoredRegExpString(nd, msg)
+select nd, msg
diff --git a/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_BAD.js b/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_BAD.js
@@ -0,0 +1,7 @@
+app.get('/some/path', function(req, res) {
+    let url = req.param("url");
+    // BAD: the host of `url` may be controlled by an attacker
+	if (url.match(/https?:\/\/www\.example\.com\//)) {
+        res.redirect(url);
+    }
+});
diff --git a/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_GOOD.js b/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_GOOD.js
@@ -0,0 +1,7 @@
+app.get('/some/path', function(req, res) {
+    let url = req.param("url");
+    // GOOD: the host of `url` can not be controlled by an attacker
+	if (url.match(/^https?:\/\/www\.example\.com\//)) {
+        res.redirect(url);
+    }
+});
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected b/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
@@ -22,3 +22,6 @@
 | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | here |
 | tst-IncompleteHostnameRegExp.js:48:41:48:68 | '^https ... e\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | here |
 | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
+| tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | This regular expression has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | here |
+| tst-SemiAnchoredRegExp.js:64:13:64:34 | '^good. ... er.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:64:13:64:34 | '^good. ... er.com' | here |
+| tst-SemiAnchoredRegExp.js:65:13:65:36 | '^good\\ ... r\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:65:13:65:36 | '^good\\ ... r\\.com' | here |
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/MissingRegExpAnchor.expected b/javascript/ql/test/query-tests/Security/CWE-020/MissingRegExpAnchor.expected
@@ -0,0 +1,49 @@
+| tst-SemiAnchoredRegExp.js:3:2:3:7 | /^a\|b/ | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:6:2:6:9 | /^a\|b\|c/ | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:12:2:12:9 | /^a\|(b)/ | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:14:2:14:11 | /^(a)\|(b)/ | The alternative '^(a)' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:17:2:17:7 | /a\|b$/ | The alternative 'b$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:20:2:20:9 | /a\|b\|c$/ | The alternative 'c$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:26:2:26:9 | /(a)\|b$/ | The alternative 'b$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:28:2:28:11 | /(a)\|(b)$/ | The alternative '(b)$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | The alternative '^good.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:31:2:31:25 | /^good\\ ... r\\.com/ | The alternative '^good\\.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:32:2:32:27 | /^good\\ ... \\\\.com/ | The alternative '^good\\\\.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:37:13:37:18 | "^a\|b" | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:40:13:40:20 | "^a\|b\|c" | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:46:13:46:20 | "^a\|(b)" | The alternative '^a' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:48:13:48:22 | "^(a)\|(b)" | The alternative '^(a)' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:51:13:51:18 | "a\|b$" | The alternative 'b$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:54:13:54:20 | "a\|b\|c$" | The alternative 'c$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:60:13:60:20 | "(a)\|b$" | The alternative 'b$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:62:13:62:22 | "(a)\|(b)$" | The alternative '(b)$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:64:13:64:34 | '^good. ... er.com' | The alternative '^good.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:65:13:65:36 | '^good\\ ... r\\.com' | The alternative '^good.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:66:13:66:38 | '^good\\ ... \\\\.com' | The alternative '^good\\.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:75:2:75:27 | /(\\.xxx ... .zzz)$/ | The alternative '(\\.zzz)$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:77:2:77:23 | /\\.xxx\| ... zzz$/ig | The alternative '\\.zzz$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:78:2:78:19 | /\\.xxx\|\\.yyy\|zzz$/ | The alternative 'zzz$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:81:2:81:28 | /^(xxx  ...  yyy)/i | The alternative '^(xxx yyy zzz)' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:83:2:83:24 | /^(xxx: ... (zzz:)/ | The alternative '^(xxx:)' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:84:2:84:23 | /^(xxx? ... zzz\\/)/ | The alternative '^(xxx?:)' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:85:2:85:16 | /^@media\|@page/ | The alternative '^@media' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:87:2:87:21 | /^click\|mouse\|touch/ | The alternative '^click' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:88:2:88:43 | /^http: ... r\\.com/ | The alternative '^http://good\\.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:89:2:89:47 | /^https ... r\\.com/ | The alternative '^https?://good\\.com' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:90:2:90:55 | /^mouse ... ragend/ | The alternative '^mouse' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:91:2:91:14 | /^xxx:\|yyy:/i | The alternative '^xxx:' uses an anchor to match from the beginning of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-SemiAnchoredRegExp.js:92:2:92:18 | /_xxx\|_yyy\|_zzz$/ | The alternative '_zzz$' uses an anchor to match from the end of a string, but the other alternatives of this regular expression do not use anchors. |
+| tst-UnanchoredUrlRegExp.js:3:43:3:61 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:4:54:4:72 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:10:2:10:22 | /https? ... od.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:11:13:11:31 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:13:44:13:62 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:15:13:15:31 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:19:43:19:62 | "https?://good.com/" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:20:43:20:66 | "https? ... m:8080" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:23:3:23:21 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:24:3:24:23 | /https? ... od.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:25:14:25:32 | "https?://good.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:35:2:35:32 | /https? ... 0-9]+)/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:49:11:49:51 | /youtub ... -_]+)/i | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| tst-UnanchoredUrlRegExp.js:77:11:77:32 | /vimeo\\ ... 0-9]+)/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/MissingRegExpAnchor.qlref b/javascript/ql/test/query-tests/Security/CWE-020/MissingRegExpAnchor.qlref
@@ -0,0 +1 @@
+Security/CWE-020/MissingRegExpAnchor.ql
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/tst-SemiAnchoredRegExp.js b/javascript/ql/test/query-tests/Security/CWE-020/tst-SemiAnchoredRegExp.js
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/tst-UnanchoredUrlRegExp.js b/javascript/ql/test/query-tests/Security/CWE-020/tst-UnanchoredUrlRegExp.js

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Security/CWE-020/MissingRegExpAnchor.ql`