JS: add query js/incomplete-url-regexp

Esben Sparre Andreasen · Esben Sparre Andreasen · commit 52ca696ff440 · 2018-12-10T22:20:29.000+01:00
diff --git a/javascript/config/suites/javascript/security b/javascript/config/suites/javascript/security
@@ -1,5 +1,6 @@
 + semmlecode-javascript-queries/DOM/TargetBlank.ql: /Security/CWE/CWE-200
 + semmlecode-javascript-queries/Electron/EnablingNodeIntegration.ql: /Security/CWE/CWE-094
++ semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlRegExp.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
diff --git a/javascript/ql/src/Security/CWE-020/IncompleteUrlRegExp.qhelp b/javascript/ql/src/Security/CWE-020/IncompleteUrlRegExp.qhelp
@@ -0,0 +1,69 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted URLs is an important technique for
+			preventing attacks such as request forgeries and malicious
+			redirections. Usually, this is done by checking that the host of a URL
+			is in a set of allowed hosts.
+
+        </p>
+
+        <p>
+
+	        If a regular expression implements such a check, it is
+	        easy to accidentally make the check too permissive by not escaping the
+	        <code>.</code> meta-characters appropriately.
+
+	        Even if the check is not used in a security-critical
+	        context, the incomplete check may still cause undesirable behaviors
+	        when the check succeeds accidentally.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Escape all meta-characters appropriately when constructing
+			regular expressions for security checks, pay special attention to the
+			<code>.</code> meta-character.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains.
+
+		</p>
+
+		<sample src="examples/IncompleteUrlRegExp.js"/>
+
+		<p>
+
+			The check is however easy to bypass because the unescaped
+			<code>.</code> allows for any character before
+			<code>example.com</code>, effectively allowing the redirect to go to
+			an attacker-controlled domain such as <code>wwwXexample.com</code>.
+
+			Address this vulnerability by escaping <code>.</code>
+			appropriately: <code>let regex =/(www|beta|)\.example\.com/</code>.
+
+		</p>
+
+	</example>
+
+	<references>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
diff --git a/javascript/ql/src/Security/CWE-020/IncompleteUrlRegExp.ql b/javascript/ql/src/Security/CWE-020/IncompleteUrlRegExp.ql
@@ -0,0 +1,70 @@
+/**
+ * @name Incomplete URL regular expression
+ * @description Security checks on URLs using regular expressions are sometimes vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity error
+ * @precision high
+ * @id js/incomplete-url-regexp
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import javascript
+import semmle.javascript.security.dataflow.RegExpInjection
+
+module IncompleteUrlRegExpTracking {
+
+  /**
+   * A taint tracking configuration for incomplete URL regular expressions sources.
+   */
+  class Configuration extends TaintTracking::Configuration {
+    Configuration() { this = "IncompleteUrlRegExpTracking" }
+
+    override
+    predicate isSource(DataFlow::Node source) {
+      isIncompleteHostNameRegExpPattern(source.asExpr().(ConstantString).getStringValue(), _)
+    }
+
+    override
+    predicate isSink(DataFlow::Node sink) {
+      sink instanceof RegExpInjection::Sink
+    }
+
+  }
+
+}
+
+/**
+ * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
+ * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
+ */
+bindingset[pattern]
+predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
+  hostPart = pattern.regexpCapture(
+    "(?i).*" +
+    // Either:
+    // - an unescaped and repeated  `.`, followed by anything
+    // - a unescaped single `.`
+    "(?:(?<!\\\\)[.][+*].*?|(?<!\\\\)[.])" +
+    // a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
+    "([():|?a-z0-9-]+(\\\\)?[.](com|org|edu|gov|uk|net))" +
+    ".*", 1)
+}
+
+from Expr e, string pattern, string intendedHost
+where
+      (
+        e.(RegExpLiteral).getValue() = pattern or
+        exists (IncompleteUrlRegExpTracking::Configuration cfg |
+          cfg.hasFlow(e.flow(), _) and
+          e.mayHaveStringValue(pattern)
+        )
+      ) and
+      isIncompleteHostNameRegExpPattern(pattern, intendedHost)
+      and
+      // ignore patterns with capture groups after the TLD
+      not pattern.regexpMatch("(?i).*[.](com|org|edu|gov|uk|net).*[(][?]:.*[)].*")
+
+
+select e, "This regular expression has an unescaped '.', which means that '" + intendedHost + "' might not match the intended host of a matched URL."
diff --git a/javascript/ql/src/Security/CWE-020/examples/IncompleteUrlRegExp.js b/javascript/ql/src/Security/CWE-020/examples/IncompleteUrlRegExp.js
@@ -0,0 +1,9 @@
+app.get('/some/path', function(req, res) {
+    let url = req.param('url'),
+        host = urlLib.parse(url).host;
+    // BAD: the host of `url` may be controlled by an attacker
+    let regex = /(www|beta|).example.com/;
+    if (host.match(regex)) {
+        res.redirect(url);
+    }
+});
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/IncompleteUrlRegExp.expected b/javascript/ql/test/query-tests/Security/CWE-020/IncompleteUrlRegExp.expected
@@ -0,0 +1,24 @@
+| tst-IncompleteUrlRegExp.js:3:2:3:28 | /http:\\ ... le.com/ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:5:2:5:28 | /http:\\ ... le.net/ | This regular expression has an unescaped '.', which means that 'example.net' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:6:2:6:42 | /http:\\ ... b).com/ | This regular expression has an unescaped '.', which means that '(example-a\|example-b).com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:7:2:7:30 | /http:\\ ... le.com/ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:9:2:9:39 | /http:\\ ... le.com/ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:11:13:11:37 | "http:/ ... le.com" | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:12:10:12:34 | "http:/ ... le.com" | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:15:22:15:46 | "http:/ ... le.com" | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:17:13:17:31 | `test.example.com$` | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:17:14:17:30 | test.example.com$ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:19:17:19:34 | 'test.example.com' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:22:27:22:44 | 'test.example.com' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:28:22:28:39 | 'test.example.com' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:36:2:36:37 | /(.+\\.( ... \\.com)/ | This regular expression has an unescaped '.', which means that '(?:example-a\|example-b)\\.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:37:2:37:54 | /^(http ... =$\|\\/)/ | This regular expression has an unescaped '.', which means that ')?example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:39:2:39:34 | /\\(http ... m\\/\\)/g | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:40:2:40:29 | /https? ... le.com/ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:41:13:41:68 | '^http: ... e\\.com' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:41:41:41:68 | '^https ... e\\.com' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:42:13:42:61 | 'http[s ... \\/(.+)' | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:43:2:43:33 | /^https ... e.com$/ | This regular expression has an unescaped '.', which means that 'example.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:44:9:44:100 | 'protos ... ernal)' | This regular expression has an unescaped '.', which means that 'example-b.com' might not match the intended host of a matched URL. |
+| tst-IncompleteUrlRegExp.js:46:2:46:26 | /exampl ... le.com/ | This regular expression has an unescaped '.', which means that 'dev\|example.com' might not match the intended host of a matched URL. |
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/IncompleteUrlRegExp.qlref b/javascript/ql/test/query-tests/Security/CWE-020/IncompleteUrlRegExp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteUrlRegExp.ql
diff --git a/javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteUrlRegExp.js b/javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteUrlRegExp.js
@@ -0,0 +1,47 @@
+(function() {
+	/http:\/\/example.com/; // OK
+	/http:\/\/test.example.com/; // NOT OK
+	/http:\/\/test\\.example.com/; // OK
+	/http:\/\/test.example.net/; // NOT OK
+	/http:\/\/test.(example-a|example-b).com/; // NOT OK
+	/http:\/\/(.+)\\.example.com/; // NOT OK
+	/http:\/\/(\\.+)\\.example.com/; // OK
+	/http:\/\/(?:.+)\\.test\\.example.com/; // NOT OK
+	/http:\/\/test.example.com\/(?:.*)/; // OK
+	new RegExp("http://test.example.com"); // NOT OK
+	s.match("http://test.example.com"); // NOT OK
+
+	function id(e) { return e; }
+	new RegExp(id(id(id("http://test.example.com")))); // NOT OK
+
+	new RegExp(`test.example.com$`); // NOT OK
+
+	let hostname = 'test.example.com'; // NOT OK
+	new RegExp(`${hostname}$`);
+
+	let domain = { hostname: 'test.example.com' };
+	new RegExp(domain.hostname);
+
+	function convert(domain) {
+		return new RegExp(domain.hostname);
+	}
+	convert({ hostname: 'test.example.com' }); // NOT OK
+
+	let domains = [ { hostname: 'test.example.com' } ];  // NOT OK, but not yet supported
+	function convert(domain) {
+		return new RegExp(domain.hostname);
+	}
+	domains.map(d => convert(d));
+
+	/(.+\.(?:example-a|example-b)\.com)/; // NOT OK
+	/^(https?:)?\/\/((service|www).)?example.com(?=$|\/)/; // NOT OK
+	/^(http|https):\/\/www.example.com\/p\/f\//; // NOT OK
+	/\(http:\/\/sub.example.com\/\)/g; // NOT OK
+	/https?:\/\/api.example.com/; // NOT OK
+	new RegExp('^http://localhost:8000|' + '^https?://.+\.example\.com'); // NOT OK
+	new RegExp('http[s]?:\/\/?sub1\.sub2\.example\.com\/f\/(.+)'); // NOT OK
+	/^https:\/\/[a-z]*.example.com$/; // NOT OK
+	RegExp('protos?://(localhost|.+.example.net|.+.example-a.com|.+.example-b.com|.+.example.internal)'); // NOT OK
+
+	/example.dev|example.com/; // OK, but still flagged
+});

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Security/CWE-020/IncompleteUrlRegExp.ql`