tutorial0
diff --git a/‎change-notes/1.23/analysis-javascript.md
Lines changed: 1 addition & 1 deletion b/‎change-notes/1.23/analysis-javascript.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎javascript/config/suites/javascript/security
Lines changed: 1 addition & 0 deletions b/‎javascript/config/suites/javascript/security
Lines changed: 1 addition & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Lines changed: 4 additions & 1 deletion b/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Lines changed: 4 additions & 1 deletion
diff --git a/‎javascript/ql/src/Security/CWE-020/UselessRegExpCharacterEscape.qhelp
Lines changed: 86 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-020/UselessRegExpCharacterEscape.qhelp
Lines changed: 86 additions & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/UselessRegExpCharacterEscape.ql
Lines changed: 156 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-020/UselessRegExpCharacterEscape.ql
Lines changed: 156 additions & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/examples/UselessRegExpCharacterEscape_bad_1.js
Lines changed: 2 additions & 0 deletions b/‎javascript/ql/src/Security/CWE-020/examples/UselessRegExpCharacterEscape_bad_1.js
Lines changed: 2 additions & 0 deletions
diff --git a/‎javascript/ql/src/semmle/javascript/CharacterEscapes.qll
Lines changed: 97 additions & 0 deletions b/‎javascript/ql/src/semmle/javascript/CharacterEscapes.qll
Lines changed: 97 additions & 0 deletions
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
Lines changed: 2 additions & 6 deletions b/‎javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
Lines changed: 2 additions & 6 deletions
@@ -18,7 +18,7 @@
 | Loop bound injection (`js/loop-bound-injection`)                          | security, external/cwe/cwe-834                                      | Highlights loops where a user-controlled object with an arbitrary .length value can trick the server to loop indefinitely. Results are not shown on LGTM by default. |
 | Suspicious method name (`js/suspicious-method-name-declaration`)          | correctness, typescript, methods                                  | Highlights suspiciously named methods where the developer likely meant to write a constructor or function. Results are shown on LGTM by default. |
 | Use of returnless function (`js/use-of-returnless-function`)              | maintainability, correctness                                      | Highlights calls where the return value is used, but the callee never returns a value. Results are shown on LGTM by default. |
-
+| Useless regular expression character escape (`js/useless-regexp-character-escape`) | correctness, security, external/cwe/cwe-20 | Highlights regular expression strings with useless character escapes, indicating a possible violation of [CWE-20](https://cwe.mitre.org/data/definitions/20.html). Results are shown on LGTM by default. |
 
 ## Changes to existing queries
 
 
@@ -4,6 +4,7 @@
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/MissingRegExpAnchor.ql: /Security/CWE/CWE-020
++ semmlecode-javascript-queries/Security/CWE-020/UselessRegExpCharacterEscape.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-022/ZipSlip.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-078/CommandInjection.ql: /Security/CWE/CWE-078
 
@@ -11,6 +11,7 @@
  */
 
 import javascript
+import semmle.javascript.CharacterEscapes
 
 /**
  * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
@@ -40,7 +41,9 @@ where
     )
   ) and
   // ignore patterns with capture groups after the TLD
-  not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
+  not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*") and
+  // avoid double reporting
+  not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
 select re,
   "This " + kind + " has an unescaped '.' before '" + hostPart +
     "', so it might match more hosts than expected.", aux, "here"
@@ -0,0 +1,86 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			When a character in a string literal or regular expression
+			literal is preceded by a backslash, it is interpreted as part of an
+			escape sequence. For example, the escape sequence <code>\n</code> in a
+			string literal corresponds to a single <code>newline</code> character,
+			and not the <code>\</code> and <code>n</code> characters.
+
+			However, not all characters change meaning when used in an
+			escape sequence. In this case, the backslash just makes the character
+			appear to mean something else, and the backslash actually has no
+			effect. For example, the escape sequence <code>\k</code> in a string
+			literal just means <code>k</code>.
+
+			Such superfluous escape sequences are usually benign, and
+			do not change the behavior of the program.
+
+		</p>
+
+		<p>
+
+			The set of characters that change meaning when in escape
+			sequences is different for regular expression literals and string
+			literals.
+
+			This can be problematic when a regular expression literal
+			is turned into a regular expression that is built from one or more
+			string literals. The problem occurs when a regular expression escape
+			sequence loses its special meaning in a string literal.
+
+		</p>
+
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Ensure that the right amount of backslashes is used when
+			escaping characters in strings, template literals and regular
+			expressions.
+
+			Pay special attention to the number of backslashes when
+			rewriting a regular expression as a string literal.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a string is
+			<code>"my-marker"</code>, possibly surrounded by white space:
+
+		</p>
+
+		<sample src="examples/UselessRegExpCharacterEscape_bad_1.js"/>
+
+		<p>
+
+			However, the check does not work properly for white space
+			as the two <code>\s</code> occurrences are semantically equivalent to
+			just <code>s</code>, meaning that the check will succeed for strings
+			like <code>"smy-markers"</code> instead of <code>" my-marker
+			"</code>.
+
+			Address these shortcomings by either using a regular
+			expression literal (<code>/(^\s*)my-marker(\s*$)/</code>), or by
+			adding extra backslashes
+			(<code>'(^\\s*)my-marker(\\s*$)'</code>).
+
+		</p>
+
+	</example>
+
+	<references>
+		<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#Escaping">Regular expression escape notation</a></li>
+		<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#Escape_notation">String escape notation</a></li>
+	</references>
+</qhelp>
@@ -0,0 +1,156 @@
+/**
+ * @name Useless regular-expression character escape
+ * @description Prepending a backslash to an ordinary character in a string
+ *              does not have any effect, and may make regular expressions constructed from this string
+ *              behave unexpectedly.
+ * @kind problem
+ * @problem.severity error
+ * @precision high
+ * @id js/useless-regexp-character-escape
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import javascript
+import semmle.javascript.CharacterEscapes::CharacterEscapes
+
+newtype TRegExpPatternMistake =
+  /**
+   * A character escape mistake in regular expression string `src`
+   * for the character `char` at `index` in `rawStringNode`, explained
+   * by `mistake`.
+   */
+  TIdentityEscapeInStringMistake(
+    RegExpPatternSource src, string char, string mistake, ASTNode rawStringNode, int index
+  ) {
+    char = getALikelyRegExpPatternMistake(src, mistake, rawStringNode, index)
+  } or
+  /**
+   * A backslash-escaped 'b' at `index` of `rawStringNode` in the
+   * regular expression string `src`, indicating intent to use the
+   * word-boundary assertion '\b'.
+   */
+  TBackspaceInStringMistake(RegExpPatternSource src, ASTNode rawStringNode, int index) {
+    exists(string raw, string cooked |
+      exists(StringLiteral lit | lit = rawStringNode |
+        rawStringNode = src.asExpr() and
+        raw = lit.getRawValue() and
+        cooked = lit.getStringValue()
+      )
+      or
+      exists(TemplateElement elem | elem = rawStringNode |
+        rawStringNode = src.asExpr().(TemplateLiteral).getAnElement() and
+        raw = elem.getRawValue() and
+        cooked = elem.getStringValue()
+      )
+    |
+      "b" = getAnEscapedCharacter(raw, index) and
+      // except if the string is exactly \b
+      cooked.length() != 1
+    )
+  }
+
+/**
+ * A character escape mistake in a regular expression string.
+ *
+ * Implementation note: the main purpose of this class is to associate an
+ * exact character ___location with an alert message, in the name of
+ * user-friendly alerts. The implementation can be simplified
+ * significantly by only using the enclosing string ___location as the alert
+ * ___location.
+ */
+class RegExpPatternMistake extends TRegExpPatternMistake {
+  /**
+   * Holds if this element is at the specified ___location.
+   * The ___location spans column `startcolumn` of line `startline` to
+   * column `endcolumn` of line `endline` in file `filepath`.
+   * For more information, see
+   * [Locations](https://help.semmle.com/QL/learn-ql/ql/locations.html).
+   */
+  predicate hasLocationInfo(
+    string filepath, int startline, int startcolumn, int endline, int endcolumn
+  ) {
+    exists(int srcStartcolumn, int srcEndcolumn, int index |
+      index = getIndex() and
+      getRawStringNode()
+          .getLocation()
+          .hasLocationInfo(filepath, startline, srcStartcolumn, endline, srcEndcolumn)
+    |
+      (
+        if startline = endline
+        then startcolumn = srcStartcolumn + index - 1 and endcolumn = srcStartcolumn + index
+        else (
+          startcolumn = srcStartcolumn and endcolumn = srcEndcolumn
+        )
+      )
+    )
+  }
+
+  /** Gets a textual representation of this element. */
+  string toString() { result = getMessage() }
+
+  abstract ASTNode getRawStringNode();
+
+  abstract RegExpPatternSource getSrc();
+
+  abstract int getIndex();
+
+  abstract string getMessage();
+}
+
+/**
+ * An identity-escaped character that indicates programmer intent to
+ * do something special in a regular expression.
+ */
+class IdentityEscapeInStringMistake extends RegExpPatternMistake, TIdentityEscapeInStringMistake {
+  RegExpPatternSource src;
+
+  string char;
+
+  string mistake;
+
+  int index;
+
+  ASTNode rawStringNode;
+
+  IdentityEscapeInStringMistake() {
+    this = TIdentityEscapeInStringMistake(src, char, mistake, rawStringNode, index)
+  }
+
+  override string getMessage() {
+    result = "'\\" + char + "' is equivalent to just '" + char + "', so the sequence " + mistake
+  }
+
+  override int getIndex() { result = index }
+
+  override RegExpPatternSource getSrc() { result = src }
+
+  override ASTNode getRawStringNode() { result = rawStringNode }
+}
+
+/**
+  * A backspace as '\b' in a regular expression string, indicating
+  * programmer intent to use the word-boundary assertion '\b'.
+  */
+class BackspaceInStringMistake extends RegExpPatternMistake, TBackspaceInStringMistake {
+  RegExpPatternSource src;
+
+  int index;
+
+  ASTNode rawStringNode;
+
+  BackspaceInStringMistake() { this = TBackspaceInStringMistake(src, rawStringNode, index) }
+
+  override string getMessage() { result = "'\\b' is a backspace, and not a word-boundary assertion" }
+
+  override int getIndex() { result = index }
+
+  override RegExpPatternSource getSrc() { result = src }
+
+  override ASTNode getRawStringNode() { result = rawStringNode }
+}
+
+from RegExpPatternMistake mistake
+select mistake, "The escape sequence " + mistake.getMessage() + " when it is used in a $@.",
+  mistake.getSrc().getAParse(), "regular expression"
@@ -0,0 +1,2 @@
+let regex = new RegExp('(^\s*)my-marker(\s*$)'),
+    isMyMarkerText = regex.test(text);
@@ -0,0 +1,97 @@
+/**
+ * Provides classes for reasoning about character escapes in literals.
+ */
+
+import javascript
+
+module CharacterEscapes {
+  /**
+   * Provides sets of characters (as strings) with specific string/regexp characteristics.
+   */
+  private module Sets {
+    string sharedEscapeChars() { result = "fnrtvux0\\" }
+
+    string regexpAssertionChars() { result = "bB" }
+
+    string regexpCharClassChars() { result = "cdDpPsSwW" }
+
+    string regexpBackreferenceChars() { result = "123456789k" }
+
+    string regexpMetaChars() { result = "^$*+?.()|{}[]-" }
+  }
+
+  /**
+   * Gets the `i`th character of `raw`, which is preceded by an uneven number of backslashes.
+   */
+  bindingset[raw]
+  string getAnEscapedCharacter(string raw, int i) {
+    result = raw.regexpFind("(?<=(^|[^\\\\])\\\\(\\\\{2}){0,10}).", _, i)
+  }
+
+  /**
+   * Holds if `n` is delimited by `delim` and contains `rawStringNode` with the raw string value `raw`.
+   */
+  private predicate hasRawStringAndQuote(
+    DataFlow::ValueNode n, string delim, ASTNode rawStringNode, string raw
+  ) {
+    rawStringNode = n.asExpr() and
+    raw = rawStringNode.(StringLiteral).getRawValue() and
+    delim = raw.charAt(0)
+    or
+    rawStringNode = n.asExpr().(TemplateLiteral).getAnElement() and
+    raw = rawStringNode.(TemplateElement).getRawValue() and
+    delim = "`"
+    or
+    rawStringNode = n.asExpr() and
+    raw = rawStringNode.(RegExpLiteral).getRawValue() and
+    delim = "/"
+  }
+
+  /**
+   * Gets a character in `n` that is preceded by a single useless backslash.
+   *
+   * The character is the `i`th character of `rawStringNode`'s raw string value.
+   */
+  string getAnIdentityEscapedCharacter(DataFlow::Node n, ASTNode rawStringNode, int i) {
+    exists(string delim, string raw, string additionalEscapeChars |
+      hasRawStringAndQuote(n, delim, rawStringNode, raw) and
+      if rawStringNode instanceof RegExpLiteral
+      then
+        additionalEscapeChars = Sets::regexpMetaChars() + Sets::regexpAssertionChars() + Sets::regexpCharClassChars() +
+            Sets::regexpBackreferenceChars()
+      else additionalEscapeChars = "b"
+    |
+      result = getAnEscapedCharacter(raw, i) and
+      not result = (Sets::sharedEscapeChars() + delim + additionalEscapeChars).charAt(_)
+    )
+  }
+
+  /**
+   * Holds if `src` likely contains a regular expression mistake, to be reported by `js/useless-regexp-character-escape`.
+   */
+  predicate hasALikelyRegExpPatternMistake(RegExpPatternSource src) {
+    exists(getALikelyRegExpPatternMistake(src, _, _, _))
+  }
+
+  /**
+   * Gets a character in `n` that is preceded by a single useless backslash, resulting in a likely regular expression mistake explained by `mistake`.
+   *
+   * The character is the `i`th character of the raw string value of `rawStringNode`.
+   */
+  string getALikelyRegExpPatternMistake(
+    RegExpPatternSource src, string mistake, ASTNode rawStringNode, int i
+  ) {
+    result = getAnIdentityEscapedCharacter(src, rawStringNode, i) and
+    (
+      result = Sets::regexpAssertionChars().charAt(_) and mistake = "is not an assertion"
+      or
+      result = Sets::regexpCharClassChars().charAt(_) and mistake = "is not a character class"
+      or
+      result = Sets::regexpBackreferenceChars().charAt(_) and mistake = "is not a backreference"
+      or
+      // conservative formulation: we do not know in general if the sequence is enclosed in a character class `[...]`
+      result = Sets::regexpMetaChars().charAt(_) and
+      mistake = "may still represent a meta-character"
+    )
+  }
+}
@@ -14,15 +14,11 @@
 | tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | here |
 | tst-IncompleteHostnameRegExp.js:39:2:39:33 | /^(http ... om\\/)/g | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:39:2:39:33 | /^(http ... om\\/)/g | here |
 | tst-IncompleteHostnameRegExp.js:40:2:40:30 | /^https ... le.com/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:40:2:40:30 | /^https ... le.com/ | here |
-| tst-IncompleteHostnameRegExp.js:41:13:41:68 | '^http: ... e\\.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:41:13:41:68 | '^http: ... e\\.com' | here |
-| tst-IncompleteHostnameRegExp.js:41:41:41:68 | '^https ... e\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:41:13:41:68 | '^http: ... e\\.com' | here |
-| tst-IncompleteHostnameRegExp.js:42:13:42:62 | '^http[ ... \\/(.+)' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:42:13:42:62 | '^http[ ... \\/(.+)' | here |
 | tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | here |
 | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
 | tst-IncompleteHostnameRegExp.js:46:2:46:29 | /^(exam ... e.com)/ | This regular expression has an unescaped '.' before 'dev\|example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:46:2:46:29 | /^(exam ... e.com)/ | here |
-| tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | here |
-| tst-IncompleteHostnameRegExp.js:48:41:48:68 | '^https ... e\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... e\\.com' | here |
+| tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... \\\\.com' | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... \\\\.com' | here |
+| tst-IncompleteHostnameRegExp.js:48:41:48:68 | '^https ... \\\\.com' | This string, which is used as a regular expression $@, has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:68 | '^http: ... \\\\.com' | here |
 | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
 | tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | This regular expression has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | here |
 | tst-SemiAnchoredRegExp.js:66:13:66:34 | '^good. ... er.com' | This regular expression has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:66:13:66:34 | '^good. ... er.com' | here |
-| tst-SemiAnchoredRegExp.js:67:13:67:36 | '^good\\ ... r\\.com' | This regular expression has an unescaped '.' before 'com\|better.com', so it might match more hosts than expected. | tst-SemiAnchoredRegExp.js:67:13:67:36 | '^good\\ ... r\\.com' | here |
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+let regex = new RegExp('(^\s)my-marker(\s$)'),`
	`2`	`+ isMyMarkerText = regex.test(text);`