@@ -968,4 +968,92 @@ module RegExp {
968968 /** Holds `flags` includes the `s` flag or is the unknown flag `?`. */
969969 bindingset [ flags]
970970 predicate maybeDotAll ( string flags ) { flags = unknownFlag ( ) or isDotAll ( flags ) }
971+
972+ /** Holds if `term` and all of its disjuncts are anchored on both ends. */
973+ predicate isFullyAnchoredTerm ( RegExpTerm term ) {
974+ exists ( RegExpSequence seq | term = seq |
975+ seq .getChild ( 0 ) instanceof RegExpCaret and
976+ seq .getLastChild ( ) instanceof RegExpDollar
977+ )
978+ or
979+ isFullyAnchoredTerm ( term .( RegExpGroup ) .getAChild ( ) )
980+ or
981+ isFullyAnchoredAlt ( term , term .getNumChild ( ) )
982+ }
983+
984+ /** Holds if the first `i` disjuncts of `term` are fully anchored. */
985+ private predicate isFullyAnchoredAlt ( RegExpAlt term , int i ) {
986+ isFullyAnchoredTerm ( term .getChild ( 0 ) ) and i = 1
987+ or
988+ isFullyAnchoredAlt ( term , i - 1 ) and
989+ isFullyAnchoredTerm ( term .getChild ( i - 1 ) )
990+ }
991+
992+ /**
993+ * Holds if `term` is matches any character except for explicitly listed exceptions.
994+ *
995+ * For example, holds for `.`, `[^<>]`, or `\W`, but not for `[a-z]`, `\w`, or `[^\W\S]`.
996+ */
997+ predicate isWildcardLike ( RegExpTerm term ) {
998+ term instanceof RegExpDot
999+ or
1000+ term .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1001+ or
1002+ exists ( RegExpCharacterClass cls | term = cls |
1003+ cls .isInverted ( ) and
1004+ not cls .getAChild ( ) .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1005+ )
1006+ or
1007+ exists ( RegExpCharacterClass cls | term = cls |
1008+ not cls .isInverted ( ) and
1009+ cls .getAChild ( ) .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1010+ )
1011+ }
1012+
1013+ /**
1014+ * Holds if `term` is a generic sanitizer for strings that match (if `outcome` is true)
1015+ * or strings that don't match (if `outcome` is false).
1016+ *
1017+ * Specifically, whitelisting regexps such as `^(foo|bar)$` sanitize matches in the true case.
1018+ * Inverted character classes such as `[^a-z]` or `\W` sanitize matches in the false case.
1019+ */
1020+ predicate isGenericRegExpSanitizer ( RegExpTerm term , boolean outcome ) {
1021+ term .isRootTerm ( ) and
1022+ (
1023+ outcome = true and
1024+ isFullyAnchoredTerm ( term ) and
1025+ not isWildcardLike ( term .getAChild * ( ) )
1026+ or
1027+ outcome = false and
1028+ exists ( RegExpTerm root |
1029+ root = term
1030+ or
1031+ root = term .( RegExpGroup ) .getAChild ( )
1032+ |
1033+ isWildcardLike ( root )
1034+ or
1035+ isWildcardLike ( root .( RegExpAlt ) .getAChild ( ) )
1036+ )
1037+ )
1038+ }
1039+
1040+ /**
1041+ * Gets the AST of a regular expression object that can flow to `node`.
1042+ */
1043+ RegExpTerm getRegExpObjectFromNode ( DataFlow:: Node node ) {
1044+ exists ( DataFlow:: RegExpCreationNode regexp |
1045+ regexp .getAReference ( ) .flowsTo ( node ) and
1046+ result = regexp .getRegExpTerm ( )
1047+ )
1048+ }
1049+
1050+ /**
1051+ * Gets the AST of a regular expression that can flow to `node`,
1052+ * including `RegExp` objects as well as strings interpreted as regular expressions.
1053+ */
1054+ RegExpTerm getRegExpFromNode ( DataFlow:: Node node ) {
1055+ result = getRegExpObjectFromNode ( node )
1056+ or
1057+ result = node .asExpr ( ) .( StringLiteral ) .asRegExp ( )
1058+ }
9711059}
0 commit comments