|
2 | 2 |
|
3 | 3 | import python |
4 | 4 | private import semmle.python.regex |
| 5 | +private import codeql.regex.nfa.NfaUtils as NfaUtils |
| 6 | +private import codeql.regex.RegexTreeView |
| 7 | +// exporting as RegexTreeView, and in the top-level scope. |
| 8 | +import Impl as RegexTreeView |
5 | 9 | import Impl |
6 | 10 |
|
7 | 11 | /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ |
@@ -52,8 +56,34 @@ private newtype TRegExpParent = |
52 | 56 | /** A back reference */ |
53 | 57 | TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) } |
54 | 58 |
|
| 59 | +pragma[nomagic] |
| 60 | +private int seqChildEnd(Regex re, int start, int end, int i) { |
| 61 | + result = seqChild(re, start, end, i).getEnd() |
| 62 | +} |
| 63 | + |
| 64 | +// moved out so we can use it in the charpred |
| 65 | +private RegExpTerm seqChild(Regex re, int start, int end, int i) { |
| 66 | + re.sequence(start, end) and |
| 67 | + ( |
| 68 | + i = 0 and |
| 69 | + result.getRegex() = re and |
| 70 | + result.getStart() = start and |
| 71 | + exists(int itemEnd | |
| 72 | + re.item(start, itemEnd) and |
| 73 | + result.getEnd() = itemEnd |
| 74 | + ) |
| 75 | + or |
| 76 | + i > 0 and |
| 77 | + result.getRegex() = re and |
| 78 | + exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | |
| 79 | + result.getStart() = itemStart and |
| 80 | + re.item(itemStart, result.getEnd()) |
| 81 | + ) |
| 82 | + ) |
| 83 | +} |
| 84 | + |
55 | 85 | /** An implementation that statisfies the RegexTreeView signature. */ |
56 | | -module Impl { |
| 86 | +module Impl implements RegexTreeViewSig { |
57 | 87 | /** |
58 | 88 | * An element containing a regular expression term, that is, either |
59 | 89 | * a string literal (parsed as a regular expression) |
@@ -391,32 +421,6 @@ module Impl { |
391 | 421 | override string getPrimaryQLClass() { result = "RegExpSequence" } |
392 | 422 | } |
393 | 423 |
|
394 | | - pragma[nomagic] |
395 | | - private int seqChildEnd(Regex re, int start, int end, int i) { |
396 | | - result = seqChild(re, start, end, i).getEnd() |
397 | | - } |
398 | | - |
399 | | - // moved out so we can use it in the charpred |
400 | | - private RegExpTerm seqChild(Regex re, int start, int end, int i) { |
401 | | - re.sequence(start, end) and |
402 | | - ( |
403 | | - i = 0 and |
404 | | - result.getRegex() = re and |
405 | | - result.getStart() = start and |
406 | | - exists(int itemEnd | |
407 | | - re.item(start, itemEnd) and |
408 | | - result.getEnd() = itemEnd |
409 | | - ) |
410 | | - or |
411 | | - i > 0 and |
412 | | - result.getRegex() = re and |
413 | | - exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | |
414 | | - result.getStart() = itemStart and |
415 | | - re.item(itemStart, result.getEnd()) |
416 | | - ) |
417 | | - ) |
418 | | - } |
419 | | - |
420 | 424 | /** |
421 | 425 | * An alternative term, that is, a term of the form `a|b`. |
422 | 426 | * |
@@ -1030,4 +1034,62 @@ module Impl { |
1030 | 1034 |
|
1031 | 1035 | override string getPrimaryQLClass() { result = "RegExpBackRef" } |
1032 | 1036 | } |
| 1037 | + |
| 1038 | + class Top = RegExpParent; |
| 1039 | + |
| 1040 | + /** |
| 1041 | + * Holds if `term` is an escape class representing e.g. `\d`. |
| 1042 | + * `clazz` is which character class it represents, e.g. "d" for `\d`. |
| 1043 | + */ |
| 1044 | + predicate isEscapeClass(RegExpTerm term, string clazz) { |
| 1045 | + exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz) |
| 1046 | + } |
| 1047 | + |
| 1048 | + /** |
| 1049 | + * Holds if `term` is a possessive quantifier. |
| 1050 | + * As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. |
| 1051 | + */ |
| 1052 | + predicate isPossessive(RegExpQuantifier term) { none() } |
| 1053 | + |
| 1054 | + /** |
| 1055 | + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. |
| 1056 | + * Not yet implemented for Python. |
| 1057 | + */ |
| 1058 | + predicate matchesAnyPrefix(RegExpTerm term) { any() } |
| 1059 | + |
| 1060 | + /** |
| 1061 | + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. |
| 1062 | + * Not yet implemented for Python. |
| 1063 | + */ |
| 1064 | + predicate matchesAnySuffix(RegExpTerm term) { any() } |
| 1065 | + |
| 1066 | + /** |
| 1067 | + * Holds if the regular expression should not be considered. |
| 1068 | + * |
| 1069 | + * We make the pragmatic performance optimization to ignore regular expressions in files |
| 1070 | + * that does not belong to the project code (such as installed dependencies). |
| 1071 | + */ |
| 1072 | + predicate isExcluded(RegExpParent parent) { |
| 1073 | + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) |
| 1074 | + or |
| 1075 | + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so |
| 1076 | + // we explicitly exclude these. |
| 1077 | + count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 |
| 1078 | + } |
| 1079 | + |
| 1080 | + /** |
| 1081 | + * Holds if `root` has the `i` flag for case-insensitive matching. |
| 1082 | + */ |
| 1083 | + predicate isIgnoreCase(RegExpTerm root) { |
| 1084 | + root.isRootTerm() and |
| 1085 | + root.getLiteral().isIgnoreCase() |
| 1086 | + } |
| 1087 | + |
| 1088 | + /** |
| 1089 | + * Holds if `root` has the `s` flag for multi-line matching. |
| 1090 | + */ |
| 1091 | + predicate isDotAll(RegExpTerm root) { |
| 1092 | + root.isRootTerm() and |
| 1093 | + root.getLiteral().isDotAll() |
| 1094 | + } |
1033 | 1095 | } |
0 commit comments