@@ -62,7 +62,7 @@ abstract class RegexString extends StringLiteral {
6262
6363 /**
6464 * Helper predicate for `quote`.
65- * Holds if the char at `pos` is the one-based `index`th occourence of a quote delimiter (`\Q` or `\E`)
65+ * Holds if the char at `pos` is the one-based `index`th occurence of a quote delimiter (`\Q` or `\E`)
6666 * Result is `true` for `\Q` and `false` for `\E`.
6767 */
6868 private boolean quoteDelimiter ( int index , int pos ) {
@@ -73,7 +73,7 @@ abstract class RegexString extends StringLiteral {
7373 /** Holds if a quoted sequence is found between `start` and `end` */
7474 predicate quote ( int start , int end ) { this .quote ( start , end , _, _) }
7575
76- /** Holds if a quoted sequence is found between `start` and `end`, with ontent found between `inner_start` and `inner_end`. */
76+ /** Holds if a quoted sequence is found between `start` and `end`, with content found between `inner_start` and `inner_end`. */
7777 predicate quote ( int start , int end , int inner_start , int inner_end ) {
7878 exists ( int index |
7979 this .quoteDelimiter ( index , start ) = true and
@@ -98,7 +98,7 @@ abstract class RegexString extends StringLiteral {
9898 }
9999
100100 /**
101- * A control sequence, `\cx`
101+ * Holds if there is a control sequence, `\cx`, between `start` and `end`.
102102 * `x` may be any ascii character including special characters.
103103 */
104104 predicate controlEscape ( int start , int end ) {
@@ -107,6 +107,65 @@ abstract class RegexString extends StringLiteral {
107107 end = start + 3
108108 }
109109
110+ pragma [ inline]
111+ private predicate isOctal ( int index ) { this .getChar ( index ) = [ 0 .. 7 ] .toString ( ) }
112+
113+ /** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
114+ private predicate escapedBraces ( int start , int end ) {
115+ this .escapingChar ( start ) and
116+ this .getChar ( start + 1 ) = [ "N" , "p" , "P" , "x" ] and
117+ this .getChar ( start + 2 ) = "{" and
118+ end = min ( int i | start + 2 < i and this .getChar ( i - 1 ) = "}" )
119+ }
120+
121+ /**
122+ * Holds if an escaped character is found between `start` and `end`.
123+ * Escaped characters include hex values, octal values and named escapes,
124+ * but excludes backreferences.
125+ */
126+ predicate escapedCharacter ( int start , int end ) {
127+ this .escapingChar ( start ) and
128+ not this .backreference ( start , _) and
129+ (
130+ // hex value \xhh
131+ this .getChar ( start + 1 ) = "x" and
132+ this .getChar ( start + 2 ) != "{" and
133+ end = start + 4
134+ or
135+ // octal value \0o, \0oo, or \0ooo. Max of 0377.
136+ this .getChar ( start + 1 ) = "0" and
137+ this .isOctal ( start + 2 ) and
138+ (
139+ if this .isOctal ( start + 3 )
140+ then
141+ if this .isOctal ( start + 4 ) and this .getChar ( start + 2 ) in [ "0" , "1" , "2" , "3" ]
142+ then end = start + 5
143+ else end = start + 4
144+ else end = start + 3
145+ )
146+ or
147+ // 16-bit hex value \uhhhh
148+ this .getChar ( start + 1 ) = "u" and end = start + 6
149+ or
150+ this .escapedBraces ( start , end )
151+ or
152+ // Boundary matchers \b, \b{g}
153+ this .getChar ( start + 1 ) = "b" and
154+ (
155+ if this .getText ( ) .substring ( start + 2 , start + 5 ) = "{g}"
156+ then end = start + 5
157+ else end = start + 2
158+ )
159+ or
160+ this .controlEscape ( start , end )
161+ or
162+ // escape not handled above, update when adding a new case
163+ not this .getChar ( start + 1 ) in [ "x" , "0" , "u" , "p" , "P" , "N" , "b" , "c" ] and
164+ not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
165+ end = start + 2
166+ )
167+ }
168+
110169 private string nonEscapedCharAt ( int i ) {
111170 result = this .getChar ( i ) and
112171 not exists ( int x , int y | this .escapedCharacter ( x , y ) and i in [ x .. y - 1 ] ) and
@@ -128,7 +187,7 @@ abstract class RegexString extends StringLiteral {
128187
129188 /**
130189 * Holds if the character at `pos` starts a character set delimiter.
131- * Result is 1 for `[` and 0 for `]`.
190+ * Result is 1 for `[` and -1 for `]`.
132191 */
133192 private int charSetDelimiter ( int pos ) {
134193 result = 1 and this .charSetStart0 ( pos , _)
@@ -145,17 +204,14 @@ abstract class RegexString extends StringLiteral {
145204 pos = rank [ index ] ( int p | exists ( this .charSetDelimiter ( p ) ) )
146205 }
147206
148- bindingset [ x]
149- private int max_zero ( int x ) { result = max ( [ x , 0 ] ) }
150-
151207 /**
152208 * Gets the nesting depth of character classes after position `pos`,
153209 * where `pos` is the position of a character set delimiter.
154210 */
155211 private int charSetDepth ( int index , int pos ) {
156- index = 1 and result = max_zero ( charSetDelimiter ( index , pos ) )
212+ index = 1 and result = 0 . maximum ( this . charSetDelimiter ( index , pos ) )
157213 or
158- result = max_zero ( charSetDelimiter ( index , pos ) + charSetDepth ( index - 1 , _) )
214+ result = 0 . maximum ( this . charSetDelimiter ( index , pos ) + this . charSetDepth ( index - 1 , _) )
159215 }
160216
161217 /** Hold if a top-level character set starts between `start` and `end`. */
@@ -209,26 +265,10 @@ abstract class RegexString extends StringLiteral {
209265
210266 /** An indexed version of `charSetToken/3` */
211267 private predicate charSetToken ( int charset_start , int index , int token_start , int token_end ) {
212- token_start =
213- rank [ index ] ( int start , int end | this .charSetToken ( charset_start , start , end ) | start ) and
268+ token_start = rank [ index ] ( int start | this .charSetToken ( charset_start , start , _) | start ) and
214269 this .charSetToken ( charset_start , token_start , token_end )
215270 }
216271
217- /**
218- * Holds if the character set starting at `charset_start` contains either
219- * a character or a range found between `start` and `end`.
220- */
221- predicate charSetChild ( int charset_start , int start , int end ) {
222- this .charSetToken ( charset_start , start , end ) and
223- not exists ( int range_start , int range_end |
224- this .charRange ( charset_start , range_start , _, _, range_end ) and
225- range_start <= start and
226- range_end >= end
227- )
228- or
229- this .charRange ( charset_start , start , _, _, end )
230- }
231-
232272 /**
233273 * Helper predicate for `charRange`.
234274 * We can determine where character ranges end by a left to right sweep.
@@ -272,63 +312,19 @@ abstract class RegexString extends StringLiteral {
272312 )
273313 }
274314
275- pragma [ inline]
276- private predicate isOctal ( int index ) { this .getChar ( index ) = [ 0 .. 7 ] .toString ( ) }
277-
278- /** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
279- private predicate escapedBraces ( int start , int end ) {
280- this .escapingChar ( start ) and
281- this .getChar ( start + 1 ) = [ "N" , "p" , "P" , "x" ] and
282- this .getChar ( start + 2 ) = "{" and
283- end = min ( int i | start + 2 < i and this .getChar ( i - 1 ) = "}" )
284- }
285-
286315 /**
287- * Holds if an escaped character is found between `start` and `end`.
288- * Escaped characters include hex values, octal values and named escapes,
289- * but excludes backreferences.
316+ * Holds if the character set starting at `charset_start` contains either
317+ * a character or a range found between `start` and `end`.
290318 */
291- predicate escapedCharacter ( int start , int end ) {
292- this .escapingChar ( start ) and
293- not this .backreference ( start , _) and
294- (
295- // hex value \xhh
296- this .getChar ( start + 1 ) = "x" and
297- this .getChar ( start + 2 ) != "{" and
298- end = start + 4
299- or
300- // octal value \0o, \0oo, or \0ooo. Max of 0377.
301- this .getChar ( start + 1 ) = "0" and
302- this .isOctal ( start + 2 ) and
303- (
304- if this .isOctal ( start + 3 )
305- then
306- if this .isOctal ( start + 4 ) and this .getChar ( start + 2 ) in [ "0" , "1" , "2" , "3" ]
307- then end = start + 5
308- else end = start + 4
309- else end = start + 3
310- )
311- or
312- // 16-bit hex value \uhhhh
313- this .getChar ( start + 1 ) = "u" and end = start + 6
314- or
315- this .escapedBraces ( start , end )
316- or
317- // Boundary matchers \b, \b{g}
318- this .getChar ( start + 1 ) = "b" and
319- (
320- if this .getText ( ) .substring ( start + 2 , start + 5 ) = "{g}"
321- then end = start + 5
322- else end = start + 2
323- )
324- or
325- this .controlEscape ( start , end )
326- or
327- // escape not handled above, update when adding a new case
328- not this .getChar ( start + 1 ) in [ "x" , "0" , "u" , "p" , "P" , "N" , "b" , "c" ] and
329- not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
330- end = start + 2
319+ predicate charSetChild ( int charset_start , int start , int end ) {
320+ this .charSetToken ( charset_start , start , end ) and
321+ not exists ( int range_start , int range_end |
322+ this .charRange ( charset_start , range_start , _, _, range_end ) and
323+ range_start <= start and
324+ range_end >= end
331325 )
326+ or
327+ this .charRange ( charset_start , start , _, _, end )
332328 }
333329
334330 /** Holds if `index` is inside a character set. */
@@ -871,9 +867,9 @@ abstract class RegexString extends StringLiteral {
871867 * Holds if a character is represented between `start` and `end` in the source literal.
872868 */
873869 private predicate sourceCharacter ( int start , int end ) {
874- sourceEscapedCharacter ( start , end )
870+ this . sourceEscapedCharacter ( start , end )
875871 or
876- sourceNonEscapedCharacter ( start ) and
872+ this . sourceNonEscapedCharacter ( start ) and
877873 end = start + 1
878874 }
879875
@@ -885,8 +881,8 @@ abstract class RegexString extends StringLiteral {
885881 */
886882 predicate sourceCharacter ( int pos , int start , int end ) {
887883 exists ( this .getChar ( pos ) ) and
888- sourceCharacter ( start , end ) and
889- start = rank [ pos + 2 ] ( int s | sourceCharacter ( s , _) )
884+ this . sourceCharacter ( start , end ) and
885+ start = rank [ pos + 2 ] ( int s | this . sourceCharacter ( s , _) )
890886 }
891887}
892888
0 commit comments