Skip to content

Commit fe6de2f

Browse files
committed
Added support for character class union in regex processing
1 parent 1e05f32 commit fe6de2f

File tree

6 files changed

+325
-257
lines changed

6 files changed

+325
-257
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package com.semmle.js.ast.regexp;
2+
3+
import com.semmle.js.ast.SourceLocation;
4+
import java.util.List;
5+
6+
public class CharacterClassUnion extends RegExpTerm {
7+
private final List<RegExpTerm> union;
8+
9+
public CharacterClassUnion(SourceLocation loc, List<RegExpTerm> union) {
10+
super(loc, "CharacterClassUnion");
11+
this.union = union;
12+
}
13+
14+
@Override
15+
public void accept(Visitor v) {
16+
v.visit(this);
17+
}
18+
19+
public List<RegExpTerm> getUnion() {
20+
return union;
21+
}
22+
}

javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,6 @@ public interface Visitor {
6767
public void visit(CharacterClassIntersection nd);
6868

6969
public void visit(CharacterClassSubtraction nd);
70+
71+
public void visit(CharacterClassUnion nd);
7072
}

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1414
import com.semmle.js.ast.regexp.CharacterClassRange;
1515
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
16+
import com.semmle.js.ast.regexp.CharacterClassUnion;
1617
import com.semmle.js.ast.regexp.Constant;
1718
import com.semmle.js.ast.regexp.ControlEscape;
1819
import com.semmle.js.ast.regexp.ControlLetter;
@@ -98,6 +99,7 @@ public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
9899
termkinds.put("CharacterClassQuotedString", 28);
99100
termkinds.put("CharacterClassIntersection", 29);
100101
termkinds.put("CharacterClassSubtraction", 30);
102+
termkinds.put("CharacterClassUnion", 31);
101103
}
102104

103105
private static final String[] errmsgs =
@@ -372,6 +374,14 @@ public void visit(CharacterClassSubtraction nd) {
372374
for (RegExpTerm element : nd.getSubtraction())
373375
visit(element, lbl, i++);
374376
}
377+
378+
@Override
379+
public void visit(CharacterClassUnion nd) {
380+
Label lbl = extractTerm(nd, parent, idx);
381+
int i = 0;
382+
for (RegExpTerm element : nd.getUnion())
383+
visit(element, lbl, i++);
384+
}
375385
}
376386

377387
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1010
import com.semmle.js.ast.regexp.CharacterClassRange;
1111
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
12+
import com.semmle.js.ast.regexp.CharacterClassUnion;
1213
import com.semmle.js.ast.regexp.Constant;
1314
import com.semmle.js.ast.regexp.ControlEscape;
1415
import com.semmle.js.ast.regexp.ControlLetter;
@@ -568,6 +569,7 @@ private enum CharacterClassType {
568569
STANDARD,
569570
INTERSECTION,
570571
SUBTRACTION,
572+
UNION
571573
}
572574

573575
// ECMA 2024 `v` flag allows nested character classes.
@@ -599,12 +601,26 @@ else if (lookahead("--")) {
599601
}
600602
}
601603

604+
boolean containsComplex = elements.stream().anyMatch(term -> term instanceof UnicodePropertyEscape ||
605+
term instanceof CharacterClassQuotedString ||
606+
term instanceof CharacterClass);
607+
608+
// Set type to UNION only if:
609+
// 1. We haven't already determined a specific type (intersection/subtraction)
610+
// 2. We have more than one element
611+
// 3. We have at least one complex element (i.e. a nested character class or a UnicodePropertyEscape)
612+
if (containsComplex && classType == CharacterClassType.STANDARD && elements.size() > 1) {
613+
classType = CharacterClassType.UNION;
614+
}
615+
602616
// Create appropriate RegExpTerm based on the detected class type
603617
switch (classType) {
604618
case INTERSECTION:
605619
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassIntersection(loc, elements)), inverted));
606620
case SUBTRACTION:
607621
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassSubtraction(loc, elements)), inverted));
622+
case UNION:
623+
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassUnion(loc, elements)), inverted));
608624
case STANDARD:
609625
default:
610626
return this.finishTerm(new CharacterClass(loc, elements, inverted));

javascript/extractor/tests/es2024/output/trap/regex_nested_character_class.js.trap

Lines changed: 57 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -137,75 +137,81 @@ regexpterm(#20042,23,#20041,0,"[ [] [ [] [] ] ]")
137137
locations_default(#20043,#10000,3,2,3,17)
138138
hasLocation(#20042,#20043)
139139
#20044=*
140-
regexpterm(#20044,14,#20042,0," ")
141-
#20045=@"loc,{#10000},3,3,3,3"
142-
locations_default(#20045,#10000,3,3,3,3)
143-
hasLocation(#20044,#20045)
144-
regexp_const_value(#20044," ")
145-
#20046=*
146-
regexpterm(#20046,23,#20042,1,"[]")
147-
#20047=@"loc,{#10000},3,4,3,5"
148-
locations_default(#20047,#10000,3,4,3,5)
149-
hasLocation(#20046,#20047)
150-
#20048=*
151-
regexpterm(#20048,14,#20042,2," ")
152-
#20049=@"loc,{#10000},3,6,3,6"
153-
locations_default(#20049,#10000,3,6,3,6)
154-
hasLocation(#20048,#20049)
155-
regexp_const_value(#20048," ")
156-
#20050=*
157-
regexpterm(#20050,23,#20042,3,"[ [] [] ]")
158-
#20051=@"loc,{#10000},3,7,3,15"
159-
locations_default(#20051,#10000,3,7,3,15)
160-
hasLocation(#20050,#20051)
161-
#20052=*
162-
regexpterm(#20052,14,#20050,0," ")
163-
#20053=@"loc,{#10000},3,8,3,8"
164-
locations_default(#20053,#10000,3,8,3,8)
165-
hasLocation(#20052,#20053)
166-
regexp_const_value(#20052," ")
140+
regexpterm(#20044,31,#20042,0,"[ [] [ [] [] ] ]")
141+
hasLocation(#20044,#20043)
142+
#20045=*
143+
regexpterm(#20045,14,#20044,0," ")
144+
#20046=@"loc,{#10000},3,3,3,3"
145+
locations_default(#20046,#10000,3,3,3,3)
146+
hasLocation(#20045,#20046)
147+
regexp_const_value(#20045," ")
148+
#20047=*
149+
regexpterm(#20047,23,#20044,1,"[]")
150+
#20048=@"loc,{#10000},3,4,3,5"
151+
locations_default(#20048,#10000,3,4,3,5)
152+
hasLocation(#20047,#20048)
153+
#20049=*
154+
regexpterm(#20049,14,#20044,2," ")
155+
#20050=@"loc,{#10000},3,6,3,6"
156+
locations_default(#20050,#10000,3,6,3,6)
157+
hasLocation(#20049,#20050)
158+
regexp_const_value(#20049," ")
159+
#20051=*
160+
regexpterm(#20051,23,#20044,3,"[ [] [] ]")
161+
#20052=@"loc,{#10000},3,7,3,15"
162+
locations_default(#20052,#10000,3,7,3,15)
163+
hasLocation(#20051,#20052)
164+
#20053=*
165+
regexpterm(#20053,31,#20051,0,"[ [] [] ]")
166+
hasLocation(#20053,#20052)
167167
#20054=*
168-
regexpterm(#20054,23,#20050,1,"[]")
169-
#20055=@"loc,{#10000},3,9,3,10"
170-
locations_default(#20055,#10000,3,9,3,10)
168+
regexpterm(#20054,14,#20053,0," ")
169+
#20055=@"loc,{#10000},3,8,3,8"
170+
locations_default(#20055,#10000,3,8,3,8)
171171
hasLocation(#20054,#20055)
172+
regexp_const_value(#20054," ")
172173
#20056=*
173-
regexpterm(#20056,14,#20050,2," ")
174-
#20057=@"loc,{#10000},3,11,3,11"
175-
locations_default(#20057,#10000,3,11,3,11)
174+
regexpterm(#20056,23,#20053,1,"[]")
175+
#20057=@"loc,{#10000},3,9,3,10"
176+
locations_default(#20057,#10000,3,9,3,10)
176177
hasLocation(#20056,#20057)
177-
regexp_const_value(#20056," ")
178178
#20058=*
179-
regexpterm(#20058,23,#20050,3,"[]")
180-
#20059=@"loc,{#10000},3,12,3,13"
181-
locations_default(#20059,#10000,3,12,3,13)
179+
regexpterm(#20058,14,#20053,2," ")
180+
#20059=@"loc,{#10000},3,11,3,11"
181+
locations_default(#20059,#10000,3,11,3,11)
182182
hasLocation(#20058,#20059)
183+
regexp_const_value(#20058," ")
183184
#20060=*
184-
regexpterm(#20060,14,#20050,4," ")
185-
#20061=@"loc,{#10000},3,14,3,14"
186-
locations_default(#20061,#10000,3,14,3,14)
185+
regexpterm(#20060,23,#20053,3,"[]")
186+
#20061=@"loc,{#10000},3,12,3,13"
187+
locations_default(#20061,#10000,3,12,3,13)
187188
hasLocation(#20060,#20061)
188-
regexp_const_value(#20060," ")
189189
#20062=*
190-
regexpterm(#20062,14,#20042,4," ")
191-
#20063=@"loc,{#10000},3,16,3,16"
192-
locations_default(#20063,#10000,3,16,3,16)
190+
regexpterm(#20062,14,#20053,4," ")
191+
#20063=@"loc,{#10000},3,14,3,14"
192+
locations_default(#20063,#10000,3,14,3,14)
193193
hasLocation(#20062,#20063)
194194
regexp_const_value(#20062," ")
195195
#20064=*
196-
entry_cfg_node(#20064,#20001)
197-
#20065=@"loc,{#10000},1,1,1,0"
198-
locations_default(#20065,#10000,1,1,1,0)
196+
regexpterm(#20064,14,#20044,4," ")
197+
#20065=@"loc,{#10000},3,16,3,16"
198+
locations_default(#20065,#10000,3,16,3,16)
199199
hasLocation(#20064,#20065)
200+
regexp_const_value(#20064," ")
200201
#20066=*
201-
exit_cfg_node(#20066,#20001)
202-
hasLocation(#20066,#20023)
202+
entry_cfg_node(#20066,#20001)
203+
#20067=@"loc,{#10000},1,1,1,0"
204+
locations_default(#20067,#10000,1,1,1,0)
205+
hasLocation(#20066,#20067)
206+
#20068=*
207+
exit_cfg_node(#20068,#20001)
208+
hasLocation(#20068,#20023)
203209
successor(#20040,#20041)
204-
successor(#20041,#20066)
210+
successor(#20041,#20068)
205211
successor(#20032,#20033)
206212
successor(#20033,#20040)
207213
successor(#20025,#20027)
208214
successor(#20027,#20032)
209-
successor(#20064,#20025)
215+
successor(#20066,#20025)
210216
numlines(#10000,3,3,1)
211217
filetype(#10000,"javascript")

0 commit comments

Comments
 (0)