Skip to content

Commit e66691a

Browse files
authored
Merge pull request #551 from asger-semmle/js-extractor-shebang
Approved by xiemaisi
2 parents 31ac33e + 623a80f commit e66691a

File tree

7 files changed

+310
-0
lines changed

7 files changed

+310
-0
lines changed

javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import java.io.FileInputStream;
66
import java.io.FileReader;
77
import java.io.IOException;
8+
import java.nio.charset.Charset;
89
import java.util.LinkedHashSet;
910
import java.util.Set;
1011
import java.util.regex.Pattern;
@@ -38,6 +39,11 @@ public class FileExtractor {
3839
*/
3940
public static final Pattern JSON_OBJECT_START = Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");
4041

42+
/**
43+
* The charset for decoding UTF-8 strings.
44+
*/
45+
private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
46+
4147
/**
4248
* Information about supported file types.
4349
*/
@@ -169,6 +175,11 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
169175
if (isXml(bytes, length))
170176
return true;
171177

178+
// Avoid files with an unrecognized shebang header.
179+
if (hasUnrecognizedShebang(bytes, length)) {
180+
return true;
181+
}
182+
172183
return false;
173184
} catch (IOException e) {
174185
Exceptions.ignore(e, "Let extractor handle this one.");
@@ -249,6 +260,38 @@ private boolean hasUnprintableUtf8(byte[] bytes, int length) {
249260
return false;
250261
}
251262

263+
/**
264+
* Returns true if the byte sequence starts with a shebang line that is not
265+
* recognized as a JavaScript interpreter.
266+
*/
267+
private boolean hasUnrecognizedShebang(byte[] bytes, int length) {
268+
// Shebangs preceded by a BOM aren't recognized in UNIX, but the BOM might only
269+
// be present in the source file, to be stripped out in the build process.
270+
int startIndex = skipBOM(bytes, length);
271+
if (startIndex + 2 >= length) return false;
272+
if (bytes[startIndex] != '#' || bytes[startIndex + 1] != '!') {
273+
return false;
274+
}
275+
int endOfLine = -1;
276+
for (int i = startIndex; i < length; ++i) {
277+
if (bytes[i] == '\r' || bytes[i] == '\n') {
278+
endOfLine = i;
279+
break;
280+
}
281+
}
282+
if (endOfLine == -1) {
283+
// The shebang is either very long or there are no other lines in the file.
284+
// Treat this as unrecognized.
285+
return true;
286+
}
287+
// Extract the shebang text
288+
int startOfText = startIndex + "#!".length();
289+
int lengthOfText = endOfLine - startOfText;
290+
String text = new String(bytes, startOfText, lengthOfText, UTF8_CHARSET);
291+
// Check if the shebang is a recognized JavaScript intepreter.
292+
return !NODE_INVOCATION.matcher(text).find();
293+
}
294+
252295
@Override
253296
public IExtractor mkExtractor(ExtractorConfig config, ExtractorState state) {
254297
return new TypeScriptExtractor(config, state.getTypeScriptParser());
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env perl
2+
3+
use strict;
4+
5+
exit 0;
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/usr/bin/env node
2+
interface Foo {
3+
x: number;
4+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
interface Foo {
2+
x: number;
3+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"typescript": true
3+
}
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#10000=@"/typescript-with-shebang.ts;sourcefile"
2+
files(#10000,"/typescript-with-shebang.ts","typescript-with-shebang","ts",0)
3+
#10001=@"/;folder"
4+
folders(#10001,"/","")
5+
containerparent(#10001,#10000)
6+
#10002=@"loc,{#10000},0,0,0,0"
7+
locations_default(#10002,#10000,0,0,0,0)
8+
hasLocation(#10000,#10002)
9+
#20000=@"global_scope"
10+
scopes(#20000,0)
11+
#20001=@"script;{#10000},1,1"
12+
toplevels(#20001,0)
13+
#20002=@"loc,{#10000},1,1,5,0"
14+
locations_default(#20002,#10000,1,1,5,0)
15+
hasLocation(#20001,#20002)
16+
#20003=@"local_type_name;{Foo};{#20000}"
17+
local_type_names(#20003,"Foo",#20000)
18+
#20004=*
19+
stmts(#20004,34,#20001,0,"#!/usr/ ... mber;\n}")
20+
#20005=@"loc,{#10000},1,1,4,1"
21+
locations_default(#20005,#10000,1,1,4,1)
22+
hasLocation(#20004,#20005)
23+
stmtContainers(#20004,#20001)
24+
#20006=*
25+
typeexprs(#20006,1,#20004,0,"Foo")
26+
#20007=@"loc,{#10000},2,11,2,13"
27+
locations_default(#20007,#10000,2,11,2,13)
28+
hasLocation(#20006,#20007)
29+
enclosingStmt(#20006,#20004)
30+
exprContainers(#20006,#20001)
31+
literals("Foo","Foo",#20006)
32+
typedecl(#20006,#20003)
33+
#20008=*
34+
properties(#20008,#20004,2,8,"x: number;")
35+
#20009=@"loc,{#10000},3,3,3,12"
36+
locations_default(#20009,#10000,3,3,3,12)
37+
hasLocation(#20008,#20009)
38+
#20010=*
39+
exprs(#20010,0,#20008,0,"x")
40+
#20011=@"loc,{#10000},3,3,3,3"
41+
locations_default(#20011,#10000,3,3,3,3)
42+
hasLocation(#20010,#20011)
43+
enclosingStmt(#20010,#20004)
44+
exprContainers(#20010,#20001)
45+
literals("x","x",#20010)
46+
isAbstractMember(#20008)
47+
#20012=*
48+
typeexprs(#20012,2,#20008,2,"number")
49+
#20013=@"loc,{#10000},3,6,3,11"
50+
locations_default(#20013,#10000,3,6,3,11)
51+
hasLocation(#20012,#20013)
52+
enclosingStmt(#20012,#20004)
53+
exprContainers(#20012,#20001)
54+
literals("number","number",#20012)
55+
#20014=*
56+
lines(#20014,#20001,"#!/usr/bin/env node","
57+
")
58+
#20015=@"loc,{#10000},1,1,1,19"
59+
locations_default(#20015,#10000,1,1,1,19)
60+
hasLocation(#20014,#20015)
61+
#20016=*
62+
lines(#20016,#20001,"interface Foo {","
63+
")
64+
#20017=@"loc,{#10000},2,1,2,15"
65+
locations_default(#20017,#10000,2,1,2,15)
66+
hasLocation(#20016,#20017)
67+
#20018=*
68+
lines(#20018,#20001," x: number;","
69+
")
70+
#20019=@"loc,{#10000},3,1,3,12"
71+
locations_default(#20019,#10000,3,1,3,12)
72+
hasLocation(#20018,#20019)
73+
indentation(#10000,3," ",2)
74+
#20020=*
75+
lines(#20020,#20001,"}","
76+
")
77+
#20021=@"loc,{#10000},4,1,4,1"
78+
locations_default(#20021,#10000,4,1,4,1)
79+
hasLocation(#20020,#20021)
80+
numlines(#20001,4,3,0)
81+
#20022=*
82+
tokeninfo(#20022,7,#20001,0,"interface")
83+
#20023=@"loc,{#10000},2,1,2,9"
84+
locations_default(#20023,#10000,2,1,2,9)
85+
hasLocation(#20022,#20023)
86+
#20024=*
87+
tokeninfo(#20024,6,#20001,1,"Foo")
88+
hasLocation(#20024,#20007)
89+
#20025=*
90+
tokeninfo(#20025,8,#20001,2,"{")
91+
#20026=@"loc,{#10000},2,15,2,15"
92+
locations_default(#20026,#10000,2,15,2,15)
93+
hasLocation(#20025,#20026)
94+
#20027=*
95+
tokeninfo(#20027,6,#20001,3,"x")
96+
hasLocation(#20027,#20011)
97+
#20028=*
98+
tokeninfo(#20028,8,#20001,4,":")
99+
#20029=@"loc,{#10000},3,4,3,4"
100+
locations_default(#20029,#10000,3,4,3,4)
101+
hasLocation(#20028,#20029)
102+
#20030=*
103+
tokeninfo(#20030,7,#20001,5,"number")
104+
hasLocation(#20030,#20013)
105+
#20031=*
106+
tokeninfo(#20031,8,#20001,6,";")
107+
#20032=@"loc,{#10000},3,12,3,12"
108+
locations_default(#20032,#10000,3,12,3,12)
109+
hasLocation(#20031,#20032)
110+
#20033=*
111+
tokeninfo(#20033,8,#20001,7,"}")
112+
hasLocation(#20033,#20021)
113+
#20034=*
114+
tokeninfo(#20034,0,#20001,8,"")
115+
#20035=@"loc,{#10000},5,1,5,0"
116+
locations_default(#20035,#10000,5,1,5,0)
117+
hasLocation(#20034,#20035)
118+
#20036=*
119+
entry_cfg_node(#20036,#20001)
120+
#20037=@"loc,{#10000},1,1,1,0"
121+
locations_default(#20037,#10000,1,1,1,0)
122+
hasLocation(#20036,#20037)
123+
#20038=*
124+
exit_cfg_node(#20038,#20001)
125+
hasLocation(#20038,#20035)
126+
successor(#20004,#20038)
127+
successor(#20036,#20004)
128+
numlines(#10000,4,3,0)
129+
filetype(#10000,"typescript")
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#10000=@"/typescript.ts;sourcefile"
2+
files(#10000,"/typescript.ts","typescript","ts",0)
3+
#10001=@"/;folder"
4+
folders(#10001,"/","")
5+
containerparent(#10001,#10000)
6+
#10002=@"loc,{#10000},0,0,0,0"
7+
locations_default(#10002,#10000,0,0,0,0)
8+
hasLocation(#10000,#10002)
9+
#20000=@"global_scope"
10+
scopes(#20000,0)
11+
#20001=@"script;{#10000},1,1"
12+
toplevels(#20001,0)
13+
#20002=@"loc,{#10000},1,1,4,0"
14+
locations_default(#20002,#10000,1,1,4,0)
15+
hasLocation(#20001,#20002)
16+
#20003=@"local_type_name;{Foo};{#20000}"
17+
local_type_names(#20003,"Foo",#20000)
18+
#20004=*
19+
stmts(#20004,34,#20001,0,"interfa ... mber;\n}")
20+
#20005=@"loc,{#10000},1,1,3,1"
21+
locations_default(#20005,#10000,1,1,3,1)
22+
hasLocation(#20004,#20005)
23+
stmtContainers(#20004,#20001)
24+
#20006=*
25+
typeexprs(#20006,1,#20004,0,"Foo")
26+
#20007=@"loc,{#10000},1,11,1,13"
27+
locations_default(#20007,#10000,1,11,1,13)
28+
hasLocation(#20006,#20007)
29+
enclosingStmt(#20006,#20004)
30+
exprContainers(#20006,#20001)
31+
literals("Foo","Foo",#20006)
32+
typedecl(#20006,#20003)
33+
#20008=*
34+
properties(#20008,#20004,2,8,"x: number;")
35+
#20009=@"loc,{#10000},2,3,2,12"
36+
locations_default(#20009,#10000,2,3,2,12)
37+
hasLocation(#20008,#20009)
38+
#20010=*
39+
exprs(#20010,0,#20008,0,"x")
40+
#20011=@"loc,{#10000},2,3,2,3"
41+
locations_default(#20011,#10000,2,3,2,3)
42+
hasLocation(#20010,#20011)
43+
enclosingStmt(#20010,#20004)
44+
exprContainers(#20010,#20001)
45+
literals("x","x",#20010)
46+
isAbstractMember(#20008)
47+
#20012=*
48+
typeexprs(#20012,2,#20008,2,"number")
49+
#20013=@"loc,{#10000},2,6,2,11"
50+
locations_default(#20013,#10000,2,6,2,11)
51+
hasLocation(#20012,#20013)
52+
enclosingStmt(#20012,#20004)
53+
exprContainers(#20012,#20001)
54+
literals("number","number",#20012)
55+
#20014=*
56+
lines(#20014,#20001,"interface Foo {","
57+
")
58+
#20015=@"loc,{#10000},1,1,1,15"
59+
locations_default(#20015,#10000,1,1,1,15)
60+
hasLocation(#20014,#20015)
61+
#20016=*
62+
lines(#20016,#20001," x: number;","
63+
")
64+
#20017=@"loc,{#10000},2,1,2,12"
65+
locations_default(#20017,#10000,2,1,2,12)
66+
hasLocation(#20016,#20017)
67+
indentation(#10000,2," ",2)
68+
#20018=*
69+
lines(#20018,#20001,"}","
70+
")
71+
#20019=@"loc,{#10000},3,1,3,1"
72+
locations_default(#20019,#10000,3,1,3,1)
73+
hasLocation(#20018,#20019)
74+
numlines(#20001,3,3,0)
75+
#20020=*
76+
tokeninfo(#20020,7,#20001,0,"interface")
77+
#20021=@"loc,{#10000},1,1,1,9"
78+
locations_default(#20021,#10000,1,1,1,9)
79+
hasLocation(#20020,#20021)
80+
#20022=*
81+
tokeninfo(#20022,6,#20001,1,"Foo")
82+
hasLocation(#20022,#20007)
83+
#20023=*
84+
tokeninfo(#20023,8,#20001,2,"{")
85+
#20024=@"loc,{#10000},1,15,1,15"
86+
locations_default(#20024,#10000,1,15,1,15)
87+
hasLocation(#20023,#20024)
88+
#20025=*
89+
tokeninfo(#20025,6,#20001,3,"x")
90+
hasLocation(#20025,#20011)
91+
#20026=*
92+
tokeninfo(#20026,8,#20001,4,":")
93+
#20027=@"loc,{#10000},2,4,2,4"
94+
locations_default(#20027,#10000,2,4,2,4)
95+
hasLocation(#20026,#20027)
96+
#20028=*
97+
tokeninfo(#20028,7,#20001,5,"number")
98+
hasLocation(#20028,#20013)
99+
#20029=*
100+
tokeninfo(#20029,8,#20001,6,";")
101+
#20030=@"loc,{#10000},2,12,2,12"
102+
locations_default(#20030,#10000,2,12,2,12)
103+
hasLocation(#20029,#20030)
104+
#20031=*
105+
tokeninfo(#20031,8,#20001,7,"}")
106+
hasLocation(#20031,#20019)
107+
#20032=*
108+
tokeninfo(#20032,0,#20001,8,"")
109+
#20033=@"loc,{#10000},4,1,4,0"
110+
locations_default(#20033,#10000,4,1,4,0)
111+
hasLocation(#20032,#20033)
112+
#20034=*
113+
entry_cfg_node(#20034,#20001)
114+
#20035=@"loc,{#10000},1,1,1,0"
115+
locations_default(#20035,#10000,1,1,1,0)
116+
hasLocation(#20034,#20035)
117+
#20036=*
118+
exit_cfg_node(#20036,#20001)
119+
hasLocation(#20036,#20033)
120+
successor(#20004,#20036)
121+
successor(#20034,#20004)
122+
numlines(#10000,3,3,0)
123+
filetype(#10000,"typescript")

0 commit comments

Comments
 (0)