|
1 | 1 | package com.semmle.js.extractor; |
2 | 2 |
|
3 | | -import com.semmle.util.data.StringUtil; |
4 | | -import com.semmle.util.exception.CatastrophicError; |
5 | | -import com.semmle.util.exception.UserError; |
6 | | -import com.semmle.util.locations.LineTable; |
7 | | -import com.semmle.util.trap.TrapWriter; |
8 | | -import com.semmle.util.trap.TrapWriter.Label; |
9 | | -import com.semmle.util.trap.TrapWriter.Table; |
10 | | - |
11 | 3 | import java.util.Collections; |
12 | 4 |
|
13 | | -import org.yaml.snakeyaml.composer.Composer; |
14 | | -import org.yaml.snakeyaml.error.Mark; |
15 | | -import org.yaml.snakeyaml.error.MarkedYAMLException; |
16 | | -import org.yaml.snakeyaml.events.AliasEvent; |
17 | | -import org.yaml.snakeyaml.events.Event; |
18 | | -import org.yaml.snakeyaml.events.MappingStartEvent; |
19 | | -import org.yaml.snakeyaml.events.NodeEvent; |
20 | | -import org.yaml.snakeyaml.events.ScalarEvent; |
21 | | -import org.yaml.snakeyaml.events.SequenceStartEvent; |
22 | | -import org.yaml.snakeyaml.nodes.NodeId; |
23 | | -import org.yaml.snakeyaml.parser.Parser; |
24 | | -import org.yaml.snakeyaml.parser.ParserImpl; |
25 | | -import org.yaml.snakeyaml.reader.ReaderException; |
26 | | -import org.yaml.snakeyaml.reader.StreamReader; |
27 | | -import org.yaml.snakeyaml.resolver.Resolver; |
| 5 | +import com.semmle.extractor.yaml.YamlPopulator; |
28 | 6 |
|
29 | 7 | /** |
30 | 8 | * Extractor for populating YAML files. |
31 | 9 | * |
32 | | - * <p>The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse YAML. |
| 10 | + * <p> |
| 11 | + * The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse |
| 12 | + * YAML. |
33 | 13 | */ |
34 | 14 | public class YAMLExtractor implements IExtractor { |
35 | | - /** The tables constituting the YAML dbscheme. */ |
36 | | - private static enum YAMLTables implements Table { |
37 | | - YAML(6), // yaml (id: @yaml_node, kind: int ref, parent: @yaml_node_parent ref, |
38 | | - // idx: int ref, tag: string ref, tostring: string ref) |
39 | | - YAML_ANCHORS(2), // yaml_anchors (node: @yaml_node ref, anchor: string ref) |
40 | | - YAML_ALIASES(2), // yaml_aliases (alias: @yaml_alias_node ref, target: string ref) |
41 | | - YAML_SCALARS( |
42 | | - 3), // yaml_scalars (scalar: @yaml_scalar_node ref, style: int ref, value: string ref) |
43 | | - YAML_ERRORS(2); // yaml_errors (id: @yaml_error, message: string ref) |
44 | | - |
45 | | - private final int arity; |
46 | | - |
47 | | - private YAMLTables(int arity) { |
48 | | - this.arity = arity; |
49 | | - } |
50 | | - |
51 | | - @Override |
52 | | - public String getName() { |
53 | | - return StringUtil.lc(name()); |
54 | | - } |
55 | | - |
56 | | - @Override |
57 | | - public int getArity() { |
58 | | - return arity; |
59 | | - } |
60 | | - |
61 | | - @Override |
62 | | - public boolean validate(Object... values) { |
63 | | - return true; |
64 | | - } |
65 | | - } |
66 | | - |
67 | | - /* |
68 | | - * case @yaml_node.kind of |
69 | | - * 0 = @yaml_scalar_node |
70 | | - * | 1 = @yaml_mapping_node |
71 | | - * | 2 = @yaml_sequence_node |
72 | | - * | 3 = @yaml_alias_node |
73 | | - */ |
74 | | - private static enum NodeKind { |
75 | | - SCALAR, |
76 | | - MAPPING, |
77 | | - SEQUENCE, |
78 | | - ALIAS |
79 | | - }; |
80 | | - |
81 | 15 | private final boolean tolerateParseErrors; |
82 | 16 |
|
83 | | - private TextualExtractor textualExtractor; |
84 | | - private LocationManager locationManager; |
85 | | - private TrapWriter trapWriter; |
86 | | - private LineTable lineTable; |
87 | | - |
88 | | - /** |
89 | | - * The underlying SnakeYAML parser; we use the relatively low-level {@linkplain Parser} instead of |
90 | | - * the more high-level {@linkplain Composer}, since our dbscheme represents YAML documents in AST |
91 | | - * form, with aliases left unresolved. |
92 | | - */ |
93 | | - private Parser parser; |
94 | | - |
95 | | - /** The resolver used for resolving type tags. */ |
96 | | - private Resolver resolver; |
97 | | - |
98 | 17 | public YAMLExtractor(ExtractorConfig config) { |
99 | 18 | this.tolerateParseErrors = config.isTolerateParseErrors(); |
100 | 19 | } |
101 | 20 |
|
102 | | - private LineTable getLineTable() { |
103 | | - if (lineTable == null) { |
104 | | - lineTable = new LineTable(this.textualExtractor.getSource()); |
105 | | - } |
106 | | - return lineTable; |
107 | | - } |
108 | | - |
109 | 21 | @Override |
110 | 22 | public ParseResultInfo extract(TextualExtractor textualExtractor) { |
111 | | - this.textualExtractor = textualExtractor; |
112 | | - locationManager = textualExtractor.getLocationManager(); |
113 | | - trapWriter = textualExtractor.getTrapwriter(); |
114 | | - |
115 | | - Label fileLabel = locationManager.getFileLabel(); |
116 | | - locationManager.setHasLocationTable("yaml_locations"); |
117 | | - try { |
118 | | - parser = new ParserImpl(new StreamReader(textualExtractor.getSource())); |
119 | | - resolver = new Resolver(); |
120 | | - int idx = 0; |
121 | | - while (!atStreamEnd()) |
122 | | - extractDocument(fileLabel, idx++, textualExtractor.getSource().codePoints().toArray()); |
123 | | - } catch (MarkedYAMLException e) { |
124 | | - int line = e.getProblemMark().getLine() + 1; |
125 | | - int column = e.getProblemMark().getColumn() + 1; |
126 | | - if (!this.tolerateParseErrors) |
127 | | - throw new UserError(e.getProblem() + ": " + line + ":" + column); |
128 | | - Label lbl = trapWriter.freshLabel(); |
129 | | - trapWriter.addTuple(YAMLTables.YAML_ERRORS, lbl, e.getProblem()); |
130 | | - locationManager.emitSnippetLocation(lbl, line, column, line, column); |
131 | | - } catch (ReaderException e) { |
132 | | - if (!this.tolerateParseErrors) throw new UserError(e.toString()); |
133 | | - int c = e.getCodePoint(); |
134 | | - String s = String.valueOf(Character.toChars(c)); |
135 | | - trapWriter.addTuple( |
136 | | - YAMLTables.YAML_ERRORS, |
137 | | - trapWriter.freshLabel(), |
138 | | - "Unexpected character " + s + "(" + c + ")"); |
139 | | - // unfortunately, SnakeYAML does not provide structured location information for |
140 | | - // ReaderExceptions |
141 | | - } |
142 | | - |
| 23 | + new YamlPopulator(textualExtractor.getExtractedFile(), textualExtractor.getSource(), |
| 24 | + textualExtractor.getTrapwriter(), |
| 25 | + this.tolerateParseErrors).extract(); |
143 | 26 | return new ParseResultInfo(0, 0, Collections.emptyList()); |
144 | 27 | } |
145 | | - |
146 | | - /** Check whether the parser has encountered the end of the YAML input stream. */ |
147 | | - private boolean atStreamEnd() { |
148 | | - if (parser.checkEvent(Event.ID.StreamStart)) parser.getEvent(); |
149 | | - return parser.checkEvent(Event.ID.StreamEnd); |
150 | | - } |
151 | | - |
152 | | - /** Extract a complete YAML document; cf. {@link Composer#getNode}. */ |
153 | | - private void extractDocument(Label parent, int idx, int[] codepoints) { |
154 | | - // Drop the DOCUMENT-START event |
155 | | - parser.getEvent(); |
156 | | - extractNode(parent, idx, codepoints); |
157 | | - // Drop the DOCUMENT-END event |
158 | | - parser.getEvent(); |
159 | | - } |
160 | | - |
161 | | - /** Extract a single YAML node; cf. {@link Composer#composeNode}. */ |
162 | | - private void extractNode(Label parent, int idx, int[] codepoints) { |
163 | | - Label label = trapWriter.freshLabel(); |
164 | | - NodeKind kind; |
165 | | - String tag = ""; |
166 | | - Event start = parser.getEvent(), end = start; |
167 | | - |
168 | | - if (start.is(Event.ID.Alias)) { |
169 | | - kind = NodeKind.ALIAS; |
170 | | - trapWriter.addTuple(YAMLTables.YAML_ALIASES, label, ((AliasEvent) start).getAnchor()); |
171 | | - } else { |
172 | | - String anchor = start instanceof NodeEvent ? ((NodeEvent) start).getAnchor() : null; |
173 | | - if (anchor != null) trapWriter.addTuple(YAMLTables.YAML_ANCHORS, label, anchor); |
174 | | - |
175 | | - if (start.is(Event.ID.Scalar)) { |
176 | | - kind = NodeKind.SCALAR; |
177 | | - ScalarEvent scalar = (ScalarEvent) start; |
178 | | - tag = |
179 | | - getTag( |
180 | | - scalar.getTag(), |
181 | | - NodeId.scalar, |
182 | | - scalar.getValue(), |
183 | | - scalar.getImplicit().canOmitTagInPlainScalar()); |
184 | | - Character style = scalar.getStyle(); |
185 | | - int styleCode = style == null ? 0 : (int) style; |
186 | | - trapWriter.addTuple(YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue()); |
187 | | - } else if (start.is(Event.ID.SequenceStart)) { |
188 | | - kind = NodeKind.SEQUENCE; |
189 | | - SequenceStartEvent sequenceStart = (SequenceStartEvent) start; |
190 | | - tag = getTag(sequenceStart.getTag(), NodeId.sequence, null, sequenceStart.getImplicit()); |
191 | | - |
192 | | - int childIdx = 0; |
193 | | - while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++, codepoints); |
194 | | - |
195 | | - end = parser.getEvent(); |
196 | | - } else if (start.is(Event.ID.MappingStart)) { |
197 | | - kind = NodeKind.MAPPING; |
198 | | - MappingStartEvent mappingStart = (MappingStartEvent) start; |
199 | | - tag = getTag(mappingStart.getTag(), NodeId.mapping, null, mappingStart.getImplicit()); |
200 | | - |
201 | | - int childIdx = 1; |
202 | | - while (!parser.checkEvent(Event.ID.MappingEnd)) { |
203 | | - extractNode(label, childIdx, codepoints); |
204 | | - extractNode(label, -childIdx, codepoints); |
205 | | - ++childIdx; |
206 | | - } |
207 | | - |
208 | | - end = parser.getEvent(); |
209 | | - } else { |
210 | | - throw new CatastrophicError("Unexpected YAML parser event: " + start); |
211 | | - } |
212 | | - } |
213 | | - |
214 | | - trapWriter.addTuple( |
215 | | - YAMLTables.YAML, |
216 | | - label, |
217 | | - kind.ordinal(), |
218 | | - parent, |
219 | | - idx, |
220 | | - tag, |
221 | | - mkToString(start.getStartMark(), end.getEndMark(), codepoints)); |
222 | | - extractLocation(label, start.getStartMark(), end.getEndMark()); |
223 | | - } |
224 | | - |
225 | | - /** Determine the type tag of a node. */ |
226 | | - private String getTag(String explicitTag, NodeId kind, String value, boolean implicit) { |
227 | | - if (explicitTag == null || "!".equals(explicitTag)) |
228 | | - return resolver.resolve(kind, value, implicit).getValue(); |
229 | | - return explicitTag; |
230 | | - } |
231 | | - |
232 | | - private static boolean isNewLine(int codePoint) { |
233 | | - switch (codePoint) { |
234 | | - case '\n': |
235 | | - case '\r': |
236 | | - case '\u0085': |
237 | | - case '\u2028': |
238 | | - case '\u2029': |
239 | | - return true; |
240 | | - default: |
241 | | - return false; |
242 | | - } |
243 | | - } |
244 | | - |
245 | | - /** |
246 | | - * SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents |
247 | | - * as an array of Unicode code points. The start and end marks each contain an index into the code |
248 | | - * point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we |
249 | | - * stop at the first encountered newline. |
250 | | - */ |
251 | | - private static String mkToString(Mark startMark, Mark endMark, int[] codepoints) { |
252 | | - StringBuilder b = new StringBuilder(); |
253 | | - for (int i = startMark.getIndex(); i < endMark.getIndex() && !isNewLine(codepoints[i]); i++) |
254 | | - b.appendCodePoint(codepoints[i]); |
255 | | - return TextualExtractor.sanitiseToString(b.toString()); |
256 | | - } |
257 | | - |
258 | | - /** Emit a source location for a YAML node. */ |
259 | | - private void extractLocation(Label label, Mark startMark, Mark endMark) { |
260 | | - int startLine, startColumn, endLine, endColumn; |
261 | | - |
262 | | - // SnakeYAML uses 0-based indexing for both lines and columns, so need to +1 |
263 | | - startLine = startMark.getLine() + 1; |
264 | | - startColumn = startMark.getColumn() + 1; |
265 | | - |
266 | | - // SnakeYAML's end positions are exclusive, so only need to +1 for the line |
267 | | - endLine = endMark.getLine() + 1; |
268 | | - endColumn = endMark.getColumn(); |
269 | | - |
270 | | - // Avoid emitting column zero for non-empty locations |
271 | | - if (endColumn == 0 && !(startLine == endLine && startColumn == endColumn)) { |
272 | | - String source = textualExtractor.getSource(); |
273 | | - int offset = getLineTable().getOffsetFromPoint(endMark.getLine(), endMark.getColumn()) - 1; |
274 | | - while (offset > 0 && isNewLine((int)source.charAt(offset))) { |
275 | | - --offset; |
276 | | - } |
277 | | - com.semmle.util.locations.Position adjustedEndPos = getLineTable().getEndPositionFromOffset(offset); |
278 | | - endLine = adjustedEndPos.getLine(); |
279 | | - endColumn = adjustedEndPos.getColumn(); |
280 | | - } |
281 | | - |
282 | | - locationManager.emitSnippetLocation(label, startLine, startColumn, endLine, endColumn); |
283 | | - } |
284 | 28 | } |
0 commit comments