Skip to content

Commit 09b87d8

Browse files
authored
Merge pull request github#1733 from pavgust/imp/yaml-snippets
Approved by esben-semmle
2 parents 24f407c + eb77b86 commit 09b87d8

File tree

1 file changed

+32
-36
lines changed

1 file changed

+32
-36
lines changed

javascript/extractor/src/com/semmle/js/extractor/YAMLExtractor.java

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ public LoCInfo extract(TextualExtractor textualExtractor) {
103103
try {
104104
parser = new ParserImpl(new StreamReader(textualExtractor.getSource()));
105105
resolver = new Resolver();
106-
107106
int idx = 0;
108-
while (!atStreamEnd()) extractDocument(fileLabel, idx++);
107+
while (!atStreamEnd())
108+
extractDocument(fileLabel, idx++, textualExtractor.getSource().codePoints().toArray());
109109
} catch (MarkedYAMLException e) {
110110
int line = e.getProblemMark().getLine() + 1;
111111
int column = e.getProblemMark().getColumn() + 1;
@@ -136,16 +136,16 @@ private boolean atStreamEnd() {
136136
}
137137

138138
/** Extract a complete YAML document; cf. {@link Composer#composeDocument}. */
139-
private void extractDocument(Label parent, int idx) {
139+
private void extractDocument(Label parent, int idx, int[] codepoints) {
140140
// Drop the DOCUMENT-START event
141141
parser.getEvent();
142-
extractNode(parent, idx);
142+
extractNode(parent, idx, codepoints);
143143
// Drop the DOCUMENT-END event
144144
parser.getEvent();
145145
}
146146

147147
/** Extract a single YAML node; cf. {@link Composer#composeNode}. */
148-
private void extractNode(Label parent, int idx) {
148+
private void extractNode(Label parent, int idx, int[] codepoints) {
149149
Label label = trapWriter.freshLabel();
150150
NodeKind kind;
151151
String tag = "";
@@ -169,15 +169,14 @@ private void extractNode(Label parent, int idx) {
169169
scalar.getImplicit().canOmitTagInPlainScalar());
170170
Character style = scalar.getStyle();
171171
int styleCode = style == null ? 0 : (int) style;
172-
trapWriter.addTuple(
173-
YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue());
172+
trapWriter.addTuple(YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue());
174173
} else if (start.is(Event.ID.SequenceStart)) {
175174
kind = NodeKind.SEQUENCE;
176175
SequenceStartEvent sequenceStart = (SequenceStartEvent) start;
177176
tag = getTag(sequenceStart.getTag(), NodeId.sequence, null, sequenceStart.getImplicit());
178177

179178
int childIdx = 0;
180-
while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++);
179+
while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++, codepoints);
181180

182181
end = parser.getEvent();
183182
} else if (start.is(Event.ID.MappingStart)) {
@@ -187,8 +186,8 @@ private void extractNode(Label parent, int idx) {
187186

188187
int childIdx = 1;
189188
while (!parser.checkEvent(Event.ID.MappingEnd)) {
190-
extractNode(label, childIdx);
191-
extractNode(label, -childIdx);
189+
extractNode(label, childIdx, codepoints);
190+
extractNode(label, -childIdx, codepoints);
192191
++childIdx;
193192
}
194193

@@ -205,7 +204,7 @@ private void extractNode(Label parent, int idx) {
205204
parent,
206205
idx,
207206
tag,
208-
mkToString(start.getStartMark(), end.getEndMark()));
207+
mkToString(start.getStartMark(), end.getEndMark(), codepoints));
209208
extractLocation(label, start.getStartMark(), end.getEndMark());
210209
}
211210

@@ -216,33 +215,30 @@ private String getTag(String explicitTag, NodeId kind, String value, boolean imp
216215
return explicitTag;
217216
}
218217

218+
private static boolean isNewLine(int codePoint) {
219+
switch (codePoint) {
220+
case '\n':
221+
case '\r':
222+
case '\u0085':
223+
case '\u2028':
224+
case '\u2029':
225+
return true;
226+
default:
227+
return false;
228+
}
229+
}
230+
219231
/**
220-
* SnakeYAML doesn't directly expose the source text of nodes, but we can get a decent
221-
* approximation from the snippet associated with the node's start {@linkplain Mark}.
222-
*
223-
* <p>The snippet of a {@linkplain Mark} is meant to be used for diagnostic messages and consists
224-
* of two lines: the first line contains some context around the source position represented by
225-
* the mark, the second line contains a caret character positioned underneath the source position
226-
* itself.
227-
*
228-
* <p>To approximate the source text, we take the text on the first line and strip off the first
229-
* <i>n</i> characters, where <i>n</i> is the number of spaces preceding the caret character on
230-
* the second line.
231-
*
232-
* <p>This is only an approximation, since the context is limited to relatively short strings that
233-
* never extend across newlines, but it suffices for the purposes of <code>toString</code>.
232+
* SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents
233+
* as an array of Unicode code points. The start and end marks each contain an index into the code
234+
* point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we
235+
* stop at the first encountered newline.
234236
*/
235-
private String mkToString(Mark startMark, Mark endMark) {
236-
String snippet = startMark.get_snippet(0, Integer.MAX_VALUE);
237-
int nl = snippet.indexOf('\n');
238-
String context = snippet.substring(0, nl);
239-
String src = context.substring(snippet.substring(nl + 1).indexOf('^'));
240-
int desiredStringLength = endMark.getColumn() - startMark.getColumn();
241-
boolean hasAccessToDesiredString = src.length() >= desiredStringLength;
242-
boolean isSingleLine = endMark.getLine() == startMark.getLine();
243-
if (isSingleLine && hasAccessToDesiredString)
244-
src = src.substring(0, desiredStringLength);
245-
return TextualExtractor.sanitiseToString(src);
237+
private static String mkToString(Mark startMark, Mark endMark, int[] codepoints) {
238+
StringBuilder b = new StringBuilder();
239+
for (int i = startMark.getIndex(); i < endMark.getIndex() && !isNewLine(codepoints[i]); i++)
240+
b.appendCodePoint(codepoints[i]);
241+
return TextualExtractor.sanitiseToString(b.toString());
246242
}
247243

248244
/** Emit a source ___location for a YAML node. */

0 commit comments

Comments
 (0)