Skip to content

Commit c79ea6d

Browse files
committed
Support recovery from lexer errors
Lexer::startLexing() no longer throws, instead errors can be fetched using Lexer::getErrors(). Lexer errors now also contain full line and position information.
1 parent e926efd commit c79ea6d

File tree

5 files changed

+229
-57
lines changed

5 files changed

+229
-57
lines changed

lib/PhpParser/Lexer.php

Lines changed: 94 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class Lexer
99
{
1010
protected $code;
1111
protected $tokens;
12+
protected $errors;
1213
protected $pos;
1314
protected $line;
1415
protected $filePos;
@@ -49,11 +50,22 @@ public function __construct(array $options = array()) {
4950
/**
5051
* Initializes the lexer for lexing the provided source code.
5152
*
52-
* @param string $code The source code to lex
53+
* This function does not throw if lexing errors occur. Instead, errors may be retrieved using
54+
* the getErrors() method.
5355
*
54-
* @throws Error on lexing errors (unterminated comment or unexpected character)
56+
* @param string $code The source code to lex
5557
*/
5658
public function startLexing($code) {
59+
$this->code = $code; // keep the code around for __halt_compiler() handling
60+
$this->pos = -1;
61+
$this->line = 1;
62+
$this->filePos = 0;
63+
$this->errors = [];
64+
65+
// If inline HTML occurs without preceding code, treat it as if it had a leading newline.
66+
// This ensures proper composability, because having a newline is the "safe" assumption.
67+
$this->prevCloseTagHasNewline = true;
68+
5769
$scream = ini_set('xdebug.scream', '0');
5870

5971
$this->resetErrors();
@@ -63,15 +75,6 @@ public function startLexing($code) {
6375
if (false !== $scream) {
6476
ini_set('xdebug.scream', $scream);
6577
}
66-
67-
$this->code = $code; // keep the code around for __halt_compiler() handling
68-
$this->pos = -1;
69-
$this->line = 1;
70-
$this->filePos = 0;
71-
72-
// If inline HTML occurs without preceding code, treat it as if it had a leading newline.
73-
// This ensures proper composability, because having a newline is the "safe" assumption.
74-
$this->prevCloseTagHasNewline = true;
7578
}
7679

7780
protected function resetErrors() {
@@ -85,32 +88,85 @@ protected function resetErrors() {
8588
}
8689
}
8790

88-
protected function handleErrors() {
91+
private function handleInvalidCharacterRange($start, $end, $line) {
92+
for ($i = $start; $i < $end; $i++) {
93+
$chr = $this->code[$i];
94+
if ($chr === "\0") {
95+
// PHP cuts error message after null byte, so need special case
96+
$errorMsg = 'Unexpected null byte';
97+
} else {
98+
$errorMsg = sprintf(
99+
'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
100+
);
101+
}
102+
$this->errors[] = new Error($errorMsg, [
103+
'startLine' => $line,
104+
'endLine' => $line,
105+
'startFilePos' => $i,
106+
'endFilePos' => $i,
107+
]);
108+
}
109+
}
110+
111+
private function isUnterminatedComment($token) {
112+
return ($token[0] === T_COMMENT || $token[0] === T_DOC_COMMENT)
113+
&& substr($token[1], 0, 2) === '/*'
114+
&& substr($token[1], -2) !== '*/';
115+
}
116+
117+
private function errorMayHaveOccurred() {
118+
if (defined('HHVM_VERSION')) {
119+
// In HHVM token_get_all() does not throw warnings, so we need to conservatively
120+
// assume that an error occurred
121+
return true;
122+
}
123+
89124
$error = error_get_last();
90-
if (null === $error) {
125+
return null !== $error
126+
&& false === strpos($error['message'], 'Undefined variable');
127+
}
128+
129+
protected function handleErrors() {
130+
if (!$this->errorMayHaveOccurred()) {
91131
return;
92132
}
93133

94-
if (preg_match(
95-
'~^Unterminated comment starting line ([0-9]+)$~',
96-
$error['message'], $matches
97-
)) {
98-
throw new Error('Unterminated comment', (int) $matches[1]);
134+
// PHP's error handling for token_get_all() is rather bad, so if we want detailed
135+
// error information we need to compute it ourselves. Invalid character errors are
136+
// detected by finding "gaps" in the token array. Unterminated comments are detected
137+
// by checking if a trailing comment has a "*/" at the end.
138+
139+
$filePos = 0;
140+
$line = 1;
141+
foreach ($this->tokens as $i => $token) {
142+
$tokenValue = \is_string($token) ? $token : $token[1];
143+
$tokenLen = \strlen($tokenValue);
144+
145+
if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
146+
// Something is missing, must be an invalid character
147+
$nextFilePos = strpos($this->code, $tokenValue, $filePos);
148+
$this->handleInvalidCharacterRange($filePos, $nextFilePos, $line);
149+
$filePos = $nextFilePos;
150+
}
151+
152+
$filePos += $tokenLen;
153+
$line += substr_count($tokenValue, "\n");
99154
}
100155

101-
if (preg_match(
102-
'~^Unexpected character in input: \'(.)\' \(ASCII=([0-9]+)\)~s',
103-
$error['message'], $matches
104-
)) {
105-
throw new Error(sprintf(
106-
'Unexpected character "%s" (ASCII %d)',
107-
$matches[1], $matches[2]
108-
));
156+
// Invalid characters at the end of the input
157+
if ($filePos !== \strlen($this->code)) {
158+
$this->handleInvalidCharacterRange($filePos, \strlen($this->code), $line);
109159
}
110160

111-
// PHP cuts error message after null byte, so need special case
112-
if (preg_match('~^Unexpected character in input: \'$~', $error['message'])) {
113-
throw new Error('Unexpected null byte');
161+
// Check for unterminated comment
162+
$lastToken = $this->tokens[count($this->tokens) - 1];
163+
if ($this->isUnterminatedComment($lastToken)) {
164+
$this->errors[] = new Error('Unterminated comment', [
165+
'startLine' => $line - substr_count($lastToken[1], "\n"),
166+
'endLine' => $line,
167+
'startFilePos' => $filePos - \strlen($lastToken[1]),
168+
'endFilePos' => $filePos,
169+
]);
114170
}
115171
}
116172

@@ -224,6 +280,15 @@ public function getTokens() {
224280
return $this->tokens;
225281
}
226282

283+
/**
284+
* Returns errors that occurred during lexing.
285+
*
286+
* @return Error[] Array of lexer errors
287+
*/
288+
public function getErrors() {
289+
return $this->errors;
290+
}
291+
227292
/**
228293
* Handles __halt_compiler() by returning the text after it.
229294
*

lib/PhpParser/ParserAbstract.php

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -132,19 +132,11 @@ public function getErrors() {
132132
* unable to recover from an error).
133133
*/
134134
public function parse($code) {
135-
$this->errors = array();
136-
137-
// Initialize the lexer
138-
try {
139-
$this->lexer->startLexing($code);
140-
} catch (Error $e) {
141-
$this->errors[] = $e;
142-
if ($this->throwOnError) {
143-
throw $e;
144-
} else {
145-
// Currently can't recover from lexer errors
146-
return null;
147-
}
135+
// Initialize the lexer and inherit lexing errors
136+
$this->lexer->startLexing($code);
137+
$this->errors = $this->lexer->getErrors();
138+
if ($this->throwOnError && !empty($this->errors)) {
139+
throw $this->errors[0];
148140
}
149141

150142
// We start off with no lookahead-token

test/PhpParser/LexerTest.php

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,35 @@ protected function getLexer(array $options = array()) {
1414
/**
1515
* @dataProvider provideTestError
1616
*/
17-
public function testError($code, $message) {
17+
public function testError($code, $messages) {
1818
if (defined('HHVM_VERSION')) {
1919
$this->markTestSkipped('HHVM does not throw warnings from token_get_all()');
2020
}
2121

22-
$lexer = $this->getLexer();
23-
try {
24-
$lexer->startLexing($code);
25-
} catch (Error $e) {
26-
$this->assertSame($message, $e->getMessage());
22+
$lexer = $this->getLexer(['usedAttributes' => [
23+
'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'
24+
]]);
25+
$lexer->startLexing($code);
26+
$errors = $lexer->getErrors();
2727

28-
return;
28+
$this->assertSame(count($messages), count($errors));
29+
for ($i = 0; $i < count($messages); $i++) {
30+
$this->assertSame($messages[$i], $errors[$i]->getMessageWithColumnInfo($code));
2931
}
30-
31-
$this->fail('Expected PhpParser\Error');
3232
}
3333

3434
public function provideTestError() {
3535
return array(
36-
array('<?php /*', 'Unterminated comment on line 1'),
37-
array('<?php ' . "\1", 'Unexpected character "' . "\1" . '" (ASCII 1) on unknown line'),
38-
array('<?php ' . "\0", 'Unexpected null byte on unknown line'),
36+
array("<?php /*", array("Unterminated comment from 1:7 to 1:9")),
37+
array("<?php \1", array("Unexpected character \"\1\" (ASCII 1) from 1:7 to 1:7")),
38+
array("<?php \0", array("Unexpected null byte from 1:7 to 1:7")),
39+
// Error with potentially emulated token
40+
array("<?php ?? \0", array("Unexpected null byte from 1:10 to 1:10")),
41+
array("<?php\n\0\1 foo /* bar", array(
42+
"Unexpected null byte from 2:1 to 2:1",
43+
"Unexpected character \"\1\" (ASCII 1) from 2:2 to 2:2",
44+
"Unterminated comment from 2:8 to 2:14"
45+
)),
3946
);
4047
}
4148

test/PhpParser/ParserTest.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ public function testParserThrowsSpecialError() {
3030
$parser->parse('<?php use foo as self;');
3131
}
3232

33+
/**
34+
* @expectedException \PhpParser\Error
35+
* @expectedExceptionMessage Unterminated comment on line 1
36+
*/
37+
public function testParserThrowsLexerError() {
38+
$parser = $this->getParser(new Lexer());
39+
$parser->parse('<?php /*');
40+
}
41+
3342
public function testAttributeAssignment() {
3443
$lexer = new Lexer(array(
3544
'usedAttributes' => array(

test/code/parser/errorHandling/lexerErrors.test

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,119 @@ $a = 42;
66
/*
77
$b = 24;
88
-----
9-
Unterminated comment on line 4
9+
Unterminated comment from 4:1 to 5:9
10+
array(
11+
0: Expr_Assign(
12+
var: Expr_Variable(
13+
name: a
14+
)
15+
expr: Scalar_LNumber(
16+
value: 42
17+
)
18+
)
19+
1: Stmt_Nop(
20+
comments: array(
21+
0: /*
22+
$b = 24;
23+
)
24+
)
25+
)
1026
-----
1127
<?php
1228

1329
$a = 42;
1430
@@{ "\1" }@@
1531
$b = 24;
1632
-----
17-
Unexpected character "@@{ "\1" }@@" (ASCII 1) on unknown line
33+
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
34+
array(
35+
0: Expr_Assign(
36+
var: Expr_Variable(
37+
name: a
38+
)
39+
expr: Scalar_LNumber(
40+
value: 42
41+
)
42+
)
43+
1: Expr_Assign(
44+
var: Expr_Variable(
45+
name: b
46+
)
47+
expr: Scalar_LNumber(
48+
value: 24
49+
)
50+
)
51+
)
1852
-----
1953
<?php
2054

2155
$a = 42;
2256
@@{ "\0" }@@
2357
$b = 24;
2458
-----
25-
Unexpected null byte on unknown line
59+
Unexpected null byte from 4:1 to 4:1
60+
array(
61+
0: Expr_Assign(
62+
var: Expr_Variable(
63+
name: a
64+
)
65+
expr: Scalar_LNumber(
66+
value: 42
67+
)
68+
)
69+
1: Expr_Assign(
70+
var: Expr_Variable(
71+
name: b
72+
)
73+
expr: Scalar_LNumber(
74+
value: 24
75+
)
76+
)
77+
)
78+
-----
79+
<?php
80+
81+
$a = 1;
82+
@@{ "\1" }@@
83+
$b = 2;
84+
@@{ "\2" }@@
85+
$c = 3;
86+
-----
87+
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
88+
Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
89+
array(
90+
0: Expr_Assign(
91+
var: Expr_Variable(
92+
name: a
93+
)
94+
expr: Scalar_LNumber(
95+
value: 1
96+
)
97+
)
98+
1: Expr_Assign(
99+
var: Expr_Variable(
100+
name: b
101+
)
102+
expr: Scalar_LNumber(
103+
value: 2
104+
)
105+
)
106+
2: Expr_Assign(
107+
var: Expr_Variable(
108+
name: c
109+
)
110+
expr: Scalar_LNumber(
111+
value: 3
112+
)
113+
)
114+
)
115+
-----
116+
<?php
117+
118+
if ($b) {
119+
$a = 1;
120+
/* unterminated
121+
}
122+
-----
123+
Unterminated comment from 5:5 to 6:2
124+
Syntax error, unexpected EOF from 6:2 to 6:2

0 commit comments

Comments
 (0)