Skip to content

Commit 4abc531

Browse files
committed
Canonicalize to PHP 8 comment token format
The trailing newline is no longer part of the comment token.
1 parent b58b19e commit 4abc531

File tree

5 files changed

+47
-52
lines changed

5 files changed

+47
-52
lines changed

lib/PhpParser/Lexer.php

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ public function startLexing(string $code, ErrorHandler $errorHandler = null) {
8989

9090
error_clear_last();
9191
$this->tokens = @token_get_all($code);
92-
$this->handleErrors($errorHandler);
92+
$this->postprocessTokens($errorHandler);
9393

9494
if (false !== $scream) {
9595
ini_set('xdebug.scream', $scream);
@@ -131,40 +131,14 @@ private function isUnterminatedComment($token) : bool {
131131
&& substr($token[1], -2) !== '*/';
132132
}
133133

134-
/**
135-
* Check whether an error *may* have occurred during tokenization.
136-
*
137-
* @return bool
138-
*/
139-
private function errorMayHaveOccurred() : bool {
140-
if (defined('HHVM_VERSION')) {
141-
// In HHVM token_get_all() does not throw warnings, so we need to conservatively
142-
// assume that an error occurred
143-
return true;
144-
}
145-
146-
if (PHP_VERSION_ID >= 80000) {
147-
// PHP 8 converts the "bad character" case into a parse error, rather than treating
148-
// it as a lexing warning. To preserve previous behavior, we need to assume that an
149-
// error occurred.
150-
// TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
151-
// token here (for older PHP versions) and leave generationg of the actual parse error
152-
// to the parser. This will also save the full token scan on PHP 8 here.
153-
return true;
154-
}
155-
156-
return null !== error_get_last();
157-
}
158-
159-
protected function handleErrors(ErrorHandler $errorHandler) {
160-
if (!$this->errorMayHaveOccurred()) {
161-
return;
162-
}
163-
134+
protected function postprocessTokens(ErrorHandler $errorHandler) {
164135
// PHP's error handling for token_get_all() is rather bad, so if we want detailed
165136
// error information we need to compute it ourselves. Invalid character errors are
166137
// detected by finding "gaps" in the token array. Unterminated comments are detected
167138
// by checking if a trailing comment has a "*/" at the end.
139+
//
140+
// Additionally, we canonicalize to the PHP 8 comment format here, which does not include
141+
// the trailing whitespace anymore
168142

169143
$filePos = 0;
170144
$line = 1;
@@ -178,6 +152,23 @@ protected function handleErrors(ErrorHandler $errorHandler) {
178152
$this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
179153
}
180154

155+
if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
156+
$trailingNewline = $matches[0];
157+
$token[1] = substr($token[1], 0, -strlen($trailingNewline));
158+
$this->tokens[$i] = $token;
159+
if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
160+
// Move trailing newline into following T_WHITESPACE token, if it already exists.
161+
$this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
162+
$this->tokens[$i + 1][2]--;
163+
} else {
164+
// Otherwise, we need to create a new T_WHITESPACE token.
165+
array_splice($this->tokens, $i + 1, 0, [
166+
[\T_WHITESPACE, $trailingNewline, $line],
167+
]);
168+
$numTokens++;
169+
}
170+
}
171+
181172
$tokenValue = \is_string($token) ? $token : $token[1];
182173
$tokenLen = \strlen($tokenValue);
183174

test/PhpParser/LexerTest.php

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,12 @@ public function provideTestLex() {
124124
'comments' => [
125125
new Comment('/* comment */',
126126
1, 6, 1, 1, 18, 1),
127-
new Comment('// comment' . "\n",
128-
1, 20, 3, 2, 30, 3),
127+
new Comment('// comment',
128+
1, 20, 3, 1, 29, 3),
129129
new Comment\Doc('/** docComment 1 */',
130-
2, 31, 4, 2, 49, 4),
130+
2, 31, 5, 2, 49, 5),
131131
new Comment\Doc('/** docComment 2 */',
132-
2, 50, 5, 2, 68, 5),
132+
2, 50, 6, 2, 68, 6),
133133
],
134134
],
135135
['endLine' => 2]
@@ -185,11 +185,11 @@ public function provideTestLex() {
185185
],
186186
[
187187
Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"',
188-
['startTokenPos' => 5], ['endTokenPos' => 5]
188+
['startTokenPos' => 6], ['endTokenPos' => 6]
189189
],
190190
[
191191
ord(';'), ';',
192-
['startTokenPos' => 6], ['endTokenPos' => 6]
192+
['startTokenPos' => 7], ['endTokenPos' => 7]
193193
],
194194
]
195195
],
@@ -251,14 +251,17 @@ public function testHandleHaltCompilerError() {
251251
}
252252

253253
public function testGetTokens() {
254-
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
254+
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '// bar' . "\n\n" . '"b";';
255255
$expectedTokens = [
256256
[T_OPEN_TAG, '<?php ', 1],
257257
[T_CONSTANT_ENCAPSED_STRING, '"a"', 1],
258258
';',
259259
[T_WHITESPACE, "\n", 1],
260-
[T_COMMENT, '// foo' . "\n", 2],
261-
[T_CONSTANT_ENCAPSED_STRING, '"b"', 3],
260+
[T_COMMENT, '// foo', 2],
261+
[T_WHITESPACE, "\n", 2],
262+
[T_COMMENT, '// bar', 3],
263+
[T_WHITESPACE, "\n\n", 3],
264+
[T_CONSTANT_ENCAPSED_STRING, '"b"', 5],
262265
';',
263266
];
264267

test/PhpParser/NodeAbstractTest.php

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -307,23 +307,23 @@ function functionName(&$a = 0, $b = 1.0) {
307307
"comments": [
308308
{
309309
"nodeType": "Comment",
310-
"text": "\/\/ comment\n",
310+
"text": "\/\/ comment",
311311
"line": 2,
312312
"filePos": 6,
313313
"tokenPos": 1,
314-
"endLine": 3,
315-
"endFilePos": 16,
314+
"endLine": 2,
315+
"endFilePos": 15,
316316
"endTokenPos": 1
317317
},
318318
{
319319
"nodeType": "Comment_Doc",
320320
"text": "\/** doc comment *\/",
321321
"line": 3,
322322
"filePos": 17,
323-
"tokenPos": 2,
323+
"tokenPos": 3,
324324
"endLine": 3,
325325
"endFilePos": 34,
326-
"endTokenPos": 2
326+
"endTokenPos": 3
327327
}
328328
],
329329
"endLine": 6

test/PhpParser/ParserTest.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,10 @@ function test($a) {
8383
$this->assertInstanceOf(Stmt\Echo_::class, $echo);
8484
$this->assertEquals([
8585
'comments' => [
86-
new Comment("// Line\n",
87-
4, 49, 12, 5, 56, 12),
88-
new Comment("// Comments\n",
89-
5, 61, 14, 6, 72, 14),
86+
new Comment("// Line",
87+
4, 49, 12, 4, 55, 12),
88+
new Comment("// Comments",
89+
5, 61, 14, 5, 71, 14),
9090
],
9191
'startLine' => 6,
9292
'endLine' => 6,

test/code/formatPreservation/classMethodNop.test

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ class Foo {
1515
public function __construct()
1616
{
1717
// I'm just a comment
18-
19-
$foo; }
18+
$foo;
19+
}
2020
}
2121
-----
2222
<?php
@@ -72,5 +72,6 @@ class Foo {
7272
public function __construct()
7373
{
7474
// I'm a new comment
75-
}
75+
76+
}
7677
}

0 commit comments

Comments
 (0)