Support recovery from lexer errors

nikic · nikic · commit c79ea6d1d3f1 · 2016-09-30T20:23:36.000+02:00
Lexer::startLexing() no longer throws, instead errors can be fetched
using Lexer::getErrors().

Lexer errors now also contain full line and position information.
diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php
@@ -9,6 +9,7 @@ class Lexer
 {
     protected $code;
     protected $tokens;
+    protected $errors;
     protected $pos;
     protected $line;
     protected $filePos;
@@ -49,11 +50,22 @@ public function __construct(array $options = array()) {
     /**
      * Initializes the lexer for lexing the provided source code.
      *
-     * @param string $code The source code to lex
+     * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
+     * the getErrors() method.
      *
-     * @throws Error on lexing errors (unterminated comment or unexpected character)
+     * @param string $code The source code to lex
      */
     public function startLexing($code) {
+        $this->code = $code; // keep the code around for __halt_compiler() handling
+        $this->pos  = -1;
+        $this->line =  1;
+        $this->filePos = 0;
+        $this->errors = [];
+
+        // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
+        // This ensures proper composability, because having a newline is the "safe" assumption.
+        $this->prevCloseTagHasNewline = true;
+
         $scream = ini_set('xdebug.scream', '0');
 
         $this->resetErrors();
@@ -63,15 +75,6 @@ public function startLexing($code) {
         if (false !== $scream) {
             ini_set('xdebug.scream', $scream);
         }
-
-        $this->code = $code; // keep the code around for __halt_compiler() handling
-        $this->pos  = -1;
-        $this->line =  1;
-        $this->filePos = 0;
-
-        // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
-        // This ensures proper composability, because having a newline is the "safe" assumption.
-        $this->prevCloseTagHasNewline = true;
     }
 
     protected function resetErrors() {
@@ -85,32 +88,85 @@ protected function resetErrors() {
         }
     }
 
-    protected function handleErrors() {
+    private function handleInvalidCharacterRange($start, $end, $line) {
+        for ($i = $start; $i < $end; $i++) {
+            $chr = $this->code[$i];
+            if ($chr === "\0") {
+                // PHP cuts error message after null byte, so need special case
+                $errorMsg = 'Unexpected null byte';
+            } else {
+                $errorMsg = sprintf(
+                    'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
+                );
+            }
+            $this->errors[] = new Error($errorMsg, [
+                'startLine' => $line,
+                'endLine' => $line,
+                'startFilePos' => $i,
+                'endFilePos' => $i,
+            ]);
+        }
+    }
+
+    private function isUnterminatedComment($token) {
+        return ($token[0] === T_COMMENT || $token[0] === T_DOC_COMMENT)
+            && substr($token[1], 0, 2) === '/*'
+            && substr($token[1], -2) !== '*/';
+    }
+
+    private function errorMayHaveOccurred() {
+        if (defined('HHVM_VERSION')) {
+            // In HHVM token_get_all() does not throw warnings, so we need to conservatively
+            // assume that an error occurred
+            return true;
+        }
+
         $error = error_get_last();
-        if (null === $error) {
+        return null !== $error
+            && false === strpos($error['message'], 'Undefined variable');
+    }
+
+    protected function handleErrors() {
+        if (!$this->errorMayHaveOccurred()) {
             return;
         }
 
-        if (preg_match(
-            '~^Unterminated comment starting line ([0-9]+)$~',
-            $error['message'], $matches
-        )) {
-            throw new Error('Unterminated comment', (int) $matches[1]);
+        // PHP's error handling for token_get_all() is rather bad, so if we want detailed
+        // error information we need to compute it ourselves. Invalid character errors are
+        // detected by finding "gaps" in the token array. Unterminated comments are detected
+        // by checking if a trailing comment has a "*/" at the end.
+
+        $filePos = 0;
+        $line = 1;
+        foreach ($this->tokens as $i => $token) {
+            $tokenValue = \is_string($token) ? $token : $token[1];
+            $tokenLen = \strlen($tokenValue);
+
+            if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
+                // Something is missing, must be an invalid character
+                $nextFilePos = strpos($this->code, $tokenValue, $filePos);
+                $this->handleInvalidCharacterRange($filePos, $nextFilePos, $line);
+                $filePos = $nextFilePos;
+            }
+
+            $filePos += $tokenLen;
+            $line += substr_count($tokenValue, "\n");
         }
 
-        if (preg_match(
-            '~^Unexpected character in input:  \'(.)\' \(ASCII=([0-9]+)\)~s',
-            $error['message'], $matches
-        )) {
-            throw new Error(sprintf(
-                'Unexpected character "%s" (ASCII %d)',
-                $matches[1], $matches[2]
-            ));
+        // Invalid characters at the end of the input
+        if ($filePos !== \strlen($this->code)) {
+            $this->handleInvalidCharacterRange($filePos, \strlen($this->code), $line);
         }
 
-        // PHP cuts error message after null byte, so need special case
-        if (preg_match('~^Unexpected character in input:  \'$~', $error['message'])) {
-            throw new Error('Unexpected null byte');
+        // Check for unterminated comment
+        $lastToken = $this->tokens[count($this->tokens) - 1];
+        if ($this->isUnterminatedComment($lastToken)) {
+            $this->errors[] = new Error('Unterminated comment', [
+                'startLine' => $line - substr_count($lastToken[1], "\n"),
+                'endLine' => $line,
+                'startFilePos' => $filePos - \strlen($lastToken[1]),
+                'endFilePos' => $filePos,
+            ]);
         }
     }
 
@@ -224,6 +280,15 @@ public function getTokens() {
         return $this->tokens;
     }
 
+    /**
+     * Returns errors that occurred during lexing.
+     *
+     * @return Error[] Array of lexer errors
+     */
+    public function getErrors() {
+        return $this->errors;
+    }
+
     /**
      * Handles __halt_compiler() by returning the text after it.
      *
diff --git a/lib/PhpParser/ParserAbstract.php b/lib/PhpParser/ParserAbstract.php
@@ -132,19 +132,11 @@ public function getErrors() {
      *                     unable to recover from an error).
      */
     public function parse($code) {
-        $this->errors = array();
-
-        // Initialize the lexer
-        try {
-            $this->lexer->startLexing($code);
-        } catch (Error $e) {
-            $this->errors[] = $e;
-            if ($this->throwOnError) {
-                throw $e;
-            } else {
-                // Currently can't recover from lexer errors
-                return null;
-            }
+        // Initialize the lexer and inherit lexing errors
+        $this->lexer->startLexing($code);
+        $this->errors = $this->lexer->getErrors();
+        if ($this->throwOnError && !empty($this->errors)) {
+            throw $this->errors[0];
         }
 
         // We start off with no lookahead-token
diff --git a/test/PhpParser/LexerTest.php b/test/PhpParser/LexerTest.php
@@ -14,28 +14,35 @@ protected function getLexer(array $options = array()) {
     /**
      * @dataProvider provideTestError
      */
-    public function testError($code, $message) {
+    public function testError($code, $messages) {
         if (defined('HHVM_VERSION')) {
             $this->markTestSkipped('HHVM does not throw warnings from token_get_all()');
         }
 
-        $lexer = $this->getLexer();
-        try {
-            $lexer->startLexing($code);
-        } catch (Error $e) {
-            $this->assertSame($message, $e->getMessage());
+        $lexer = $this->getLexer(['usedAttributes' => [
+            'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'
+        ]]);
+        $lexer->startLexing($code);
+        $errors = $lexer->getErrors();
 
-            return;
+        $this->assertSame(count($messages), count($errors));
+        for ($i = 0; $i < count($messages); $i++) {
+            $this->assertSame($messages[$i], $errors[$i]->getMessageWithColumnInfo($code));
         }
-
-        $this->fail('Expected PhpParser\Error');
     }
 
     public function provideTestError() {
         return array(
-            array('<?php /*', 'Unterminated comment on line 1'),
-            array('<?php ' . "\1", 'Unexpected character "' . "\1" . '" (ASCII 1) on unknown line'),
-            array('<?php ' . "\0", 'Unexpected null byte on unknown line'),
+            array("<?php /*", array("Unterminated comment from 1:7 to 1:9")),
+            array("<?php \1", array("Unexpected character \"\1\" (ASCII 1) from 1:7 to 1:7")),
+            array("<?php \0", array("Unexpected null byte from 1:7 to 1:7")),
+            // Error with potentially emulated token
+            array("<?php ?? \0", array("Unexpected null byte from 1:10 to 1:10")),
+            array("<?php\n\0\1 foo /* bar", array(
+                "Unexpected null byte from 2:1 to 2:1",
+                "Unexpected character \"\1\" (ASCII 1) from 2:2 to 2:2",
+                "Unterminated comment from 2:8 to 2:14"
+            )),
         );
     }
 
diff --git a/test/PhpParser/ParserTest.php b/test/PhpParser/ParserTest.php
@@ -30,6 +30,15 @@ public function testParserThrowsSpecialError() {
         $parser->parse('<?php use foo as self;');
     }
 
+    /**
+     * @expectedException \PhpParser\Error
+     * @expectedExceptionMessage Unterminated comment on line 1
+     */
+    public function testParserThrowsLexerError() {
+        $parser = $this->getParser(new Lexer());
+        $parser->parse('<?php /*');
+    }
+
     public function testAttributeAssignment() {
         $lexer = new Lexer(array(
             'usedAttributes' => array(
diff --git a/test/code/parser/errorHandling/lexerErrors.test b/test/code/parser/errorHandling/lexerErrors.test
@@ -6,20 +6,119 @@ $a = 42;
 /*
 $b = 24;
 -----
-Unterminated comment on line 4
+Unterminated comment from 4:1 to 5:9
+array(
+    0: Expr_Assign(
+        var: Expr_Variable(
+            name: a
+        )
+        expr: Scalar_LNumber(
+            value: 42
+        )
+    )
+    1: Stmt_Nop(
+        comments: array(
+            0: /*
+            $b = 24;
+        )
+    )
+)
 -----
 <?php
 
 $a = 42;
 @@{ "\1" }@@
 $b = 24;
 -----
-Unexpected character "@@{ "\1" }@@" (ASCII 1) on unknown line
+Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
+array(
+    0: Expr_Assign(
+        var: Expr_Variable(
+            name: a
+        )
+        expr: Scalar_LNumber(
+            value: 42
+        )
+    )
+    1: Expr_Assign(
+        var: Expr_Variable(
+            name: b
+        )
+        expr: Scalar_LNumber(
+            value: 24
+        )
+    )
+)
 -----
 <?php
 
 $a = 42;
 @@{ "\0" }@@
 $b = 24;
 -----
-Unexpected null byte on unknown line
+Unexpected null byte from 4:1 to 4:1
+array(
+    0: Expr_Assign(
+        var: Expr_Variable(
+            name: a
+        )
+        expr: Scalar_LNumber(
+            value: 42
+        )
+    )
+    1: Expr_Assign(
+        var: Expr_Variable(
+            name: b
+        )
+        expr: Scalar_LNumber(
+            value: 24
+        )
+    )
+)
+-----
+<?php
+
+$a = 1;
+@@{ "\1" }@@
+$b = 2;
+@@{ "\2" }@@
+$c = 3;
+-----
+Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
+Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
+array(
+    0: Expr_Assign(
+        var: Expr_Variable(
+            name: a
+        )
+        expr: Scalar_LNumber(
+            value: 1
+        )
+    )
+    1: Expr_Assign(
+        var: Expr_Variable(
+            name: b
+        )
+        expr: Scalar_LNumber(
+            value: 2
+        )
+    )
+    2: Expr_Assign(
+        var: Expr_Variable(
+            name: c
+        )
+        expr: Scalar_LNumber(
+            value: 3
+        )
+    )
+)
+-----
+<?php
+
+if ($b) {
+    $a = 1;
+    /* unterminated
+}
+-----
+Unterminated comment from 5:5 to 6:2
+Syntax error, unexpected EOF from 6:2 to 6:2