@@ -9,6 +9,7 @@ class Lexer
9
9
{
10
10
protected $ code ;
11
11
protected $ tokens ;
12
+ protected $ errors ;
12
13
protected $ pos ;
13
14
protected $ line ;
14
15
protected $ filePos ;
@@ -49,11 +50,22 @@ public function __construct(array $options = array()) {
49
50
/**
50
51
* Initializes the lexer for lexing the provided source code.
51
52
*
52
- * @param string $code The source code to lex
53
+ * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
54
+ * the getErrors() method.
53
55
*
54
- * @throws Error on lexing errors (unterminated comment or unexpected character)
56
+ * @param string $code The source code to lex
55
57
*/
56
58
public function startLexing ($ code ) {
59
+ $ this ->code = $ code ; // keep the code around for __halt_compiler() handling
60
+ $ this ->pos = -1 ;
61
+ $ this ->line = 1 ;
62
+ $ this ->filePos = 0 ;
63
+ $ this ->errors = [];
64
+
65
+ // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
66
+ // This ensures proper composability, because having a newline is the "safe" assumption.
67
+ $ this ->prevCloseTagHasNewline = true ;
68
+
57
69
$ scream = ini_set ('xdebug.scream ' , '0 ' );
58
70
59
71
$ this ->resetErrors ();
@@ -63,15 +75,6 @@ public function startLexing($code) {
63
75
if (false !== $ scream ) {
64
76
ini_set ('xdebug.scream ' , $ scream );
65
77
}
66
-
67
- $ this ->code = $ code ; // keep the code around for __halt_compiler() handling
68
- $ this ->pos = -1 ;
69
- $ this ->line = 1 ;
70
- $ this ->filePos = 0 ;
71
-
72
- // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
73
- // This ensures proper composability, because having a newline is the "safe" assumption.
74
- $ this ->prevCloseTagHasNewline = true ;
75
78
}
76
79
77
80
protected function resetErrors () {
@@ -85,32 +88,85 @@ protected function resetErrors() {
85
88
}
86
89
}
87
90
88
- protected function handleErrors () {
91
+ private function handleInvalidCharacterRange ($ start , $ end , $ line ) {
92
+ for ($ i = $ start ; $ i < $ end ; $ i ++) {
93
+ $ chr = $ this ->code [$ i ];
94
+ if ($ chr === "\0" ) {
95
+ // PHP cuts error message after null byte, so need special case
96
+ $ errorMsg = 'Unexpected null byte ' ;
97
+ } else {
98
+ $ errorMsg = sprintf (
99
+ 'Unexpected character "%s" (ASCII %d) ' , $ chr , ord ($ chr )
100
+ );
101
+ }
102
+ $ this ->errors [] = new Error ($ errorMsg , [
103
+ 'startLine ' => $ line ,
104
+ 'endLine ' => $ line ,
105
+ 'startFilePos ' => $ i ,
106
+ 'endFilePos ' => $ i ,
107
+ ]);
108
+ }
109
+ }
110
+
111
+ private function isUnterminatedComment ($ token ) {
112
+ return ($ token [0 ] === T_COMMENT || $ token [0 ] === T_DOC_COMMENT )
113
+ && substr ($ token [1 ], 0 , 2 ) === '/* '
114
+ && substr ($ token [1 ], -2 ) !== '*/ ' ;
115
+ }
116
+
117
+ private function errorMayHaveOccurred () {
118
+ if (defined ('HHVM_VERSION ' )) {
119
+ // In HHVM token_get_all() does not throw warnings, so we need to conservatively
120
+ // assume that an error occurred
121
+ return true ;
122
+ }
123
+
89
124
$ error = error_get_last ();
90
- if (null === $ error ) {
125
+ return null !== $ error
126
+ && false === strpos ($ error ['message ' ], 'Undefined variable ' );
127
+ }
128
+
129
+ protected function handleErrors () {
130
+ if (!$ this ->errorMayHaveOccurred ()) {
91
131
return ;
92
132
}
93
133
94
- if (preg_match (
95
- '~^Unterminated comment starting line ([0-9]+)$~ ' ,
96
- $ error ['message ' ], $ matches
97
- )) {
98
- throw new Error ('Unterminated comment ' , (int ) $ matches [1 ]);
134
+ // PHP's error handling for token_get_all() is rather bad, so if we want detailed
135
+ // error information we need to compute it ourselves. Invalid character errors are
136
+ // detected by finding "gaps" in the token array. Unterminated comments are detected
137
+ // by checking if a trailing comment has a "*/" at the end.
138
+
139
+ $ filePos = 0 ;
140
+ $ line = 1 ;
141
+ foreach ($ this ->tokens as $ i => $ token ) {
142
+ $ tokenValue = \is_string ($ token ) ? $ token : $ token [1 ];
143
+ $ tokenLen = \strlen ($ tokenValue );
144
+
145
+ if (substr ($ this ->code , $ filePos , $ tokenLen ) !== $ tokenValue ) {
146
+ // Something is missing, must be an invalid character
147
+ $ nextFilePos = strpos ($ this ->code , $ tokenValue , $ filePos );
148
+ $ this ->handleInvalidCharacterRange ($ filePos , $ nextFilePos , $ line );
149
+ $ filePos = $ nextFilePos ;
150
+ }
151
+
152
+ $ filePos += $ tokenLen ;
153
+ $ line += substr_count ($ tokenValue , "\n" );
99
154
}
100
155
101
- if (preg_match (
102
- '~^Unexpected character in input: \'(.) \' \(ASCII=([0-9]+)\)~s ' ,
103
- $ error ['message ' ], $ matches
104
- )) {
105
- throw new Error (sprintf (
106
- 'Unexpected character "%s" (ASCII %d) ' ,
107
- $ matches [1 ], $ matches [2 ]
108
- ));
156
+ // Invalid characters at the end of the input
157
+ if ($ filePos !== \strlen ($ this ->code )) {
158
+ $ this ->handleInvalidCharacterRange ($ filePos , \strlen ($ this ->code ), $ line );
109
159
}
110
160
111
- // PHP cuts error message after null byte, so need special case
112
- if (preg_match ('~^Unexpected character in input: \'$~ ' , $ error ['message ' ])) {
113
- throw new Error ('Unexpected null byte ' );
161
+ // Check for unterminated comment
162
+ $ lastToken = $ this ->tokens [count ($ this ->tokens ) - 1 ];
163
+ if ($ this ->isUnterminatedComment ($ lastToken )) {
164
+ $ this ->errors [] = new Error ('Unterminated comment ' , [
165
+ 'startLine ' => $ line - substr_count ($ lastToken [1 ], "\n" ),
166
+ 'endLine ' => $ line ,
167
+ 'startFilePos ' => $ filePos - \strlen ($ lastToken [1 ]),
168
+ 'endFilePos ' => $ filePos ,
169
+ ]);
114
170
}
115
171
}
116
172
@@ -224,6 +280,15 @@ public function getTokens() {
224
280
return $ this ->tokens ;
225
281
}
226
282
283
+ /**
284
+ * Returns errors that occurred during lexing.
285
+ *
286
+ * @return Error[] Array of lexer errors
287
+ */
288
+ public function getErrors () {
289
+ return $ this ->errors ;
290
+ }
291
+
227
292
/**
228
293
* Handles __halt_compiler() by returning the text after it.
229
294
*
0 commit comments