Skip to content

Commit 17d1e73

Browse files
committed
Don't use ~__EMU sequences in emulative lexer
These were necessary back in the day when we had to emulate some complex functionality such as nowdoc strings. Now we can simply directly translate certain token sequences. The motivation for this change is to avoid preprocessing of the source code, which would complicate offset-aware error handling inside the lexer as offsets would no longer be correct.
1 parent 9e5d3bb commit 17d1e73

File tree

1 file changed

+75
-103
lines changed

1 file changed

+75
-103
lines changed

lib/PhpParser/Lexer/Emulative.php

Lines changed: 75 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44

55
use PhpParser\Parser\Tokens;
66

7-
/**
8-
* ATTENTION: This code is WRITE-ONLY. Do not try to read it.
9-
*/
107
class Emulative extends \PhpParser\Lexer
118
{
129
protected $newKeywords;
@@ -56,128 +53,103 @@ public function __construct(array $options = array()) {
5653
public function startLexing($code) {
5754
$this->inObjectAccess = false;
5855

59-
$preprocessedCode = $this->preprocessCode($code);
60-
parent::startLexing($preprocessedCode);
61-
if ($preprocessedCode !== $code) {
62-
$this->postprocessTokens();
56+
parent::startLexing($code);
57+
if ($this->requiresEmulation($code)) {
58+
$this->emulateTokens();
6359
}
64-
65-
// Set code property back to the original code, so __halt_compiler()
66-
// handling and (start|end)FilePos attributes use the correct offsets
67-
$this->code = $code;
6860
}
6961

7062
/*
71-
* Replaces new features in the code by ~__EMU__{NAME}__{DATA}__~ sequences.
72-
* ~LABEL~ is never valid PHP code, that's why we can (to some degree) safely
73-
* use it here.
74-
* Later when preprocessing the tokens these sequences will either be replaced
75-
* by real tokens or replaced with their original content (e.g. if they occurred
76-
* inside a string, i.e. a place where they don't have a special meaning).
63+
* Checks if the code is potentially using features that require emulation.
7764
*/
78-
protected function preprocessCode($code) {
65+
protected function requiresEmulation($code) {
7966
if (version_compare(PHP_VERSION, self::PHP_7_0, '>=')) {
80-
return $code;
67+
return false;
8168
}
8269

83-
$code = str_replace('??', '~__EMU__COALESCE__~', $code);
84-
$code = str_replace('<=>', '~__EMU__SPACESHIP__~', $code);
85-
$code = preg_replace_callback('(yield[ \n\r\t]+from)', function($matches) {
86-
// Encoding $0 in order to preserve exact whitespace
87-
return '~__EMU__YIELDFROM__' . bin2hex($matches[0]) . '__~';
88-
}, $code);
70+
if (preg_match('/\?\?|<=>|yield[ \n\r\t]+from/', $code)) {
71+
return true;
72+
}
8973

9074
if (version_compare(PHP_VERSION, self::PHP_5_6, '>=')) {
91-
return $code;
75+
return false;
9276
}
9377

94-
$code = str_replace('...', '~__EMU__ELLIPSIS__~', $code);
95-
$code = preg_replace('((?<!/)\*\*=)', '~__EMU__POWEQUAL__~', $code);
96-
$code = preg_replace('((?<!/)\*\*(?!/))', '~__EMU__POW__~', $code);
97-
98-
return $code;
78+
return preg_match('/\.\.\.|(?<!/)\*\*(?!/)/', $code);
9979
}
10080

10181
/*
102-
* Replaces the ~__EMU__...~ sequences with real tokens or their original
103-
* value.
82+
* Emulates tokens for newer PHP versions.
10483
*/
105-
protected function postprocessTokens() {
106-
// we need to manually iterate and manage a count because we'll change
84+
protected function emulateTokens() {
85+
// We need to manually iterate and manage a count because we'll change
10786
// the tokens array on the way
87+
$line = 1;
10888
for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) {
109-
// first check that the following tokens are of form ~LABEL~,
110-
// then match the __EMU__... sequence.
111-
if ('~' === $this->tokens[$i]
112-
&& isset($this->tokens[$i + 2])
113-
&& '~' === $this->tokens[$i + 2]
114-
&& T_STRING === $this->tokens[$i + 1][0]
115-
&& preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches)
116-
) {
117-
if ('ELLIPSIS' === $matches[1]) {
118-
$replace = array(
119-
array(self::T_ELLIPSIS, '...', $this->tokens[$i + 1][2])
120-
);
121-
} else if ('POW' === $matches[1]) {
122-
$replace = array(
123-
array(self::T_POW, '**', $this->tokens[$i + 1][2])
124-
);
125-
} else if ('POWEQUAL' === $matches[1]) {
126-
$replace = array(
127-
array(self::T_POW_EQUAL, '**=', $this->tokens[$i + 1][2])
128-
);
129-
} else if ('COALESCE' === $matches[1]) {
130-
$replace = array(
131-
array(self::T_COALESCE, '??', $this->tokens[$i + 1][2])
132-
);
133-
} else if ('SPACESHIP' === $matches[1]) {
134-
$replace = array(
135-
array(self::T_SPACESHIP, '<=>', $this->tokens[$i + 1][2]),
136-
);
137-
} else if ('YIELDFROM' === $matches[1]) {
138-
$content = hex2bin($matches[2]);
139-
$replace = array(
140-
array(self::T_YIELD_FROM, $content, $this->tokens[$i + 1][2] - substr_count($content, "\n"))
141-
);
142-
} else {
143-
throw new \RuntimeException('Invalid __EMU__ sequence');
89+
$replace = null;
90+
if (isset($this->tokens[$i + 1])) {
91+
if ($this->tokens[$i] === '?' && $this->tokens[$i + 1] === '?') {
92+
array_splice($this->tokens, $i, 2, array(
93+
array(self::T_COALESCE, '??', $line)
94+
));
95+
$c--;
96+
continue;
97+
}
98+
if ($this->tokens[$i][0] === T_IS_SMALLER_OR_EQUAL
99+
&& $this->tokens[$i + 1] === '>'
100+
) {
101+
array_splice($this->tokens, $i, 2, array(
102+
array(self::T_SPACESHIP, '<=>', $line)
103+
));
104+
$c--;
105+
continue;
144106
}
107+
if ($this->tokens[$i] === '*' && $this->tokens[$i + 1] === '*') {
108+
array_splice($this->tokens, $i, 2, array(
109+
array(self::T_POW, '**', $line)
110+
));
111+
$c--;
112+
continue;
113+
}
114+
if ($this->tokens[$i] === '*' && $this->tokens[$i + 1][0] === T_MUL_EQUAL) {
115+
array_splice($this->tokens, $i, 2, array(
116+
array(self::T_POW_EQUAL, '**=', $line)
117+
));
118+
$c--;
119+
continue;
120+
}
121+
}
145122

146-
array_splice($this->tokens, $i, 3, $replace);
147-
$c -= 3 - count($replace);
148-
// for multichar tokens (e.g. strings) replace any ~__EMU__...~ sequences
149-
// in their content with the original character sequence
150-
} elseif (is_array($this->tokens[$i])
151-
&& 0 !== strpos($this->tokens[$i][1], '__EMU__')
152-
) {
153-
$this->tokens[$i][1] = preg_replace_callback(
154-
'(~__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?~)',
155-
array($this, 'restoreContentCallback'),
156-
$this->tokens[$i][1]
157-
);
123+
if (isset($this->tokens[$i + 2])) {
124+
if ($this->tokens[$i][0] === T_YIELD && $this->tokens[$i + 1][0] === T_WHITESPACE
125+
&& $this->tokens[$i + 2][0] === T_STRING
126+
&& !strcasecmp($this->tokens[$i + 2][1], 'from')
127+
) {
128+
array_splice($this->tokens, $i, 3, array(
129+
array(
130+
self::T_YIELD_FROM,
131+
$this->tokens[$i][1] . $this->tokens[$i + 1][1] . $this->tokens[$i + 2][1],
132+
$line
133+
)
134+
));
135+
$c -= 2;
136+
$line += substr_count($this->tokens[$i][1], "\n");
137+
continue;
138+
}
139+
if ($this->tokens[$i] === '.' && $this->tokens[$i + 1] === '.'
140+
&& $this->tokens[$i + 2] === '.'
141+
) {
142+
array_splice($this->tokens, $i, 3, array(
143+
array(self::T_ELLIPSIS, '...', $line)
144+
));
145+
$c -= 2;
146+
continue;
147+
}
158148
}
159-
}
160-
}
161149

162-
/*
163-
* This method is a callback for restoring EMU sequences in
164-
* multichar tokens (like strings) to their original value.
165-
*/
166-
public function restoreContentCallback(array $matches) {
167-
if ('ELLIPSIS' === $matches[1]) {
168-
return '...';
169-
} else if ('POW' === $matches[1]) {
170-
return '**';
171-
} else if ('POWEQUAL' === $matches[1]) {
172-
return '**=';
173-
} else if ('COALESCE' === $matches[1]) {
174-
return '??';
175-
} else if ('SPACESHIP' === $matches[1]) {
176-
return '<=>';
177-
} else if ('YIELDFROM' === $matches[1]) {
178-
return hex2bin($matches[2]);
179-
} else {
180-
return $matches[0];
150+
if (\is_array($this->tokens[$i])) {
151+
$line += substr_count($this->tokens[$i][1], "\n");
152+
}
181153
}
182154
}
183155

0 commit comments

Comments
 (0)