1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Antlr\Antlr4\Runtime; |
||
6 | |||
7 | use Antlr\Antlr4\Runtime\Atn\LexerATNSimulator; |
||
8 | use Antlr\Antlr4\Runtime\Error\Exceptions\LexerNoViableAltException; |
||
9 | use Antlr\Antlr4\Runtime\Error\Exceptions\RecognitionException; |
||
10 | use Antlr\Antlr4\Runtime\Utils\Pair; |
||
11 | |||
12 | /** |
||
13 | * A lexer is recognizer that draws input symbols from a character stream. |
||
14 | * lexer grammars result in a subclass of this object. A Lexer object |
||
15 | * uses simplified match() and error recovery mechanisms in the interest |
||
16 | * of speed. |
||
17 | */ |
||
18 | abstract class Lexer extends Recognizer implements TokenSource |
||
19 | { |
||
20 | public const DEFAULT_MODE = 0; |
||
21 | public const MORE = -2; |
||
22 | public const SKIP = -3; |
||
23 | |||
24 | public const DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL; |
||
25 | public const HIDDEN = Token::HIDDEN_CHANNEL; |
||
26 | public const MIN_CHAR_VALUE = 0x0000; |
||
27 | public const MAX_CHAR_VALUE = 0x10FFFF; |
||
28 | |||
29 | /** @var CharStream|null */ |
||
30 | public $input; |
||
31 | |||
32 | /** @var Pair Pair<TokenSource, CharStream> */ |
||
33 | protected $tokenFactorySourcePair; |
||
34 | |||
35 | /** @var TokenFactory */ |
||
36 | protected $factory; |
||
37 | |||
38 | /** |
||
39 | * The goal of all lexer rules/methods is to create a token object. |
||
40 | * This is an instance variable as multiple rules may collaborate to |
||
41 | * create a single token. `nextToken` will return this object after |
||
42 | * matching lexer rule(s). |
||
43 | * |
||
44 | * If you subclass to allow multiple token emissions, then set this |
||
45 | * to the last token to be matched or something nonnull so that |
||
46 | * the auto token emit mechanism will not emit another token. |
||
47 | * |
||
48 | * @var Token|null |
||
49 | */ |
||
50 | public $token; |
||
51 | |||
52 | /** |
||
53 | * What character index in the stream did the current token start at? |
||
54 | * Needed, for example, to get the text for current token. Set at |
||
55 | * the start of nextToken. |
||
56 | * |
||
57 | * @var int |
||
58 | */ |
||
59 | public $tokenStartCharIndex = -1; |
||
60 | |||
61 | /** |
||
62 | * The line on which the first character of the token resides. |
||
63 | * |
||
64 | * @var int |
||
65 | */ |
||
66 | public $tokenStartLine = -1; |
||
67 | |||
68 | /** |
||
69 | * The character position of first character within the line |
||
70 | * |
||
71 | * @var int |
||
72 | */ |
||
73 | public $tokenStartCharPositionInLine = -1; |
||
74 | |||
75 | /** |
||
76 | * Once we see EOF on char stream, next token will be EOF. |
||
77 | * If you have DONE : EOF ; then you see DONE EOF. |
||
78 | * |
||
79 | * @var bool |
||
80 | */ |
||
81 | public $hitEOF = false; |
||
82 | |||
83 | /** |
||
84 | * The channel number for the current token. |
||
85 | * |
||
86 | * @var int |
||
87 | */ |
||
88 | public $channel = Token::DEFAULT_CHANNEL; |
||
89 | |||
90 | /** |
||
91 | * The token type for the current token. |
||
92 | * |
||
93 | * @var int |
||
94 | */ |
||
95 | public $type = Token::INVALID_TYPE; |
||
96 | |||
97 | /** @var array<int> */ |
||
98 | public $modeStack = []; |
||
99 | |||
100 | /** @var int */ |
||
101 | public $mode = self::DEFAULT_MODE; |
||
102 | |||
103 | /** |
||
104 | * You can set the text for the current token to override what is in the |
||
105 | * input char buffer. Use {@see Lexer::setText()} or can set this instance var. |
||
106 | * |
||
107 | * @var string|null |
||
108 | */ |
||
109 | public $text; |
||
110 | |||
111 | /** @var LexerATNSimulator|null */ |
||
112 | protected $interp; |
||
113 | |||
114 | 7 | public function __construct(?CharStream $input = null) |
|
115 | { |
||
116 | 7 | parent::__construct(); |
|
117 | |||
118 | 7 | $this->input = $input; |
|
119 | 7 | $this->factory = CommonTokenFactory::default(); |
|
120 | 7 | $this->tokenFactorySourcePair = new Pair($this, $input); |
|
121 | |||
122 | // @todo remove this property |
||
123 | 7 | $this->interp = null;// child classes must populate this |
|
124 | 7 | } |
|
125 | |||
126 | public function reset() : void |
||
127 | { |
||
128 | // wack Lexer state variables |
||
129 | if ($this->input !== null) { |
||
130 | $this->input->seek(0);// rewind the input |
||
131 | } |
||
132 | |||
133 | $this->token = null; |
||
134 | $this->type = Token::INVALID_TYPE; |
||
135 | $this->channel = Token::DEFAULT_CHANNEL; |
||
136 | $this->tokenStartCharIndex = -1; |
||
137 | $this->tokenStartCharPositionInLine = -1; |
||
138 | $this->tokenStartLine = -1; |
||
139 | $this->text = null; |
||
140 | |||
141 | $this->hitEOF = false; |
||
142 | $this->mode = self::DEFAULT_MODE; |
||
143 | $this->modeStack = []; |
||
144 | |||
145 | if ($this->interp !== null) { |
||
146 | $this->interp->reset(); |
||
147 | } |
||
148 | } |
||
149 | |||
150 | /** |
||
151 | * Return a token from this source; i.e., match a token on the char stream. |
||
152 | */ |
||
153 | 7 | public function nextToken() : ?Token |
|
154 | { |
||
155 | 7 | if ($this->input === null) { |
|
156 | throw new \RuntimeException('NextToken requires a non-null input stream.'); |
||
157 | } |
||
158 | |||
159 | // Mark start location in char stream so unbuffered streams are |
||
160 | // guaranteed at least have text of current token |
||
161 | 7 | $tokenStartMarker = $this->input->mark(); |
|
162 | |||
163 | try { |
||
164 | 7 | while (true) { |
|
165 | 7 | if ($this->hitEOF) { |
|
166 | 6 | $this->emitEOF(); |
|
167 | |||
168 | 6 | return $this->token; |
|
169 | } |
||
170 | |||
171 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
172 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
173 | } |
||
174 | |||
175 | 7 | $this->token = null; |
|
176 | 7 | $this->channel = Token::DEFAULT_CHANNEL; |
|
177 | 7 | $this->tokenStartCharIndex = $this->input->getIndex(); |
|
178 | 7 | $this->tokenStartCharPositionInLine = $this->interp->getCharPositionInLine(); |
|
179 | 7 | $this->tokenStartLine = $this->interp->getLine(); |
|
180 | 7 | $this->text = null; |
|
181 | 7 | $continueOuter = false; |
|
182 | |||
183 | 7 | while (true) { |
|
184 | 7 | $this->type = Token::INVALID_TYPE; |
|
185 | 7 | $ttype = self::SKIP; |
|
0 ignored issues
–
show
Unused Code
introduced
by
![]() |
|||
186 | try { |
||
187 | 7 | $ttype = $this->interp->match($this->input, $this->mode); |
|
188 | } catch (LexerNoViableAltException $e) { |
||
189 | $this->notifyListeners($e); // report error |
||
190 | $this->recover($e); |
||
191 | } |
||
192 | |||
193 | 7 | if ($this->input->LA(1) === Token::EOF) { |
|
194 | 7 | $this->hitEOF = true; |
|
195 | } |
||
196 | |||
197 | 7 | if ($this->type === Token::INVALID_TYPE) { |
|
198 | 6 | $this->type = $ttype; |
|
199 | } |
||
200 | |||
201 | 7 | if ($this->type === self::SKIP) { |
|
202 | 5 | $continueOuter = true; |
|
203 | |||
204 | 5 | break; |
|
205 | } |
||
206 | |||
207 | 6 | if ($this->type !== self::MORE) { |
|
208 | 6 | break; |
|
209 | } |
||
210 | } |
||
211 | |||
212 | 7 | if ($continueOuter) { |
|
213 | 5 | continue; |
|
214 | } |
||
215 | |||
216 | 6 | if ($this->token === null) { |
|
217 | 6 | $this->emit(); |
|
218 | } |
||
219 | |||
220 | 6 | return $this->token; |
|
221 | } |
||
222 | } finally { |
||
223 | // make sure we release marker after match or |
||
224 | // unbuffered char stream will keep buffering |
||
225 | 7 | $this->input->release($tokenStartMarker); |
|
226 | } |
||
227 | } |
||
228 | |||
229 | /** |
||
230 | * Instruct the lexer to skip creating a token for current lexer rule |
||
231 | * and look for another token. `nextToken` knows to keep looking when |
||
232 | * a lexer rule finishes with token set to SKIP_TOKEN. Recall that |
||
233 | * if `token === null` at end of any token rule, it creates one for you |
||
234 | * and emits it. |
||
235 | */ |
||
236 | 5 | public function skip() : void |
|
237 | { |
||
238 | 5 | $this->type = self::SKIP; |
|
239 | 5 | } |
|
240 | |||
241 | public function more() : void |
||
242 | { |
||
243 | $this->type = self::MORE; |
||
244 | } |
||
245 | |||
246 | public function mode(int $m) : void |
||
247 | { |
||
248 | $this->mode = $m; |
||
249 | } |
||
250 | |||
251 | public function pushMode(int $m) : void |
||
252 | { |
||
253 | $this->modeStack[] = $this->mode; |
||
254 | |||
255 | $this->mode($m); |
||
256 | } |
||
257 | |||
258 | public function popMode() : int |
||
259 | { |
||
260 | if (\count($this->modeStack) === 0) { |
||
261 | throw new \RuntimeException('Empty Stack'); |
||
262 | } |
||
263 | |||
264 | $this->mode(\array_pop($this->modeStack)); |
||
265 | |||
266 | return $this->mode; |
||
267 | } |
||
268 | |||
269 | public function getSourceName() : string |
||
270 | { |
||
271 | return $this->input === null ? '' : $this->input->getSourceName(); |
||
272 | } |
||
273 | |||
274 | public function getInputStream() : ?IntStream |
||
275 | { |
||
276 | return $this->input; |
||
277 | } |
||
278 | |||
279 | public function getTokenFactory() : TokenFactory |
||
280 | { |
||
281 | return $this->factory; |
||
282 | } |
||
283 | |||
284 | public function setTokenFactory(TokenFactory $factory) : void |
||
285 | { |
||
286 | $this->factory = $factory; |
||
287 | } |
||
288 | |||
289 | public function setInputStream(IntStream $input) : void |
||
290 | { |
||
291 | $this->input = null; |
||
292 | $this->tokenFactorySourcePair = new Pair($this, $this->input); |
||
293 | |||
294 | $this->reset(); |
||
295 | |||
296 | if (!$input instanceof CharStream) { |
||
297 | throw new \RuntimeException('Input must be CharStream.'); |
||
298 | } |
||
299 | |||
300 | $this->input = $input; |
||
301 | $this->tokenFactorySourcePair = new Pair($this, $this->input); |
||
302 | } |
||
303 | |||
304 | /** |
||
305 | * By default does not support multiple emits per nextToken invocation |
||
306 | * for efficiency reasons. Subclass and override this method, nextToken, |
||
307 | * and getToken (to push tokens into a list and pull from that list |
||
308 | * rather than a single variable as this implementation does). |
||
309 | */ |
||
310 | 7 | public function emitToken(Token $token) : void |
|
311 | { |
||
312 | 7 | $this->token = $token; |
|
313 | 7 | } |
|
314 | |||
315 | /** |
||
316 | * The standard method called to automatically emit a token at the |
||
317 | * outermost lexical rule. The token object should point into the |
||
318 | * char buffer start..stop. If there is a text override in 'text', |
||
319 | * use that to set the token's text. Override this method to emit |
||
320 | * custom Token objects or provide a new factory. |
||
321 | */ |
||
322 | 6 | public function emit() : Token |
|
323 | { |
||
324 | 6 | $token = $this->factory->createEx( |
|
325 | 6 | $this->tokenFactorySourcePair, |
|
326 | 6 | $this->type, |
|
327 | 6 | $this->text, |
|
328 | 6 | $this->channel, |
|
329 | 6 | $this->tokenStartCharIndex, |
|
330 | 6 | $this->getCharIndex() - 1, |
|
331 | 6 | $this->tokenStartLine, |
|
332 | 6 | $this->tokenStartCharPositionInLine |
|
333 | ); |
||
334 | |||
335 | 6 | $this->emitToken($token); |
|
336 | |||
337 | 6 | return $token; |
|
338 | } |
||
339 | |||
340 | 6 | public function emitEOF() : Token |
|
341 | { |
||
342 | 6 | if ($this->input === null) { |
|
343 | throw new \RuntimeException('Cannot emit EOF for null stream.'); |
||
344 | } |
||
345 | |||
346 | 6 | $cpos = $this->getCharPositionInLine(); |
|
347 | 6 | $lpos = $this->getLine(); |
|
348 | 6 | $eof = $this->factory->createEx( |
|
349 | 6 | $this->tokenFactorySourcePair, |
|
350 | 6 | Token::EOF, |
|
351 | 6 | null, |
|
352 | 6 | Token::DEFAULT_CHANNEL, |
|
353 | 6 | $this->input->getIndex(), |
|
354 | 6 | $this->input->getIndex() - 1, |
|
355 | $lpos, |
||
356 | $cpos |
||
357 | ); |
||
358 | |||
359 | 6 | $this->emitToken($eof); |
|
360 | |||
361 | 6 | return $eof; |
|
362 | } |
||
363 | |||
364 | 7 | public function getLine() : int |
|
365 | { |
||
366 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
367 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
368 | } |
||
369 | |||
370 | 7 | return $this->interp->getLine(); |
|
371 | } |
||
372 | |||
373 | public function setLine(int $line) : void |
||
374 | { |
||
375 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
376 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
377 | } |
||
378 | |||
379 | $this->interp->setLine($line); |
||
380 | } |
||
381 | |||
382 | 7 | public function getCharPositionInLine() : int |
|
383 | { |
||
384 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
385 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
386 | } |
||
387 | |||
388 | 7 | return $this->interp->getCharPositionInLine(); |
|
389 | } |
||
390 | |||
391 | public function setCharPositionInLine(int $charPositionInLine) : void |
||
392 | { |
||
393 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
394 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
395 | } |
||
396 | |||
397 | $this->interp->setCharPositionInLine($charPositionInLine); |
||
398 | } |
||
399 | |||
400 | /** |
||
401 | * What is the index of the current character of lookahead? |
||
402 | */ |
||
403 | 6 | public function getCharIndex() : int |
|
404 | { |
||
405 | 6 | if ($this->input === null) { |
|
406 | throw new \RuntimeException('Cannot know char index for null stream.'); |
||
407 | } |
||
408 | |||
409 | 6 | return $this->input->getIndex(); |
|
410 | } |
||
411 | |||
412 | /** |
||
413 | * Return the text matched so far for the current token or any text override. |
||
414 | */ |
||
415 | public function getText() : string |
||
416 | { |
||
417 | if ($this->text !== null) { |
||
418 | return $this->text; |
||
419 | } |
||
420 | |||
421 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
422 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
423 | } |
||
424 | |||
425 | return $this->input === null ? '' : $this->interp->getText($this->input); |
||
426 | } |
||
427 | |||
428 | /** |
||
429 | * Set the complete text of this token; it wipes any previous changes to the text. |
||
430 | */ |
||
431 | public function setText(string $text) : void |
||
432 | { |
||
433 | $this->text = $text; |
||
434 | } |
||
435 | |||
436 | public function getToken() : ?Token |
||
437 | { |
||
438 | return $this->token; |
||
439 | } |
||
440 | |||
441 | /** |
||
442 | * Override if emitting multiple tokens. |
||
443 | */ |
||
444 | public function setToken(Token $token) : void |
||
445 | { |
||
446 | $this->token = $token; |
||
447 | } |
||
448 | |||
449 | public function getType() : int |
||
450 | { |
||
451 | return $this->type; |
||
452 | } |
||
453 | |||
454 | public function setType(int $type) : void |
||
455 | { |
||
456 | $this->type = $type; |
||
457 | } |
||
458 | |||
459 | public function getChannel() : int |
||
460 | { |
||
461 | return $this->channel; |
||
462 | } |
||
463 | |||
464 | public function setChannel(int $channel) : void |
||
465 | { |
||
466 | $this->channel = $channel; |
||
467 | } |
||
468 | |||
469 | /** |
||
470 | * @return array<string>|null |
||
471 | */ |
||
472 | public function getChannelNames() : ?array |
||
473 | { |
||
474 | return null; |
||
475 | } |
||
476 | |||
477 | /** |
||
478 | * @return array<string>|null |
||
479 | */ |
||
480 | public function getModeNames() : ?array |
||
481 | { |
||
482 | return null; |
||
483 | } |
||
484 | |||
485 | /** |
||
486 | * Return a list of all Token objects in input char stream. |
||
487 | * Forces load of all tokens. Does not include EOF token. |
||
488 | * |
||
489 | * @return array<Token> |
||
490 | */ |
||
491 | public function getAllTokens() : array |
||
492 | { |
||
493 | $tokens = []; |
||
494 | $token = $this->nextToken(); |
||
495 | |||
496 | while ($token && $token->getType() !== Token::EOF) { |
||
497 | $tokens[] = $token; |
||
498 | $token = $this->nextToken(); |
||
499 | } |
||
500 | |||
501 | return $tokens; |
||
502 | } |
||
503 | |||
504 | /** |
||
505 | * Lexers can normally match any char in it's vocabulary after matching |
||
506 | * a token, so do the easy thing and just kill a character and hope |
||
507 | * it all works out. You can instead use the rule invocation stack |
||
508 | * to do sophisticated error recovery if you are in a fragment rule. |
||
509 | */ |
||
510 | public function recover(RecognitionException $re) : void |
||
511 | { |
||
512 | if ($this->input !== null && $this->input->LA(1) !== Token::EOF) { |
||
513 | if ($re instanceof LexerNoViableAltException && $this->interp !== null) { |
||
514 | // skip a char and try again |
||
515 | $this->interp->consume($this->input); |
||
516 | } else { |
||
517 | // TODO: Do we lose character or line position information? |
||
518 | $this->input->consume(); |
||
519 | } |
||
520 | } |
||
521 | } |
||
522 | |||
523 | public function notifyListeners(LexerNoViableAltException $e) : void |
||
524 | { |
||
525 | $start = $this->tokenStartCharIndex; |
||
526 | |||
527 | if ($this->input === null) { |
||
528 | $text = ''; |
||
529 | } else { |
||
530 | $stop = $this->input->getIndex(); |
||
531 | $text = $this->input->getText($start, $stop); |
||
532 | } |
||
533 | |||
534 | $listener = $this->getErrorListenerDispatch(); |
||
535 | |||
536 | $listener->syntaxError( |
||
537 | $this, |
||
538 | null, |
||
539 | $this->tokenStartLine, |
||
540 | $this->tokenStartCharPositionInLine, |
||
541 | \sprintf('token recognition error at: \'%s\'', $text), |
||
542 | $e |
||
543 | ); |
||
544 | } |
||
545 | } |
||
546 |