SetBased /
antlr-php-runtime
| 1 | <?php |
||
| 2 | |||
| 3 | declare(strict_types=1); |
||
| 4 | |||
| 5 | namespace Antlr\Antlr4\Runtime; |
||
| 6 | |||
| 7 | use Antlr\Antlr4\Runtime\Atn\LexerATNSimulator; |
||
| 8 | use Antlr\Antlr4\Runtime\Error\Exceptions\LexerNoViableAltException; |
||
| 9 | use Antlr\Antlr4\Runtime\Error\Exceptions\RecognitionException; |
||
| 10 | use Antlr\Antlr4\Runtime\Utils\Pair; |
||
| 11 | |||
| 12 | /** |
||
| 13 | * A lexer is recognizer that draws input symbols from a character stream. |
||
| 14 | * lexer grammars result in a subclass of this object. A Lexer object |
||
| 15 | * uses simplified match() and error recovery mechanisms in the interest |
||
| 16 | * of speed. |
||
| 17 | */ |
||
| 18 | abstract class Lexer extends Recognizer implements TokenSource |
||
| 19 | { |
||
| 20 | public const DEFAULT_MODE = 0; |
||
| 21 | public const MORE = -2; |
||
| 22 | public const SKIP = -3; |
||
| 23 | |||
| 24 | public const DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL; |
||
| 25 | public const HIDDEN = Token::HIDDEN_CHANNEL; |
||
| 26 | public const MIN_CHAR_VALUE = 0x0000; |
||
| 27 | public const MAX_CHAR_VALUE = 0x10FFFF; |
||
| 28 | |||
| 29 | /** @var CharStream|null */ |
||
| 30 | public $input; |
||
| 31 | |||
| 32 | /** @var Pair Pair<TokenSource, CharStream> */ |
||
| 33 | protected $tokenFactorySourcePair; |
||
| 34 | |||
| 35 | /** @var TokenFactory */ |
||
| 36 | protected $factory; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * The goal of all lexer rules/methods is to create a token object. |
||
| 40 | * This is an instance variable as multiple rules may collaborate to |
||
| 41 | * create a single token. `nextToken` will return this object after |
||
| 42 | * matching lexer rule(s). |
||
| 43 | * |
||
| 44 | * If you subclass to allow multiple token emissions, then set this |
||
| 45 | * to the last token to be matched or something nonnull so that |
||
| 46 | * the auto token emit mechanism will not emit another token. |
||
| 47 | * |
||
| 48 | * @var Token|null |
||
| 49 | */ |
||
| 50 | public $token; |
||
| 51 | |||
| 52 | /** |
||
| 53 | * What character index in the stream did the current token start at? |
||
| 54 | * Needed, for example, to get the text for current token. Set at |
||
| 55 | * the start of nextToken. |
||
| 56 | * |
||
| 57 | * @var int |
||
| 58 | */ |
||
| 59 | public $tokenStartCharIndex = -1; |
||
| 60 | |||
| 61 | /** |
||
| 62 | * The line on which the first character of the token resides. |
||
| 63 | * |
||
| 64 | * @var int |
||
| 65 | */ |
||
| 66 | public $tokenStartLine = -1; |
||
| 67 | |||
| 68 | /** |
||
| 69 | * The character position of first character within the line |
||
| 70 | * |
||
| 71 | * @var int |
||
| 72 | */ |
||
| 73 | public $tokenStartCharPositionInLine = -1; |
||
| 74 | |||
| 75 | /** |
||
| 76 | * Once we see EOF on char stream, next token will be EOF. |
||
| 77 | * If you have DONE : EOF ; then you see DONE EOF. |
||
| 78 | * |
||
| 79 | * @var bool |
||
| 80 | */ |
||
| 81 | public $hitEOF = false; |
||
| 82 | |||
| 83 | /** |
||
| 84 | * The channel number for the current token. |
||
| 85 | * |
||
| 86 | * @var int |
||
| 87 | */ |
||
| 88 | public $channel = Token::DEFAULT_CHANNEL; |
||
| 89 | |||
| 90 | /** |
||
| 91 | * The token type for the current token. |
||
| 92 | * |
||
| 93 | * @var int |
||
| 94 | */ |
||
| 95 | public $type = Token::INVALID_TYPE; |
||
| 96 | |||
| 97 | /** @var array<int> */ |
||
| 98 | public $modeStack = []; |
||
| 99 | |||
| 100 | /** @var int */ |
||
| 101 | public $mode = self::DEFAULT_MODE; |
||
| 102 | |||
| 103 | /** |
||
| 104 | * You can set the text for the current token to override what is in the |
||
| 105 | * input char buffer. Use {@see Lexer::setText()} or can set this instance var. |
||
| 106 | * |
||
| 107 | * @var string|null |
||
| 108 | */ |
||
| 109 | public $text; |
||
| 110 | |||
| 111 | /** @var LexerATNSimulator|null */ |
||
| 112 | protected $interp; |
||
| 113 | |||
| 114 | 7 | public function __construct(?CharStream $input = null) |
|
| 115 | { |
||
| 116 | 7 | parent::__construct(); |
|
| 117 | |||
| 118 | 7 | $this->input = $input; |
|
| 119 | 7 | $this->factory = CommonTokenFactory::default(); |
|
| 120 | 7 | $this->tokenFactorySourcePair = new Pair($this, $input); |
|
| 121 | |||
| 122 | // @todo remove this property |
||
| 123 | 7 | $this->interp = null;// child classes must populate this |
|
| 124 | 7 | } |
|
| 125 | |||
| 126 | public function reset() : void |
||
| 127 | { |
||
| 128 | // wack Lexer state variables |
||
| 129 | if ($this->input !== null) { |
||
| 130 | $this->input->seek(0);// rewind the input |
||
| 131 | } |
||
| 132 | |||
| 133 | $this->token = null; |
||
| 134 | $this->type = Token::INVALID_TYPE; |
||
| 135 | $this->channel = Token::DEFAULT_CHANNEL; |
||
| 136 | $this->tokenStartCharIndex = -1; |
||
| 137 | $this->tokenStartCharPositionInLine = -1; |
||
| 138 | $this->tokenStartLine = -1; |
||
| 139 | $this->text = null; |
||
| 140 | |||
| 141 | $this->hitEOF = false; |
||
| 142 | $this->mode = self::DEFAULT_MODE; |
||
| 143 | $this->modeStack = []; |
||
| 144 | |||
| 145 | if ($this->interp !== null) { |
||
| 146 | $this->interp->reset(); |
||
| 147 | } |
||
| 148 | } |
||
| 149 | |||
| 150 | /** |
||
| 151 | * Return a token from this source; i.e., match a token on the char stream. |
||
| 152 | */ |
||
| 153 | 7 | public function nextToken() : ?Token |
|
| 154 | { |
||
| 155 | 7 | if ($this->input === null) { |
|
| 156 | throw new \RuntimeException('NextToken requires a non-null input stream.'); |
||
| 157 | } |
||
| 158 | |||
| 159 | // Mark start location in char stream so unbuffered streams are |
||
| 160 | // guaranteed at least have text of current token |
||
| 161 | 7 | $tokenStartMarker = $this->input->mark(); |
|
| 162 | |||
| 163 | try { |
||
| 164 | 7 | while (true) { |
|
| 165 | 7 | if ($this->hitEOF) { |
|
| 166 | 6 | $this->emitEOF(); |
|
| 167 | |||
| 168 | 6 | return $this->token; |
|
| 169 | } |
||
| 170 | |||
| 171 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
| 172 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 173 | } |
||
| 174 | |||
| 175 | 7 | $this->token = null; |
|
| 176 | 7 | $this->channel = Token::DEFAULT_CHANNEL; |
|
| 177 | 7 | $this->tokenStartCharIndex = $this->input->getIndex(); |
|
| 178 | 7 | $this->tokenStartCharPositionInLine = $this->interp->getCharPositionInLine(); |
|
| 179 | 7 | $this->tokenStartLine = $this->interp->getLine(); |
|
| 180 | 7 | $this->text = null; |
|
| 181 | 7 | $continueOuter = false; |
|
| 182 | |||
| 183 | 7 | while (true) { |
|
| 184 | 7 | $this->type = Token::INVALID_TYPE; |
|
| 185 | 7 | $ttype = self::SKIP; |
|
|
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
| 186 | try { |
||
| 187 | 7 | $ttype = $this->interp->match($this->input, $this->mode); |
|
| 188 | } catch (LexerNoViableAltException $e) { |
||
| 189 | $this->notifyListeners($e); // report error |
||
| 190 | $this->recover($e); |
||
| 191 | } |
||
| 192 | |||
| 193 | 7 | if ($this->input->LA(1) === Token::EOF) { |
|
| 194 | 7 | $this->hitEOF = true; |
|
| 195 | } |
||
| 196 | |||
| 197 | 7 | if ($this->type === Token::INVALID_TYPE) { |
|
| 198 | 6 | $this->type = $ttype; |
|
| 199 | } |
||
| 200 | |||
| 201 | 7 | if ($this->type === self::SKIP) { |
|
| 202 | 5 | $continueOuter = true; |
|
| 203 | |||
| 204 | 5 | break; |
|
| 205 | } |
||
| 206 | |||
| 207 | 6 | if ($this->type !== self::MORE) { |
|
| 208 | 6 | break; |
|
| 209 | } |
||
| 210 | } |
||
| 211 | |||
| 212 | 7 | if ($continueOuter) { |
|
| 213 | 5 | continue; |
|
| 214 | } |
||
| 215 | |||
| 216 | 6 | if ($this->token === null) { |
|
| 217 | 6 | $this->emit(); |
|
| 218 | } |
||
| 219 | |||
| 220 | 6 | return $this->token; |
|
| 221 | } |
||
| 222 | } finally { |
||
| 223 | // make sure we release marker after match or |
||
| 224 | // unbuffered char stream will keep buffering |
||
| 225 | 7 | $this->input->release($tokenStartMarker); |
|
| 226 | } |
||
| 227 | } |
||
| 228 | |||
| 229 | /** |
||
| 230 | * Instruct the lexer to skip creating a token for current lexer rule |
||
| 231 | * and look for another token. `nextToken` knows to keep looking when |
||
| 232 | * a lexer rule finishes with token set to SKIP_TOKEN. Recall that |
||
| 233 | * if `token === null` at end of any token rule, it creates one for you |
||
| 234 | * and emits it. |
||
| 235 | */ |
||
| 236 | 5 | public function skip() : void |
|
| 237 | { |
||
| 238 | 5 | $this->type = self::SKIP; |
|
| 239 | 5 | } |
|
| 240 | |||
| 241 | public function more() : void |
||
| 242 | { |
||
| 243 | $this->type = self::MORE; |
||
| 244 | } |
||
| 245 | |||
| 246 | public function mode(int $m) : void |
||
| 247 | { |
||
| 248 | $this->mode = $m; |
||
| 249 | } |
||
| 250 | |||
| 251 | public function pushMode(int $m) : void |
||
| 252 | { |
||
| 253 | $this->modeStack[] = $this->mode; |
||
| 254 | |||
| 255 | $this->mode($m); |
||
| 256 | } |
||
| 257 | |||
| 258 | public function popMode() : int |
||
| 259 | { |
||
| 260 | if (\count($this->modeStack) === 0) { |
||
| 261 | throw new \RuntimeException('Empty Stack'); |
||
| 262 | } |
||
| 263 | |||
| 264 | $this->mode(\array_pop($this->modeStack)); |
||
| 265 | |||
| 266 | return $this->mode; |
||
| 267 | } |
||
| 268 | |||
| 269 | public function getSourceName() : string |
||
| 270 | { |
||
| 271 | return $this->input === null ? '' : $this->input->getSourceName(); |
||
| 272 | } |
||
| 273 | |||
| 274 | public function getInputStream() : ?IntStream |
||
| 275 | { |
||
| 276 | return $this->input; |
||
| 277 | } |
||
| 278 | |||
| 279 | public function getTokenFactory() : TokenFactory |
||
| 280 | { |
||
| 281 | return $this->factory; |
||
| 282 | } |
||
| 283 | |||
| 284 | public function setTokenFactory(TokenFactory $factory) : void |
||
| 285 | { |
||
| 286 | $this->factory = $factory; |
||
| 287 | } |
||
| 288 | |||
| 289 | public function setInputStream(IntStream $input) : void |
||
| 290 | { |
||
| 291 | $this->input = null; |
||
| 292 | $this->tokenFactorySourcePair = new Pair($this, $this->input); |
||
| 293 | |||
| 294 | $this->reset(); |
||
| 295 | |||
| 296 | if (!$input instanceof CharStream) { |
||
| 297 | throw new \RuntimeException('Input must be CharStream.'); |
||
| 298 | } |
||
| 299 | |||
| 300 | $this->input = $input; |
||
| 301 | $this->tokenFactorySourcePair = new Pair($this, $this->input); |
||
| 302 | } |
||
| 303 | |||
| 304 | /** |
||
| 305 | * By default does not support multiple emits per nextToken invocation |
||
| 306 | * for efficiency reasons. Subclass and override this method, nextToken, |
||
| 307 | * and getToken (to push tokens into a list and pull from that list |
||
| 308 | * rather than a single variable as this implementation does). |
||
| 309 | */ |
||
| 310 | 7 | public function emitToken(Token $token) : void |
|
| 311 | { |
||
| 312 | 7 | $this->token = $token; |
|
| 313 | 7 | } |
|
| 314 | |||
| 315 | /** |
||
| 316 | * The standard method called to automatically emit a token at the |
||
| 317 | * outermost lexical rule. The token object should point into the |
||
| 318 | * char buffer start..stop. If there is a text override in 'text', |
||
| 319 | * use that to set the token's text. Override this method to emit |
||
| 320 | * custom Token objects or provide a new factory. |
||
| 321 | */ |
||
| 322 | 6 | public function emit() : Token |
|
| 323 | { |
||
| 324 | 6 | $token = $this->factory->createEx( |
|
| 325 | 6 | $this->tokenFactorySourcePair, |
|
| 326 | 6 | $this->type, |
|
| 327 | 6 | $this->text, |
|
| 328 | 6 | $this->channel, |
|
| 329 | 6 | $this->tokenStartCharIndex, |
|
| 330 | 6 | $this->getCharIndex() - 1, |
|
| 331 | 6 | $this->tokenStartLine, |
|
| 332 | 6 | $this->tokenStartCharPositionInLine |
|
| 333 | ); |
||
| 334 | |||
| 335 | 6 | $this->emitToken($token); |
|
| 336 | |||
| 337 | 6 | return $token; |
|
| 338 | } |
||
| 339 | |||
| 340 | 6 | public function emitEOF() : Token |
|
| 341 | { |
||
| 342 | 6 | if ($this->input === null) { |
|
| 343 | throw new \RuntimeException('Cannot emit EOF for null stream.'); |
||
| 344 | } |
||
| 345 | |||
| 346 | 6 | $cpos = $this->getCharPositionInLine(); |
|
| 347 | 6 | $lpos = $this->getLine(); |
|
| 348 | 6 | $eof = $this->factory->createEx( |
|
| 349 | 6 | $this->tokenFactorySourcePair, |
|
| 350 | 6 | Token::EOF, |
|
| 351 | 6 | null, |
|
| 352 | 6 | Token::DEFAULT_CHANNEL, |
|
| 353 | 6 | $this->input->getIndex(), |
|
| 354 | 6 | $this->input->getIndex() - 1, |
|
| 355 | $lpos, |
||
| 356 | $cpos |
||
| 357 | ); |
||
| 358 | |||
| 359 | 6 | $this->emitToken($eof); |
|
| 360 | |||
| 361 | 6 | return $eof; |
|
| 362 | } |
||
| 363 | |||
| 364 | 7 | public function getLine() : int |
|
| 365 | { |
||
| 366 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
| 367 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 368 | } |
||
| 369 | |||
| 370 | 7 | return $this->interp->getLine(); |
|
| 371 | } |
||
| 372 | |||
| 373 | public function setLine(int $line) : void |
||
| 374 | { |
||
| 375 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
| 376 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 377 | } |
||
| 378 | |||
| 379 | $this->interp->setLine($line); |
||
| 380 | } |
||
| 381 | |||
| 382 | 7 | public function getCharPositionInLine() : int |
|
| 383 | { |
||
| 384 | 7 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
|
| 385 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 386 | } |
||
| 387 | |||
| 388 | 7 | return $this->interp->getCharPositionInLine(); |
|
| 389 | } |
||
| 390 | |||
| 391 | public function setCharPositionInLine(int $charPositionInLine) : void |
||
| 392 | { |
||
| 393 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
| 394 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 395 | } |
||
| 396 | |||
| 397 | $this->interp->setCharPositionInLine($charPositionInLine); |
||
| 398 | } |
||
| 399 | |||
| 400 | /** |
||
| 401 | * What is the index of the current character of lookahead? |
||
| 402 | */ |
||
| 403 | 6 | public function getCharIndex() : int |
|
| 404 | { |
||
| 405 | 6 | if ($this->input === null) { |
|
| 406 | throw new \RuntimeException('Cannot know char index for null stream.'); |
||
| 407 | } |
||
| 408 | |||
| 409 | 6 | return $this->input->getIndex(); |
|
| 410 | } |
||
| 411 | |||
| 412 | /** |
||
| 413 | * Return the text matched so far for the current token or any text override. |
||
| 414 | */ |
||
| 415 | public function getText() : string |
||
| 416 | { |
||
| 417 | if ($this->text !== null) { |
||
| 418 | return $this->text; |
||
| 419 | } |
||
| 420 | |||
| 421 | if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { |
||
| 422 | throw new \RuntimeException('Unexpected interpreter type.'); |
||
| 423 | } |
||
| 424 | |||
| 425 | return $this->input === null ? '' : $this->interp->getText($this->input); |
||
| 426 | } |
||
| 427 | |||
| 428 | /** |
||
| 429 | * Set the complete text of this token; it wipes any previous changes to the text. |
||
| 430 | */ |
||
| 431 | public function setText(string $text) : void |
||
| 432 | { |
||
| 433 | $this->text = $text; |
||
| 434 | } |
||
| 435 | |||
| 436 | public function getToken() : ?Token |
||
| 437 | { |
||
| 438 | return $this->token; |
||
| 439 | } |
||
| 440 | |||
| 441 | /** |
||
| 442 | * Override if emitting multiple tokens. |
||
| 443 | */ |
||
| 444 | public function setToken(Token $token) : void |
||
| 445 | { |
||
| 446 | $this->token = $token; |
||
| 447 | } |
||
| 448 | |||
| 449 | public function getType() : int |
||
| 450 | { |
||
| 451 | return $this->type; |
||
| 452 | } |
||
| 453 | |||
| 454 | public function setType(int $type) : void |
||
| 455 | { |
||
| 456 | $this->type = $type; |
||
| 457 | } |
||
| 458 | |||
| 459 | public function getChannel() : int |
||
| 460 | { |
||
| 461 | return $this->channel; |
||
| 462 | } |
||
| 463 | |||
| 464 | public function setChannel(int $channel) : void |
||
| 465 | { |
||
| 466 | $this->channel = $channel; |
||
| 467 | } |
||
| 468 | |||
| 469 | /** |
||
| 470 | * @return array<string>|null |
||
| 471 | */ |
||
| 472 | public function getChannelNames() : ?array |
||
| 473 | { |
||
| 474 | return null; |
||
| 475 | } |
||
| 476 | |||
| 477 | /** |
||
| 478 | * @return array<string>|null |
||
| 479 | */ |
||
| 480 | public function getModeNames() : ?array |
||
| 481 | { |
||
| 482 | return null; |
||
| 483 | } |
||
| 484 | |||
| 485 | /** |
||
| 486 | * Return a list of all Token objects in input char stream. |
||
| 487 | * Forces load of all tokens. Does not include EOF token. |
||
| 488 | * |
||
| 489 | * @return array<Token> |
||
| 490 | */ |
||
| 491 | public function getAllTokens() : array |
||
| 492 | { |
||
| 493 | $tokens = []; |
||
| 494 | $token = $this->nextToken(); |
||
| 495 | |||
| 496 | while ($token && $token->getType() !== Token::EOF) { |
||
| 497 | $tokens[] = $token; |
||
| 498 | $token = $this->nextToken(); |
||
| 499 | } |
||
| 500 | |||
| 501 | return $tokens; |
||
| 502 | } |
||
| 503 | |||
| 504 | /** |
||
| 505 | * Lexers can normally match any char in it's vocabulary after matching |
||
| 506 | * a token, so do the easy thing and just kill a character and hope |
||
| 507 | * it all works out. You can instead use the rule invocation stack |
||
| 508 | * to do sophisticated error recovery if you are in a fragment rule. |
||
| 509 | */ |
||
| 510 | public function recover(RecognitionException $re) : void |
||
| 511 | { |
||
| 512 | if ($this->input !== null && $this->input->LA(1) !== Token::EOF) { |
||
| 513 | if ($re instanceof LexerNoViableAltException && $this->interp !== null) { |
||
| 514 | // skip a char and try again |
||
| 515 | $this->interp->consume($this->input); |
||
| 516 | } else { |
||
| 517 | // TODO: Do we lose character or line position information? |
||
| 518 | $this->input->consume(); |
||
| 519 | } |
||
| 520 | } |
||
| 521 | } |
||
| 522 | |||
| 523 | public function notifyListeners(LexerNoViableAltException $e) : void |
||
| 524 | { |
||
| 525 | $start = $this->tokenStartCharIndex; |
||
| 526 | |||
| 527 | if ($this->input === null) { |
||
| 528 | $text = ''; |
||
| 529 | } else { |
||
| 530 | $stop = $this->input->getIndex(); |
||
| 531 | $text = $this->input->getText($start, $stop); |
||
| 532 | } |
||
| 533 | |||
| 534 | $listener = $this->getErrorListenerDispatch(); |
||
| 535 | |||
| 536 | $listener->syntaxError( |
||
| 537 | $this, |
||
| 538 | null, |
||
| 539 | $this->tokenStartLine, |
||
| 540 | $this->tokenStartCharPositionInLine, |
||
| 541 | \sprintf('token recognition error at: \'%s\'', $text), |
||
| 542 | $e |
||
| 543 | ); |
||
| 544 | } |
||
| 545 | } |
||
| 546 |