Passed
Pull Request — master (#196)
by Christoffer
03:09
created

Lexer::skipDigits()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 12
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 12
rs 9.4285
c 0
b 0
f 0
cc 3
eloc 7
nc 2
nop 1
1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
use Digia\GraphQL\Error\SyntaxErrorException;
6
7
class Lexer implements LexerInterface
8
{
9
    protected const ENCODING = 'UTF-8';
10
11
    /**
12
     * A map between punctuation character code and the corresponding token kind.
13
     *
14
     * @var array
15
     */
16
    protected static $codeTokenKindMap = [
17
        33  => TokenKindEnum::BANG,
18
        36  => TokenKindEnum::DOLLAR,
19
        38  => TokenKindEnum::AMP,
20
        40  => TokenKindEnum::PAREN_L,
21
        41  => TokenKindEnum::PAREN_R,
22
        58  => TokenKindEnum::COLON,
23
        61  => TokenKindEnum::EQUALS,
24
        64  => TokenKindEnum::AT,
25
        91  => TokenKindEnum::BRACKET_L,
26
        93  => TokenKindEnum::BRACKET_R,
27
        123 => TokenKindEnum::BRACE_L,
28
        124 => TokenKindEnum::PIPE,
29
        125 => TokenKindEnum::BRACE_R,
30
    ];
31
32
    /**
33
     * The source file for this lexer.
34
     *
35
     * @var Source
36
     */
37
    protected $source;
38
39
    /**
40
     * The contents of the source file.
41
     *
42
     * @var string
43
     */
44
    protected $body;
45
46
    /**
47
     * The total number of characters in the source file.
48
     *
49
     * @var int
50
     */
51
    protected $bodyLength;
52
53
    /**
54
     * The options for this lexer.
55
     *
56
     * @var array
57
     */
58
    protected $options = [];
59
60
    /**
61
     * The previously focused non-ignored token.
62
     *
63
     * @var Token
64
     */
65
    protected $lastToken;
66
67
    /**
68
     * The currently focused non-ignored token.
69
     *
70
     * @var Token
71
     */
72
    protected $token;
73
74
    /**
75
     * The current position.
76
     *
77
     * @var int
78
     */
79
    protected $pos;
80
81
    /**
82
     * The (1-indexed) line containing the current token.
83
     *
84
     * @var int
85
     */
86
    protected $line;
87
88
    /**
89
     * The character offset at which the current line begins.
90
     *
91
     * @var int
92
     */
93
    protected $lineStart;
94
95
    /**
96
     * @var array
97
     */
98
    protected static $charCodeCache = [];
99
100
    /**
101
     * Lexer constructor.
102
     * @param Source|null $source
103
     * @param array       $options
104
     */
105
    public function __construct(Source $source, array $options)
106
    {
107
        $startOfFileToken = $this->createStartOfFileToken();
108
109
        $this->lastToken  = $startOfFileToken;
110
        $this->token      = $startOfFileToken;
111
        $this->line       = 1;
112
        $this->lineStart  = 0;
113
        $this->body       = $source->getBody();
114
        $this->bodyLength = \strlen($this->body);
115
        $this->source     = $source;
116
        $this->options    = $options;
117
    }
118
119
    /**
120
     * @inheritdoc
121
     * @throws SyntaxErrorException
122
     */
123
    public function advance(): Token
124
    {
125
        $this->lastToken = $this->token;
126
        return $this->token = $this->lookahead();
127
    }
128
129
    /**
130
     * @inheritdoc
131
     * @throws SyntaxErrorException
132
     */
133
    public function lookahead(): Token
134
    {
135
        $token = $this->token;
136
137
        if (TokenKindEnum::EOF !== $token->getKind()) {
138
            do {
139
                $next = $this->readToken($token);
140
                $token->setNext($next);
141
                $token = $next;
142
            } while (TokenKindEnum::COMMENT === $token->getKind());
143
        }
144
145
        return $token;
146
    }
147
148
    /**
149
     * @inheritdoc
150
     */
151
    public function getOption(string $name, $default = null)
152
    {
153
        return $this->options[$name] ?? $default;
154
    }
155
156
    /**
157
     * @inheritdoc
158
     */
159
    public function getTokenKind(): string
160
    {
161
        return $this->token->getKind();
162
    }
163
164
    /**
165
     * @inheritdoc
166
     */
167
    public function getTokenValue(): ?string
168
    {
169
        return $this->token->getValue();
170
    }
171
172
    /**
173
     * @inheritdoc
174
     */
175
    public function getToken(): Token
176
    {
177
        return $this->token;
178
    }
179
180
    /**
181
     * @inheritdoc
182
     */
183
    public function getSource(): Source
184
    {
185
        return $this->source;
186
    }
187
188
    /**
189
     * @inheritdoc
190
     */
191
    public function getLastToken(): Token
192
    {
193
        return $this->lastToken;
194
    }
195
196
    /**
197
     * Reads the token after the given token.
198
     *
199
     * @param Token $prev
200
     * @return Token
201
     * @throws SyntaxErrorException
202
     */
203
    protected function readToken(Token $prev): Token
204
    {
205
        $this->pos = $prev->getEnd();
206
207
        $this->skipWhitespace();
208
209
        $line = $this->line;
210
        $col  = (1 + $this->pos) - $this->lineStart;
211
212
        if ($this->pos >= $this->bodyLength) {
213
            return $this->createEndOfFileToken($line, $col, $prev);
214
        }
215
216
        $code = $this->readCharCode($this->pos);
217
218
        // Punctuation: [!$&:=@|()\[\]{}]{1}
219
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
220
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
221
            return $this->lexPunctuation($code, $line, $col, $prev);
222
        }
223
224
        // Comment: #[\u0009\u0020-\uFFFF]*
225
        if (35 === $code) {
226
            return $this->lexComment($line, $col, $prev);
227
        }
228
229
        // Int:   -?(0|[1-9][0-9]*)
230
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
231
        if (45 === $code || isNumber($code)) {
232
            return $this->lexNumber($code, $line, $col, $prev);
233
        }
234
235
        // Name: [_A-Za-z][_0-9A-Za-z]*
236
        if (isAlphaNumeric($code)) {
237
            return $this->lexName($line, $col, $prev);
238
        }
239
240
        // Spread: ...
241
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
242
            return $this->lexSpread($line, $col, $prev);
243
        }
244
245
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
246
        if ($this->isString($code)) {
247
            return $this->lexString($line, $col, $prev);
248
        }
249
250
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
251
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
252
            return $this->lexBlockString($line, $col, $prev);
253
        }
254
255
        throw $this->createSyntaxErrorException();
256
    }
257
258
    /**
259
     * @return Token
260
     */
261
    protected function createStartOfFileToken(): Token
262
    {
263
        return new Token(TokenKindEnum::SOF);
264
    }
265
266
    /**
267
     * Creates an End Of File (EOF) token.
268
     *
269
     * @param int   $line
270
     * @param int   $col
271
     * @param Token $prev
272
     * @return Token
273
     */
274
    protected function createEndOfFileToken(int $line, int $col, Token $prev): Token
275
    {
276
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $col, $prev);
277
    }
278
279
    /**
280
     * Reads a punctuation token from the source file.
281
     *
282
     * @param int   $code
283
     * @param int   $line
284
     * @param int   $col
285
     * @param Token $prev
286
     * @return Token
287
     * @throws SyntaxErrorException
288
     */
289
    protected function lexPunctuation(int $code, int $line, int $col, Token $prev): ?Token
290
    {
291
        if (!isset(self::$codeTokenKindMap[$code])) {
292
            throw $this->createSyntaxErrorException();
293
        }
294
295
        return new Token(self::$codeTokenKindMap[$code], $this->pos, $this->pos + 1, $line, $col, $prev);
296
    }
297
298
    /**
299
     * Reads a name token from the source file.
300
     *
301
     * @param int   $line
302
     * @param int   $col
303
     * @param Token $prev
304
     * @return Token
305
     */
306
    protected function lexName(int $line, int $col, Token $prev): Token
307
    {
308
        $start = $this->pos;
309
310
        ++$this->pos;
311
312
        while ($this->pos !== $this->bodyLength &&
313
            ($code = $this->readCharCode($this->pos)) !== null &&
314
            isAlphaNumeric($code)) {
315
            ++$this->pos;
316
        }
317
318
        $value = sliceString($this->body, $start, $this->pos);
319
320
        return new Token(TokenKindEnum::NAME, $start, $this->pos, $line, $col, $prev, $value);
321
    }
322
323
    /**
324
     * Reads a number (int or float) token from the source file.
325
     *
326
     * @param int   $code
327
     * @param int   $line
328
     * @param int   $col
329
     * @param Token $prev
330
     * @return Token
331
     * @throws SyntaxErrorException
332
     */
333
    protected function lexNumber(int $code, int $line, int $col, Token $prev): Token
334
    {
335
        $start   = $this->pos;
336
        $isFloat = false;
337
338
        if (45 === $code) {
339
            // -
340
            $code = $this->readCharCode(++$this->pos);
341
        }
342
343
        if (48 === $code) {
344
            // 0
345
            $code = $this->readCharCode(++$this->pos);
346
347
            if (isNumber($code)) {
348
                throw $this->createSyntaxErrorException(
349
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
350
                );
351
            }
352
        } else {
353
            $this->skipDigits($code);
354
            $code = $this->readCharCode($this->pos);
355
        }
356
357
        if (46 === $code) {
358
            // .
359
            $isFloat = true;
360
361
            $code = $this->readCharCode(++$this->pos);
362
            $this->skipDigits($code);
363
            $code = $this->readCharCode($this->pos);
364
        }
365
366
        if (69 === $code || 101 === $code) {
367
            // e or E
368
            $isFloat = true;
369
            $code    = $this->readCharCode(++$this->pos);
370
371
            if (43 === $code || 45 === $code) {
372
                // + or -
373
                $code = $this->readCharCode(++$this->pos);
374
            }
375
376
            $this->skipDigits($code);
377
        }
378
379
        return new Token(
380
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
381
            $start,
382
            $this->pos,
383
            $line,
384
            $col,
385
            $prev,
386
            sliceString($this->body, $start, $this->pos)
387
        );
388
    }
389
390
    /**
391
     * Skips digits at the current position.
392
     *
393
     * @param int $code
394
     * @throws SyntaxErrorException
395
     */
396
    protected function skipDigits(int $code): void
397
    {
398
        if (isNumber($code)) {
399
            do {
400
                $code = $this->readCharCode(++$this->pos);
401
            } while (isNumber($code));
402
403
            return;
404
        }
405
406
        throw $this->createSyntaxErrorException(
407
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
408
        );
409
    }
410
411
    /**
412
     * Reads a comment token from the source file.
413
     *
414
     * @param int   $line
415
     * @param int   $col
416
     * @param Token $prev
417
     * @return Token
418
     */
419
    protected function lexComment(int $line, int $col, Token $prev): Token
420
    {
421
        $start = $this->pos;
422
423
        do {
424
            $code = $this->readCharCode(++$this->pos);
425
        } while ($code !== null && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
426
427
        return new Token(
428
            TokenKindEnum::COMMENT,
429
            $start,
430
            $this->pos,
431
            $line,
432
            $col,
433
            $prev,
434
            sliceString($this->body, $start + 1, $this->pos)
435
        );
436
    }
437
438
    /**
439
     * Reads a spread token from the source.
440
     *
441
     * @param int   $line
442
     * @param int   $col
443
     * @param Token $prev
444
     * @return Token
445
     */
446
    protected function lexSpread(int $line, int $col, Token $prev): Token
447
    {
448
        return new Token(TokenKindEnum::SPREAD, $this->pos, $this->pos + 3, $line, $col, $prev);
449
    }
450
451
    /**
452
     * Reads a string token from the source.
453
     *
454
     * @param int   $line
455
     * @param int   $col
456
     * @param Token $prev
457
     * @return Token
458
     * @throws SyntaxErrorException
459
     */
460
    protected function lexString(int $line, int $col, Token $prev): Token
461
    {
462
        $start      = $this->pos;
463
        $chunkStart = ++$this->pos; // skip the quote
464
        $value      = '';
465
466
        while ($this->pos < $this->bodyLength &&
467
            ($code = $this->readCharCode($this->pos)) !== null && !isLineTerminator($code)) {
468
            // Closing Quote (")
469
            if (34 === $code) {
470
                $value .= sliceString($this->body, $chunkStart, $this->pos);
471
                return new Token(TokenKindEnum::STRING, $start, $this->pos + 1, $line, $col, $prev, $value);
472
            }
473
474
            if (isSourceCharacter($code)) {
475
                throw $this->createSyntaxErrorException(
476
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
477
                );
478
            }
479
480
            ++$this->pos;
481
482
            if (92 === $code) {
483
                // \
484
                $value .= sliceString($this->body, $chunkStart, $this->pos - 1);
485
486
                $code = $this->readCharCode($this->pos);
487
488
                switch ($code) {
489
                    case 34: // "
490
                        $value .= '"';
491
                        break;
492
                    case 47: // /
493
                        $value .= '/';
494
                        break;
495
                    case 92: // \
496
                        $value .= '\\';
497
                        break;
498
                    case 98: // b
499
                        $value .= '\b';
500
                        break;
501
                    case 102: // f
502
                        $value .= '\f';
503
                        break;
504
                    case 110: // n
505
                        $value .= '\n';
506
                        break;
507
                    case 114: // r
508
                        $value .= '\r';
509
                        break;
510
                    case 116: // t
511
                        $value .= '\t';
512
                        break;
513
                    case 117: // u
514
                        $unicodeString = sliceString($this->body, $this->pos + 1, $this->pos + 5);
515
516
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
517
                            throw $this->createSyntaxErrorException(
518
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
519
                            );
520
                        }
521
522
                        $value     .= '\\u' . $unicodeString;
523
                        $this->pos += 4;
524
525
                        break;
526
                    default:
527
                        throw $this->createSyntaxErrorException(
528
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
529
                        );
530
                }
531
532
                ++$this->pos;
533
534
                $chunkStart = $this->pos;
535
            }
536
        }
537
538
        throw $this->createSyntaxErrorException('Unterminated string.');
539
    }
540
541
    /**
542
     * Reads a block string token from the source file.
543
     *
544
     * @param int   $line
545
     * @param int   $col
546
     * @param Token $prev
547
     * @return Token
548
     * @throws SyntaxErrorException
549
     */
550
    protected function lexBlockString(int $line, int $col, Token $prev): Token
551
    {
552
        $start      = $this->pos;
553
        $this->pos  = $start + 3; // skip the triple-quote
554
        $chunkStart = $this->pos;
555
        $rawValue   = '';
556
557
        while ($this->pos < $this->bodyLength && ($code = $this->readCharCode($this->pos)) !== null) {
558
            // Closing Triple-Quote (""")
559
            if ($this->isTripleQuote($code)) {
560
                $rawValue .= sliceString($this->body, $chunkStart, $this->pos);
561
                return new Token(
562
                    TokenKindEnum::BLOCK_STRING,
563
                    $start,
564
                    $this->pos + 3,
565
                    $line,
566
                    $col,
567
                    $prev,
568
                    blockStringValue($rawValue)
569
                );
570
            }
571
572
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
573
                throw $this->createSyntaxErrorException(
574
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
575
                );
576
            }
577
578
            if ($this->isEscapedTripleQuote($code)) {
579
                $rawValue   .= sliceString($this->body, $chunkStart, $this->pos) . '"""';
580
                $this->pos  += 4;
581
                $chunkStart = $this->pos;
582
            } else {
583
                ++$this->pos;
584
            }
585
        }
586
587
        throw $this->createSyntaxErrorException('Unterminated string.');
588
    }
589
590
    /**
591
     * Skips whitespace at the current position.
592
     */
593
    protected function skipWhitespace(): void
594
    {
595
        while ($this->pos < $this->bodyLength) {
596
            $code = $this->readCharCode($this->pos);
597
598
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
599
                // tab | space | comma | BOM
600
                ++$this->pos;
601
            } elseif (10 === $code) {
602
                // new line (\n)
603
                ++$this->pos;
604
                ++$this->line;
605
                $this->lineStart = $this->pos;
606
            } elseif (13 === $code) {
607
                // carriage return (\r)
608
                if (10 === $this->readCharCode($this->pos + 1)) {
609
                    // carriage return and new line (\r\n)
610
                    $this->pos += 2;
611
                } else {
612
                    ++$this->pos;
613
                }
614
                ++$this->line;
615
                $this->lineStart = $this->pos;
616
            } else {
617
                break;
618
            }
619
        }
620
    }
621
622
    /**
623
     * @param int $pos
624
     * @return int
625
     */
626
    protected function readCharCode(int $pos): int
627
    {
628
        $char = \mb_substr($this->body, $pos, 1, self::ENCODING);
629
630
        if (!isset(self::$charCodeCache[$char])) {
631
            self::$charCodeCache[$char] = \mb_ord($char, self::ENCODING);
632
        }
633
634
        return self::$charCodeCache[$char];
635
    }
636
637
    /**
638
     * Creates a `SyntaxErrorException` for the current position in the source.
639
     *
640
     * @param null|string $description
641
     * @return SyntaxErrorException
642
     */
643
    protected function createSyntaxErrorException(?string $description = null): SyntaxErrorException
644
    {
645
        return new SyntaxErrorException(
646
            $this->source,
647
            $this->pos,
648
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->pos))
649
        );
650
    }
651
652
    /**
653
     * Report a message that an unexpected character was encountered.
654
     *
655
     * @param int $code
656
     * @return string
657
     */
658
    protected function unexpectedCharacterMessage(int $code): string
659
    {
660
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
661
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
662
        }
663
664
        if ($code === 39) {
665
            // '
666
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
667
        }
668
669
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
670
    }
671
672
    /**
673
     * @param int $code
674
     * @return bool
675
     */
676
    protected function isSpread(int $code): bool
677
    {
678
        return 46 === $code &&
679
            $this->readCharCode($this->pos + 1) === 46 &&
680
            $this->readCharCode($this->pos + 2) === 46; // ...
681
    }
682
683
    /**
684
     * @param int $code
685
     * @return bool
686
     */
687
    protected function isString(int $code): bool
688
    {
689
        return 34 === $code && $this->readCharCode($this->pos + 1) !== 34;
690
    }
691
692
    /**
693
     * @param int $code
694
     * @return bool
695
     */
696
    protected function isTripleQuote(int $code): bool
697
    {
698
        return 34 === $code &&
699
            34 === $this->readCharCode($this->pos + 1) &&
700
            34 === $this->readCharCode($this->pos + 2); // """
701
    }
702
703
    /**
704
     * @param int $code
705
     * @return bool
706
     */
707
    protected function isEscapedTripleQuote(int $code): bool
708
    {
709
        return $code === 92 &&
710
            34 === $this->readCharCode($this->pos + 1) &&
711
            34 === $this->readCharCode($this->pos + 2) &&
712
            34 === $this->readCharCode($this->pos + 3); // \"""
713
    }
714
}
715