Passed
Pull Request — master (#196)
by Christoffer
02:58
created

Lexer::readToken()   C

Complexity

Conditions 24
Paths 9

Size

Total Lines 53
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 53
rs 6.1349
c 0
b 0
f 0
cc 24
eloc 23
nc 9
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
use Digia\GraphQL\Error\SyntaxErrorException;
6
7
class Lexer implements LexerInterface
8
{
9
    protected const ENCODING = 'UTF-8';
10
11
    /**
12
     * A map between punctuation character code and the corresponding token kind.
13
     *
14
     * @var array
15
     */
16
    protected static $codeTokenKindMap = [
17
        33  => TokenKindEnum::BANG,
18
        36  => TokenKindEnum::DOLLAR,
19
        38  => TokenKindEnum::AMP,
20
        40  => TokenKindEnum::PAREN_L,
21
        41  => TokenKindEnum::PAREN_R,
22
        58  => TokenKindEnum::COLON,
23
        61  => TokenKindEnum::EQUALS,
24
        64  => TokenKindEnum::AT,
25
        91  => TokenKindEnum::BRACKET_L,
26
        93  => TokenKindEnum::BRACKET_R,
27
        123 => TokenKindEnum::BRACE_L,
28
        124 => TokenKindEnum::PIPE,
29
        125 => TokenKindEnum::BRACE_R,
30
    ];
31
32
    /**
33
     * The source file for this lexer.
34
     *
35
     * @var Source
36
     */
37
    protected $source;
38
39
    /**
40
     * The contents of the source file.
41
     *
42
     * @var string
43
     */
44
    protected $body;
45
46
    /**
47
     * The total number of characters in the source file.
48
     *
49
     * @var int
50
     */
51
    protected $bodyLength;
52
53
    /**
54
     * The options for this lexer.
55
     *
56
     * @var array
57
     */
58
    protected $options = [];
59
60
    /**
61
     * The previously focused non-ignored token.
62
     *
63
     * @var Token
64
     */
65
    protected $lastToken;
66
67
    /**
68
     * The currently focused non-ignored token.
69
     *
70
     * @var Token
71
     */
72
    protected $token;
73
74
    /**
75
     * The current position.
76
     *
77
     * @var int
78
     */
79
    protected $pos;
80
81
    /**
82
     * The (1-indexed) line containing the current token.
83
     *
84
     * @var int
85
     */
86
    protected $line;
87
88
    /**
89
     * The character offset at which the current line begins.
90
     *
91
     * @var int
92
     */
93
    protected $lineStart;
94
95
    /**
96
     * @var array
97
     */
98
    protected static $charCodeCache = [];
99
100
    /**
101
     * Lexer constructor.
102
     * @param Source|null $source
103
     * @param array       $options
104
     */
105
    public function __construct(Source $source, array $options)
106
    {
107
        $startOfFileToken = $this->createStartOfFileToken();
108
109
        $this->lastToken  = $startOfFileToken;
110
        $this->token      = $startOfFileToken;
111
        $this->line       = 1;
112
        $this->lineStart  = 0;
113
        $this->body       = $source->getBody();
114
        $this->bodyLength = \strlen($this->body);
115
        $this->source     = $source;
116
        $this->options    = $options;
117
    }
118
119
    /**
120
     * @inheritdoc
121
     * @throws SyntaxErrorException
122
     */
123
    public function advance(): Token
124
    {
125
        $this->lastToken = $this->token;
126
        return $this->token = $this->lookahead();
127
    }
128
129
    /**
130
     * @inheritdoc
131
     * @throws SyntaxErrorException
132
     */
133
    public function lookahead(): Token
134
    {
135
        $token = $this->token;
136
137
        if (TokenKindEnum::EOF !== $token->getKind()) {
138
            do {
139
                $next = $this->readToken($token);
140
                $token->setNext($next);
141
                $token = $next;
142
            } while (TokenKindEnum::COMMENT === $token->getKind());
143
        }
144
145
        return $token;
146
    }
147
148
    /**
149
     * @inheritdoc
150
     */
151
    public function getOption(string $name, $default = null)
152
    {
153
        return $this->options[$name] ?? $default;
154
    }
155
156
    /**
157
     * @inheritdoc
158
     */
159
    public function getTokenKind(): string
160
    {
161
        return $this->token->getKind();
162
    }
163
164
    /**
165
     * @inheritdoc
166
     */
167
    public function getTokenValue(): ?string
168
    {
169
        return $this->token->getValue();
170
    }
171
172
    /**
173
     * @inheritdoc
174
     */
175
    public function getToken(): Token
176
    {
177
        return $this->token;
178
    }
179
180
    /**
181
     * @inheritdoc
182
     */
183
    public function getSource(): Source
184
    {
185
        return $this->source;
186
    }
187
188
    /**
189
     * @inheritdoc
190
     */
191
    public function getLastToken(): Token
192
    {
193
        return $this->lastToken;
194
    }
195
196
    /**
197
     * Reads the token after the given token.
198
     *
199
     * @param Token $prev
200
     * @return Token
201
     * @throws SyntaxErrorException
202
     */
203
    protected function readToken(Token $prev): Token
204
    {
205
        $this->pos = $prev->getEnd();
206
207
        $this->skipWhitespace();
208
209
        $line = $this->line;
210
        $col  = (1 + $this->pos) - $this->lineStart;
211
212
        if ($this->pos >= $this->bodyLength) {
213
            return $this->createEndOfFileToken($line, $col, $prev);
214
        }
215
216
        $code = $this->readCharCode($this->pos);
217
218
        // Punctuation: [!$&:=@|()\[\]{}]{1}
219
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
220
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
221
            return $this->lexPunctuation($code, $line, $col, $prev);
222
        }
223
224
        // Comment: #[\u0009\u0020-\uFFFF]*
225
        if (35 === $code) {
226
            return $this->lexComment($line, $col, $prev);
227
        }
228
229
        // Int:   -?(0|[1-9][0-9]*)
230
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
231
        if (45 === $code || isNumber($code)) {
232
            return $this->lexNumber($code, $line, $col, $prev);
233
        }
234
235
        // Name: [_A-Za-z][_0-9A-Za-z]*
236
        if (isAlphaNumeric($code)) {
237
            return $this->lexName($line, $col, $prev);
238
        }
239
240
        // Spread: ...
241
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
242
            return $this->lexSpread($line, $col, $prev);
243
        }
244
245
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
246
        if ($this->isString($code)) {
247
            return $this->lexString($line, $col, $prev);
248
        }
249
250
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
251
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
252
            return $this->lexBlockString($line, $col, $prev);
253
        }
254
255
        throw $this->createSyntaxErrorException();
256
    }
257
258
    /**
259
     * @return Token
260
     */
261
    protected function createStartOfFileToken(): Token
262
    {
263
        return new Token(TokenKindEnum::SOF);
264
    }
265
266
    /**
267
     * Creates an End Of File (EOF) token.
268
     *
269
     * @param int   $line
270
     * @param int   $col
271
     * @param Token $prev
272
     * @return Token
273
     */
274
    protected function createEndOfFileToken(int $line, int $col, Token $prev): Token
275
    {
276
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $col, $prev);
277
    }
278
279
    /**
280
     * Reads a punctuation token from the source file.
281
     *
282
     * @param int   $code
283
     * @param int   $line
284
     * @param int   $col
285
     * @param Token $prev
286
     * @return Token
287
     * @throws SyntaxErrorException
288
     */
289
    protected function lexPunctuation(int $code, int $line, int $col, Token $prev): ?Token
290
    {
291
        if (!isset(self::$codeTokenKindMap[$code])) {
292
            throw $this->createSyntaxErrorException();
293
        }
294
295
        return new Token(self::$codeTokenKindMap[$code], $this->pos, $this->pos + 1, $line, $col, $prev);
296
    }
297
298
    /**
299
     * Reads a name token from the source file.
300
     *
301
     * @param int   $line
302
     * @param int   $col
303
     * @param Token $prev
304
     * @return Token
305
     */
306
    protected function lexName(int $line, int $col, Token $prev): Token
307
    {
308
        $start = $this->pos;
309
310
        ++$this->pos;
311
312
        while ($this->pos !== $this->bodyLength &&
313
            ($code = $this->readCharCode($this->pos)) !== null &&
314
            isAlphaNumeric($code)) {
315
            ++$this->pos;
316
        }
317
318
        $value = sliceString($this->body, $start, $this->pos);
319
320
        return new Token(TokenKindEnum::NAME, $start, $this->pos, $line, $col, $prev, $value);
321
    }
322
323
    /**
324
     * Reads a number (int or float) token from the source file.
325
     *
326
     * @param int   $code
327
     * @param int   $line
328
     * @param int   $col
329
     * @param Token $prev
330
     * @return Token
331
     * @throws SyntaxErrorException
332
     */
333
    protected function lexNumber(int $code, int $line, int $col, Token $prev): Token
334
    {
335
        $start   = $this->pos;
336
        $isFloat = false;
337
338
        if (45 === $code) {
339
            // -
340
            $code = $this->readCharCode(++$this->pos);
341
        }
342
343
        if (48 === $code) {
344
            // 0
345
            $code = $this->readCharCode(++$this->pos);
346
347
            if (isNumber($code)) {
348
                throw $this->createSyntaxErrorException(
349
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
350
                );
351
            }
352
        } else {
353
            $this->skipDigits($code);
354
            $code = $this->readCharCode($this->pos);
355
        }
356
357
        if (46 === $code) {
358
            // .
359
            $isFloat = true;
360
361
            $code = $this->readCharCode(++$this->pos);
362
            $this->skipDigits($code);
363
            $code = $this->readCharCode($this->pos);
364
        }
365
366
        if (69 === $code || 101 === $code) {
367
            // e or E
368
            $isFloat = true;
369
370
            $code = $this->readCharCode(++$this->pos);
371
372
            if (43 === $code || 45 === $code) {
373
                // + or -
374
                $code = $this->readCharCode(++$this->pos);
375
            }
376
377
            $this->skipDigits($code);
378
        }
379
380
        return new Token(
381
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
382
            $start,
383
            $this->pos,
384
            $line,
385
            $col,
386
            $prev,
387
            sliceString($this->body, $start, $this->pos)
388
        );
389
    }
390
391
    /**
392
     * Skips digits at the current position.
393
     *
394
     * @param int $code
395
     * @throws SyntaxErrorException
396
     */
397
    protected function skipDigits(int $code): void
398
    {
399
        if (isNumber($code)) {
400
            do {
401
                $code = $this->readCharCode(++$this->pos);
402
            } while (isNumber($code));
403
404
            return;
405
        }
406
407
        throw $this->createSyntaxErrorException(
408
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
409
        );
410
    }
411
412
    /**
413
     * Reads a comment token from the source file.
414
     *
415
     * @param int   $line
416
     * @param int   $col
417
     * @param Token $prev
418
     * @return Token
419
     */
420
    protected function lexComment(int $line, int $col, Token $prev): Token
421
    {
422
        $start = $this->pos;
423
424
        do {
425
            $code = $this->readCharCode(++$this->pos);
426
        } while ($code !== null && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
427
428
        return new Token(
429
            TokenKindEnum::COMMENT,
430
            $start,
431
            $this->pos,
432
            $line,
433
            $col,
434
            $prev,
435
            sliceString($this->body, $start + 1, $this->pos)
436
        );
437
    }
438
439
    /**
440
     * Reads a spread token from the source.
441
     *
442
     * @param int   $line
443
     * @param int   $col
444
     * @param Token $prev
445
     * @return Token
446
     */
447
    protected function lexSpread(int $line, int $col, Token $prev): Token
448
    {
449
        return new Token(TokenKindEnum::SPREAD, $this->pos, $this->pos + 3, $line, $col, $prev);
450
    }
451
452
    /**
453
     * Reads a string token from the source.
454
     *
455
     * @param int   $line
456
     * @param int   $col
457
     * @param Token $prev
458
     * @return Token
459
     * @throws SyntaxErrorException
460
     */
461
    protected function lexString(int $line, int $col, Token $prev): Token
462
    {
463
        $start      = $this->pos;
464
        $chunkStart = ++$this->pos; // skip the quote
465
        $value      = '';
466
467
        while ($this->pos < $this->bodyLength &&
468
            ($code = $this->readCharCode($this->pos)) !== null && !isLineTerminator($code)) {
469
            // Closing Quote (")
470
            if (34 === $code) {
471
                $value .= sliceString($this->body, $chunkStart, $this->pos);
472
                return new Token(TokenKindEnum::STRING, $start, $this->pos + 1, $line, $col, $prev, $value);
473
            }
474
475
            if (isSourceCharacter($code)) {
476
                throw $this->createSyntaxErrorException(
477
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
478
                );
479
            }
480
481
            ++$this->pos;
482
483
            if (92 === $code) {
484
                // \
485
                $value .= sliceString($this->body, $chunkStart, $this->pos - 1);
486
487
                $code = $this->readCharCode($this->pos);
488
489
                switch ($code) {
490
                    case 34: // "
491
                        $value .= '"';
492
                        break;
493
                    case 47: // /
494
                        $value .= '/';
495
                        break;
496
                    case 92: // \
497
                        $value .= '\\';
498
                        break;
499
                    case 98: // b
500
                        $value .= '\b';
501
                        break;
502
                    case 102: // f
503
                        $value .= '\f';
504
                        break;
505
                    case 110: // n
506
                        $value .= '\n';
507
                        break;
508
                    case 114: // r
509
                        $value .= '\r';
510
                        break;
511
                    case 116: // t
512
                        $value .= '\t';
513
                        break;
514
                    case 117: // u
515
                        $unicodeString = sliceString($this->body, $this->pos + 1, $this->pos + 5);
516
517
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
518
                            throw $this->createSyntaxErrorException(
519
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
520
                            );
521
                        }
522
523
                        $value     .= '\\u' . $unicodeString;
524
                        $this->pos += 4;
525
526
                        break;
527
                    default:
528
                        throw $this->createSyntaxErrorException(
529
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
530
                        );
531
                }
532
533
                ++$this->pos;
534
535
                $chunkStart = $this->pos;
536
            }
537
        }
538
539
        throw $this->createSyntaxErrorException('Unterminated string.');
540
    }
541
542
    /**
543
     * Reads a block string token from the source file.
544
     *
545
     * @param int   $line
546
     * @param int   $col
547
     * @param Token $prev
548
     * @return Token
549
     * @throws SyntaxErrorException
550
     */
551
    protected function lexBlockString(int $line, int $col, Token $prev): Token
552
    {
553
        $start      = $this->pos;
554
        $this->pos  = $start + 3; // skip the triple-quote
555
        $chunkStart = $this->pos;
556
        $rawValue   = '';
557
558
        while ($this->pos < $this->bodyLength && ($code = $this->readCharCode($this->pos)) !== null) {
559
            // Closing Triple-Quote (""")
560
            if ($this->isTripleQuote($code)) {
561
                $rawValue .= sliceString($this->body, $chunkStart, $this->pos);
562
                return new Token(
563
                    TokenKindEnum::BLOCK_STRING,
564
                    $start,
565
                    $this->pos + 3,
566
                    $line,
567
                    $col,
568
                    $prev,
569
                    blockStringValue($rawValue)
570
                );
571
            }
572
573
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
574
                throw $this->createSyntaxErrorException(
575
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
576
                );
577
            }
578
579
            if ($this->isEscapedTripleQuote($code)) {
580
                $rawValue   .= sliceString($this->body, $chunkStart, $this->pos) . '"""';
581
                $this->pos  += 4;
582
                $chunkStart = $this->pos;
583
            } else {
584
                ++$this->pos;
585
            }
586
        }
587
588
        throw $this->createSyntaxErrorException('Unterminated string.');
589
    }
590
591
    /**
592
     * Skips whitespace at the current position.
593
     */
594
    protected function skipWhitespace(): void
595
    {
596
        while ($this->pos < $this->bodyLength) {
597
            $code = $this->readCharCode($this->pos);
598
599
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
600
                // tab | space | comma | BOM
601
                ++$this->pos;
602
            } elseif (10 === $code) {
603
                // new line (\n)
604
                ++$this->pos;
605
                ++$this->line;
606
                $this->lineStart = $this->pos;
607
            } elseif (13 === $code) {
608
                // carriage return (\r)
609
                if (10 === $this->readCharCode($this->pos + 1)) {
610
                    // carriage return and new line (\r\n)
611
                    $this->pos += 2;
612
                } else {
613
                    ++$this->pos;
614
                }
615
                ++$this->line;
616
                $this->lineStart = $this->pos;
617
            } else {
618
                break;
619
            }
620
        }
621
    }
622
623
    /**
624
     * @param int $pos
625
     * @return int
626
     */
627
    protected function readCharCode(int $pos): int
628
    {
629
        $char = \mb_substr($this->body, $pos, 1, self::ENCODING);
630
631
        if ('' === $char) {
632
            return 0;
633
        }
634
635
        if (!isset(self::$charCodeCache[$char])) {
636
            self::$charCodeCache[$char] = \mb_ord($char, self::ENCODING);
637
        }
638
639
        return self::$charCodeCache[$char];
640
    }
641
642
    /**
643
     * Creates a `SyntaxErrorException` for the current position in the source.
644
     *
645
     * @param null|string $description
646
     * @return SyntaxErrorException
647
     */
648
    protected function createSyntaxErrorException(?string $description = null): SyntaxErrorException
649
    {
650
        return new SyntaxErrorException(
651
            $this->source,
652
            $this->pos,
653
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->pos))
654
        );
655
    }
656
657
    /**
658
     * Report a message that an unexpected character was encountered.
659
     *
660
     * @param int $code
661
     * @return string
662
     */
663
    protected function unexpectedCharacterMessage(int $code): string
664
    {
665
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
666
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
667
        }
668
669
        if ($code === 39) {
670
            // '
671
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
672
        }
673
674
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
675
    }
676
677
    /**
678
     * @param int $code
679
     * @return bool
680
     */
681
    protected function isSpread(int $code): bool
682
    {
683
        return 46 === $code &&
684
            $this->readCharCode($this->pos + 1) === 46 &&
685
            $this->readCharCode($this->pos + 2) === 46; // ...
686
    }
687
688
    /**
689
     * @param int $code
690
     * @return bool
691
     */
692
    protected function isString(int $code): bool
693
    {
694
        return 34 === $code && $this->readCharCode($this->pos + 1) !== 34;
695
    }
696
697
    /**
698
     * @param int $code
699
     * @return bool
700
     */
701
    protected function isTripleQuote(int $code): bool
702
    {
703
        return 34 === $code &&
704
            34 === $this->readCharCode($this->pos + 1) &&
705
            34 === $this->readCharCode($this->pos + 2); // """
706
    }
707
708
    /**
709
     * @param int $code
710
     * @return bool
711
     */
712
    protected function isEscapedTripleQuote(int $code): bool
713
    {
714
        return $code === 92 &&
715
            34 === $this->readCharCode($this->pos + 1) &&
716
            34 === $this->readCharCode($this->pos + 2) &&
717
            34 === $this->readCharCode($this->pos + 3); // \"""
718
    }
719
}
720