Passed
Pull Request — master (#196)
by Christoffer
02:50
created

Lexer::readToken()   C

Complexity

Conditions 24
Paths 9

Size

Total Lines 53
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 53
rs 6.1349
c 0
b 0
f 0
cc 24
eloc 23
nc 9
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
use Digia\GraphQL\Error\SyntaxErrorException;
6
7
class Lexer implements LexerInterface
8
{
9
    protected const ENCODING = 'UTF-8';
10
11
    /**
12
     * A map between punctuation character code and the corresponding token kind.
13
     *
14
     * @var array
15
     */
16
    protected static $codeTokenKindMap = [
17
        33  => TokenKindEnum::BANG,
18
        36  => TokenKindEnum::DOLLAR,
19
        38  => TokenKindEnum::AMP,
20
        40  => TokenKindEnum::PAREN_L,
21
        41  => TokenKindEnum::PAREN_R,
22
        58  => TokenKindEnum::COLON,
23
        61  => TokenKindEnum::EQUALS,
24
        64  => TokenKindEnum::AT,
25
        91  => TokenKindEnum::BRACKET_L,
26
        93  => TokenKindEnum::BRACKET_R,
27
        123 => TokenKindEnum::BRACE_L,
28
        124 => TokenKindEnum::PIPE,
29
        125 => TokenKindEnum::BRACE_R,
30
    ];
31
32
    /**
33
     * The source file for this lexer.
34
     *
35
     * @var Source
36
     */
37
    protected $source;
38
39
    /**
40
     * The contents of the source file.
41
     *
42
     * @var string
43
     */
44
    protected $body;
45
46
    /**
47
     * The total number of characters in the source file.
48
     *
49
     * @var int
50
     */
51
    protected $bodyLength;
52
53
    /**
54
     * The options for this lexer.
55
     *
56
     * @var array
57
     */
58
    protected $options = [];
59
60
    /**
61
     * The previously focused non-ignored token.
62
     *
63
     * @var Token
64
     */
65
    protected $lastToken;
66
67
    /**
68
     * The currently focused non-ignored token.
69
     *
70
     * @var Token
71
     */
72
    protected $token;
73
74
    /**
75
     * The current position.
76
     *
77
     * @var int
78
     */
79
    protected $pos;
80
81
    /**
82
     * The (1-indexed) line containing the current token.
83
     *
84
     * @var int
85
     */
86
    protected $line;
87
88
    /**
89
     * The character offset at which the current line begins.
90
     *
91
     * @var int
92
     */
93
    protected $lineStart;
94
95
    /**
96
     * @var array
97
     */
98
    protected static $charCodeCache = [];
99
100
    /**
101
     * Lexer constructor.
102
     * @param Source|null $source
103
     * @param array       $options
104
     */
105
    public function __construct(Source $source, array $options)
106
    {
107
        $startOfFileToken = $this->createStartOfFileToken();
108
109
        $this->lastToken  = $startOfFileToken;
110
        $this->token      = $startOfFileToken;
111
        $this->line       = 1;
112
        $this->lineStart  = 0;
113
        $this->body       = $source->getBody();
114
        $this->bodyLength = \strlen($this->body);
115
        $this->source     = $source;
116
        $this->options    = $options;
117
    }
118
119
    /**
120
     * @inheritdoc
121
     * @throws SyntaxErrorException
122
     */
123
    public function advance(): Token
124
    {
125
        $this->lastToken = $this->token;
126
        return $this->token = $this->lookahead();
127
    }
128
129
    /**
130
     * @inheritdoc
131
     * @throws SyntaxErrorException
132
     */
133
    public function lookahead(): Token
134
    {
135
        $token = $this->token;
136
137
        if (TokenKindEnum::EOF !== $token->kind) {
138
            do {
139
                $next  = $token->next = $this->readToken($token);
140
                $token = $next;
141
            } while (TokenKindEnum::COMMENT === $token->kind);
142
        }
143
144
        return $token;
145
    }
146
147
    /**
148
     * @inheritdoc
149
     */
150
    public function getOption(string $name, $default = null)
151
    {
152
        return $this->options[$name] ?? $default;
153
    }
154
155
    /**
156
     * @inheritdoc
157
     */
158
    public function getSource(): Source
159
    {
160
        return $this->source;
161
    }
162
163
    /**
164
     * @inheritdoc
165
     */
166
    public function getToken(): Token
167
    {
168
        return $this->token;
169
    }
170
171
    /**
172
     * @inheritdoc
173
     */
174
    public function getLastToken(): Token
175
    {
176
        return $this->lastToken;
177
    }
178
179
    /**
180
     * @inheritdoc
181
     */
182
    public function createSyntaxErrorException(?string $description = null): SyntaxErrorException
183
    {
184
        return new SyntaxErrorException(
185
            $this->source,
186
            $this->pos,
187
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->pos))
188
        );
189
    }
190
191
    /**
192
     * Reads the token after the given token.
193
     *
194
     * @param Token $prev
195
     * @return Token
196
     * @throws SyntaxErrorException
197
     */
198
    protected function readToken(Token $prev): Token
199
    {
200
        $this->pos = $prev->end;
201
202
        $this->skipWhitespace();
203
204
        $line = $this->line;
205
        $col  = (1 + $this->pos) - $this->lineStart;
206
207
        if ($this->pos >= $this->bodyLength) {
208
            return $this->createEndOfFileToken($line, $col, $prev);
209
        }
210
211
        $code = $this->readCharCode($this->pos);
212
213
        // Punctuation: [!$&:=@|()\[\]{}]{1}
214
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
215
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
216
            return $this->lexPunctuation($code, $line, $col, $prev);
217
        }
218
219
        // Comment: #[\u0009\u0020-\uFFFF]*
220
        if (35 === $code) {
221
            return $this->lexComment($line, $col, $prev);
222
        }
223
224
        // Int:   -?(0|[1-9][0-9]*)
225
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
226
        if (45 === $code || isNumber($code)) {
227
            return $this->lexNumber($code, $line, $col, $prev);
228
        }
229
230
        // Name: [_A-Za-z][_0-9A-Za-z]*
231
        if (isAlphaNumeric($code)) {
232
            return $this->lexName($line, $col, $prev);
233
        }
234
235
        // Spread: ...
236
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
237
            return $this->lexSpread($line, $col, $prev);
238
        }
239
240
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
241
        if ($this->isString($code)) {
242
            return $this->lexString($line, $col, $prev);
243
        }
244
245
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
246
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
247
            return $this->lexBlockString($line, $col, $prev);
248
        }
249
250
        throw $this->createSyntaxErrorException();
251
    }
252
253
    /**
254
     * @return Token
255
     */
256
    protected function createStartOfFileToken(): Token
257
    {
258
        return new Token(TokenKindEnum::SOF);
259
    }
260
261
    /**
262
     * Creates an End Of File (EOF) token.
263
     *
264
     * @param int   $line
265
     * @param int   $col
266
     * @param Token $prev
267
     * @return Token
268
     */
269
    protected function createEndOfFileToken(int $line, int $col, Token $prev): Token
270
    {
271
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $col, $prev);
272
    }
273
274
    /**
275
     * Reads a punctuation token from the source file.
276
     *
277
     * @param int   $code
278
     * @param int   $line
279
     * @param int   $col
280
     * @param Token $prev
281
     * @return Token
282
     * @throws SyntaxErrorException
283
     */
284
    protected function lexPunctuation(int $code, int $line, int $col, Token $prev): ?Token
285
    {
286
        if (!isset(self::$codeTokenKindMap[$code])) {
287
            throw $this->createSyntaxErrorException();
288
        }
289
290
        return new Token(self::$codeTokenKindMap[$code], $this->pos, $this->pos + 1, $line, $col, $prev);
291
    }
292
293
    /**
294
     * Reads a name token from the source file.
295
     *
296
     * @param int   $line
297
     * @param int   $col
298
     * @param Token $prev
299
     * @return Token
300
     */
301
    protected function lexName(int $line, int $col, Token $prev): Token
302
    {
303
        $start = $this->pos;
304
305
        ++$this->pos;
306
307
        while ($this->pos !== $this->bodyLength &&
308
            ($code = $this->readCharCode($this->pos)) !== null &&
309
            isAlphaNumeric($code)) {
310
            ++$this->pos;
311
        }
312
313
        $value = sliceString($this->body, $start, $this->pos);
314
315
        return new Token(TokenKindEnum::NAME, $start, $this->pos, $line, $col, $prev, $value);
316
    }
317
318
    /**
319
     * Reads a number (int or float) token from the source file.
320
     *
321
     * @param int   $code
322
     * @param int   $line
323
     * @param int   $col
324
     * @param Token $prev
325
     * @return Token
326
     * @throws SyntaxErrorException
327
     */
328
    protected function lexNumber(int $code, int $line, int $col, Token $prev): Token
329
    {
330
        $start   = $this->pos;
331
        $isFloat = false;
332
333
        if (45 === $code) {
334
            // -
335
            $code = $this->readCharCode(++$this->pos);
336
        }
337
338
        if (48 === $code) {
339
            // 0
340
            $code = $this->readCharCode(++$this->pos);
341
342
            if (isNumber($code)) {
343
                throw $this->createSyntaxErrorException(
344
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
345
                );
346
            }
347
        } else {
348
            $this->skipDigits($code);
349
            $code = $this->readCharCode($this->pos);
350
        }
351
352
        if (46 === $code) {
353
            // .
354
            $isFloat = true;
355
356
            $code = $this->readCharCode(++$this->pos);
357
            $this->skipDigits($code);
358
            $code = $this->readCharCode($this->pos);
359
        }
360
361
        if (69 === $code || 101 === $code) {
362
            // e or E
363
            $isFloat = true;
364
365
            $code = $this->readCharCode(++$this->pos);
366
367
            if (43 === $code || 45 === $code) {
368
                // + or -
369
                $code = $this->readCharCode(++$this->pos);
370
            }
371
372
            $this->skipDigits($code);
373
        }
374
375
        return new Token(
376
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
377
            $start,
378
            $this->pos,
379
            $line,
380
            $col,
381
            $prev,
382
            sliceString($this->body, $start, $this->pos)
383
        );
384
    }
385
386
    /**
387
     * Skips digits at the current position.
388
     *
389
     * @param int $code
390
     * @throws SyntaxErrorException
391
     */
392
    protected function skipDigits(int $code): void
393
    {
394
        if (isNumber($code)) {
395
            do {
396
                $code = $this->readCharCode(++$this->pos);
397
            } while (isNumber($code));
398
399
            return;
400
        }
401
402
        throw $this->createSyntaxErrorException(
403
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
404
        );
405
    }
406
407
    /**
408
     * Reads a comment token from the source file.
409
     *
410
     * @param int   $line
411
     * @param int   $col
412
     * @param Token $prev
413
     * @return Token
414
     */
415
    protected function lexComment(int $line, int $col, Token $prev): Token
416
    {
417
        $start = $this->pos;
418
419
        do {
420
            $code = $this->readCharCode(++$this->pos);
421
        } while ($code !== null && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
422
423
        return new Token(
424
            TokenKindEnum::COMMENT,
425
            $start,
426
            $this->pos,
427
            $line,
428
            $col,
429
            $prev,
430
            sliceString($this->body, $start + 1, $this->pos)
431
        );
432
    }
433
434
    /**
435
     * Reads a spread token from the source.
436
     *
437
     * @param int   $line
438
     * @param int   $col
439
     * @param Token $prev
440
     * @return Token
441
     */
442
    protected function lexSpread(int $line, int $col, Token $prev): Token
443
    {
444
        return new Token(TokenKindEnum::SPREAD, $this->pos, $this->pos + 3, $line, $col, $prev);
445
    }
446
447
    /**
448
     * Reads a string token from the source.
449
     *
450
     * @param int   $line
451
     * @param int   $col
452
     * @param Token $prev
453
     * @return Token
454
     * @throws SyntaxErrorException
455
     */
456
    protected function lexString(int $line, int $col, Token $prev): Token
457
    {
458
        $start      = $this->pos;
459
        $chunkStart = ++$this->pos; // skip the quote
460
        $value      = '';
461
462
        while ($this->pos < $this->bodyLength &&
463
            ($code = $this->readCharCode($this->pos)) !== null && !isLineTerminator($code)) {
464
            // Closing Quote (")
465
            if (34 === $code) {
466
                $value .= sliceString($this->body, $chunkStart, $this->pos);
467
                return new Token(TokenKindEnum::STRING, $start, $this->pos + 1, $line, $col, $prev, $value);
468
            }
469
470
            if (isSourceCharacter($code)) {
471
                throw $this->createSyntaxErrorException(
472
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
473
                );
474
            }
475
476
            ++$this->pos;
477
478
            if (92 === $code) {
479
                // \
480
                $value .= sliceString($this->body, $chunkStart, $this->pos - 1);
481
482
                $code = $this->readCharCode($this->pos);
483
484
                switch ($code) {
485
                    case 34: // "
486
                        $value .= '"';
487
                        break;
488
                    case 47: // /
489
                        $value .= '/';
490
                        break;
491
                    case 92: // \
492
                        $value .= '\\';
493
                        break;
494
                    case 98: // b
495
                        $value .= '\b';
496
                        break;
497
                    case 102: // f
498
                        $value .= '\f';
499
                        break;
500
                    case 110: // n
501
                        $value .= '\n';
502
                        break;
503
                    case 114: // r
504
                        $value .= '\r';
505
                        break;
506
                    case 116: // t
507
                        $value .= '\t';
508
                        break;
509
                    case 117: // u
510
                        $unicodeString = sliceString($this->body, $this->pos + 1, $this->pos + 5);
511
512
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
513
                            throw $this->createSyntaxErrorException(
514
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
515
                            );
516
                        }
517
518
                        $value     .= '\\u' . $unicodeString;
519
                        $this->pos += 4;
520
521
                        break;
522
                    default:
523
                        throw $this->createSyntaxErrorException(
524
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
525
                        );
526
                }
527
528
                ++$this->pos;
529
530
                $chunkStart = $this->pos;
531
            }
532
        }
533
534
        throw $this->createSyntaxErrorException('Unterminated string.');
535
    }
536
537
    /**
538
     * Reads a block string token from the source file.
539
     *
540
     * @param int   $line
541
     * @param int   $col
542
     * @param Token $prev
543
     * @return Token
544
     * @throws SyntaxErrorException
545
     */
546
    protected function lexBlockString(int $line, int $col, Token $prev): Token
547
    {
548
        $start      = $this->pos;
549
        $this->pos  = $start + 3; // skip the triple-quote
550
        $chunkStart = $this->pos;
551
        $rawValue   = '';
552
553
        while ($this->pos < $this->bodyLength && ($code = $this->readCharCode($this->pos)) !== null) {
554
            // Closing Triple-Quote (""")
555
            if ($this->isTripleQuote($code)) {
556
                $rawValue .= sliceString($this->body, $chunkStart, $this->pos);
557
                return new Token(
558
                    TokenKindEnum::BLOCK_STRING,
559
                    $start,
560
                    $this->pos + 3,
561
                    $line,
562
                    $col,
563
                    $prev,
564
                    blockStringValue($rawValue)
565
                );
566
            }
567
568
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
569
                throw $this->createSyntaxErrorException(
570
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
571
                );
572
            }
573
574
            if ($this->isEscapedTripleQuote($code)) {
575
                $rawValue   .= sliceString($this->body, $chunkStart, $this->pos) . '"""';
576
                $this->pos  += 4;
577
                $chunkStart = $this->pos;
578
            } else {
579
                ++$this->pos;
580
            }
581
        }
582
583
        throw $this->createSyntaxErrorException('Unterminated string.');
584
    }
585
586
    /**
587
     * Skips whitespace at the current position.
588
     */
589
    protected function skipWhitespace(): void
590
    {
591
        while ($this->pos < $this->bodyLength) {
592
            $code = $this->readCharCode($this->pos);
593
594
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
595
                // tab | space | comma | BOM
596
                ++$this->pos;
597
            } elseif (10 === $code) {
598
                // new line (\n)
599
                ++$this->pos;
600
                ++$this->line;
601
                $this->lineStart = $this->pos;
602
            } elseif (13 === $code) {
603
                // carriage return (\r)
604
                if (10 === $this->readCharCode($this->pos + 1)) {
605
                    // carriage return and new line (\r\n)
606
                    $this->pos += 2;
607
                } else {
608
                    ++$this->pos;
609
                }
610
                ++$this->line;
611
                $this->lineStart = $this->pos;
612
            } else {
613
                break;
614
            }
615
        }
616
    }
617
618
    /**
619
     * @param int $pos
620
     * @return int
621
     */
622
    protected function readCharCode(int $pos): int
623
    {
624
        $char = \mb_substr($this->body, $pos, 1, self::ENCODING);
625
626
        if ('' === $char) {
627
            return 0;
628
        }
629
630
        if (!isset(self::$charCodeCache[$char])) {
631
            $code = \ord($char);
632
633
            if ($code >= 128) {
634
                $code = \mb_ord($char, self::ENCODING);
635
            }
636
637
            self::$charCodeCache[$char] = $code;
638
        }
639
640
        return self::$charCodeCache[$char];
641
    }
642
643
    /**
644
     * Report a message that an unexpected character was encountered.
645
     *
646
     * @param int $code
647
     * @return string
648
     */
649
    protected function unexpectedCharacterMessage(int $code): string
650
    {
651
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
652
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
653
        }
654
655
        if ($code === 39) {
656
            // '
657
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
658
        }
659
660
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
661
    }
662
663
    /**
664
     * @param int $code
665
     * @return bool
666
     */
667
    protected function isSpread(int $code): bool
668
    {
669
        return 46 === $code &&
670
            $this->readCharCode($this->pos + 1) === 46 &&
671
            $this->readCharCode($this->pos + 2) === 46; // ...
672
    }
673
674
    /**
675
     * @param int $code
676
     * @return bool
677
     */
678
    protected function isString(int $code): bool
679
    {
680
        return 34 === $code && $this->readCharCode($this->pos + 1) !== 34;
681
    }
682
683
    /**
684
     * @param int $code
685
     * @return bool
686
     */
687
    protected function isTripleQuote(int $code): bool
688
    {
689
        return 34 === $code &&
690
            34 === $this->readCharCode($this->pos + 1) &&
691
            34 === $this->readCharCode($this->pos + 2); // """
692
    }
693
694
    /**
695
     * @param int $code
696
     * @return bool
697
     */
698
    protected function isEscapedTripleQuote(int $code): bool
699
    {
700
        return $code === 92 &&
701
            34 === $this->readCharCode($this->pos + 1) &&
702
            34 === $this->readCharCode($this->pos + 2) &&
703
            34 === $this->readCharCode($this->pos + 3); // \"""
704
    }
705
}
706