Lexer   F
last analyzed

Complexity

Total Complexity 111

Size/Duplication

Total Lines 708
Duplicated Lines 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
eloc 264
c 3
b 0
f 0
dl 0
loc 708
rs 2
wmc 111

26 Methods

Rating   Name   Duplication   Size   Complexity  
A createStartOfFileToken() 0 3 1
B lexNumber() 0 55 10
A skipDigits() 0 12 3
A readCharCode() 0 19 4
B skipWhitespace() 0 25 9
A lexPunctuation() 0 7 2
A lexName() 0 15 4
A isEscapedTripleQuote() 0 6 4
A isSpread() 0 5 3
A isTripleQuote() 0 5 3
A isString() 0 3 2
A lexSpread() 0 3 1
A createEndOfFileToken() 0 3 1
A createSyntaxErrorException() 0 6 1
A lexComment() 0 16 4
B lexBlockString() 0 40 6
A getSource() 0 3 1
A getToken() 0 3 1
A getLastToken() 0 3 1
A __construct() 0 12 1
A advance() 0 4 1
C lexString() 0 85 16
D readToken() 0 53 24
A unexpectedCharacterMessage() 0 12 4
A getOption() 0 3 1
A lookahead() 0 13 3

How to fix   Complexity   

Complex Class

Complex classes like Lexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Lexer, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
/**
6
 * A Lexer is a stateful stream generator in that every time
7
 * it is advanced, it returns the next token in the Source. Assuming the
8
 * source lexes, the final Token emitted by the lexer will be of kind
9
 * EOF, after which the lexer will repeatedly return the same EOF token
10
 * whenever called.
11
 */
12
class Lexer implements LexerInterface
13
{
14
    protected const ENCODING = 'UTF-8';
15
16
    /**
17
     * A map between punctuation character code and the corresponding token kind.
18
     *
19
     * @var array
20
     */
21
    protected static $codeTokenKindMap = [
22
        33  => TokenKindEnum::BANG,
23
        36  => TokenKindEnum::DOLLAR,
24
        38  => TokenKindEnum::AMP,
25
        40  => TokenKindEnum::PAREN_L,
26
        41  => TokenKindEnum::PAREN_R,
27
        58  => TokenKindEnum::COLON,
28
        61  => TokenKindEnum::EQUALS,
29
        64  => TokenKindEnum::AT,
30
        91  => TokenKindEnum::BRACKET_L,
31
        93  => TokenKindEnum::BRACKET_R,
32
        123 => TokenKindEnum::BRACE_L,
33
        124 => TokenKindEnum::PIPE,
34
        125 => TokenKindEnum::BRACE_R,
35
    ];
36
37
    /**
38
     * The source file for this lexer.
39
     *
40
     * @var Source
41
     */
42
    protected $source;
43
44
    /**
45
     * The contents of the source file.
46
     *
47
     * @var string
48
     */
49
    protected $body;
50
51
    /**
52
     * The total number of characters in the source file.
53
     *
54
     * @var int
55
     */
56
    protected $bodyLength;
57
58
    /**
59
     * The options for this lexer.
60
     *
61
     * @var array
62
     */
63
    protected $options = [];
64
65
    /**
66
     * The previously focused non-ignored token.
67
     *
68
     * @var Token
69
     */
70
    protected $lastToken;
71
72
    /**
73
     * The currently focused non-ignored token.
74
     *
75
     * @var Token
76
     */
77
    protected $token;
78
79
    /**
80
     * The current position.
81
     *
82
     * @var int
83
     */
84
    protected $position;
85
86
    /**
87
     * The (1-indexed) line containing the current token.
88
     *
89
     * @var int
90
     */
91
    protected $line;
92
93
    /**
94
     * The character offset at which the current line begins.
95
     *
96
     * @var int
97
     */
98
    protected $lineStart;
99
100
    /**
101
     * A key-value map over characters and their corresponding character codes.
102
     *
103
     * @var array
104
     */
105
    protected static $charCodeCache = [];
106
107
    /**
108
     * Lexer constructor.
109
     * @param Source $source
110
     * @param array  $options
111
     */
112
    public function __construct(Source $source, array $options)
113
    {
114
        $startOfFileToken = $this->createStartOfFileToken();
115
116
        $this->lastToken  = $startOfFileToken;
117
        $this->token      = $startOfFileToken;
118
        $this->line       = 1;
119
        $this->lineStart  = 0;
120
        $this->body       = $source->getBody();
121
        $this->bodyLength = \mb_strlen($this->body);
122
        $this->source     = $source;
123
        $this->options    = $options;
124
    }
125
126
    /**
127
     * @inheritdoc
128
     * @throws SyntaxErrorException
129
     */
130
    public function advance(): Token
131
    {
132
        $this->lastToken = $this->token;
133
        return $this->token = $this->lookahead();
134
    }
135
136
    /**
137
     * @inheritdoc
138
     * @throws SyntaxErrorException
139
     */
140
    public function lookahead(): Token
141
    {
142
        $token = $this->token;
143
144
        if (TokenKindEnum::EOF !== $token->getKind()) {
145
            do {
146
                $next = $this->readToken($token);
147
                $token->setNext($next);
148
                $token = $next;
149
            } while (TokenKindEnum::COMMENT === $token->getKind());
150
        }
151
152
        return $token;
153
    }
154
155
    /**
156
     * @inheritdoc
157
     */
158
    public function getOption(string $name, $default = null)
159
    {
160
        return $this->options[$name] ?? $default;
161
    }
162
163
    /**
164
     * @inheritdoc
165
     */
166
    public function getSource(): Source
167
    {
168
        return $this->source;
169
    }
170
171
    /**
172
     * @inheritdoc
173
     */
174
    public function getToken(): Token
175
    {
176
        return $this->token;
177
    }
178
179
    /**
180
     * @inheritdoc
181
     */
182
    public function getLastToken(): Token
183
    {
184
        return $this->lastToken;
185
    }
186
187
    /**
188
     * @inheritdoc
189
     */
190
    public function createSyntaxErrorException(?string $description = null): SyntaxErrorException
191
    {
192
        return new SyntaxErrorException(
193
            $this->source,
194
            $this->position,
195
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->position))
196
        );
197
    }
198
199
    /**
200
     * Reads the token after the given token.
201
     *
202
     * @param Token $prev
203
     * @return Token
204
     * @throws SyntaxErrorException
205
     */
206
    protected function readToken(Token $prev): Token
207
    {
208
        $this->position = $prev->getEnd();
209
210
        $this->skipWhitespace();
211
212
        $line   = $this->line;
213
        $column = (1 + $this->position) - $this->lineStart;
214
215
        if ($this->position >= $this->bodyLength) {
216
            return $this->createEndOfFileToken($line, $column, $prev);
217
        }
218
219
        $code = $this->readCharCode($this->position);
220
221
        // Punctuation: [!$&:=@|()\[\]{}]{1}
222
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
223
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
224
            return $this->lexPunctuation($code, $line, $column, $prev);
225
        }
226
227
        // Comment: #[\u0009\u0020-\uFFFF]*
228
        if (35 === $code) {
229
            return $this->lexComment($line, $column, $prev);
230
        }
231
232
        // Int:   -?(0|[1-9][0-9]*)
233
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
234
        if (45 === $code || isNumber($code)) {
235
            return $this->lexNumber($code, $line, $column, $prev);
236
        }
237
238
        // Name: [_A-Za-z][_0-9A-Za-z]*
239
        if (isAlphaNumeric($code)) {
240
            return $this->lexName($line, $column, $prev);
241
        }
242
243
        // Spread: ...
244
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
245
            return $this->lexSpread($line, $column, $prev);
246
        }
247
248
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
249
        if ($this->isString($code)) {
250
            return $this->lexString($line, $column, $prev);
251
        }
252
253
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
254
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
255
            return $this->lexBlockString($line, $column, $prev);
256
        }
257
258
        throw $this->createSyntaxErrorException();
259
    }
260
261
    /**
262
     * @return Token
263
     */
264
    protected function createStartOfFileToken(): Token
265
    {
266
        return new Token(TokenKindEnum::SOF);
267
    }
268
269
    /**
270
     * Creates an End Of File (EOF) token.
271
     *
272
     * @param int   $line
273
     * @param int   $column
274
     * @param Token $prev
275
     * @return Token
276
     */
277
    protected function createEndOfFileToken(int $line, int $column, Token $prev): Token
278
    {
279
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $column, $prev);
280
    }
281
282
    /**
283
     * Reads a punctuation token from the source file.
284
     *
285
     * @param int   $code
286
     * @param int   $line
287
     * @param int   $column
288
     * @param Token $prev
289
     * @return Token
290
     * @throws SyntaxErrorException
291
     */
292
    protected function lexPunctuation(int $code, int $line, int $column, Token $prev): ?Token
293
    {
294
        if (!isset(self::$codeTokenKindMap[$code])) {
295
            throw $this->createSyntaxErrorException();
296
        }
297
298
        return new Token(self::$codeTokenKindMap[$code], $this->position, $this->position + 1, $line, $column, $prev);
299
    }
300
301
    /**
302
     * Reads a name token from the source file.
303
     *
304
     * @param int   $line
305
     * @param int   $column
306
     * @param Token $prev
307
     * @return Token
308
     */
309
    protected function lexName(int $line, int $column, Token $prev): Token
310
    {
311
        $start = $this->position;
312
313
        ++$this->position;
314
315
        while ($this->position !== $this->bodyLength &&
316
            ($code = $this->readCharCode($this->position)) !== 0 &&
317
            isAlphaNumeric($code)) {
318
            ++$this->position;
319
        }
320
321
        $value = sliceString($this->body, $start, $this->position);
322
323
        return new Token(TokenKindEnum::NAME, $start, $this->position, $line, $column, $prev, $value);
324
    }
325
326
    /**
327
     * Reads a number (int or float) token from the source file.
328
     *
329
     * @param int   $code
330
     * @param int   $line
331
     * @param int   $column
332
     * @param Token $prev
333
     * @return Token
334
     * @throws SyntaxErrorException
335
     */
336
    protected function lexNumber(int $code, int $line, int $column, Token $prev): Token
337
    {
338
        $start   = $this->position;
339
        $isFloat = false;
340
341
        if (45 === $code) {
342
            // -
343
            $code = $this->readCharCode(++$this->position);
344
        }
345
346
        if (48 === $code) {
347
            // 0
348
            $code = $this->readCharCode(++$this->position);
349
350
            if (isNumber($code)) {
351
                throw $this->createSyntaxErrorException(
352
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
353
                );
354
            }
355
        } else {
356
            $this->skipDigits($code);
357
            $code = $this->readCharCode($this->position);
358
        }
359
360
        if (46 === $code) {
361
            // .
362
            $isFloat = true;
363
364
            $code = $this->readCharCode(++$this->position);
365
            $this->skipDigits($code);
366
            $code = $this->readCharCode($this->position);
367
        }
368
369
        if (69 === $code || 101 === $code) {
370
            // e or E
371
            $isFloat = true;
372
373
            $code = $this->readCharCode(++$this->position);
374
375
            if (43 === $code || 45 === $code) {
376
                // + or -
377
                $code = $this->readCharCode(++$this->position);
378
            }
379
380
            $this->skipDigits($code);
381
        }
382
383
        return new Token(
384
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
385
            $start,
386
            $this->position,
387
            $line,
388
            $column,
389
            $prev,
390
            sliceString($this->body, $start, $this->position)
391
        );
392
    }
393
394
    /**
395
     * Skips digits at the current position.
396
     *
397
     * @param int $code
398
     * @throws SyntaxErrorException
399
     */
400
    protected function skipDigits(int $code): void
401
    {
402
        if (isNumber($code)) {
403
            do {
404
                $code = $this->readCharCode(++$this->position);
405
            } while (isNumber($code));
406
407
            return;
408
        }
409
410
        throw $this->createSyntaxErrorException(
411
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
412
        );
413
    }
414
415
    /**
416
     * Reads a comment token from the source file.
417
     *
418
     * @param int   $line
419
     * @param int   $column
420
     * @param Token $prev
421
     * @return Token
422
     */
423
    protected function lexComment(int $line, int $column, Token $prev): Token
424
    {
425
        $start = $this->position;
426
427
        do {
428
            $code = $this->readCharCode(++$this->position);
429
        } while ($code !== 0 && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
430
431
        return new Token(
432
            TokenKindEnum::COMMENT,
433
            $start,
434
            $this->position,
435
            $line,
436
            $column,
437
            $prev,
438
            sliceString($this->body, $start + 1, $this->position)
439
        );
440
    }
441
442
    /**
443
     * Reads a spread token from the source.
444
     *
445
     * @param int   $line
446
     * @param int   $column
447
     * @param Token $prev
448
     * @return Token
449
     */
450
    protected function lexSpread(int $line, int $column, Token $prev): Token
451
    {
452
        return new Token(TokenKindEnum::SPREAD, $this->position, $this->position + 3, $line, $column, $prev);
453
    }
454
455
    /**
456
     * Reads a string token from the source.
457
     *
458
     * @param int   $line
459
     * @param int   $column
460
     * @param Token $prev
461
     * @return Token
462
     * @throws SyntaxErrorException
463
     */
464
    protected function lexString(int $line, int $column, Token $prev): Token
465
    {
466
        $start      = $this->position;
467
        $chunkStart = ++$this->position; // skip the quote
468
        $value      = '';
469
470
        while ($this->position < $this->bodyLength) {
471
            $code = $this->readCharCode($this->position);
472
473
            if (isLineTerminator($code)) {
474
                break;
475
            }
476
477
            // Closing Quote (")
478
            if (34 === $code) {
479
                $value .= sliceString($this->body, $chunkStart, $this->position);
480
                return new Token(TokenKindEnum::STRING, $start, $this->position + 1, $line, $column, $prev, $value);
481
            }
482
483
            if (isSourceCharacter($code)) {
484
                throw $this->createSyntaxErrorException(
485
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
486
                );
487
            }
488
489
            ++$this->position;
490
491
            if (92 === $code) {
492
                // \
493
                $value .= sliceString($this->body, $chunkStart, $this->position - 1);
494
495
                $code = $this->readCharCode($this->position);
496
497
                switch ($code) {
498
                    case 34: // "
499
                        $value .= '"';
500
                        break;
501
                    case 47: // /
502
                        $value .= '/';
503
                        break;
504
                    case 92: // \
505
                        $value .= '\\';
506
                        break;
507
                    case 98: // b
508
                        $value .= '\b';
509
                        break;
510
                    case 102: // f
511
                        $value .= '\f';
512
                        break;
513
                    case 110: // n
514
                        $value .= '\n';
515
                        break;
516
                    case 114: // r
517
                        $value .= '\r';
518
                        break;
519
                    case 116: // t
520
                        $value .= '\t';
521
                        break;
522
                    case 117: // u
523
                        $unicodeString = sliceString($this->body, $this->position + 1, $this->position + 5);
524
525
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
526
                            throw $this->createSyntaxErrorException(
527
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
528
                            );
529
                        }
530
531
                        $value .= '\\u' . $unicodeString;
532
533
                        $this->position += 4;
534
535
                        break;
536
                    default:
537
                        throw $this->createSyntaxErrorException(
538
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
539
                        );
540
                }
541
542
                ++$this->position;
543
544
                $chunkStart = $this->position;
545
            }
546
        }
547
548
        throw $this->createSyntaxErrorException('Unterminated string.');
549
    }
550
551
    /**
552
     * Reads a block string token from the source file.
553
     *
554
     * @param int   $line
555
     * @param int   $column
556
     * @param Token $prev
557
     * @return Token
558
     * @throws SyntaxErrorException
559
     */
560
    protected function lexBlockString(int $line, int $column, Token $prev): Token
561
    {
562
        $start          = $this->position;
563
        $this->position = $start + 3; // skip the triple-quote
564
        $chunkStart     = $this->position;
565
        $rawValue       = '';
566
567
        while ($this->position < $this->bodyLength) {
568
            $code = $this->readCharCode($this->position);
569
570
            // Closing Triple-Quote (""")
571
            if ($this->isTripleQuote($code)) {
572
                $rawValue .= sliceString($this->body, $chunkStart, $this->position);
573
                return new Token(
574
                    TokenKindEnum::BLOCK_STRING,
575
                    $start,
576
                    $this->position + 3,
577
                    $line,
578
                    $column,
579
                    $prev,
580
                    blockStringValue($rawValue)
581
                );
582
            }
583
584
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
585
                throw $this->createSyntaxErrorException(
586
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
587
                );
588
            }
589
590
            if ($this->isEscapedTripleQuote($code)) {
591
                $rawValue       .= sliceString($this->body, $chunkStart, $this->position) . '"""';
592
                $this->position += 4;
593
                $chunkStart     = $this->position;
594
            } else {
595
                ++$this->position;
596
            }
597
        }
598
599
        throw $this->createSyntaxErrorException('Unterminated string.');
600
    }
601
602
    /**
603
     * Skips whitespace at the current position.
604
     */
605
    protected function skipWhitespace(): void
606
    {
607
        while ($this->position < $this->bodyLength) {
608
            $code = $this->readCharCode($this->position);
609
610
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
611
                // tab | space | comma | BOM
612
                ++$this->position;
613
            } elseif (10 === $code) {
614
                // new line (\n)
615
                ++$this->position;
616
                ++$this->line;
617
                $this->lineStart = $this->position;
618
            } elseif (13 === $code) {
619
                // carriage return (\r)
620
                if (10 === $this->readCharCode($this->position + 1)) {
621
                    // carriage return and new line (\r\n)
622
                    $this->position += 2;
623
                } else {
624
                    ++$this->position;
625
                }
626
                ++$this->line;
627
                $this->lineStart = $this->position;
628
            } else {
629
                break;
630
            }
631
        }
632
    }
633
634
    /**
635
     * @param int $position
636
     * @return int
637
     */
638
    protected function readCharCode(int $position): int
639
    {
640
        $char = \mb_substr($this->body, $position, 1, self::ENCODING);
641
642
        if ('' === $char) {
643
            return 0;
644
        }
645
646
        if (!isset(self::$charCodeCache[$char])) {
647
            $code = \ord($char);
648
649
            if ($code >= 128) {
650
                $code = \mb_ord($char, self::ENCODING);
651
            }
652
653
            self::$charCodeCache[$char] = $code;
654
        }
655
656
        return self::$charCodeCache[$char];
657
    }
658
659
    /**
660
     * Report a message that an unexpected character was encountered.
661
     *
662
     * @param int $code
663
     * @return string
664
     */
665
    protected function unexpectedCharacterMessage(int $code): string
666
    {
667
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
668
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
669
        }
670
671
        if ($code === 39) {
672
            // '
673
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
674
        }
675
676
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
677
    }
678
679
    /**
680
     * @param int $code
681
     * @return bool
682
     */
683
    protected function isSpread(int $code): bool
684
    {
685
        return 46 === $code &&
686
            $this->readCharCode($this->position + 1) === 46 &&
687
            $this->readCharCode($this->position + 2) === 46; // ...
688
    }
689
690
    /**
691
     * @param int $code
692
     * @return bool
693
     */
694
    protected function isString(int $code): bool
695
    {
696
        return 34 === $code && $this->readCharCode($this->position + 1) !== 34;
697
    }
698
699
    /**
700
     * @param int $code
701
     * @return bool
702
     */
703
    protected function isTripleQuote(int $code): bool
704
    {
705
        return 34 === $code &&
706
            34 === $this->readCharCode($this->position + 1) &&
707
            34 === $this->readCharCode($this->position + 2); // """
708
    }
709
710
    /**
711
     * @param int $code
712
     * @return bool
713
     */
714
    protected function isEscapedTripleQuote(int $code): bool
715
    {
716
        return $code === 92 &&
717
            34 === $this->readCharCode($this->position + 1) &&
718
            34 === $this->readCharCode($this->position + 2) &&
719
            34 === $this->readCharCode($this->position + 3); // \"""
720
    }
721
}
722