Passed
Pull Request — master (#196)
by Christoffer
02:23
created

Lexer::positionAfterWhitespace()   D

Complexity

Conditions 9
Paths 4

Size

Total Lines 30
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 30
rs 4.909
c 0
b 0
f 0
cc 9
eloc 18
nc 4
nop 2

1 Method

Rating   Name   Duplication   Size   Complexity  
A Lexer::createStartOfFileToken() 0 3 1
1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
use Digia\GraphQL\Error\SyntaxErrorException;
6
7
class Lexer implements LexerInterface
8
{
9
    protected const ENCODING = 'UTF-8';
10
11
    /**
12
     * A map between punctuation character code and the corresponding token kind.
13
     *
14
     * @var array
15
     */
16
    protected static $codeTokenKindMap = [
17
        33  => TokenKindEnum::BANG,
18
        36  => TokenKindEnum::DOLLAR,
19
        38  => TokenKindEnum::AMP,
20
        40  => TokenKindEnum::PAREN_L,
21
        41  => TokenKindEnum::PAREN_R,
22
        58  => TokenKindEnum::COLON,
23
        61  => TokenKindEnum::EQUALS,
24
        64  => TokenKindEnum::AT,
25
        91  => TokenKindEnum::BRACKET_L,
26
        93  => TokenKindEnum::BRACKET_R,
27
        123 => TokenKindEnum::BRACE_L,
28
        124 => TokenKindEnum::PIPE,
29
        125 => TokenKindEnum::BRACE_R,
30
    ];
31
32
    /**
33
     * The source file for this lexer.
34
     *
35
     * @var Source
36
     */
37
    protected $source;
38
39
    /**
40
     * The contents of the source file.
41
     *
42
     * @var string
43
     */
44
    protected $body;
45
46
    /**
47
     * The total number of characters in the source file.
48
     *
49
     * @var int
50
     */
51
    protected $bodyLength;
52
53
    /**
54
     * The options for this lexer.
55
     *
56
     * @var array
57
     */
58
    protected $options = [];
59
60
    /**
61
     * The previously focused non-ignored token.
62
     *
63
     * @var Token
64
     */
65
    protected $lastToken;
66
67
    /**
68
     * The currently focused non-ignored token.
69
     *
70
     * @var Token
71
     */
72
    protected $token;
73
74
    /**
75
     * The current position.
76
     *
77
     * @var int
78
     */
79
    protected $pos;
80
81
    /**
82
     * The (1-indexed) line containing the current token.
83
     *
84
     * @var int
85
     */
86
    protected $line;
87
88
    /**
89
     * The character offset at which the current line begins.
90
     *
91
     * @var int
92
     */
93
    protected $lineStart;
94
95
    /**
96
     * @var array
97
     */
98
    protected static $charCodeCache = [];
99
100
    /**
101
     * Lexer constructor.
102
     * @param Source|null $source
103
     * @param array       $options
104
     */
105
    public function __construct(Source $source, array $options)
106
    {
107
        $startOfFileToken = $this->createStartOfFileToken();
108
109
        $this->lastToken  = $startOfFileToken;
110
        $this->token      = $startOfFileToken;
111
        $this->line       = 1;
112
        $this->lineStart  = 0;
113
        $this->body       = $source->getBody();
114
        $this->bodyLength = \strlen($this->body);
115
        $this->source     = $source;
116
        $this->options    = $options;
117
    }
118
119
    /**
120
     * @inheritdoc
121
     * @throws SyntaxErrorException
122
     */
123
    public function advance(): Token
124
    {
125
        $this->lastToken = $this->token;
126
        return $this->token = $this->lookahead();
127
    }
128
129
    /**
130
     * @inheritdoc
131
     * @throws SyntaxErrorException
132
     */
133
    public function lookahead(): Token
134
    {
135
        $token = $this->token;
136
137
        if (TokenKindEnum::EOF !== $token->getKind()) {
138
            do {
139
                $next = $this->readToken($token);
140
                $token->setNext($next);
141
                $token = $next;
142
            } while (TokenKindEnum::COMMENT === $token->getKind());
143
        }
144
145
        return $token;
146
    }
147
148
    /**
149
     * @inheritdoc
150
     */
151
    public function getOption(string $name, $default = null)
152
    {
153
        return $this->options[$name] ?? $default;
154
    }
155
156
    /**
157
     * @inheritdoc
158
     */
159
    public function getSource(): Source
160
    {
161
        return $this->source;
162
    }
163
164
    /**
165
     * @inheritdoc
166
     */
167
    public function getToken(): Token
168
    {
169
        return $this->token;
170
    }
171
172
    /**
173
     * @inheritdoc
174
     */
175
    public function getLastToken(): Token
176
    {
177
        return $this->lastToken;
178
    }
179
180
    /**
181
     * @inheritdoc
182
     */
183
    public function createSyntaxErrorException(?string $description = null): SyntaxErrorException
184
    {
185
        return new SyntaxErrorException(
186
            $this->source,
187
            $this->pos,
188
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->pos))
189
        );
190
    }
191
192
    /**
193
     * Reads the token after the given token.
194
     *
195
     * @param Token $prev
196
     * @return Token
197
     * @throws SyntaxErrorException
198
     */
199
    protected function readToken(Token $prev): Token
200
    {
201
        $this->pos = $prev->getEnd();
202
203
        $this->skipWhitespace();
204
205
        $line = $this->line;
206
        $col  = (1 + $this->pos) - $this->lineStart;
207
208
        if ($this->pos >= $this->bodyLength) {
209
            return $this->createEndOfFileToken($line, $col, $prev);
210
        }
211
212
        $code = $this->readCharCode($this->pos);
213
214
        // Punctuation: [!$&:=@|()\[\]{}]{1}
215
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
216
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
217
            return $this->lexPunctuation($code, $line, $col, $prev);
218
        }
219
220
        // Comment: #[\u0009\u0020-\uFFFF]*
221
        if (35 === $code) {
222
            return $this->lexComment($line, $col, $prev);
223
        }
224
225
        // Int:   -?(0|[1-9][0-9]*)
226
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
227
        if (45 === $code || isNumber($code)) {
228
            return $this->lexNumber($code, $line, $col, $prev);
229
        }
230
231
        // Name: [_A-Za-z][_0-9A-Za-z]*
232
        if (isAlphaNumeric($code)) {
233
            return $this->lexName($line, $col, $prev);
234
        }
235
236
        // Spread: ...
237
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
238
            return $this->lexSpread($line, $col, $prev);
239
        }
240
241
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
242
        if ($this->isString($code)) {
243
            return $this->lexString($line, $col, $prev);
244
        }
245
246
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
247
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
248
            return $this->lexBlockString($line, $col, $prev);
249
        }
250
251
        throw $this->createSyntaxErrorException();
252
    }
253
254
    /**
255
     * @return Token
256
     */
257
    protected function createStartOfFileToken(): Token
258
    {
259
        return new Token(TokenKindEnum::SOF);
260
    }
261
262
    /**
263
     * Creates an End Of File (EOF) token.
264
     *
265
     * @param int   $line
266
     * @param int   $col
267
     * @param Token $prev
268
     * @return Token
269
     */
270
    protected function createEndOfFileToken(int $line, int $col, Token $prev): Token
271
    {
272
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $col, $prev);
273
    }
274
275
    /**
276
     * Reads a punctuation token from the source file.
277
     *
278
     * @param int   $code
279
     * @param int   $line
280
     * @param int   $col
281
     * @param Token $prev
282
     * @return Token
283
     * @throws SyntaxErrorException
284
     */
285
    protected function lexPunctuation(int $code, int $line, int $col, Token $prev): ?Token
286
    {
287
        if (!isset(self::$codeTokenKindMap[$code])) {
288
            throw $this->createSyntaxErrorException();
289
        }
290
291
        return new Token(self::$codeTokenKindMap[$code], $this->pos, $this->pos + 1, $line, $col, $prev);
292
    }
293
294
    /**
295
     * Reads a name token from the source file.
296
     *
297
     * @param int   $line
298
     * @param int   $col
299
     * @param Token $prev
300
     * @return Token
301
     */
302
    protected function lexName(int $line, int $col, Token $prev): Token
303
    {
304
        $start = $this->pos;
305
306
        ++$this->pos;
307
308
        while ($this->pos !== $this->bodyLength &&
309
            ($code = $this->readCharCode($this->pos)) !== null &&
310
            isAlphaNumeric($code)) {
311
            ++$this->pos;
312
        }
313
314
        $value = sliceString($this->body, $start, $this->pos);
315
316
        return new Token(TokenKindEnum::NAME, $start, $this->pos, $line, $col, $prev, $value);
317
    }
318
319
    /**
320
     * Reads a number (int or float) token from the source file.
321
     *
322
     * @param int   $code
323
     * @param int   $line
324
     * @param int   $col
325
     * @param Token $prev
326
     * @return Token
327
     * @throws SyntaxErrorException
328
     */
329
    protected function lexNumber(int $code, int $line, int $col, Token $prev): Token
330
    {
331
        $start   = $this->pos;
332
        $isFloat = false;
333
334
        if (45 === $code) {
335
            // -
336
            $code = $this->readCharCode(++$this->pos);
337
        }
338
339
        if (48 === $code) {
340
            // 0
341
            $code = $this->readCharCode(++$this->pos);
342
343
            if (isNumber($code)) {
344
                throw $this->createSyntaxErrorException(
345
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
346
                );
347
            }
348
        } else {
349
            $this->skipDigits($code);
350
            $code = $this->readCharCode($this->pos);
351
        }
352
353
        if (46 === $code) {
354
            // .
355
            $isFloat = true;
356
357
            $code = $this->readCharCode(++$this->pos);
358
            $this->skipDigits($code);
359
            $code = $this->readCharCode($this->pos);
360
        }
361
362
        if (69 === $code || 101 === $code) {
363
            // e or E
364
            $isFloat = true;
365
366
            $code = $this->readCharCode(++$this->pos);
367
368
            if (43 === $code || 45 === $code) {
369
                // + or -
370
                $code = $this->readCharCode(++$this->pos);
371
            }
372
373
            $this->skipDigits($code);
374
        }
375
376
        return new Token(
377
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
378
            $start,
379
            $this->pos,
380
            $line,
381
            $col,
382
            $prev,
383
            sliceString($this->body, $start, $this->pos)
384
        );
385
    }
386
387
    /**
388
     * Skips digits at the current position.
389
     *
390
     * @param int $code
391
     * @throws SyntaxErrorException
392
     */
393
    protected function skipDigits(int $code): void
394
    {
395
        if (isNumber($code)) {
396
            do {
397
                $code = $this->readCharCode(++$this->pos);
398
            } while (isNumber($code));
399
400
            return;
401
        }
402
403
        throw $this->createSyntaxErrorException(
404
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
405
        );
406
    }
407
408
    /**
409
     * Reads a comment token from the source file.
410
     *
411
     * @param int   $line
412
     * @param int   $col
413
     * @param Token $prev
414
     * @return Token
415
     */
416
    protected function lexComment(int $line, int $col, Token $prev): Token
417
    {
418
        $start = $this->pos;
419
420
        do {
421
            $code = $this->readCharCode(++$this->pos);
422
        } while ($code !== null && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
423
424
        return new Token(
425
            TokenKindEnum::COMMENT,
426
            $start,
427
            $this->pos,
428
            $line,
429
            $col,
430
            $prev,
431
            sliceString($this->body, $start + 1, $this->pos)
432
        );
433
    }
434
435
    /**
436
     * Reads a spread token from the source.
437
     *
438
     * @param int   $line
439
     * @param int   $col
440
     * @param Token $prev
441
     * @return Token
442
     */
443
    protected function lexSpread(int $line, int $col, Token $prev): Token
444
    {
445
        return new Token(TokenKindEnum::SPREAD, $this->pos, $this->pos + 3, $line, $col, $prev);
446
    }
447
448
    /**
449
     * Reads a string token from the source.
450
     *
451
     * @param int   $line
452
     * @param int   $col
453
     * @param Token $prev
454
     * @return Token
455
     * @throws SyntaxErrorException
456
     */
457
    protected function lexString(int $line, int $col, Token $prev): Token
458
    {
459
        $start      = $this->pos;
460
        $chunkStart = ++$this->pos; // skip the quote
461
        $value      = '';
462
463
        while ($this->pos < $this->bodyLength &&
464
            ($code = $this->readCharCode($this->pos)) !== null && !isLineTerminator($code)) {
465
            // Closing Quote (")
466
            if (34 === $code) {
467
                $value .= sliceString($this->body, $chunkStart, $this->pos);
468
                return new Token(TokenKindEnum::STRING, $start, $this->pos + 1, $line, $col, $prev, $value);
469
            }
470
471
            if (isSourceCharacter($code)) {
472
                throw $this->createSyntaxErrorException(
473
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
474
                );
475
            }
476
477
            ++$this->pos;
478
479
            if (92 === $code) {
480
                // \
481
                $value .= sliceString($this->body, $chunkStart, $this->pos - 1);
482
483
                $code = $this->readCharCode($this->pos);
484
485
                switch ($code) {
486
                    case 34: // "
487
                        $value .= '"';
488
                        break;
489
                    case 47: // /
490
                        $value .= '/';
491
                        break;
492
                    case 92: // \
493
                        $value .= '\\';
494
                        break;
495
                    case 98: // b
496
                        $value .= '\b';
497
                        break;
498
                    case 102: // f
499
                        $value .= '\f';
500
                        break;
501
                    case 110: // n
502
                        $value .= '\n';
503
                        break;
504
                    case 114: // r
505
                        $value .= '\r';
506
                        break;
507
                    case 116: // t
508
                        $value .= '\t';
509
                        break;
510
                    case 117: // u
511
                        $unicodeString = sliceString($this->body, $this->pos + 1, $this->pos + 5);
512
513
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
514
                            throw $this->createSyntaxErrorException(
515
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
516
                            );
517
                        }
518
519
                        $value     .= '\\u' . $unicodeString;
520
                        $this->pos += 4;
521
522
                        break;
523
                    default:
524
                        throw $this->createSyntaxErrorException(
525
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
526
                        );
527
                }
528
529
                ++$this->pos;
530
531
                $chunkStart = $this->pos;
532
            }
533
        }
534
535
        throw $this->createSyntaxErrorException('Unterminated string.');
536
    }
537
538
    /**
539
     * Reads a block string token from the source file.
540
     *
541
     * @param int   $line
542
     * @param int   $col
543
     * @param Token $prev
544
     * @return Token
545
     * @throws SyntaxErrorException
546
     */
547
    protected function lexBlockString(int $line, int $col, Token $prev): Token
548
    {
549
        $start      = $this->pos;
550
        $this->pos  = $start + 3; // skip the triple-quote
551
        $chunkStart = $this->pos;
552
        $rawValue   = '';
553
554
        while ($this->pos < $this->bodyLength && ($code = $this->readCharCode($this->pos)) !== null) {
555
            // Closing Triple-Quote (""")
556
            if ($this->isTripleQuote($code)) {
557
                $rawValue .= sliceString($this->body, $chunkStart, $this->pos);
558
                return new Token(
559
                    TokenKindEnum::BLOCK_STRING,
560
                    $start,
561
                    $this->pos + 3,
562
                    $line,
563
                    $col,
564
                    $prev,
565
                    blockStringValue($rawValue)
566
                );
567
            }
568
569
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
570
                throw $this->createSyntaxErrorException(
571
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
572
                );
573
            }
574
575
            if ($this->isEscapedTripleQuote($code)) {
576
                $rawValue   .= sliceString($this->body, $chunkStart, $this->pos) . '"""';
577
                $this->pos  += 4;
578
                $chunkStart = $this->pos;
579
            } else {
580
                ++$this->pos;
581
            }
582
        }
583
584
        throw $this->createSyntaxErrorException('Unterminated string.');
585
    }
586
587
    /**
588
     * Skips whitespace at the current position.
589
     */
590
    protected function skipWhitespace(): void
591
    {
592
        while ($this->pos < $this->bodyLength) {
593
            $code = $this->readCharCode($this->pos);
594
595
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
596
                // tab | space | comma | BOM
597
                ++$this->pos;
598
            } elseif (10 === $code) {
599
                // new line (\n)
600
                ++$this->pos;
601
                ++$this->line;
602
                $this->lineStart = $this->pos;
603
            } elseif (13 === $code) {
604
                // carriage return (\r)
605
                if (10 === $this->readCharCode($this->pos + 1)) {
606
                    // carriage return and new line (\r\n)
607
                    $this->pos += 2;
608
                } else {
609
                    ++$this->pos;
610
                }
611
                ++$this->line;
612
                $this->lineStart = $this->pos;
613
            } else {
614
                break;
615
            }
616
        }
617
    }
618
619
    /**
620
     * @param int $pos
621
     * @return int
622
     */
623
    protected function readCharCode(int $pos): int
624
    {
625
        $char = \mb_substr($this->body, $pos, 1, self::ENCODING);
626
627
        if ('' === $char) {
628
            return 0;
629
        }
630
631
        if (!isset(self::$charCodeCache[$char])) {
632
            $code = \ord($char);
633
634
            if ($code >= 128) {
635
                $code = \mb_ord($char, self::ENCODING);
636
            }
637
638
            self::$charCodeCache[$char] = $code;
639
        }
640
641
        return self::$charCodeCache[$char];
642
    }
643
644
    /**
645
     * Report a message that an unexpected character was encountered.
646
     *
647
     * @param int $code
648
     * @return string
649
     */
650
    protected function unexpectedCharacterMessage(int $code): string
651
    {
652
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
653
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
654
        }
655
656
        if ($code === 39) {
657
            // '
658
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
659
        }
660
661
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
662
    }
663
664
    /**
665
     * @param int $code
666
     * @return bool
667
     */
668
    protected function isSpread(int $code): bool
669
    {
670
        return 46 === $code &&
671
            $this->readCharCode($this->pos + 1) === 46 &&
672
            $this->readCharCode($this->pos + 2) === 46; // ...
673
    }
674
675
    /**
676
     * @param int $code
677
     * @return bool
678
     */
679
    protected function isString(int $code): bool
680
    {
681
        return 34 === $code && $this->readCharCode($this->pos + 1) !== 34;
682
    }
683
684
    /**
685
     * @param int $code
686
     * @return bool
687
     */
688
    protected function isTripleQuote(int $code): bool
689
    {
690
        return 34 === $code &&
691
            34 === $this->readCharCode($this->pos + 1) &&
692
            34 === $this->readCharCode($this->pos + 2); // """
693
    }
694
695
    /**
696
     * @param int $code
697
     * @return bool
698
     */
699
    protected function isEscapedTripleQuote(int $code): bool
700
    {
701
        return $code === 92 &&
702
            34 === $this->readCharCode($this->pos + 1) &&
703
            34 === $this->readCharCode($this->pos + 2) &&
704
            34 === $this->readCharCode($this->pos + 3); // \"""
705
    }
706
}
707