Passed
Pull Request — master (#196)
by Christoffer
02:52
created

Lexer::skipWhitespace()   D

Complexity

Conditions 9
Paths 6

Size

Total Lines 25
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 25
rs 4.909
c 0
b 0
f 0
cc 9
eloc 17
nc 6
nop 0
1
<?php
2
3
namespace Digia\GraphQL\Language;
4
5
use Digia\GraphQL\Error\SyntaxErrorException;
6
7
class Lexer implements LexerInterface
8
{
9
    /**
10
     * A map between punctuation character code and the corresponding token kind.
11
     *
12
     * @var array
13
     */
14
    protected static $codeTokenKindMap = [
15
        33  => TokenKindEnum::BANG,
16
        36  => TokenKindEnum::DOLLAR,
17
        38  => TokenKindEnum::AMP,
18
        40  => TokenKindEnum::PAREN_L,
19
        41  => TokenKindEnum::PAREN_R,
20
        58  => TokenKindEnum::COLON,
21
        61  => TokenKindEnum::EQUALS,
22
        64  => TokenKindEnum::AT,
23
        91  => TokenKindEnum::BRACKET_L,
24
        93  => TokenKindEnum::BRACKET_R,
25
        123 => TokenKindEnum::BRACE_L,
26
        124 => TokenKindEnum::PIPE,
27
        125 => TokenKindEnum::BRACE_R,
28
    ];
29
30
    /**
31
     * The source file for this lexer.
32
     *
33
     * @var Source
34
     */
35
    protected $source;
36
37
    /**
38
     * The contents of the source file.
39
     *
40
     * @var string
41
     */
42
    protected $body;
43
44
    /**
45
     * The total number of characters in the source file.
46
     *
47
     * @var int
48
     */
49
    protected $bodyLength;
50
51
    /**
52
     * The options for this lexer.
53
     *
54
     * @var array
55
     */
56
    protected $options = [];
57
58
    /**
59
     * The previously focused non-ignored token.
60
     *
61
     * @var Token
62
     */
63
    protected $lastToken;
64
65
    /**
66
     * The currently focused non-ignored token.
67
     *
68
     * @var Token
69
     */
70
    protected $token;
71
72
    /**
73
     * The current position.
74
     *
75
     * @var int
76
     */
77
    protected $pos;
78
79
    /**
80
     * The (1-indexed) line containing the current token.
81
     *
82
     * @var int
83
     */
84
    protected $line;
85
86
    /**
87
     * The character offset at which the current line begins.
88
     *
89
     * @var int
90
     */
91
    protected $lineStart;
92
93
    /**
94
     * @var array
95
     */
96
    protected static $charCodeCache = [];
97
98
    /**
99
     * Lexer constructor.
100
     * @param Source|null $source
101
     * @param array       $options
102
     */
103
    public function __construct(Source $source, array $options)
104
    {
105
        $startOfFileToken = $this->createStartOfFileToken();
106
107
        $this->lastToken  = $startOfFileToken;
108
        $this->token      = $startOfFileToken;
109
        $this->line       = 1;
110
        $this->lineStart  = 0;
111
        $this->body       = $source->getBody();
112
        $this->bodyLength = \strlen($this->body);
113
        $this->source     = $source;
114
        $this->options    = $options;
115
    }
116
117
    /**
118
     * @inheritdoc
119
     * @throws SyntaxErrorException
120
     */
121
    public function advance(): Token
122
    {
123
        $this->lastToken = $this->token;
124
        return $this->token = $this->lookahead();
125
    }
126
127
    /**
128
     * @inheritdoc
129
     * @throws SyntaxErrorException
130
     */
131
    public function lookahead(): Token
132
    {
133
        $token = $this->token;
134
135
        if (TokenKindEnum::EOF !== $token->getKind()) {
136
            do {
137
                $next = $this->readToken($token);
138
                $token->setNext($next);
139
                $token = $next;
140
            } while (TokenKindEnum::COMMENT === $token->getKind());
141
        }
142
143
        return $token;
144
    }
145
146
    /**
147
     * @inheritdoc
148
     */
149
    public function getOption(string $name, $default = null)
150
    {
151
        return $this->options[$name] ?? $default;
152
    }
153
154
    /**
155
     * @inheritdoc
156
     */
157
    public function getTokenKind(): string
158
    {
159
        return $this->token->getKind();
160
    }
161
162
    /**
163
     * @inheritdoc
164
     */
165
    public function getTokenValue(): ?string
166
    {
167
        return $this->token->getValue();
168
    }
169
170
    /**
171
     * @inheritdoc
172
     */
173
    public function getToken(): Token
174
    {
175
        return $this->token;
176
    }
177
178
    /**
179
     * @inheritdoc
180
     */
181
    public function getSource(): Source
182
    {
183
        return $this->source;
184
    }
185
186
    /**
187
     * @inheritdoc
188
     */
189
    public function getLastToken(): Token
190
    {
191
        return $this->lastToken;
192
    }
193
194
    /**
195
     * Reads the token after the given token.
196
     *
197
     * @param Token $prev
198
     * @return Token
199
     * @throws SyntaxErrorException
200
     */
201
    protected function readToken(Token $prev): Token
202
    {
203
        $this->pos = $prev->getEnd();
204
205
        $this->skipWhitespace();
206
207
        $line = $this->line;
208
        $col  = (1 + $this->pos) - $this->lineStart;
209
210
        if ($this->pos >= $this->bodyLength) {
211
            return $this->createEndOfFileToken($line, $col, $prev);
212
        }
213
214
        $code = $this->readCharCode($this->pos);
215
216
        // Punctuation: [!$&:=@|()\[\]{}]{1}
217
        if (33 === $code || 36 === $code || 38 === $code || 58 === $code || 61 === $code || 64 === $code || 124 === $code ||
218
            40 === $code || 41 === $code || 91 === $code || 93 === $code || 123 === $code || 125 === $code) {
219
            return $this->lexPunctuation($code, $line, $col, $prev);
220
        }
221
222
        // Comment: #[\u0009\u0020-\uFFFF]*
223
        if (35 === $code) {
224
            return $this->lexComment($line, $col, $prev);
225
        }
226
227
        // Int:   -?(0|[1-9][0-9]*)
228
        // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
229
        if (45 === $code || isNumber($code)) {
230
            return $this->lexNumber($code, $line, $col, $prev);
231
        }
232
233
        // Name: [_A-Za-z][_0-9A-Za-z]*
234
        if (isAlphaNumeric($code)) {
235
            return $this->lexName($line, $col, $prev);
236
        }
237
238
        // Spread: ...
239
        if ($this->bodyLength >= 3 && $this->isSpread($code)) {
240
            return $this->lexSpread($line, $col, $prev);
241
        }
242
243
        // String: "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
244
        if ($this->isString($code)) {
245
            return $this->lexString($line, $col, $prev);
246
        }
247
248
        // Block String: """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
249
        if ($this->bodyLength >= 3 && $this->isTripleQuote($code)) {
250
            return $this->lexBlockString($line, $col, $prev);
251
        }
252
253
        throw $this->createSyntaxErrorException();
254
    }
255
256
    /**
257
     * @return Token
258
     */
259
    protected function createStartOfFileToken(): Token
260
    {
261
        return new Token(TokenKindEnum::SOF);
262
    }
263
264
    /**
265
     * Creates an End Of File (EOF) token.
266
     *
267
     * @param int   $line
268
     * @param int   $col
269
     * @param Token $prev
270
     * @return Token
271
     */
272
    protected function createEndOfFileToken(int $line, int $col, Token $prev): Token
273
    {
274
        return new Token(TokenKindEnum::EOF, $this->bodyLength, $this->bodyLength, $line, $col, $prev);
275
    }
276
277
    /**
278
     * Reads a punctuation token from the source file.
279
     *
280
     * @param int   $code
281
     * @param int   $line
282
     * @param int   $col
283
     * @param Token $prev
284
     * @return Token
285
     * @throws SyntaxErrorException
286
     */
287
    protected function lexPunctuation(int $code, int $line, int $col, Token $prev): ?Token
288
    {
289
        if (!isset(self::$codeTokenKindMap[$code])) {
290
            throw $this->createSyntaxErrorException();
291
        }
292
293
        return new Token(self::$codeTokenKindMap[$code], $this->pos, $this->pos + 1, $line, $col, $prev);
294
    }
295
296
    /**
297
     * Reads a name token from the source file.
298
     *
299
     * @param int   $line
300
     * @param int   $col
301
     * @param Token $prev
302
     * @return Token
303
     */
304
    protected function lexName(int $line, int $col, Token $prev): Token
305
    {
306
        $start = $this->pos;
307
308
        ++$this->pos;
309
310
        while ($this->pos !== $this->bodyLength &&
311
            ($code = $this->readCharCode($this->pos)) !== null &&
312
            isAlphaNumeric($code)) {
313
            ++$this->pos;
314
        }
315
316
        $value = sliceString($this->body, $start, $this->pos);
317
318
        return new Token(TokenKindEnum::NAME, $start, $this->pos, $line, $col, $prev, $value);
319
    }
320
321
    /**
322
     * Reads a number (int or float) token from the source file.
323
     *
324
     * @param int   $code
325
     * @param int   $line
326
     * @param int   $col
327
     * @param Token $prev
328
     * @return Token
329
     * @throws SyntaxErrorException
330
     */
331
    protected function lexNumber(int $code, int $line, int $col, Token $prev): Token
332
    {
333
        $start   = $this->pos;
334
        $isFloat = false;
335
336
        if (45 === $code) {
337
            // -
338
            $code = $this->readCharCode(++$this->pos);
339
        }
340
341
        if (48 === $code) {
342
            // 0
343
            $code = $this->readCharCode(++$this->pos);
344
345
            if (isNumber($code)) {
346
                throw $this->createSyntaxErrorException(
347
                    \sprintf('Invalid number, unexpected digit after 0: %s.', printCharCode($code))
348
                );
349
            }
350
        } else {
351
            $this->skipDigits($code);
352
            $code = $this->readCharCode($this->pos);
353
        }
354
355
        if (46 === $code) {
356
            // .
357
            $isFloat = true;
358
359
            $code = $this->readCharCode(++$this->pos);
360
            $this->skipDigits($code);
361
            $code = $this->readCharCode($this->pos);
362
        }
363
364
        if (69 === $code || 101 === $code) {
365
            // e or E
366
            $isFloat = true;
367
            $code    = $this->readCharCode(++$this->pos);
368
369
            if (43 === $code || 45 === $code) {
370
                // + or -
371
                $code = $this->readCharCode(++$this->pos);
372
            }
373
374
            $this->skipDigits($code);
375
        }
376
377
        return new Token(
378
            $isFloat ? TokenKindEnum::FLOAT : TokenKindEnum::INT,
379
            $start,
380
            $this->pos,
381
            $line,
382
            $col,
383
            $prev,
384
            sliceString($this->body, $start, $this->pos)
385
        );
386
    }
387
388
    /**
389
     * Skips digits at the current position.
390
     *
391
     * @param int $code
392
     * @throws SyntaxErrorException
393
     */
394
    protected function skipDigits(int $code): void
395
    {
396
        if (isNumber($code)) {
397
            do {
398
                $code = $this->readCharCode(++$this->pos);
399
            } while (isNumber($code));
400
401
            return;
402
        }
403
404
        throw $this->createSyntaxErrorException(
405
            \sprintf('Invalid number, expected digit but got: %s.', printCharCode($code))
406
        );
407
    }
408
409
    /**
410
     * Reads a comment token from the source file.
411
     *
412
     * @param int   $line
413
     * @param int   $col
414
     * @param Token $prev
415
     * @return Token
416
     */
417
    protected function lexComment(int $line, int $col, Token $prev): Token
418
    {
419
        $start = $this->pos;
420
421
        do {
422
            $code = $this->readCharCode(++$this->pos);
423
        } while ($code !== null && ($code > 0x001f || 0x0009 === $code)); // SourceCharacter but not LineTerminator
424
425
        return new Token(
426
            TokenKindEnum::COMMENT,
427
            $start,
428
            $this->pos,
429
            $line,
430
            $col,
431
            $prev,
432
            sliceString($this->body, $start + 1, $this->pos)
433
        );
434
    }
435
436
    /**
437
     * Reads a spread token from the source.
438
     *
439
     * @param int   $line
440
     * @param int   $col
441
     * @param Token $prev
442
     * @return Token
443
     */
444
    protected function lexSpread(int $line, int $col, Token $prev): Token
445
    {
446
        return new Token(TokenKindEnum::SPREAD, $this->pos, $this->pos + 3, $line, $col, $prev);
447
    }
448
449
    /**
450
     * Reads a string token from the source.
451
     *
452
     * @param int   $line
453
     * @param int   $col
454
     * @param Token $prev
455
     * @return Token
456
     * @throws SyntaxErrorException
457
     */
458
    protected function lexString(int $line, int $col, Token $prev): Token
459
    {
460
        $start      = $this->pos;
461
        $chunkStart = ++$this->pos; // skip the quote
462
        $value      = '';
463
464
        while ($this->pos < $this->bodyLength &&
465
            ($code = $this->readCharCode($this->pos)) !== null && !isLineTerminator($code)) {
466
            // Closing Quote (")
467
            if (34 === $code) {
468
                $value .= sliceString($this->body, $chunkStart, $this->pos);
469
                return new Token(TokenKindEnum::STRING, $start, $this->pos + 1, $line, $col, $prev, $value);
470
            }
471
472
            if (isSourceCharacter($code)) {
473
                throw $this->createSyntaxErrorException(
474
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
475
                );
476
            }
477
478
            ++$this->pos;
479
480
            if (92 === $code) {
481
                // \
482
                $value .= sliceString($this->body, $chunkStart, $this->pos - 1);
483
484
                $code = $this->readCharCode($this->pos);
485
486
                switch ($code) {
487
                    case 34: // "
488
                        $value .= '"';
489
                        break;
490
                    case 47: // /
491
                        $value .= '/';
492
                        break;
493
                    case 92: // \
494
                        $value .= '\\';
495
                        break;
496
                    case 98: // b
497
                        $value .= '\b';
498
                        break;
499
                    case 102: // f
500
                        $value .= '\f';
501
                        break;
502
                    case 110: // n
503
                        $value .= '\n';
504
                        break;
505
                    case 114: // r
506
                        $value .= '\r';
507
                        break;
508
                    case 116: // t
509
                        $value .= '\t';
510
                        break;
511
                    case 117: // u
512
                        $unicodeString = sliceString($this->body, $this->pos + 1, $this->pos + 5);
513
514
                        if (!\preg_match('/[0-9A-Fa-f]{4}/', $unicodeString)) {
515
                            throw $this->createSyntaxErrorException(
516
                                \sprintf('Invalid character escape sequence: \\u%s.', $unicodeString)
517
                            );
518
                        }
519
520
                        $value     .= '\\u' . $unicodeString;
521
                        $this->pos += 4;
522
523
                        break;
524
                    default:
525
                        throw $this->createSyntaxErrorException(
526
                            \sprintf('Invalid character escape sequence: \\%s.', \chr($code))
527
                        );
528
                }
529
530
                ++$this->pos;
531
532
                $chunkStart = $this->pos;
533
            }
534
        }
535
536
        throw $this->createSyntaxErrorException('Unterminated string.');
537
    }
538
539
    /**
540
     * Reads a block string token from the source file.
541
     *
542
     * @param int   $line
543
     * @param int   $col
544
     * @param Token $prev
545
     * @return Token
546
     * @throws SyntaxErrorException
547
     */
548
    protected function lexBlockString(int $line, int $col, Token $prev): Token
549
    {
550
        $start      = $this->pos;
551
        $this->pos  = $start + 3; // skip the triple-quote
552
        $chunkStart = $this->pos;
553
        $rawValue   = '';
554
555
        while ($this->pos < $this->bodyLength && ($code = $this->readCharCode($this->pos)) !== null) {
556
            // Closing Triple-Quote (""")
557
            if ($this->isTripleQuote($code)) {
558
                $rawValue .= sliceString($this->body, $chunkStart, $this->pos);
559
                return new Token(
560
                    TokenKindEnum::BLOCK_STRING,
561
                    $start,
562
                    $this->pos + 3,
563
                    $line,
564
                    $col,
565
                    $prev,
566
                    blockStringValue($rawValue)
567
                );
568
            }
569
570
            if (isSourceCharacter($code) && !isLineTerminator($code)) {
571
                throw $this->createSyntaxErrorException(
572
                    \sprintf('Invalid character within String: %s.', printCharCode($code))
573
                );
574
            }
575
576
            if ($this->isEscapedTripleQuote($code)) {
577
                $rawValue   .= sliceString($this->body, $chunkStart, $this->pos) . '"""';
578
                $this->pos  += 4;
579
                $chunkStart = $this->pos;
580
            } else {
581
                ++$this->pos;
582
            }
583
        }
584
585
        throw $this->createSyntaxErrorException('Unterminated string.');
586
    }
587
588
    /**
589
     * Skips whitespace at the current position.
590
     */
591
    protected function skipWhitespace(): void
592
    {
593
        while ($this->pos < $this->bodyLength) {
594
            $code = $this->readCharCode($this->pos);
595
596
            if (9 === $code || 32 === $code || 44 === $code || 0xfeff === $code) {
597
                // tab | space | comma | BOM
598
                ++$this->pos;
599
            } elseif (10 === $code) {
600
                // new line (\n)
601
                ++$this->pos;
602
                ++$this->line;
603
                $this->lineStart = $this->pos;
604
            } elseif (13 === $code) {
605
                // carriage return (\r)
606
                if (10 === $this->readCharCode($this->pos + 1)) {
607
                    // carriage return and new line (\r\n)
608
                    $this->pos += 2;
609
                } else {
610
                    ++$this->pos;
611
                }
612
                ++$this->line;
613
                $this->lineStart = $this->pos;
614
            } else {
615
                break;
616
            }
617
        }
618
    }
619
620
    /**
621
     * @param int $pos
622
     * @return int
623
     */
624
    protected function readCharCode(int $pos): int
625
    {
626
        $char = \mb_substr($this->body, $pos, 1, 'UTF-8');
627
628
        if (!isset(self::$charCodeCache[$char])) {
629
            self::$charCodeCache[$char] = \mb_ord($char);
0 ignored issues
show
Bug introduced by
The call to mb_ord() has too few arguments starting with encoding. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

629
            self::$charCodeCache[$char] = /** @scrutinizer ignore-call */ \mb_ord($char);

This check compares calls to functions or methods with their respective definitions. If the call has less arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
630
        }
631
632
        return self::$charCodeCache[$char];
633
    }
634
635
    /**
636
     * Creates a `SyntaxErrorException` for the current position in the source.
637
     *
638
     * @param null|string $description
639
     * @return SyntaxErrorException
640
     */
641
    protected function createSyntaxErrorException(?string $description = null): SyntaxErrorException
642
    {
643
        return new SyntaxErrorException(
644
            $this->source,
645
            $this->pos,
646
            $description ?? $this->unexpectedCharacterMessage($this->readCharCode($this->pos))
647
        );
648
    }
649
650
    /**
651
     * Report a message that an unexpected character was encountered.
652
     *
653
     * @param int $code
654
     * @return string
655
     */
656
    protected function unexpectedCharacterMessage(int $code): string
657
    {
658
        if (isSourceCharacter($code) && !isLineTerminator($code)) {
659
            return \sprintf('Cannot contain the invalid character %s.', printCharCode($code));
660
        }
661
662
        if ($code === 39) {
663
            // '
664
            return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
665
        }
666
667
        return \sprintf('Cannot parse the unexpected character %s.', printCharCode($code));
668
    }
669
670
    /**
671
     * @param int $code
672
     * @return bool
673
     */
674
    protected function isSpread(int $code): bool
675
    {
676
        return 46 === $code &&
677
            $this->readCharCode($this->pos + 1) === 46 &&
678
            $this->readCharCode($this->pos + 2) === 46; // ...
679
    }
680
681
    /**
682
     * @param int $code
683
     * @return bool
684
     */
685
    protected function isString(int $code): bool
686
    {
687
        return 34 === $code && $this->readCharCode($this->pos + 1) !== 34;
688
    }
689
690
    /**
691
     * @param int $code
692
     * @return bool
693
     */
694
    protected function isTripleQuote(int $code): bool
695
    {
696
        return 34 === $code &&
697
            34 === $this->readCharCode($this->pos + 1) &&
698
            34 === $this->readCharCode($this->pos + 2); // """
699
    }
700
701
    /**
702
     * @param int $code
703
     * @return bool
704
     */
705
    protected function isEscapedTripleQuote(int $code): bool
706
    {
707
        return $code === 92 &&
708
            34 === $this->readCharCode($this->pos + 1) &&
709
            34 === $this->readCharCode($this->pos + 2) &&
710
            34 === $this->readCharCode($this->pos + 3); // \"""
711
    }
712
}
713