NfaBuilder::onBeginProduction()   F
last analyzed

Complexity

Conditions 32
Paths 53

Size

Total Lines 132
Code Lines 104

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
cc 32
eloc 104
nc 53
nop 2
dl 0
loc 132
rs 3.3333
c 2
b 1
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace Remorhaz\UniLex\RegExp\FSM;
6
7
use Remorhaz\IntRangeSets\Range;
8
use Remorhaz\IntRangeSets\RangeSet;
9
use Remorhaz\IntRangeSets\RangeSetInterface;
10
use Remorhaz\UCD\PropertyRangeLoader;
11
use Remorhaz\UCD\PropertyRangeLoaderInterface;
12
use Remorhaz\UniLex\AST\AbstractTranslatorListener;
13
use Remorhaz\UniLex\AST\Node;
14
use Remorhaz\UniLex\AST\Symbol;
15
use Remorhaz\UniLex\AST\Translator;
16
use Remorhaz\UniLex\AST\Tree;
17
use Remorhaz\UniLex\IO\CharBufferInterface;
18
use Remorhaz\UniLex\Exception;
19
use Remorhaz\UniLex\RegExp\AST\NodeType;
20
use Remorhaz\UniLex\RegExp\ParserFactory;
21
use Remorhaz\UniLex\Stack\PushInterface;
22
23
class NfaBuilder extends AbstractTranslatorListener
24
{
25
    private $nfa;
26
27
    private $propertyLoader;
28
29
    private $rangeTransitionMap;
30
31
    private $languageBuilder;
32
33
    private $startState;
34
35
    public function __construct(Nfa $nfa, PropertyRangeLoaderInterface $propertyLoader)
36
    {
37
        $this->nfa = $nfa;
38
        $this->propertyLoader = $propertyLoader;
39
    }
40
41
    /**
42
     * @param CharBufferInterface $buffer
43
     * @return Nfa
44
     * @throws Exception
45
     */
46
    public static function fromBuffer(CharBufferInterface $buffer): Nfa
47
    {
48
        $tree = new Tree();
49
        ParserFactory::createFromBuffer($tree, $buffer)->run();
50
51
        return self::fromTree($tree);
52
    }
53
54
    /**
55
     * @param Tree $tree
56
     * @return Nfa
57
     * @throws Exception
58
     */
59
    public static function fromTree(Tree $tree): Nfa
60
    {
61
        $nfa = new Nfa();
62
        (new Translator($tree, new self($nfa, PropertyRangeLoader::create())))->run();
63
64
        return $nfa;
65
    }
66
67
    /**
68
     * @param int $state
69
     * @throws Exception
70
     */
71
    public function setStartState(int $state): void
72
    {
73
        if (isset($this->startState)) {
74
            throw new Exception("Start state is already set");
75
        }
76
        $this->startState = $state;
77
    }
78
79
    /**
80
     * @param Node $node
81
     * @throws Exception
82
     */
83
    public function onStart(Node $node): void
84
    {
85
        $stateIn = $this->getStartState();
86
        $node->setAttribute('state_in', $stateIn);
87
        $stateOut = $this->createState();
88
        $this
89
            ->nfa
90
            ->getStateMap()
91
            ->addFinishState($stateOut);
92
        $node->setAttribute('state_out', $stateOut);
93
        $node->setAttribute('in_range', false);
94
    }
95
96
    /**
97
     * @return int
98
     * @throws Exception
99
     */
100
    private function getStartState(): int
101
    {
102
        if (!isset($this->startState)) {
103
            $this->startState = $this->createState();
104
            $this
105
                ->nfa
106
                ->getStateMap()
107
                ->addStartState($this->startState);
108
        }
109
110
        return $this->startState;
111
    }
112
113
    /**
114
     * @param Node          $node
115
     * @param PushInterface $stack
116
     * @throws Exception
117
     */
118
    public function onBeginProduction(Node $node, PushInterface $stack): void
119
    {
120
        switch ($node->getName()) {
121
            case NodeType::ASSERT:
122
                throw new Exception("AST nodes of type '{$node->getName()}' are not supported yet");
123
                break;
124
125
            case NodeType::EMPTY:
126
            case NodeType::SYMBOL:
127
            case NodeType::SYMBOL_ANY:
128
            case NodeType::SYMBOL_CTL:
129
            case NodeType::ESC_SIMPLE:
130
                if (!empty($node->getChildList())) {
131
                    throw new Exception("AST node '{$node->getName()}' should not have child nodes");
132
                }
133
                break;
134
135
            case NodeType::SYMBOL_PROP:
136
                $propName = $node->getStringAttribute('name');
137
                $propRangeSet = $this->propertyLoader->getRangeSet($propName);
138
                $rangeSet = $node->getAttribute('not')
139
                    ? $propRangeSet->createSymmetricDifference(
140
                        RangeSet::createUnsafe(new Range(0x00, 0x10FFFF))
141
                    )
142
                    : $propRangeSet;
143
                [$stateIn, $stateOut] = $this->getNodeStates($node);
144
                $this->setRangeTransition($stateIn, $stateOut, $rangeSet);
145
                break;
146
147
            case NodeType::SYMBOL_CLASS:
148
                if (empty($node->getChildList())) {
149
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
150
                }
151
                [$stateIn, $stateOut] = $this->getNodeStates($node);
152
                $symbolList = [];
153
                foreach ($node->getChildIndexList() as $index) {
154
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
155
                }
156
                $stack->push(...$symbolList);
157
                break;
158
159
            case NodeType::SYMBOL_RANGE:
160
                if (count($node->getChildList()) != 2) {
161
                    throw new Exception("AST node '{$node->getName()}' should have exactly two child nodes");
162
                }
163
                [$stateIn, $stateOut] = $this->getNodeStates($node);
164
                $symbolList = [];
165
                foreach ($node->getChildIndexList() as $index) {
166
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
167
                }
168
                $stack->push(...$symbolList);
169
                break;
170
171
            case NodeType::REPEAT:
172
                if (count($node->getChildList()) != 1) {
173
                    throw new Exception("AST node '{$node->getName()}' should have exactly one child node");
174
                }
175
                $min = $node->getAttribute('min');
176
                $symbolList = [];
177
                $stateOut = null;
178
                // Prefix concatenation construction
179
                for ($index = 0; $index < $min; $index++) {
180
                    $stateIn = $stateOut ?? $node->getIntAttribute('state_in');
181
                    $isLastNode =
182
                        !$node->getAttribute('is_max_infinite') &&
183
                        $index + 1 == $node->getAttribute('max');
184
                    $stateOut = $isLastNode
185
                        ? $node->getIntAttribute('state_out')
186
                        : $this->createState();
187
                    $symbolList[] = $this->createSymbolFromClonedNodeChild($node, $stateIn, $stateOut);
188
                }
189
                if ($node->getAttribute('is_max_infinite')) {
190
                    // Postfix Kleene star construction
191
                    $stateIn = $stateOut ?? $node->getAttribute('state_in');
192
                    $stateOut = $node->getAttribute('state_out');
193
                    $symbolList[] = $this->createKleeneStarSymbolFromNode($node, $stateIn, $stateOut);
194
                    $stack->push(...$symbolList);
195
                    break;
196
                }
197
                $max = $node->getAttribute('max');
198
                if ($min > $max) {
199
                    $message = "AST node '{$node->getName()}' has invalid attributes: min({$min}) > max({$max})";
200
                    throw new Exception($message);
201
                }
202
                // Postfix optional concatenation construction
203
                for ($index = $min; $index < $max; $index++) {
204
                    $stateIn = $stateOut ?? $node->getAttribute('state_in');
205
                    $stateOut = $index == $max - 1
206
                        ? $node->getAttribute('state_out')
207
                        : $this->createState();
208
                    $optStateOut = $node->getAttribute('state_out');
209
                    $this->addEpsilonTransition($stateIn, $optStateOut);
210
                    $symbolList[] = $this->createSymbolFromClonedNodeChild($node, $stateIn, $stateOut);
211
                }
212
                $stack->push(...$symbolList);
213
                break;
214
215
            case NodeType::CONCATENATE:
216
                if (empty($node->getChildList())) {
217
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
218
                }
219
                $symbolList = [];
220
                $stateOut = null;
221
                $maxIndex = count($node->getChildList()) - 1;
222
                foreach ($node->getChildIndexList() as $index) {
223
                    $stateIn = $stateOut ?? $node->getIntAttribute('state_in');
224
                    $stateOut = $index == $maxIndex
225
                        ? $node->getIntAttribute('state_out')
226
                        : $this->createState();
227
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
228
                }
229
                $stack->push(...$symbolList);
230
                break;
231
232
            case NodeType::ALTERNATIVE:
233
                if (empty($node->getChildList())) {
234
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
235
                }
236
                $symbolList = [];
237
                [$headerStateIn, $headerStateOut] = $this->getNodeStates($node);
238
                foreach ($node->getChildIndexList() as $index) {
239
                    $stateIn = $this->createState();
240
                    $stateOut = $this->createState();
241
                    $this->addEpsilonTransition($headerStateIn, $stateIn);
242
                    $this->addEpsilonTransition($stateOut, $headerStateOut);
243
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
244
                }
245
                $stack->push(...$symbolList);
246
                break;
247
248
            default:
249
                throw new Exception("Unknown AST node name: {$node->getName()}");
250
        }
251
    }
252
253
    /**
254
     * @param Symbol        $symbol
255
     * @param PushInterface $stack
256
     * @throws Exception
257
     */
258
    public function onSymbol(Symbol $symbol, PushInterface $stack): void
259
    {
260
        $inRange = $symbol->getHeader()->getName() == NodeType::SYMBOL_RANGE
261
            ? true
262
            : $symbol->getHeader()->getAttribute('in_range');
263
        $symbol
264
            ->getSymbol()
265
            ->setAttribute('in_range', $inRange);
266
        $stack->push($symbol->getSymbol());
267
    }
268
269
    /**
270
     * @param Node $node
271
     * @throws Exception
272
     */
273
    public function onFinishProduction(Node $node): void
274
    {
275
        [$stateIn, $stateOut] = $this->getNodeStates($node);
276
        $inRange = $node->getAttribute('in_range');
277
        switch ($node->getName()) {
278
            case NodeType::SYMBOL_RANGE:
279
                $startChar = $node->getChild(0)->getAttribute('range_code');
280
                $finishChar = $node->getChild(1)->getAttribute('range_code');
281
                if ($startChar > $finishChar) {
282
                    throw new Exception("Invalid range: start char is greater than finish char");
283
                }
284
                $this->addRangeTransition($stateIn, $stateOut, $startChar, $finishChar);
285
                break;
286
287
            case NodeType::SYMBOL:
288
                $code = $node->getAttribute('code');
289
                $inRange
290
                    ? $node->setAttribute('range_code', $code)
291
                    : $this->addRangeTransition($stateIn, $stateOut, $code);
292
                break;
293
294
            case NodeType::EMPTY:
295
                if ($inRange) {
296
                    throw new Exception("Invalid range component: no matching chars");
297
                }
298
                $this->addEpsilonTransition($stateIn, $stateOut);
299
                break;
300
301
            case NodeType::SYMBOL_ANY:
302
                if ($inRange) {
303
                    throw new Exception("Invalid range component: any char is matching");
304
                }
305
                $this->addRangeTransition($stateIn, $stateOut, 0x00, 0x10FFFF);
306
                break;
307
308
            case NodeType::SYMBOL_CTL:
309
                $code = $node->getAttribute('code');
310
                $controlCode = $this->getControlCode($code);
311
                $inRange
312
                    ? $node->setAttribute('range_code', $controlCode)
313
                    : $this->addRangeTransition($stateIn, $stateOut, $controlCode);
314
                break;
315
316
            case NodeType::ESC_SIMPLE:
317
                $code = $node->getAttribute('code');
318
                $singleCharMap = [
319
                    0x61 => 0x07, // \a: alert/BEL
320
                    0x65 => 0x1B, // \e: escape/ESC
321
                    0x66 => 0x0C, // \f: form feed/FF
322
                    0x6E => 0x0A, // \n: line feed/LF
323
                    0x72 => 0x0D, // \r: carriage return/CR
324
                    0x74 => 0x09, // \t: tab/HT
325
                ];
326
                if (isset($singleCharMap[$code])) {
327
                    $escapedCode = $singleCharMap[$code];
328
                    $inRange
329
                        ? $node->setAttribute('range_code', $escapedCode)
330
                        : $this->addRangeTransition($stateIn, $stateOut, $escapedCode);
331
                    break;
332
                }
333
                $notImplementedMap = [
334
                    0x41 => "Assertion \\A (subject start)",
335
                    0x42 => "Assertion \\B (not a word boundary)",
336
                    0x43 => "Escape \\C (single code unit)",
337
                    0x44 => "Escape \\D (not a decimal digit)",
338
                    0x45 => "Escape \\E (raw sequence end)",
339
                    0x47 => "Assert \\G (first matching position)",
340
                    0x48 => "Escape \\H (not a horizontal whitespace)",
341
                    0x4B => "Escape \\K (reset matching start)",
342
                    0x4E => "Escape \\N (not a newline)",
343
                    0x51 => "Escape \\Q (raw sequence start)",
344
                    0x52 => "Escape \\R (newline)",
345
                    0x53 => "Escape \\S (not a whitespace)",
346
                    0x56 => "Escape \\V (not a vertical whitespace)",
347
                    0x57 => "Escape \\W (not a \"word\" character)",
348
                    0x58 => "Escape \\X (Unicode extended grapheme cluster)",
349
                    0x5A => "Assertion \\Z (subject end or newline before subject end)",
350
                    0x62 => "Assertion \\b (word boundary)",
351
                    0x64 => "Escape \\d (decimal digit)",
352
                    0x67 => "Escape \\g (back-reference)",
353
                    0x68 => "Escape \\h (horizontal whitespace)",
354
                    0x6B => "Escape \\k (named back-reference)",
355
                    0x73 => "Escape \\s (whitespace)",
356
                    0x76 => "Escape \\v (vertical whitespace)",
357
                    0x77 => "Escape \\w (\"word\" character)",
358
                    0x7A => "Escape \\z (subject end)",
359
                ];
360
                if (isset($notImplementedMap[$code])) {
361
                    throw new Exception("{$notImplementedMap[$code]} is not implemented yet");
362
                }
363
                switch ($code) {
364
                    default:
365
                        $inRange
366
                            ? $node->setAttribute('range_code', $code)
367
                            : $this->addRangeTransition($stateIn, $stateOut, $code);
368
                }
369
                break;
370
371
            case NodeType::SYMBOL_CLASS:
372
                if (!$node->getAttribute('not')) {
373
                    break;
374
                }
375
                $invertedRangeSet = $this
376
                    ->getRangeTransition($stateIn, $stateOut)
377
                    ->createSymmetricDifference(
378
                        RangeSet::createUnsafe(
379
                            new Range(0x00, 0x10FFFF)
380
                        )
381
                    );
382
                $this->setRangeTransition($stateIn, $stateOut, $invertedRangeSet);
383
                break;
384
        }
385
    }
386
387
    public function onFinish(): void
388
    {
389
        $addTransitionToLanguage = function (RangeSetInterface $rangeSet, int $stateIn, int $stateOut) {
390
            $this
391
                ->getLanguageBuilder()
392
                ->addTransition($stateIn, $stateOut, ...$rangeSet->getRanges());
393
        };
394
        $this
395
            ->getRangeTransitionMap()
396
            ->onEachTransition($addTransitionToLanguage);
397
    }
398
399
    private function getLanguageBuilder(): LanguageBuilder
400
    {
401
        if (!isset($this->languageBuilder)) {
402
            $this->languageBuilder = LanguageBuilder::forNfa($this->nfa);
403
        }
404
405
        return $this->languageBuilder;
406
    }
407
408
    /**
409
     * @param Node $node
410
     * @return int[]
411
     * @throws Exception
412
     */
413
    private function getNodeStates(Node $node): array
414
    {
415
        $inState = $node->getAttribute('state_in');
416
        $outState = $node->getAttribute('state_out');
417
418
        return [$inState, $outState];
419
    }
420
421
    /**
422
     * @param int $code
423
     * @return int
424
     * @throws Exception
425
     */
426
    private function getControlCode(int $code): int
427
    {
428
        if ($code < 0x20 || $code > 0x7E) {
429
            throw new Exception("Invalid control character: {$code}");
430
        }
431
432
        // Lowercase ASCII letters are converted to uppercase, then bit 6 is inverted.
433
        return ($code < 0x61 || $code > 0x7A ? $code : $code - 0x20) ^ 0x40;
434
    }
435
436
    /**
437
     * @param Node $node
438
     * @param int  $stateIn
439
     * @param int  $stateOut
440
     * @return Symbol
441
     * @throws Exception
442
     */
443
    private function createKleeneStarSymbolFromNode(Node $node, int $stateIn, int $stateOut): Symbol
444
    {
445
        $innerStateIn = $this->createState();
446
        $innerStateOut = $this->createState();
447
        $this
448
            ->addEpsilonTransition($stateIn, $innerStateIn)
449
            ->addEpsilonTransition($innerStateOut, $stateOut)
450
            ->addEpsilonTransition($stateIn, $stateOut)
451
            ->addEpsilonTransition($innerStateOut, $innerStateIn);
452
453
        return $this->createSymbolFromClonedNodeChild($node, $innerStateIn, $innerStateOut);
454
    }
455
456
    /**
457
     * @param Node $node
458
     * @param int  $stateIn
459
     * @param int  $stateOut
460
     * @param int  $index
461
     * @return Symbol
462
     * @throws Exception
463
     */
464
    private function createSymbolFromClonedNodeChild(Node $node, int $stateIn, int $stateOut, int $index = 0): Symbol
465
    {
466
        $nodeClone = $node
467
            ->getChild($index)
468
            ->getClone()
469
            ->setAttribute('state_in', $stateIn)
470
            ->setAttribute('state_out', $stateOut);
471
        $symbol = new Symbol($node, $index);
472
        $symbol->setSymbol($nodeClone);
473
474
        return $symbol;
475
    }
476
477
    /**
478
     * @param Node $node
479
     * @param int  $stateIn
480
     * @param int  $stateOut
481
     * @param int  $index
482
     * @return Symbol
483
     * @throws Exception
484
     */
485
    private function createSymbolFromNodeChild(Node $node, int $stateIn, int $stateOut, int $index = 0): Symbol
486
    {
487
        $node
488
            ->getChild($index)
489
            ->setAttribute('state_in', $stateIn)
490
            ->setAttribute('state_out', $stateOut);
491
492
        return new Symbol($node, $index);
493
    }
494
495
    /**
496
     * @return int
497
     * @throws Exception
498
     */
499
    private function createState(): int
500
    {
501
        return $this
502
            ->nfa
503
            ->getStateMap()
504
            ->createState();
505
    }
506
507
    private function getRangeTransitionMap(): TransitionMap
508
    {
509
        if (!isset($this->rangeTransitionMap)) {
510
            $this->rangeTransitionMap = new TransitionMap($this->nfa->getStateMap());
511
        }
512
513
        return $this->rangeTransitionMap;
514
    }
515
516
    /**
517
     * @param int      $stateIn
518
     * @param int      $stateOut
519
     * @param int      $start
520
     * @param int|null $finish
521
     * @return NfaBuilder
522
     * @throws Exception
523
     */
524
    private function addRangeTransition(int $stateIn, int $stateOut, int $start, int $finish = null): self
525
    {
526
        $this->setRangeTransition(
527
            $stateIn,
528
            $stateOut,
529
            $this
530
                ->getRangeTransition($stateIn, $stateOut)
531
                ->withRanges(new Range($start, $finish))
532
        );
533
534
        return $this;
535
    }
536
537
    /**
538
     * @param int               $stateIn
539
     * @param int               $stateOut
540
     * @param RangeSetInterface $rangeSet
541
     * @return NfaBuilder
542
     * @throws Exception
543
     */
544
    private function setRangeTransition(int $stateIn, int $stateOut, RangeSetInterface $rangeSet): self
545
    {
546
        $this
547
            ->getRangeTransitionMap()
548
            ->replaceTransition($stateIn, $stateOut, $rangeSet);
549
550
        return $this;
551
    }
552
553
    /**
554
     * @param int $stateIn
555
     * @param int $stateOut
556
     * @return RangeSetInterface
557
     * @throws Exception
558
     */
559
    private function getRangeTransition(int $stateIn, int $stateOut): RangeSetInterface
560
    {
561
        $transitionExists = $this
562
            ->getRangeTransitionMap()
563
            ->transitionExists($stateIn, $stateOut);
564
        if ($transitionExists) {
565
            return $this
566
                ->getRangeTransitionMap()
567
                ->getTransition($stateIn, $stateOut);
568
        }
569
        $rangeSet = RangeSet::create();
570
        $this
571
            ->getRangeTransitionMap()
572
            ->addTransition($stateIn, $stateOut, $rangeSet);
573
574
        return $rangeSet;
575
    }
576
577
    /**
578
     * @param int $stateIn
579
     * @param int $stateOut
580
     * @return NfaBuilder
581
     * @throws Exception
582
     */
583
    private function addEpsilonTransition(int $stateIn, int $stateOut): self
584
    {
585
        $this
586
            ->nfa
587
            ->getEpsilonTransitionMap()
588
            ->addTransition($stateIn, $stateOut, true);
589
590
        return $this;
591
    }
592
}
593