Passed
Push — master ( 2dfc84...94e87c )
by Edward
05:17
created

NfaBuilder::onBeginProduction()   F

Complexity

Conditions 32
Paths 53

Size

Total Lines 132
Code Lines 105

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
cc 32
eloc 105
c 2
b 1
f 0
nc 53
nop 2
dl 0
loc 132
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace Remorhaz\UniLex\RegExp\FSM;
6
7
use Remorhaz\UniLex\AST\AbstractTranslatorListener;
8
use Remorhaz\UniLex\AST\Node;
9
use Remorhaz\UniLex\AST\Symbol;
10
use Remorhaz\UniLex\AST\Translator;
11
use Remorhaz\UniLex\AST\Tree;
12
use Remorhaz\UniLex\IO\CharBufferInterface;
13
use Remorhaz\UniLex\Exception;
14
use Remorhaz\UniLex\RegExp\AST\NodeType;
15
use Remorhaz\UniLex\RegExp\ParserFactory;
16
use Remorhaz\UniLex\RegExp\PropertyLoader;
17
use Remorhaz\UniLex\RegExp\PropertyLoaderInterface;
18
use Remorhaz\UniLex\Stack\PushInterface;
19
20
class NfaBuilder extends AbstractTranslatorListener
21
{
22
23
    private $nfa;
24
25
    private $propertyLoader;
26
27
    private $rangeTransitionMap;
28
29
    private $languageBuilder;
30
31
    private $startState;
32
33
    private $rangeSetCalc;
34
35
    public function __construct(Nfa $nfa, PropertyLoaderInterface $propertyLoader)
36
    {
37
        $this->nfa = $nfa;
38
        $this->propertyLoader = $propertyLoader;
39
        $this->rangeSetCalc = new RangeSetCalc();
40
    }
41
42
    /**
43
     * @param CharBufferInterface $buffer
44
     * @return Nfa
45
     * @throws Exception
46
     */
47
    public static function fromBuffer(CharBufferInterface $buffer): Nfa
48
    {
49
        $tree = new Tree();
50
        ParserFactory::createFromBuffer($tree, $buffer)->run();
51
52
        return self::fromTree($tree);
53
    }
54
55
    /**
56
     * @param Tree $tree
57
     * @return Nfa
58
     * @throws Exception
59
     */
60
    public static function fromTree(Tree $tree): Nfa
61
    {
62
        $nfa = new Nfa();
63
        (new Translator($tree, new self($nfa, PropertyLoader::create())))->run();
64
65
        return $nfa;
66
    }
67
68
    /**
69
     * @param int $state
70
     * @throws Exception
71
     */
72
    public function setStartState(int $state): void
73
    {
74
        if (isset($this->startState)) {
75
            throw new Exception("Start state is already set");
76
        }
77
        $this->startState = $state;
78
    }
79
80
    /**
81
     * @param Node $node
82
     * @throws Exception
83
     */
84
    public function onStart(Node $node): void
85
    {
86
        $stateIn = $this->getStartState();
87
        $node->setAttribute('state_in', $stateIn);
88
        $stateOut = $this->createState();
89
        $this
90
            ->nfa
91
            ->getStateMap()
92
            ->addFinishState($stateOut);
93
        $node->setAttribute('state_out', $stateOut);
94
        $node->setAttribute('in_range', false);
95
    }
96
97
    /**
98
     * @return int
99
     * @throws Exception
100
     */
101
    private function getStartState(): int
102
    {
103
        if (!isset($this->startState)) {
104
            $this->startState = $this->createState();
105
            $this
106
                ->nfa
107
                ->getStateMap()
108
                ->setStartState($this->startState);
109
        }
110
111
        return $this->startState;
112
    }
113
114
    /**
115
     * @param Node          $node
116
     * @param PushInterface $stack
117
     * @throws Exception
118
     */
119
    public function onBeginProduction(Node $node, PushInterface $stack): void
120
    {
121
        switch ($node->getName()) {
122
            case NodeType::ASSERT:
123
                throw new Exception("AST nodes of type '{$node->getName()}' are not supported yet");
124
                break;
125
126
            case NodeType::EMPTY:
127
            case NodeType::SYMBOL:
128
            case NodeType::SYMBOL_ANY:
129
            case NodeType::SYMBOL_CTL:
130
            case NodeType::ESC_SIMPLE:
131
                if (!empty($node->getChildList())) {
132
                    throw new Exception("AST node '{$node->getName()}' should not have child nodes");
133
                }
134
                break;
135
136
            case NodeType::SYMBOL_PROP:
137
                $propName = $node->getStringAttribute('name');
138
                $propRangeSet = $this->propertyLoader->getPropertyRangeSet($propName);
139
                $rangeSet = $node->getAttribute('not')
140
                    ? $this
141
                        ->rangeSetCalc
142
                        ->xor($propRangeSet, RangeSet::import([0x00, 0x10FFFF]))
143
                    : $propRangeSet;
144
                [$stateIn, $stateOut] = $this->getNodeStates($node);
145
                $this->setRangeTransition($stateIn, $stateOut, $rangeSet);
146
                break;
147
148
            case NodeType::SYMBOL_CLASS:
149
                if (empty($node->getChildList())) {
150
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
151
                }
152
                [$stateIn, $stateOut] = $this->getNodeStates($node);
153
                $symbolList = [];
154
                foreach ($node->getChildIndexList() as $index) {
155
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
156
                }
157
                $stack->push(...$symbolList);
158
                break;
159
160
            case NodeType::SYMBOL_RANGE:
161
                if (count($node->getChildList()) != 2) {
162
                    throw new Exception("AST node '{$node->getName()}' should have exactly two child nodes");
163
                }
164
                [$stateIn, $stateOut] = $this->getNodeStates($node);
165
                $symbolList = [];
166
                foreach ($node->getChildIndexList() as $index) {
167
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
168
                }
169
                $stack->push(...$symbolList);
170
                break;
171
172
            case NodeType::REPEAT:
173
                if (count($node->getChildList()) != 1) {
174
                    throw new Exception("AST node '{$node->getName()}' should have exactly one child node");
175
                }
176
                $min = $node->getAttribute('min');
177
                $symbolList = [];
178
                $stateOut = null;
179
                // Prefix concatenation construction
180
                for ($index = 0; $index < $min; $index++) {
181
                    $stateIn = $stateOut ?? $node->getIntAttribute('state_in');
182
                    $isLastNode =
183
                        !$node->getAttribute('is_max_infinite') &&
184
                        $index + 1 == $node->getAttribute('max');
185
                    $stateOut = $isLastNode
186
                        ? $node->getIntAttribute('state_out')
187
                        : $this->createState();
188
                    $symbolList[] = $this->createSymbolFromClonedNodeChild($node, $stateIn, $stateOut);
189
                }
190
                if ($node->getAttribute('is_max_infinite')) {
191
                    // Postfix Kleene star construction
192
                    $stateIn = $stateOut ?? $node->getAttribute('state_in');
193
                    $stateOut = $node->getAttribute('state_out');
194
                    $symbolList[] = $this->createKleeneStarSymbolFromNode($node, $stateIn, $stateOut);
195
                    $stack->push(...$symbolList);
196
                    break;
197
                }
198
                $max = $node->getAttribute('max');
199
                if ($min > $max) {
200
                    $message = "AST node '{$node->getName()}' has invalid attributes: min({$min}) > max({$max})";
201
                    throw new Exception($message);
202
                }
203
                // Postfix optional concatenation construction
204
                for ($index = $min; $index < $max; $index++) {
205
                    $stateIn = $stateOut ?? $node->getAttribute('state_in');
206
                    $stateOut = $index == $max - 1
207
                        ? $node->getAttribute('state_out')
208
                        : $this->createState();
209
                    $optStateOut = $node->getAttribute('state_out');
210
                    $this->addEpsilonTransition($stateIn, $optStateOut);
211
                    $symbolList[] = $this->createSymbolFromClonedNodeChild($node, $stateIn, $stateOut);
212
                }
213
                $stack->push(...$symbolList);
214
                break;
215
216
            case NodeType::CONCATENATE:
217
                if (empty($node->getChildList())) {
218
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
219
                }
220
                $symbolList = [];
221
                $stateOut = null;
222
                $maxIndex = count($node->getChildList()) - 1;
223
                foreach ($node->getChildIndexList() as $index) {
224
                    $stateIn = $stateOut ?? $node->getIntAttribute('state_in');
225
                    $stateOut = $index == $maxIndex
226
                        ? $node->getIntAttribute('state_out')
227
                        : $this->createState();
228
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
229
                }
230
                $stack->push(...$symbolList);
231
                break;
232
233
            case NodeType::ALTERNATIVE:
234
                if (empty($node->getChildList())) {
235
                    throw new Exception("AST node '{$node->getName()}' should have child nodes");
236
                }
237
                $symbolList = [];
238
                [$headerStateIn, $headerStateOut] = $this->getNodeStates($node);
239
                foreach ($node->getChildIndexList() as $index) {
240
                    $stateIn = $this->createState();
241
                    $stateOut = $this->createState();
242
                    $this->addEpsilonTransition($headerStateIn, $stateIn);
243
                    $this->addEpsilonTransition($stateOut, $headerStateOut);
244
                    $symbolList[] = $this->createSymbolFromNodeChild($node, $stateIn, $stateOut, $index);
245
                }
246
                $stack->push(...$symbolList);
247
                break;
248
249
            default:
250
                throw new Exception("Unknown AST node name: {$node->getName()}");
251
        }
252
    }
253
254
    /**
255
     * @param Symbol        $symbol
256
     * @param PushInterface $stack
257
     * @throws Exception
258
     */
259
    public function onSymbol(Symbol $symbol, PushInterface $stack): void
260
    {
261
        $inRange = $symbol->getHeader()->getName() == NodeType::SYMBOL_RANGE
262
            ? true
263
            : $symbol->getHeader()->getAttribute('in_range');
264
        $symbol
265
            ->getSymbol()
266
            ->setAttribute('in_range', $inRange);
267
        $stack->push($symbol->getSymbol());
268
    }
269
270
    /**
271
     * @param Node $node
272
     * @throws Exception
273
     */
274
    public function onFinishProduction(Node $node): void
275
    {
276
        [$stateIn, $stateOut] = $this->getNodeStates($node);
277
        $inRange = $node->getAttribute('in_range');
278
        switch ($node->getName()) {
279
            case NodeType::SYMBOL_RANGE:
280
                $startChar = $node->getChild(0)->getAttribute('range_code');
281
                $finishChar = $node->getChild(1)->getAttribute('range_code');
282
                if ($startChar > $finishChar) {
283
                    throw new Exception("Invalid range: start char is greater than finish char");
284
                }
285
                $this->addRangeTransition($stateIn, $stateOut, $startChar, $finishChar);
286
                break;
287
288
            case NodeType::SYMBOL:
289
                $code = $node->getAttribute('code');
290
                $inRange
291
                    ? $node->setAttribute('range_code', $code)
292
                    : $this->addRangeTransition($stateIn, $stateOut, $code);
293
                break;
294
295
            case NodeType::EMPTY:
296
                if ($inRange) {
297
                    throw new Exception("Invalid range component: no matching chars");
298
                }
299
                $this->addEpsilonTransition($stateIn, $stateOut);
300
                break;
301
302
            case NodeType::SYMBOL_ANY:
303
                if ($inRange) {
304
                    throw new Exception("Invalid range component: any char is matching");
305
                }
306
                $this->addRangeTransition($stateIn, $stateOut, 0x00, 0x10FFFF);
307
                break;
308
309
            case NodeType::SYMBOL_CTL:
310
                $code = $node->getAttribute('code');
311
                $controlCode = $this->getControlCode($code);
312
                $inRange
313
                    ? $node->setAttribute('range_code', $controlCode)
314
                    : $this->addRangeTransition($stateIn, $stateOut, $controlCode);
315
                break;
316
317
            case NodeType::ESC_SIMPLE:
318
                $code = $node->getAttribute('code');
319
                $singleCharMap = [
320
                    0x61 => 0x07, // \a: alert/BEL
321
                    0x65 => 0x1B, // \e: escape/ESC
322
                    0x66 => 0x0C, // \f: form feed/FF
323
                    0x6E => 0x0A, // \n: line feed/LF
324
                    0x72 => 0x0D, // \r: carriage return/CR
325
                    0x74 => 0x09, // \t: tab/HT
326
                ];
327
                if (isset($singleCharMap[$code])) {
328
                    $escapedCode = $singleCharMap[$code];
329
                    $inRange
330
                        ? $node->setAttribute('range_code', $escapedCode)
331
                        : $this->addRangeTransition($stateIn, $stateOut, $escapedCode);
332
                    break;
333
                }
334
                $notImplementedMap = [
335
                    0x41 => "Assertion \\A (subject start)",
336
                    0x42 => "Assertion \\B (not a word boundary)",
337
                    0x43 => "Escape \\C (single code unit)",
338
                    0x44 => "Escape \\D (not a decimal digit)",
339
                    0x45 => "Escape \\E (raw sequence end)",
340
                    0x47 => "Assert \\G (first matching position)",
341
                    0x48 => "Escape \\H (not a horizontal whitespace)",
342
                    0x4B => "Escape \\K (reset matching start)",
343
                    0x4E => "Escape \\N (not a newline)",
344
                    0x51 => "Escape \\Q (raw sequence start)",
345
                    0x52 => "Escape \\R (newline)",
346
                    0x53 => "Escape \\S (not a whitespace)",
347
                    0x56 => "Escape \\V (not a vertical whitespace)",
348
                    0x57 => "Escape \\W (not a \"word\" character)",
349
                    0x58 => "Escape \\X (Unicode extended grapheme cluster)",
350
                    0x5A => "Assertion \\Z (subject end or newline before subject end)",
351
                    0x62 => "Assertion \\b (word boundary)",
352
                    0x64 => "Escape \\d (decimal digit)",
353
                    0x67 => "Escape \\g (back-reference)",
354
                    0x68 => "Escape \\h (horizontal whitespace)",
355
                    0x6B => "Escape \\k (named back-reference)",
356
                    0x73 => "Escape \\s (whitespace)",
357
                    0x76 => "Escape \\v (vertical whitespace)",
358
                    0x77 => "Escape \\w (\"word\" character)",
359
                    0x7A => "Escape \\z (subject end)",
360
                ];
361
                if (isset($notImplementedMap[$code])) {
362
                    throw new Exception("{$notImplementedMap[$code]} is not implemented yet");
363
                }
364
                switch ($code) {
365
                    default:
366
                        $inRange
367
                            ? $node->setAttribute('range_code', $code)
368
                            : $this->addRangeTransition($stateIn, $stateOut, $code);
369
                }
370
                break;
371
372
            case NodeType::SYMBOL_CLASS:
373
                if (!$node->getAttribute('not')) {
374
                    break;
375
                }
376
                $invertedRangeSet = $this
377
                    ->rangeSetCalc
378
                    ->xor($this->getRangeTransition($stateIn, $stateOut), RangeSet::import([0x00, 0x10FFFF]));
379
                $this->setRangeTransition($stateIn, $stateOut, $invertedRangeSet);
380
                break;
381
        }
382
    }
383
384
    public function onFinish(): void
385
    {
386
        $addTransitionToLanguage = function (RangeSet $rangeSet, int $stateIn, int $stateOut) {
387
            $this
388
                ->getLanguageBuilder()
389
                ->addTransition($stateIn, $stateOut, ...$rangeSet->getRanges());
390
        };
391
        $this
392
            ->getRangeTransitionMap()
393
            ->onEachTransition($addTransitionToLanguage);
394
    }
395
396
    private function getLanguageBuilder(): LanguageBuilder
397
    {
398
        if (!isset($this->languageBuilder)) {
399
            $this->languageBuilder = LanguageBuilder::forNfa($this->nfa);
400
        }
401
402
        return $this->languageBuilder;
403
    }
404
405
    /**
406
     * @param Node $node
407
     * @return int[]
408
     * @throws Exception
409
     */
410
    private function getNodeStates(Node $node): array
411
    {
412
        $inState = $node->getAttribute('state_in');
413
        $outState = $node->getAttribute('state_out');
414
415
        return [$inState, $outState];
416
    }
417
418
    /**
419
     * @param int $code
420
     * @return int
421
     * @throws Exception
422
     */
423
    private function getControlCode(int $code): int
424
    {
425
        if ($code < 0x20 || $code > 0x7E) {
426
            throw new Exception("Invalid control character: {$code}");
427
        }
428
429
        // Lowercase ASCII letters are converted to uppercase, then bit 6 is inverted.
430
        return ($code < 0x61 || $code > 0x7A ? $code : $code - 0x20) ^ 0x40;
431
    }
432
433
    /**
434
     * @param Node $node
435
     * @param int  $stateIn
436
     * @param int  $stateOut
437
     * @return Symbol
438
     * @throws Exception
439
     */
440
    private function createKleeneStarSymbolFromNode(Node $node, int $stateIn, int $stateOut): Symbol
441
    {
442
        $innerStateIn = $this->createState();
443
        $innerStateOut = $this->createState();
444
        $this
445
            ->addEpsilonTransition($stateIn, $innerStateIn)
446
            ->addEpsilonTransition($innerStateOut, $stateOut)
447
            ->addEpsilonTransition($stateIn, $stateOut)
448
            ->addEpsilonTransition($innerStateOut, $innerStateIn);
449
450
        return $this->createSymbolFromClonedNodeChild($node, $innerStateIn, $innerStateOut);
451
    }
452
453
    /**
454
     * @param Node $node
455
     * @param int  $stateIn
456
     * @param int  $stateOut
457
     * @param int  $index
458
     * @return Symbol
459
     * @throws Exception
460
     */
461
    private function createSymbolFromClonedNodeChild(Node $node, int $stateIn, int $stateOut, int $index = 0): Symbol
462
    {
463
        $nodeClone = $node
464
            ->getChild($index)
465
            ->getClone()
466
            ->setAttribute('state_in', $stateIn)
467
            ->setAttribute('state_out', $stateOut);
468
        $symbol = new Symbol($node, $index);
469
        $symbol->setSymbol($nodeClone);
470
471
        return $symbol;
472
    }
473
474
    /**
475
     * @param Node $node
476
     * @param int  $stateIn
477
     * @param int  $stateOut
478
     * @param int  $index
479
     * @return Symbol
480
     * @throws Exception
481
     */
482
    private function createSymbolFromNodeChild(Node $node, int $stateIn, int $stateOut, int $index = 0): Symbol
483
    {
484
        $node
485
            ->getChild($index)
486
            ->setAttribute('state_in', $stateIn)
487
            ->setAttribute('state_out', $stateOut);
488
489
        return new Symbol($node, $index);
490
    }
491
492
    /**
493
     * @return int
494
     * @throws Exception
495
     */
496
    private function createState(): int
497
    {
498
        return $this
499
            ->nfa
500
            ->getStateMap()
501
            ->createState();
502
    }
503
504
    private function getRangeTransitionMap(): TransitionMap
505
    {
506
        if (!isset($this->rangeTransitionMap)) {
507
            $this->rangeTransitionMap = new TransitionMap($this->nfa->getStateMap());
508
        }
509
510
        return $this->rangeTransitionMap;
511
    }
512
513
    /**
514
     * @param int      $stateIn
515
     * @param int      $stateOut
516
     * @param int      $start
517
     * @param int|null $finish
518
     * @return NfaBuilder
519
     * @throws Exception
520
     */
521
    private function addRangeTransition(int $stateIn, int $stateOut, int $start, int $finish = null): self
522
    {
523
        $this
524
            ->getRangeTransition($stateIn, $stateOut)
525
            ->addRange(new Range($start, $finish ?? $start));
526
527
        return $this;
528
    }
529
530
    /**
531
     * @param int      $stateIn
532
     * @param int      $stateOut
533
     * @param RangeSet $rangeSet
534
     * @return NfaBuilder
535
     * @throws Exception
536
     */
537
    private function setRangeTransition(int $stateIn, int $stateOut, RangeSet $rangeSet): self
538
    {
539
        $this
540
            ->getRangeTransitionMap()
541
            ->replaceTransition($stateIn, $stateOut, $rangeSet);
542
543
        return $this;
544
    }
545
546
    /**
547
     * @param int $stateIn
548
     * @param int $stateOut
549
     * @return RangeSet
550
     * @throws Exception
551
     */
552
    private function getRangeTransition(int $stateIn, int $stateOut): RangeSet
553
    {
554
        $transitionExists = $this
555
            ->getRangeTransitionMap()
556
            ->transitionExists($stateIn, $stateOut);
557
        if ($transitionExists) {
558
            return $this
559
                ->getRangeTransitionMap()
560
                ->getTransition($stateIn, $stateOut);
561
        }
562
        $rangeSet = new RangeSet();
563
        $this
564
            ->getRangeTransitionMap()
565
            ->addTransition($stateIn, $stateOut, $rangeSet);
566
567
        return $rangeSet;
568
    }
569
570
    /**
571
     * @param int $stateIn
572
     * @param int $stateOut
573
     * @return NfaBuilder
574
     * @throws Exception
575
     */
576
    private function addEpsilonTransition(int $stateIn, int $stateOut): self
577
    {
578
        $this
579
            ->nfa
580
            ->getEpsilonTransitionMap()
581
            ->addTransition($stateIn, $stateOut, true);
582
583
        return $this;
584
    }
585
}
586