Passed
Push — master ( c01dfe...f39ce2 )
by Edward
04:04
created

Utf8TokenMatcher   A

Complexity

Total Complexity 36

Size/Duplication

Total Lines 211
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 163
dl 0
loc 211
rs 9.52
c 1
b 0
f 0
wmc 36

1 Method

Rating   Name   Duplication   Size   Complexity  
F match() 0 208 36
1
<?php
2
3
/**
4
 * Unicode UTF-8 token matcher.
5
 *
6
 * Auto-generated file, please don't edit manually.
7
 * Generated by UniLex.
8
 */
9
10
declare(strict_types=1);
11
12
namespace Remorhaz\UniLex\Unicode\Grammar;
13
14
use Remorhaz\UniLex\IO\CharBufferInterface;
15
use Remorhaz\UniLex\Lexer\TokenFactoryInterface;
16
use Remorhaz\UniLex\Lexer\TokenMatcherTemplate;
17
18
class Utf8TokenMatcher extends TokenMatcherTemplate
19
{
20
21
    public function match(CharBufferInterface $buffer, TokenFactoryInterface $tokenFactory): bool
22
    {
23
        $context = $this->createContext($buffer, $tokenFactory);
24
        $context->setRegExps(
25
            '[\\x00-\\x7F]',
26
            '[\\xC0-\\xDF][\\x80-\\xBF]',
27
            '[\\xE0-\\xEF][\\x80-\\xBF]{2}',
28
            '[\\xF0-\\xF7][\\x80-\\xBF]{3}',
29
            '[\\xF8-\\xFB][\\x80-\\xBF]{4}',
30
            '[\\xFC-\\xFD][\\x80-\\xBF]{5}'
31
        );
32
        goto state1;
33
34
        state1:
35
        if ($context->getBuffer()->isEnd()) {
36
            goto error;
37
        }
38
        $char = $context->getBuffer()->getSymbol();
39
        if (0x00 <= $char && $char <= 0x7F) {
40
            $context->getBuffer()->nextSymbol();
41
            $context->allowRegExps('[\\x00-\\x7F]');
42
            goto state2;
43
        }
44
        if (0xC0 <= $char && $char <= 0xDF) {
45
            $context->getBuffer()->nextSymbol();
46
            $context->allowRegExps('[\\xC0-\\xDF][\\x80-\\xBF]');
47
            goto state3;
48
        }
49
        if (0xE0 <= $char && $char <= 0xEF) {
50
            $context->getBuffer()->nextSymbol();
51
            $context->allowRegExps('[\\xE0-\\xEF][\\x80-\\xBF]{2}');
52
            goto state4;
53
        }
54
        if (0xF0 <= $char && $char <= 0xF7) {
55
            $context->getBuffer()->nextSymbol();
56
            $context->allowRegExps('[\\xF0-\\xF7][\\x80-\\xBF]{3}');
57
            goto state5;
58
        }
59
        if (0xF8 <= $char && $char <= 0xFB) {
60
            $context->getBuffer()->nextSymbol();
61
            $context->allowRegExps('[\\xF8-\\xFB][\\x80-\\xBF]{4}');
62
            goto state6;
63
        }
64
        if (0xFC == $char || 0xFD == $char) {
65
            $context->getBuffer()->nextSymbol();
66
            $context->allowRegExps('[\\xFC-\\xFD][\\x80-\\xBF]{5}');
67
            goto state7;
68
        }
69
        goto error;
70
71
        state2:
72
        switch ($context->getRegExp()) {
73
            case '[\\x00-\\x7F]':
74
                $context
75
                    ->setNewToken(TokenType::SYMBOL)
76
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $char);
77
78
                return true;
79
80
            case '[\\xC0-\\xDF][\\x80-\\xBF]':
81
                $charList = array_slice($context->getSymbolList(), -2);
82
                $symbol = ($charList[0] & 0x1F) << 6;
83
                $symbol |= ($charList[1] & 0x3F);
84
                $context
85
                    ->setNewToken(TokenType::SYMBOL)
86
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $symbol);
87
88
                return true;
89
90
            case '[\\xE0-\\xEF][\\x80-\\xBF]{2}':
91
                $charList = array_slice($context->getSymbolList(), -3);
92
                $symbol = ($charList[0] & 0x0F) << 12;
93
                $symbol |= ($charList[1] & 0x3F) << 6;
94
                $symbol |= ($charList[2] & 0x3F);
95
                $context
96
                    ->setNewToken(TokenType::SYMBOL)
97
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $symbol);
98
99
                return true;
100
101
            case '[\\xF0-\\xF7][\\x80-\\xBF]{3}':
102
                $charList = array_slice($context->getSymbolList(), -4);
103
                $symbol = ($charList[0] & 0x07) << 18;
104
                $symbol |= ($charList[1] & 0x3F) << 12;
105
                $symbol |= ($charList[2] & 0x3F) << 6;
106
                $symbol |= ($charList[3] & 0x3F);
107
                $context
108
                    ->setNewToken(TokenType::SYMBOL)
109
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $symbol);
110
111
                return true;
112
113
            case '[\\xF8-\\xFB][\\x80-\\xBF]{4}':
114
                $charList = array_slice($context->getSymbolList(), -5);
115
                $symbol = ($charList[0] & 0x03) << 24;
116
                $symbol |= ($charList[1] & 0x3F) << 18;
117
                $symbol |= ($charList[2] & 0x3F) << 12;
118
                $symbol |= ($charList[3] & 0x3F) << 6;
119
                $symbol |= ($charList[4] & 0x3F);
120
                $context
121
                    ->setNewToken(TokenType::SYMBOL)
122
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $symbol);
123
124
                return true;
125
126
            case '[\\xFC-\\xFD][\\x80-\\xBF]{5}':
127
                $charList = array_slice($context->getSymbolList(), -6);
128
                $symbol = ($charList[0] & 0x01) << 30;
129
                $symbol |= ($charList[1] & 0x03) << 24;
130
                $symbol |= ($charList[2] & 0x3F) << 18;
131
                $symbol |= ($charList[3] & 0x3F) << 12;
132
                $symbol |= ($charList[4] & 0x3F) << 6;
133
                $symbol |= ($charList[5] & 0x3F);
134
                $context
135
                    ->setNewToken(TokenType::SYMBOL)
136
                    ->setTokenAttribute(TokenAttribute::UNICODE_CHAR, $symbol);
137
138
                return true;
139
140
            default:
141
                goto error;
142
        }
143
144
        state3:
145
        if ($context->getBuffer()->isEnd()) {
146
            goto error;
147
        }
148
        $char = $context->getBuffer()->getSymbol();
149
        if (0x80 <= $char && $char <= 0xBF) {
150
            $context->getBuffer()->nextSymbol();
151
            $context->allowRegExps(
152
                '[\\xC0-\\xDF][\\x80-\\xBF]',
153
                '[\\xE0-\\xEF][\\x80-\\xBF]{2}',
154
                '[\\xF0-\\xF7][\\x80-\\xBF]{3}',
155
                '[\\xF8-\\xFB][\\x80-\\xBF]{4}',
156
                '[\\xFC-\\xFD][\\x80-\\xBF]{5}'
157
            );
158
            goto state2;
159
        }
160
        goto error;
161
162
        state4:
163
        if ($context->getBuffer()->isEnd()) {
164
            goto error;
165
        }
166
        $char = $context->getBuffer()->getSymbol();
167
        if (0x80 <= $char && $char <= 0xBF) {
168
            $context->getBuffer()->nextSymbol();
169
            $context->allowRegExps(
170
                '[\\xE0-\\xEF][\\x80-\\xBF]{2}',
171
                '[\\xF0-\\xF7][\\x80-\\xBF]{3}',
172
                '[\\xF8-\\xFB][\\x80-\\xBF]{4}',
173
                '[\\xFC-\\xFD][\\x80-\\xBF]{5}'
174
            );
175
            goto state3;
176
        }
177
        goto error;
178
179
        state5:
180
        if ($context->getBuffer()->isEnd()) {
181
            goto error;
182
        }
183
        $char = $context->getBuffer()->getSymbol();
184
        if (0x80 <= $char && $char <= 0xBF) {
185
            $context->getBuffer()->nextSymbol();
186
            $context->allowRegExps(
187
                '[\\xF0-\\xF7][\\x80-\\xBF]{3}',
188
                '[\\xF8-\\xFB][\\x80-\\xBF]{4}',
189
                '[\\xFC-\\xFD][\\x80-\\xBF]{5}'
190
            );
191
            goto state4;
192
        }
193
        goto error;
194
195
        state6:
196
        if ($context->getBuffer()->isEnd()) {
197
            goto error;
198
        }
199
        $char = $context->getBuffer()->getSymbol();
200
        if (0x80 <= $char && $char <= 0xBF) {
201
            $context->getBuffer()->nextSymbol();
202
            $context->allowRegExps(
203
                '[\\xF8-\\xFB][\\x80-\\xBF]{4}',
204
                '[\\xFC-\\xFD][\\x80-\\xBF]{5}'
205
            );
206
            goto state5;
207
        }
208
        goto error;
209
210
        state7:
211
        if ($context->getBuffer()->isEnd()) {
212
            goto error;
213
        }
214
        $char = $context->getBuffer()->getSymbol();
215
        if (0x80 <= $char && $char <= 0xBF) {
216
            $context->getBuffer()->nextSymbol();
217
            $context->allowRegExps('[\\xFC-\\xFD][\\x80-\\xBF]{5}');
218
            goto state6;
219
        }
220
        goto error;
221
222
        error:
223
        if ($context->getBuffer()->isEnd()) {
224
            return false;
225
        }
226
        $context->getBuffer()->nextSymbol();
227
        $context->setNewToken(TokenType::INVALID_BYTES);
228
        return true;
229
    }
230
}
231