Completed
Pull Request — master (#1184)
by Alexey
14:33
created

Lexer::validateInputInUnicodeMode()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 3
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 5
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
/**
6
 * Hoa
7
 *
8
 *
9
 *
10
 *
11
 * BSD 3-Clause License
12
 *
13
 * Copyright © 2007-2017, Hoa community. All rights reserved.
14
 *
15
 * Redistribution and use in source and binary forms, with or without
16
 * modification, are permitted provided that the following conditions are met:
17
 *
18
 * 1. Redistributions of source code must retain the above copyright notice, this
19
 *    list of conditions and the following disclaimer.
20
 *
21
 * 2. Redistributions in binary form must reproduce the above copyright notice,
22
 *    this list of conditions and the following disclaimer in the documentation
23
 *    and/or other materials provided with the distribution.
24
 *
25
 * 3. Neither the name of the copyright holder nor the names of its
26
 *    contributors may be used to endorse or promote products derived from
27
 *    this software without specific prior written permission.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
33
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
35
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
37
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39
 */
40
41
namespace JMS\Serializer\Type\Compiler\Llk;
42
43
use JMS\Serializer\Type\Compiler;
44
use JMS\Serializer\Type\Compiler\Exception\UnrecognizedToken;
45
46
/**
47
 * Class \JMS\Serializer\Type\Compiler\Llk\Lexer.
48
 *
49
 * Lexical analyser, i.e. split a string into a set of lexeme, i.e. tokens.
50
 */
51
final class Lexer
52
{
53
    /**
54
     * Lexer state.
55
     *
56
     * @var array
57
     */
58
    protected $_lexerState  = null;
59
60
    /**
61
     * Text.
62
     *
63
     * @var string
64
     */
65
    protected $_text        = null;
66
67
    /**
68
     * Tokens.
69
     *
70
     * @var array
71
     */
72
    protected $_tokens      = [];
73
74
    /**
75
     * Namespace stacks.
76
     *
77
     * @var \SplStack
78
     */
79
    protected $_nsStack     = null;
80
81
    /**
82
     * PCRE options.
83
     *
84
     * @var string
85
     */
86
    protected $_pcreOptions = null;
87
88
89
90
    /**
91
     * @param   array  $pragmas Pragmas.
92
     */
93
    public function __construct(array $pragmas = [])
94
    {
95
        if (!isset($pragmas['lexer.unicode']) || true === $pragmas['lexer.unicode']) {
96
            $this->_pcreOptions .= 'u';
97
        }
98
99
        return;
100
    }
101
102
    /**
103
     * Text tokenizer: splits the text in parameter in an ordered array of
104
     * tokens.
105
     *
106
     * @param string  $text   Text to tokenize.
107
     * @param array[]   $tokens Tokens to be returned.
108
     *
109
     * @return \Generator|array[]
110
     *
111
     * @throws UnrecognizedToken
112
     *
113
     * @psalm-return \Generator<int, array{token: string, value: string, length: int|false, namespace: array|string, keep: true, offset: int}>
114
     */
115
    public function lexMe(string $text, array $tokens): \Generator
116
    {
117
        $this->validateInputInUnicodeMode($text);
118
119
        $this->_text       = $text;
120
        $this->_tokens     = $tokens;
121
        $this->_nsStack    = null;
122
        $offset            = 0;
123
        $maxOffset         = strlen($this->_text);
124
        $this->_lexerState = 'default';
0 ignored issues
show
Documentation Bug introduced by
It seems like 'default' of type string is incompatible with the declared type array of property $_lexerState.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
125
        $stack             = false;
126
127
        foreach ($this->_tokens as &$tokens) {
128
            $_tokens = [];
129
130
            foreach ($tokens as $fullLexeme => $regex) {
131
                if (false === strpos($fullLexeme, ':')) {
132
                    $_tokens[$fullLexeme] = [$regex, null];
133
134
                    continue;
135
                }
136
137
                [$lexeme, $namespace] = explode(':', $fullLexeme, 2);
138
139
                $stack |= ('__shift__' === substr($namespace, 0, 9));
140
141
                unset($tokens[$fullLexeme]);
142
                $_tokens[$lexeme] = [$regex, $namespace];
143
            }
144
145
            $tokens = $_tokens;
146
        }
147
148
        if (true === $stack) {
0 ignored issues
show
introduced by
The condition true === $stack is always false.
Loading history...
149
            $this->_nsStack = new \SplStack();
150
        }
151
152
        while ($offset < $maxOffset) {
153
            $nextToken = $this->nextToken($offset);
154
155
            if (null === $nextToken) {
156
                throw new Compiler\Exception\UnrecognizedToken(
157
                    'Unrecognized token "%s" at line 1 and column %d:' .
158
                    "\n" . '%s' . "\n" .
159
                    str_repeat(' ', mb_strlen(substr($text, 0, $offset))) . '↑',
160
                    0,
161
                    [
162
                        mb_substr(substr($text, $offset), 0, 1),
163
                        $offset + 1,
164
                        $text,
165
                    ],
166
                    1,
167
                    $offset
168
                );
169
            }
170
171
            if (true === $nextToken['keep']) {
172
                $nextToken['offset'] = $offset;
173
                yield $nextToken;
174
            }
175
176
            $offset += strlen($nextToken['value']);
177
        }
178
179
        yield [
180
            'token'     => 'EOF',
181
            'value'     => 'EOF',
182
            'length'    => 0,
183
            'namespace' => 'default',
184
            'keep'      => true,
185
            'offset'    => $offset,
186
        ];
187
    }
188
189
    /**
190
     * Compute the next token recognized at the beginning of the string.
191
     *
192
     * @param int  $offset Offset.
193
     *
194
     * @return (array|bool|int|string)[]|array[]|null
195
     *
196
     * @throws \JMS\Serializer\Type\Compiler\Exception\Lexer
197
     *
198
     * @psalm-return array{token: string, value: string, length: int|false, namespace: array, keep: bool}|null
199
     */
200
    protected function nextToken(int $offset)
201
    {
202
        $tokenArray = &$this->_tokens[$this->_lexerState];
203
204
        $previousNamespace = null;
205
        foreach ($tokenArray as $lexeme => $bucket) {
206
            [$regex, $nextState] = $bucket;
207
208
            if (null === $nextState) {
209
                $nextState = $this->_lexerState;
210
            }
211
212
            $out = $this->matchLexeme($lexeme, $regex, $offset);
213
214
            if (null !== $out) {
215
                $out['namespace'] = $this->_lexerState;
216
                $out['keep']      = 'skip' !== $lexeme;
217
218
                if ($nextState !== $this->_lexerState) {
219
                    $shift = false;
220
221
                    if (null !== $this->_nsStack &&
222
                        0 !== preg_match('#^__shift__(?:\s*\*\s*(\d+))?$#', $nextState, $matches)) {
0 ignored issues
show
Bug introduced by
$nextState of type array is incompatible with the type string expected by parameter $subject of preg_match(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

222
                        0 !== preg_match('#^__shift__(?:\s*\*\s*(\d+))?$#', /** @scrutinizer ignore-type */ $nextState, $matches)) {
Loading history...
223
                        $i = isset($matches[1]) ? intval($matches[1]) : 1;
224
225
                        if ($i > ($c = count($this->_nsStack))) {
226
                            throw new Compiler\Exception\Lexer(
227
                                'Cannot shift namespace %d-times, from token ' .
228
                                '%s in namespace %s, because the stack ' .
229
                                'contains only %d namespaces.',
230
                                1,
231
                                [
232
                                    $i,
233
                                    $lexeme,
234
                                    $this->_lexerState,
235
                                    $c,
236
                                ]
237
                            );
238
                        }
239
240
                        while (1 <= $i--) {
241
                            $previousNamespace = $this->_nsStack->pop();
242
                        }
243
244
                        $nextState = $previousNamespace;
245
                        $shift     = true;
246
                    }
247
248
                    if (!isset($this->_tokens[$nextState])) {
249
                        throw new Compiler\Exception\Lexer(
250
                            'Namespace %s does not exist, called by token %s ' .
251
                            'in namespace %s.',
252
                            2,
253
                            [
254
                                $nextState,
255
                                $lexeme,
256
                                $this->_lexerState,
257
                            ]
258
                        );
259
                    }
260
261
                    if (null !== $this->_nsStack && false === $shift) {
262
                        $this->_nsStack[] = $this->_lexerState;
263
                    }
264
265
                    $this->_lexerState = $nextState;
266
                }
267
268
                return $out;
269
            }
270
        }
271
272
        return null;
273
    }
274
275
    /**
276
     * Check if a given lexeme is matched at the beginning of the text.
277
     *
278
     * @param string  $lexeme Name of the lexeme.
279
     * @param string  $regex  Regular expression describing the lexeme.
280
     * @param int     $offset Offset.
281
     *
282
     * @return (int|false|string)[]|null
283
     *
284
     * @throws \JMS\Serializer\Type\Compiler\Exception\Lexer
285
     *
286
     * @psalm-return array{token: string, value: string, length: int|false}|null
287
     */
288
    protected function matchLexeme(string $lexeme, string $regex, int $offset)
289
    {
290
        $_regex = str_replace('#', '\#', $regex);
291
        $preg   = @preg_match(
292
            '#\G(?|' . $_regex . ')#' . $this->_pcreOptions,
293
            $this->_text,
294
            $matches,
295
            0,
296
            $offset
297
        );
298
299
        if (0 === $preg) {
300
            return null;
301
        }
302
303
        if (false === $preg) {
304
            throw new Compiler\Exception\InternalError(
305
                'Lexer encountered a PCRE error on a lexeme "%s", full regex: "%s". Please report this issue to the maintainers.',
306
                preg_last_error(),
307
                [$lexeme, $_regex]
308
            );
309
        }
310
311
        if ('' === $matches[0]) {
312
            throw new Compiler\Exception\Lexer(
313
                'A lexeme must not match an empty value, which is the ' .
314
                'case of "%s" (%s).',
315
                3,
316
                [$lexeme, $regex]
317
            );
318
        }
319
320
        return [
321
            'token'  => $lexeme,
322
            'value'  => $matches[0],
323
            'length' => mb_strlen($matches[0]),
324
        ];
325
    }
326
327
    /**
328
     * @return bool
329
     */
330
    private function validateInputInUnicodeMode(string $text)
331
    {
332
        if (false !== strpos($this->_pcreOptions, 'u') && false === preg_match('##u', $text)) {
333
            throw new Compiler\Exception\Lexer(
334
                'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
335
            );
336
        }
337
    }
338
}
339