Passed
Push — master ( fd9a54...1b375d )
by Alexander
06:02 queued 01:51
created

src/AbstractTokenizer.php (1 issue)

Labels
Severity
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Yiisoft\Db\Sqlite;
6
7
use SplStack;
8
use Yiisoft\Db\Exception\InvalidArgumentException;
9
10
use function is_array;
11
use function is_string;
12
use function mb_strlen;
13
use function mb_strpos;
14
use function mb_strtoupper;
15
use function mb_substr;
16
use function reset;
17
use function usort;
18
19
/**
20
 * Splits an SQL query into individual SQL tokens.
21
 *
22
 * You can use it to obtain addition information from an SQL code.
23
 *
24
 * Usage example:
25
 *
26
 * ```php
27
 * $tokenizer = new SqlTokenizer("SELECT * FROM {{%user}} WHERE [[id]] = 1");
28
 * $root = $tokenizer->tokenize();
29
 * $sqlTokens = $root->getChildren();
30
 * ```
31
 *
32
 * Tokens are instances of {@see SqlToken}.
33
 */
34
abstract class AbstractTokenizer
35
{
36
    /**
37
     * @var int SQL code string length.
38
     */
39
    protected int $length = 0;
40
41
    /**
42
     * @var int SQL code string current offset.
43
     */
44
    protected int $offset = 0;
45
46
    /**
47
     * @var SplStack Of active tokens.
48
     *
49
     * @psalm-var SplStack<SqlToken>
50
     *
51
     * @psalm-suppress PropertyNotSetInConstructor
52
     */
53
    private SplStack $tokenStack;
54
55
    /**
56
     * @var array|SqlToken Active token. It's usually a top of the token stack.
57
     *
58
     * @psalm-var SqlToken|SqlToken[]
59
     *
60
     * @psalm-suppress PropertyNotSetInConstructor
61
     */
62
    private array|SqlToken $currentToken;
63
64
    /**
65
     * @var array Cached substrings.
66
     *
67
     * @psalm-var string[]
68
     */
69
    private array $substrings = [];
70
71
    /**
72
     * @var string Buffer for the current token.
73
     */
74
    private string $buffer = '';
75
76 157
    public function __construct(private string $sql)
77
    {
78 157
    }
79
80
    /**
81
     * Tokenizes and returns a code type token.
82
     *
83
     * @throws InvalidArgumentException If the SQL code is invalid.
84
     *
85
     * @return SqlToken Code type token.
86
     *
87
     * @psalm-suppress MixedPropertyTypeCoercion
88
     */
89 157
    public function tokenize(): SqlToken
90
    {
91 157
        $this->length = mb_strlen($this->sql, 'UTF-8');
92 157
        $this->offset = 0;
93 157
        $this->substrings = [];
94 157
        $this->buffer = '';
95
96 157
        $token = (new SqlToken())->type(SqlToken::TYPE_CODE)->content($this->sql);
97
98 157
        $this->tokenStack = new SplStack();
99 157
        $this->tokenStack->push($token);
100
101 157
        $token[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT);
102
103 157
        $this->tokenStack->push($token[0]);
104
        /** @psalm-var SqlToken */
105 157
        $this->currentToken = $this->tokenStack->top();
106 157
        $length = 0;
107
108 157
        while (!$this->isEof()) {
109 157
            if ($this->isWhitespace($length) || $this->isComment($length)) {
110 157
                $this->addTokenFromBuffer();
111 157
                $this->advance($length);
112
113 157
                continue;
114
            }
115
116
            /** @psalm-suppress ConflictingReferenceConstraint */
117 157
            if ($this->tokenizeOperator($length) || $this->tokenizeDelimitedString($length)) {
118 157
                $this->advance($length);
119
120 157
                continue;
121
            }
122
123 157
            $this->buffer .= $this->substring(1);
124 157
            $this->advance(1);
125
        }
126
127 157
        $this->addTokenFromBuffer();
128
129
        if (
130 157
            $token->getHasChildren() &&
131 157
            $token[-1] instanceof SqlToken &&
132 157
            !$token[-1]->getHasChildren()
0 ignored issues
show
The method getHasChildren() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

132
            !$token[-1]->/** @scrutinizer ignore-call */ getHasChildren()

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
133
        ) {
134 64
            unset($token[-1]);
135
        }
136
137 157
        return $token;
138
    }
139
140
    /**
141
     * Returns whether there's a space or blank at the current offset.
142
     *
143
     * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string.
144
     *
145
     * @param int $length Length of the matched string.
146
     *
147
     * @return bool Whether there's a space or blank at the current offset.
148
     */
149
    abstract protected function isWhitespace(int &$length): bool;
150
151
    /**
152
     * Returns whether there's a commentary at the current offset.
153
     *
154
     * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string.
155
     *
156
     * @param int $length Length of the matched string.
157
     *
158
     * @return bool Whether there's a commentary at the current offset.
159
     */
160
    abstract protected function isComment(int &$length): bool;
161
162
    /**
163
     * Returns whether there's an operator at the current offset.
164
     *
165
     * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may
166
     * also set `$content` to a string that will be used as a token content.
167
     *
168
     * @param int $length  Length of the matched string.
169
     * @param string|null $content Optional content instead of the matched string.
170
     *
171
     * @return bool Whether there's an operator at the current offset.
172
     */
173
    abstract protected function isOperator(int &$length, string|null &$content): bool;
174
175
    /**
176
     * Returns whether there's an identifier at the current offset.
177
     *
178
     * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may
179
     * also set `$content` to a string that will be used as a token content.
180
     *
181
     * @param int $length Length of the matched string.
182
     * @param string|null $content Optional content instead of the matched string.
183
     *
184
     * @return bool Whether there's an identifier at the current offset.
185
     */
186
    abstract protected function isIdentifier(int &$length, string|null &$content): bool;
187
188
    /**
189
     * Returns whether there's a string literal at the current offset.
190
     *
191
     * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may
192
     * also set `$content` to a string that will be used as a token content.
193
     *
194
     * @param int $length Length of the matched string.
195
     * @param string|null $content Optional content instead of the matched string.
196
     *
197
     * @return bool Whether there's a string literal at the current offset.
198
     */
199
    abstract protected function isStringLiteral(int &$length, string|null &$content): bool;
200
201
    /**
202
     * Returns whether the given string is a keyword.
203
     *
204
     * The method may set `$content` to a string that will be used as a token content.
205
     *
206
     * @param string $string String to match.
207
     * @param string|null $content Optional content instead of the matched string.
208
     *
209
     * @return bool Whether the given string is a keyword.
210
     */
211
    abstract protected function isKeyword(string $string, string|null &$content): bool;
212
213
    /**
214
     * Returns whether the longest common prefix equals to the SQL code of the same length at the current offset.
215
     *
216
     * @param array $with Strings to test. The method `will` change this parameter to speed up lookups.
217
     * @param bool $caseSensitive Whether to perform a case-sensitive comparison.
218
     * @param int $length Length of the matched string.
219
     * @param string|null $content Matched string.
220
     *
221
     * @return bool Whether there is a match.
222
     *
223
     * @psalm-param array<array-key, string> $with
224
     */
225 157
    protected function startsWithAnyLongest(
226
        array $with,
227
        bool $caseSensitive,
228
        int &$length,
229
        string &$content = null
230
    ): bool {
231 157
        if (empty($with)) {
232
            return false;
233
        }
234
235 157
        if (!is_array(reset($with))) {
236 157
            usort($with, static fn (string $string1, string $string2) => mb_strlen($string2, 'UTF-8') - mb_strlen($string1, 'UTF-8'));
237
238 157
            $map = [];
239
240 157
            foreach ($with as $string) {
241 157
                $map[mb_strlen($string, 'UTF-8')][$caseSensitive ? $string : mb_strtoupper($string, 'UTF-8')] = true;
242
            }
243
244 157
            $with = $map;
245
        }
246
247
        /** @psalm-var array<int, array> $with */
248 157
        foreach ($with as $testLength => $testValues) {
249 157
            $content = $this->substring($testLength, $caseSensitive);
250
251 157
            if (isset($testValues[$content])) {
252 157
                $length = $testLength;
253 157
                return true;
254
            }
255
        }
256
257 157
        return false;
258
    }
259
260
    /**
261
     * Returns a string of the given length starting with the specified offset.
262
     *
263
     * @param int $length String length to return.
264
     * @param bool $caseSensitive If it's `false`, the string will be uppercase.
265
     * @param int|null $offset SQL code offset, defaults to current if `null` is passed.
266
     *
267
     * @return string Result string, it may be empty if there's nothing to return.
268
     */
269 157
    protected function substring(int $length, bool $caseSensitive = true, int $offset = null): string
270
    {
271 157
        if ($offset === null) {
272 157
            $offset = $this->offset;
273
        }
274
275 157
        if ($offset + $length > $this->length) {
276 157
            return '';
277
        }
278
279 157
        $cacheKey = $offset . ',' . $length;
280
281 157
        if (!isset($this->substrings[$cacheKey . ',1'])) {
282 157
            $this->substrings[$cacheKey . ',1'] = mb_substr($this->sql, $offset, $length, 'UTF-8');
283
        }
284
285 157
        if (!$caseSensitive && !isset($this->substrings[$cacheKey . ',0'])) {
286
            $this->substrings[$cacheKey . ',0'] = mb_strtoupper($this->substrings[$cacheKey . ',1'], 'UTF-8');
287
        }
288
289 157
        return $this->substrings[$cacheKey . ',' . (int) $caseSensitive];
290
    }
291
292
    /**
293
     * Returns an index after the given string in the SQL code starting with the specified offset.
294
     *
295
     * @param string $string String to find.
296
     * @param int|null $offset SQL code offset, defaults to current if `null` is passed.
297
     *
298
     * @return int Index after the given string or end of string index.
299
     */
300 116
    protected function indexAfter(string $string, int $offset = null): int
301
    {
302 116
        if ($offset === null) {
303 18
            $offset = $this->offset;
304
        }
305
306 116
        if ($offset + mb_strlen($string, 'UTF-8') > $this->length) {
307
            return $this->length;
308
        }
309
310 116
        $afterIndexOf = mb_strpos($this->sql, $string, $offset, 'UTF-8');
311
312 116
        if ($afterIndexOf === false) {
313
            $afterIndexOf = $this->length;
314
        } else {
315 116
            $afterIndexOf += mb_strlen($string, 'UTF-8');
316
        }
317
318 116
        return $afterIndexOf;
319
    }
320
321
    /**
322
     * Determines whether there is a delimited string at the current offset and adds it to the token children.
323
     */
324 157
    private function tokenizeDelimitedString(int &$length): bool
325
    {
326 157
        $isIdentifier = $this->isIdentifier($length, $content);
327 157
        $isStringLiteral = !$isIdentifier && $this->isStringLiteral($length, $content);
328
329 157
        if (!$isIdentifier && !$isStringLiteral) {
330 157
            return false;
331
        }
332
333 116
        $this->addTokenFromBuffer();
334
335 116
        $this->currentToken[] = (new SqlToken())
336 116
            ->type($isIdentifier ? SqlToken::TYPE_IDENTIFIER : SqlToken::TYPE_STRING_LITERAL)
337 116
            ->content(is_string($content) ? $content : $this->substring($length))
338 116
            ->startOffset($this->offset)
339 116
            ->endOffset($this->offset + $length);
340
341 116
        return true;
342
    }
343
344
    /**
345
     * Determines whether there is an operator at the current offset and adds it to the token children.
346
     */
347 157
    private function tokenizeOperator(int &$length): bool
348
    {
349 157
        if (!$this->isOperator($length, $content)) {
350 157
            return false;
351
        }
352
353 157
        $this->addTokenFromBuffer();
354
355 157
        switch ($this->substring($length)) {
356 157
            case '(':
357 157
                $this->currentToken[] = (new SqlToken())
358 157
                    ->type(SqlToken::TYPE_OPERATOR)
359 157
                    ->content(is_string($content) ? $content : $this->substring($length))
360 157
                    ->startOffset($this->offset)
361 157
                    ->endOffset($this->offset + $length);
362 157
                $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_PARENTHESIS);
363
364 157
                if ($this->currentToken[-1] !== null) {
365 157
                    $this->tokenStack->push($this->currentToken[-1]);
366
                }
367
368 157
                $this->currentToken = $this->tokenStack->top();
369
370 157
                break;
371
372 157
            case ')':
373 157
                $this->tokenStack->pop();
374 157
                $this->currentToken = $this->tokenStack->top();
375 157
                $this->currentToken[] = (new SqlToken())
376 157
                    ->type(SqlToken::TYPE_OPERATOR)
377 157
                    ->content(')')
378 157
                    ->startOffset($this->offset)
379 157
                    ->endOffset($this->offset + $length);
380
381 157
                break;
382 107
            case ';':
383 12
                if ($this->currentToken instanceof SqlToken && !$this->currentToken->getHasChildren()) {
384
                    break;
385
                }
386
387 12
                $this->currentToken[] = (new SqlToken())
388 12
                    ->type(SqlToken::TYPE_OPERATOR)
389 12
                    ->content(is_string($content) ? $content : $this->substring($length))
390 12
                    ->startOffset($this->offset)
391 12
                    ->endOffset($this->offset + $length);
392 12
                $this->tokenStack->pop();
393 12
                $this->currentToken = $this->tokenStack->top();
394 12
                $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT);
395
396 12
                if ($this->currentToken[-1] instanceof SqlToken) {
397 12
                    $this->tokenStack->push($this->currentToken[-1]);
398
                }
399
400 12
                $this->currentToken = $this->tokenStack->top();
401
402 12
                break;
403
            default:
404 107
                $this->currentToken[] = (new SqlToken())
405 107
                    ->type(SqlToken::TYPE_OPERATOR)
406 107
                    ->content(is_string($content) ? $content : $this->substring($length))
407 107
                    ->startOffset($this->offset)
408 107
                    ->endOffset($this->offset + $length);
409
410 107
                break;
411
        }
412
413 157
        return true;
414
    }
415
416
    /**
417
     * Determines a type of text in the buffer, tokenizes it and adds it to the token children.
418
     */
419 157
    private function addTokenFromBuffer(): void
420
    {
421 157
        if ($this->buffer === '') {
422 157
            return;
423
        }
424
425 157
        $isKeyword = $this->isKeyword($this->buffer, $content);
426
427 157
        $this->currentToken[] = (new SqlToken())
428 157
            ->type($isKeyword ? SqlToken::TYPE_KEYWORD : SqlToken::TYPE_TOKEN)
429 157
            ->content(is_string($content) ? $content : $this->buffer)
430 157
            ->startOffset($this->offset - mb_strlen($this->buffer, 'UTF-8'))
431 157
            ->endOffset($this->offset);
432
433 157
        $this->buffer = '';
434
    }
435
436
    /**
437
     * Adds the specified length to the current offset.
438
     *
439
     * @throws InvalidArgumentException If the length is less than or equal to 0.
440
     */
441 157
    private function advance(int $length): void
442
    {
443 157
        if ($length <= 0) {
444
            throw new InvalidArgumentException('Length must be greater than 0.');
445
        }
446
447 157
        $this->offset += $length;
448 157
        $this->substrings = [];
449
    }
450
451
    /**
452
     * Returns whether the SQL code is completely traversed.
453
     */
454 157
    private function isEof(): bool
455
    {
456 157
        return $this->offset >= $this->length;
457
    }
458
}
459