1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Yiisoft\Db\Sqlite; |
||
6 | |||
7 | use SplStack; |
||
8 | use Yiisoft\Db\Exception\InvalidArgumentException; |
||
9 | |||
10 | use function is_array; |
||
11 | use function is_string; |
||
12 | use function mb_strlen; |
||
13 | use function mb_strpos; |
||
14 | use function mb_strtoupper; |
||
15 | use function mb_substr; |
||
16 | use function reset; |
||
17 | use function usort; |
||
18 | |||
19 | /** |
||
20 | * Splits an SQL query into individual SQL tokens. |
||
21 | * |
||
22 | * You can use it to obtain addition information from an SQL code. |
||
23 | * |
||
24 | * Usage example: |
||
25 | * |
||
26 | * ```php |
||
27 | * $tokenizer = new SqlTokenizer("SELECT * FROM {{%user}} WHERE [[id]] = 1"); |
||
28 | * $root = $tokenizer->tokenize(); |
||
29 | * $sqlTokens = $root->getChildren(); |
||
30 | * ``` |
||
31 | * |
||
32 | * Tokens are instances of {@see SqlToken}. |
||
33 | */ |
||
34 | abstract class AbstractTokenizer |
||
35 | { |
||
36 | /** |
||
37 | * @var int SQL code string length. |
||
38 | */ |
||
39 | protected int $length = 0; |
||
40 | |||
41 | /** |
||
42 | * @var int SQL code string current offset. |
||
43 | */ |
||
44 | protected int $offset = 0; |
||
45 | |||
46 | /** |
||
47 | * @var SplStack Of active tokens. |
||
48 | * |
||
49 | * @psalm-var SplStack<SqlToken> |
||
50 | * |
||
51 | * @psalm-suppress PropertyNotSetInConstructor |
||
52 | */ |
||
53 | private SplStack $tokenStack; |
||
54 | |||
55 | /** |
||
56 | * @var array|SqlToken Active token. It's usually a top of the token stack. |
||
57 | * |
||
58 | * @psalm-var SqlToken|SqlToken[] |
||
59 | * |
||
60 | * @psalm-suppress PropertyNotSetInConstructor |
||
61 | */ |
||
62 | private array|SqlToken $currentToken; |
||
63 | |||
64 | /** |
||
65 | * @var array Cached substrings. |
||
66 | * |
||
67 | * @psalm-var string[] |
||
68 | */ |
||
69 | private array $substrings = []; |
||
70 | |||
71 | /** |
||
72 | * @var string Buffer for the current token. |
||
73 | */ |
||
74 | private string $buffer = ''; |
||
75 | |||
76 | 171 | public function __construct(private string $sql) |
|
77 | { |
||
78 | 171 | } |
|
79 | |||
80 | /** |
||
81 | * Tokenizes and returns a code type token. |
||
82 | * |
||
83 | * @throws InvalidArgumentException If the SQL code is invalid. |
||
84 | * |
||
85 | * @return SqlToken Code type token. |
||
86 | * |
||
87 | * @psalm-suppress MixedPropertyTypeCoercion |
||
88 | */ |
||
89 | 171 | public function tokenize(): SqlToken |
|
90 | { |
||
91 | 171 | $this->length = mb_strlen($this->sql, 'UTF-8'); |
|
92 | 171 | $this->offset = 0; |
|
93 | 171 | $this->substrings = []; |
|
94 | 171 | $this->buffer = ''; |
|
95 | |||
96 | 171 | $token = (new SqlToken())->type(SqlToken::TYPE_CODE)->content($this->sql); |
|
97 | |||
98 | 171 | $this->tokenStack = new SplStack(); |
|
99 | 171 | $this->tokenStack->push($token); |
|
100 | |||
101 | 171 | $token[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT); |
|
102 | |||
103 | 171 | $this->tokenStack->push($token[0]); |
|
104 | 171 | $this->currentToken = $this->tokenStack->top(); |
|
105 | 171 | $length = 0; |
|
106 | |||
107 | 171 | while (!$this->isEof()) { |
|
108 | 171 | if ($this->isWhitespace($length) || $this->isComment($length)) { |
|
109 | 171 | $this->addTokenFromBuffer(); |
|
110 | 171 | $this->advance($length); |
|
111 | |||
112 | 171 | continue; |
|
113 | } |
||
114 | |||
115 | /** @psalm-suppress ConflictingReferenceConstraint */ |
||
116 | 171 | if ($this->tokenizeOperator($length) || $this->tokenizeDelimitedString($length)) { |
|
117 | 171 | $this->advance($length); |
|
118 | |||
119 | 171 | continue; |
|
120 | } |
||
121 | |||
122 | 171 | $this->buffer .= $this->substring(1); |
|
123 | 171 | $this->advance(1); |
|
124 | } |
||
125 | |||
126 | 171 | $this->addTokenFromBuffer(); |
|
127 | |||
128 | if ( |
||
129 | 171 | $token->getHasChildren() && |
|
130 | 171 | $token[-1] instanceof SqlToken && |
|
131 | 171 | !$token[-1]->getHasChildren() |
|
0 ignored issues
–
show
|
|||
132 | ) { |
||
133 | 64 | unset($token[-1]); |
|
134 | } |
||
135 | |||
136 | 171 | return $token; |
|
137 | } |
||
138 | |||
139 | /** |
||
140 | * Returns whether there's a space or blank at the current offset. |
||
141 | * |
||
142 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. |
||
143 | * |
||
144 | * @param int $length Length of the matched string. |
||
145 | * |
||
146 | * @return bool Whether there's a space or blank at the current offset. |
||
147 | */ |
||
148 | abstract protected function isWhitespace(int &$length): bool; |
||
149 | |||
150 | /** |
||
151 | * Returns whether there's a commentary at the current offset. |
||
152 | * |
||
153 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. |
||
154 | * |
||
155 | * @param int $length Length of the matched string. |
||
156 | * |
||
157 | * @return bool Whether there's a commentary at the current offset. |
||
158 | */ |
||
159 | abstract protected function isComment(int &$length): bool; |
||
160 | |||
161 | /** |
||
162 | * Returns whether there's an operator at the current offset. |
||
163 | * |
||
164 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
165 | * also set `$content` to a string that will be used as a token content. |
||
166 | * |
||
167 | * @param int $length Length of the matched string. |
||
168 | * @param string|null $content Optional content instead of the matched string. |
||
169 | * |
||
170 | * @return bool Whether there's an operator at the current offset. |
||
171 | */ |
||
172 | abstract protected function isOperator(int &$length, string|null &$content): bool; |
||
173 | |||
174 | /** |
||
175 | * Returns whether there's an identifier at the current offset. |
||
176 | * |
||
177 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
178 | * also set `$content` to a string that will be used as a token content. |
||
179 | * |
||
180 | * @param int $length Length of the matched string. |
||
181 | * @param string|null $content Optional content instead of the matched string. |
||
182 | * |
||
183 | * @return bool Whether there's an identifier at the current offset. |
||
184 | */ |
||
185 | abstract protected function isIdentifier(int &$length, string|null &$content): bool; |
||
186 | |||
187 | /** |
||
188 | * Returns whether there's a string literal at the current offset. |
||
189 | * |
||
190 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
191 | * also set `$content` to a string that will be used as a token content. |
||
192 | * |
||
193 | * @param int $length Length of the matched string. |
||
194 | * @param string|null $content Optional content instead of the matched string. |
||
195 | * |
||
196 | * @return bool Whether there's a string literal at the current offset. |
||
197 | */ |
||
198 | abstract protected function isStringLiteral(int &$length, string|null &$content): bool; |
||
199 | |||
200 | /** |
||
201 | * Returns whether the given string is a keyword. |
||
202 | * |
||
203 | * The method may set `$content` to a string that will be used as a token content. |
||
204 | * |
||
205 | * @param string $string String to match. |
||
206 | * @param string|null $content Optional content instead of the matched string. |
||
207 | * |
||
208 | * @return bool Whether the given string is a keyword. |
||
209 | */ |
||
210 | abstract protected function isKeyword(string $string, string|null &$content): bool; |
||
211 | |||
212 | /** |
||
213 | * Returns whether the longest common prefix equals to the SQL code of the same length at the current offset. |
||
214 | * |
||
215 | * @param array $with Strings to test. The method `will` change this parameter to speed up lookups. |
||
216 | * @param bool $caseSensitive Whether to perform a case-sensitive comparison. |
||
217 | * @param int $length Length of the matched string. |
||
218 | * @param string|null $content Matched string. |
||
219 | * |
||
220 | * @return bool Whether there is a match. |
||
221 | * |
||
222 | * @psalm-param array<array-key, string> $with |
||
223 | */ |
||
224 | 171 | protected function startsWithAnyLongest( |
|
225 | array $with, |
||
226 | bool $caseSensitive, |
||
227 | int &$length, |
||
228 | string &$content = null |
||
229 | ): bool { |
||
230 | 171 | if (empty($with)) { |
|
231 | return false; |
||
232 | } |
||
233 | |||
234 | 171 | if (!is_array(reset($with))) { |
|
235 | 171 | usort($with, static fn (string $string1, string $string2) => mb_strlen($string2, 'UTF-8') - mb_strlen($string1, 'UTF-8')); |
|
236 | |||
237 | 171 | $map = []; |
|
238 | |||
239 | 171 | foreach ($with as $string) { |
|
240 | 171 | $map[mb_strlen($string, 'UTF-8')][$caseSensitive ? $string : mb_strtoupper($string, 'UTF-8')] = true; |
|
241 | } |
||
242 | |||
243 | 171 | $with = $map; |
|
244 | } |
||
245 | |||
246 | /** @psalm-var array<int, array> $with */ |
||
247 | 171 | foreach ($with as $testLength => $testValues) { |
|
248 | 171 | $content = $this->substring($testLength, $caseSensitive); |
|
249 | |||
250 | 171 | if (isset($testValues[$content])) { |
|
251 | 171 | $length = $testLength; |
|
252 | 171 | return true; |
|
253 | } |
||
254 | } |
||
255 | |||
256 | 171 | return false; |
|
257 | } |
||
258 | |||
259 | /** |
||
260 | * Returns a string of the given length starting with the specified offset. |
||
261 | * |
||
262 | * @param int $length String length to return. |
||
263 | * @param bool $caseSensitive If it's `false`, the string will be uppercase. |
||
264 | * @param int|null $offset SQL code offset, defaults to current if `null` is passed. |
||
265 | * |
||
266 | * @return string Result string, it may be empty if there's nothing to return. |
||
267 | */ |
||
268 | 171 | protected function substring(int $length, bool $caseSensitive = true, int $offset = null): string |
|
269 | { |
||
270 | 171 | if ($offset === null) { |
|
271 | 171 | $offset = $this->offset; |
|
272 | } |
||
273 | |||
274 | 171 | if ($offset + $length > $this->length) { |
|
275 | 171 | return ''; |
|
276 | } |
||
277 | |||
278 | 171 | $cacheKey = $offset . ',' . $length; |
|
279 | |||
280 | 171 | if (!isset($this->substrings[$cacheKey . ',1'])) { |
|
281 | 171 | $this->substrings[$cacheKey . ',1'] = mb_substr($this->sql, $offset, $length, 'UTF-8'); |
|
282 | } |
||
283 | |||
284 | 171 | if (!$caseSensitive && !isset($this->substrings[$cacheKey . ',0'])) { |
|
285 | $this->substrings[$cacheKey . ',0'] = mb_strtoupper($this->substrings[$cacheKey . ',1'], 'UTF-8'); |
||
286 | } |
||
287 | |||
288 | 171 | return $this->substrings[$cacheKey . ',' . (int) $caseSensitive]; |
|
289 | } |
||
290 | |||
291 | /** |
||
292 | * Returns an index after the given string in the SQL code starting with the specified offset. |
||
293 | * |
||
294 | * @param string $string String to find. |
||
295 | * @param int|null $offset SQL code offset, defaults to current if `null` is passed. |
||
296 | * |
||
297 | * @return int Index after the given string or end of string index. |
||
298 | */ |
||
299 | 131 | protected function indexAfter(string $string, int $offset = null): int |
|
300 | { |
||
301 | 131 | if ($offset === null) { |
|
302 | 31 | $offset = $this->offset; |
|
303 | } |
||
304 | |||
305 | 131 | if ($offset + mb_strlen($string, 'UTF-8') > $this->length) { |
|
306 | return $this->length; |
||
307 | } |
||
308 | |||
309 | 131 | $afterIndexOf = mb_strpos($this->sql, $string, $offset, 'UTF-8'); |
|
310 | |||
311 | 131 | if ($afterIndexOf === false) { |
|
312 | $afterIndexOf = $this->length; |
||
313 | } else { |
||
314 | 131 | $afterIndexOf += mb_strlen($string, 'UTF-8'); |
|
315 | } |
||
316 | |||
317 | 131 | return $afterIndexOf; |
|
318 | } |
||
319 | |||
320 | /** |
||
321 | * Determines whether there is a delimited string at the current offset and adds it to the token children. |
||
322 | */ |
||
323 | 171 | private function tokenizeDelimitedString(int &$length): bool |
|
324 | { |
||
325 | 171 | $isIdentifier = $this->isIdentifier($length, $content); |
|
326 | 171 | $isStringLiteral = !$isIdentifier && $this->isStringLiteral($length, $content); |
|
327 | |||
328 | 171 | if (!$isIdentifier && !$isStringLiteral) { |
|
329 | 171 | return false; |
|
330 | } |
||
331 | |||
332 | 131 | $this->addTokenFromBuffer(); |
|
333 | |||
334 | 131 | $this->currentToken[] = (new SqlToken()) |
|
335 | 131 | ->type($isIdentifier ? SqlToken::TYPE_IDENTIFIER : SqlToken::TYPE_STRING_LITERAL) |
|
336 | 131 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
337 | 131 | ->startOffset($this->offset) |
|
338 | 131 | ->endOffset($this->offset + $length); |
|
339 | |||
340 | 131 | return true; |
|
341 | } |
||
342 | |||
343 | /** |
||
344 | * Determines whether there is an operator at the current offset and adds it to the token children. |
||
345 | */ |
||
346 | 171 | private function tokenizeOperator(int &$length): bool |
|
347 | { |
||
348 | 171 | if (!$this->isOperator($length, $content)) { |
|
349 | 171 | return false; |
|
350 | } |
||
351 | |||
352 | 171 | $this->addTokenFromBuffer(); |
|
353 | |||
354 | 171 | switch ($this->substring($length)) { |
|
355 | 171 | case '(': |
|
356 | 171 | $this->currentToken[] = (new SqlToken()) |
|
357 | 171 | ->type(SqlToken::TYPE_OPERATOR) |
|
358 | 171 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
359 | 171 | ->startOffset($this->offset) |
|
360 | 171 | ->endOffset($this->offset + $length); |
|
361 | 171 | $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_PARENTHESIS); |
|
362 | |||
363 | 171 | if ($this->currentToken[-1] !== null) { |
|
364 | 171 | $this->tokenStack->push($this->currentToken[-1]); |
|
365 | } |
||
366 | |||
367 | 171 | $this->currentToken = $this->tokenStack->top(); |
|
368 | |||
369 | 171 | break; |
|
370 | |||
371 | 171 | case ')': |
|
372 | 171 | $this->tokenStack->pop(); |
|
373 | 171 | $this->currentToken = $this->tokenStack->top(); |
|
374 | 171 | $this->currentToken[] = (new SqlToken()) |
|
375 | 171 | ->type(SqlToken::TYPE_OPERATOR) |
|
376 | 171 | ->content(')') |
|
377 | 171 | ->startOffset($this->offset) |
|
378 | 171 | ->endOffset($this->offset + $length); |
|
379 | |||
380 | 171 | break; |
|
381 | 122 | case ';': |
|
382 | 13 | if ($this->currentToken instanceof SqlToken && !$this->currentToken->getHasChildren()) { |
|
383 | break; |
||
384 | } |
||
385 | |||
386 | 13 | $this->currentToken[] = (new SqlToken()) |
|
387 | 13 | ->type(SqlToken::TYPE_OPERATOR) |
|
388 | 13 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
389 | 13 | ->startOffset($this->offset) |
|
390 | 13 | ->endOffset($this->offset + $length); |
|
391 | 13 | $this->tokenStack->pop(); |
|
392 | 13 | $this->currentToken = $this->tokenStack->top(); |
|
393 | 13 | $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT); |
|
394 | |||
395 | 13 | if ($this->currentToken[-1] instanceof SqlToken) { |
|
396 | 13 | $this->tokenStack->push($this->currentToken[-1]); |
|
397 | } |
||
398 | |||
399 | 13 | $this->currentToken = $this->tokenStack->top(); |
|
400 | |||
401 | 13 | break; |
|
402 | default: |
||
403 | 122 | $this->currentToken[] = (new SqlToken()) |
|
404 | 122 | ->type(SqlToken::TYPE_OPERATOR) |
|
405 | 122 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
406 | 122 | ->startOffset($this->offset) |
|
407 | 122 | ->endOffset($this->offset + $length); |
|
408 | |||
409 | 122 | break; |
|
410 | } |
||
411 | |||
412 | 171 | return true; |
|
413 | } |
||
414 | |||
415 | /** |
||
416 | * Determines a type of text in the buffer, tokenizes it and adds it to the token children. |
||
417 | */ |
||
418 | 171 | private function addTokenFromBuffer(): void |
|
419 | { |
||
420 | 171 | if ($this->buffer === '') { |
|
421 | 171 | return; |
|
422 | } |
||
423 | |||
424 | 171 | $isKeyword = $this->isKeyword($this->buffer, $content); |
|
425 | |||
426 | 171 | $this->currentToken[] = (new SqlToken()) |
|
427 | 171 | ->type($isKeyword ? SqlToken::TYPE_KEYWORD : SqlToken::TYPE_TOKEN) |
|
428 | 171 | ->content(is_string($content) ? $content : $this->buffer) |
|
429 | 171 | ->startOffset($this->offset - mb_strlen($this->buffer, 'UTF-8')) |
|
430 | 171 | ->endOffset($this->offset); |
|
431 | |||
432 | 171 | $this->buffer = ''; |
|
433 | } |
||
434 | |||
435 | /** |
||
436 | * Adds the specified length to the current offset. |
||
437 | * |
||
438 | * @throws InvalidArgumentException If the length is less than or equal to 0. |
||
439 | */ |
||
440 | 171 | private function advance(int $length): void |
|
441 | { |
||
442 | 171 | if ($length <= 0) { |
|
443 | throw new InvalidArgumentException('Length must be greater than 0.'); |
||
444 | } |
||
445 | |||
446 | 171 | $this->offset += $length; |
|
447 | 171 | $this->substrings = []; |
|
448 | } |
||
449 | |||
450 | /** |
||
451 | * Returns whether the SQL code is completely traversed. |
||
452 | */ |
||
453 | 171 | private function isEof(): bool |
|
454 | { |
||
455 | 171 | return $this->offset >= $this->length; |
|
456 | } |
||
457 | } |
||
458 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.