1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Yiisoft\Db\Sqlite; |
||
6 | |||
7 | use SplStack; |
||
8 | use Yiisoft\Db\Exception\InvalidArgumentException; |
||
9 | |||
10 | use function is_array; |
||
11 | use function is_string; |
||
12 | use function mb_strlen; |
||
13 | use function mb_strpos; |
||
14 | use function mb_strtoupper; |
||
15 | use function mb_substr; |
||
16 | use function reset; |
||
17 | use function usort; |
||
18 | |||
19 | /** |
||
20 | * Splits an SQL query into individual SQL tokens. |
||
21 | * |
||
22 | * You can use it to obtain addition information from an SQL code. |
||
23 | * |
||
24 | * Usage example: |
||
25 | * |
||
26 | * ```php |
||
27 | * $tokenizer = new SqlTokenizer("SELECT * FROM {{%user}} WHERE [[id]] = 1"); |
||
28 | * $root = $tokenizer->tokenize(); |
||
29 | * $sqlTokens = $root->getChildren(); |
||
30 | * ``` |
||
31 | * |
||
32 | * Tokens are instances of {@see SqlToken}. |
||
33 | */ |
||
34 | abstract class AbstractTokenizer |
||
35 | { |
||
36 | /** |
||
37 | * @var int SQL code string length. |
||
38 | */ |
||
39 | protected int $length = 0; |
||
40 | |||
41 | /** |
||
42 | * @var int SQL code string current offset. |
||
43 | */ |
||
44 | protected int $offset = 0; |
||
45 | |||
46 | /** |
||
47 | * @var SplStack Of active tokens. |
||
48 | * |
||
49 | * @psalm-var SplStack<SqlToken> |
||
50 | * |
||
51 | * @psalm-suppress PropertyNotSetInConstructor |
||
52 | */ |
||
53 | private SplStack $tokenStack; |
||
54 | |||
55 | /** |
||
56 | * @var array|SqlToken Active token. It's usually a top of the token stack. |
||
57 | * |
||
58 | * @psalm-var SqlToken|SqlToken[] |
||
59 | * |
||
60 | * @psalm-suppress PropertyNotSetInConstructor |
||
61 | */ |
||
62 | private array|SqlToken $currentToken; |
||
63 | |||
64 | /** |
||
65 | * @var array Cached substrings. |
||
66 | * |
||
67 | * @psalm-var string[] |
||
68 | */ |
||
69 | private array $substrings = []; |
||
70 | |||
71 | /** |
||
72 | * @var string Buffer for the current token. |
||
73 | */ |
||
74 | private string $buffer = ''; |
||
75 | |||
76 | 157 | public function __construct(private string $sql) |
|
77 | { |
||
78 | 157 | } |
|
79 | |||
80 | /** |
||
81 | * Tokenizes and returns a code type token. |
||
82 | * |
||
83 | * @throws InvalidArgumentException If the SQL code is invalid. |
||
84 | * |
||
85 | * @return SqlToken Code type token. |
||
86 | * |
||
87 | * @psalm-suppress MixedPropertyTypeCoercion |
||
88 | */ |
||
89 | 157 | public function tokenize(): SqlToken |
|
90 | { |
||
91 | 157 | $this->length = mb_strlen($this->sql, 'UTF-8'); |
|
92 | 157 | $this->offset = 0; |
|
93 | 157 | $this->substrings = []; |
|
94 | 157 | $this->buffer = ''; |
|
95 | |||
96 | 157 | $token = (new SqlToken())->type(SqlToken::TYPE_CODE)->content($this->sql); |
|
97 | |||
98 | 157 | $this->tokenStack = new SplStack(); |
|
99 | 157 | $this->tokenStack->push($token); |
|
100 | |||
101 | 157 | $token[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT); |
|
102 | |||
103 | 157 | $this->tokenStack->push($token[0]); |
|
104 | /** @psalm-var SqlToken */ |
||
105 | 157 | $this->currentToken = $this->tokenStack->top(); |
|
106 | 157 | $length = 0; |
|
107 | |||
108 | 157 | while (!$this->isEof()) { |
|
109 | 157 | if ($this->isWhitespace($length) || $this->isComment($length)) { |
|
110 | 157 | $this->addTokenFromBuffer(); |
|
111 | 157 | $this->advance($length); |
|
112 | |||
113 | 157 | continue; |
|
114 | } |
||
115 | |||
116 | /** @psalm-suppress ConflictingReferenceConstraint */ |
||
117 | 157 | if ($this->tokenizeOperator($length) || $this->tokenizeDelimitedString($length)) { |
|
118 | 157 | $this->advance($length); |
|
119 | |||
120 | 157 | continue; |
|
121 | } |
||
122 | |||
123 | 157 | $this->buffer .= $this->substring(1); |
|
124 | 157 | $this->advance(1); |
|
125 | } |
||
126 | |||
127 | 157 | $this->addTokenFromBuffer(); |
|
128 | |||
129 | if ( |
||
130 | 157 | $token->getHasChildren() && |
|
131 | 157 | $token[-1] instanceof SqlToken && |
|
132 | 157 | !$token[-1]->getHasChildren() |
|
0 ignored issues
–
show
|
|||
133 | ) { |
||
134 | 64 | unset($token[-1]); |
|
135 | } |
||
136 | |||
137 | 157 | return $token; |
|
138 | } |
||
139 | |||
140 | /** |
||
141 | * Returns whether there's a space or blank at the current offset. |
||
142 | * |
||
143 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. |
||
144 | * |
||
145 | * @param int $length Length of the matched string. |
||
146 | * |
||
147 | * @return bool Whether there's a space or blank at the current offset. |
||
148 | */ |
||
149 | abstract protected function isWhitespace(int &$length): bool; |
||
150 | |||
151 | /** |
||
152 | * Returns whether there's a commentary at the current offset. |
||
153 | * |
||
154 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. |
||
155 | * |
||
156 | * @param int $length Length of the matched string. |
||
157 | * |
||
158 | * @return bool Whether there's a commentary at the current offset. |
||
159 | */ |
||
160 | abstract protected function isComment(int &$length): bool; |
||
161 | |||
162 | /** |
||
163 | * Returns whether there's an operator at the current offset. |
||
164 | * |
||
165 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
166 | * also set `$content` to a string that will be used as a token content. |
||
167 | * |
||
168 | * @param int $length Length of the matched string. |
||
169 | * @param string|null $content Optional content instead of the matched string. |
||
170 | * |
||
171 | * @return bool Whether there's an operator at the current offset. |
||
172 | */ |
||
173 | abstract protected function isOperator(int &$length, string|null &$content): bool; |
||
174 | |||
175 | /** |
||
176 | * Returns whether there's an identifier at the current offset. |
||
177 | * |
||
178 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
179 | * also set `$content` to a string that will be used as a token content. |
||
180 | * |
||
181 | * @param int $length Length of the matched string. |
||
182 | * @param string|null $content Optional content instead of the matched string. |
||
183 | * |
||
184 | * @return bool Whether there's an identifier at the current offset. |
||
185 | */ |
||
186 | abstract protected function isIdentifier(int &$length, string|null &$content): bool; |
||
187 | |||
188 | /** |
||
189 | * Returns whether there's a string literal at the current offset. |
||
190 | * |
||
191 | * If this method returns `true`, it has to set the `$length` parameter to the length of the matched string. It may |
||
192 | * also set `$content` to a string that will be used as a token content. |
||
193 | * |
||
194 | * @param int $length Length of the matched string. |
||
195 | * @param string|null $content Optional content instead of the matched string. |
||
196 | * |
||
197 | * @return bool Whether there's a string literal at the current offset. |
||
198 | */ |
||
199 | abstract protected function isStringLiteral(int &$length, string|null &$content): bool; |
||
200 | |||
201 | /** |
||
202 | * Returns whether the given string is a keyword. |
||
203 | * |
||
204 | * The method may set `$content` to a string that will be used as a token content. |
||
205 | * |
||
206 | * @param string $string String to match. |
||
207 | * @param string|null $content Optional content instead of the matched string. |
||
208 | * |
||
209 | * @return bool Whether the given string is a keyword. |
||
210 | */ |
||
211 | abstract protected function isKeyword(string $string, string|null &$content): bool; |
||
212 | |||
213 | /** |
||
214 | * Returns whether the longest common prefix equals to the SQL code of the same length at the current offset. |
||
215 | * |
||
216 | * @param array $with Strings to test. The method `will` change this parameter to speed up lookups. |
||
217 | * @param bool $caseSensitive Whether to perform a case-sensitive comparison. |
||
218 | * @param int $length Length of the matched string. |
||
219 | * @param string|null $content Matched string. |
||
220 | * |
||
221 | * @return bool Whether there is a match. |
||
222 | * |
||
223 | * @psalm-param array<array-key, string> $with |
||
224 | */ |
||
225 | 157 | protected function startsWithAnyLongest( |
|
226 | array $with, |
||
227 | bool $caseSensitive, |
||
228 | int &$length, |
||
229 | string &$content = null |
||
230 | ): bool { |
||
231 | 157 | if (empty($with)) { |
|
232 | return false; |
||
233 | } |
||
234 | |||
235 | 157 | if (!is_array(reset($with))) { |
|
236 | 157 | usort($with, static fn (string $string1, string $string2) => mb_strlen($string2, 'UTF-8') - mb_strlen($string1, 'UTF-8')); |
|
237 | |||
238 | 157 | $map = []; |
|
239 | |||
240 | 157 | foreach ($with as $string) { |
|
241 | 157 | $map[mb_strlen($string, 'UTF-8')][$caseSensitive ? $string : mb_strtoupper($string, 'UTF-8')] = true; |
|
242 | } |
||
243 | |||
244 | 157 | $with = $map; |
|
245 | } |
||
246 | |||
247 | /** @psalm-var array<int, array> $with */ |
||
248 | 157 | foreach ($with as $testLength => $testValues) { |
|
249 | 157 | $content = $this->substring($testLength, $caseSensitive); |
|
250 | |||
251 | 157 | if (isset($testValues[$content])) { |
|
252 | 157 | $length = $testLength; |
|
253 | 157 | return true; |
|
254 | } |
||
255 | } |
||
256 | |||
257 | 157 | return false; |
|
258 | } |
||
259 | |||
260 | /** |
||
261 | * Returns a string of the given length starting with the specified offset. |
||
262 | * |
||
263 | * @param int $length String length to return. |
||
264 | * @param bool $caseSensitive If it's `false`, the string will be uppercase. |
||
265 | * @param int|null $offset SQL code offset, defaults to current if `null` is passed. |
||
266 | * |
||
267 | * @return string Result string, it may be empty if there's nothing to return. |
||
268 | */ |
||
269 | 157 | protected function substring(int $length, bool $caseSensitive = true, int $offset = null): string |
|
270 | { |
||
271 | 157 | if ($offset === null) { |
|
272 | 157 | $offset = $this->offset; |
|
273 | } |
||
274 | |||
275 | 157 | if ($offset + $length > $this->length) { |
|
276 | 157 | return ''; |
|
277 | } |
||
278 | |||
279 | 157 | $cacheKey = $offset . ',' . $length; |
|
280 | |||
281 | 157 | if (!isset($this->substrings[$cacheKey . ',1'])) { |
|
282 | 157 | $this->substrings[$cacheKey . ',1'] = mb_substr($this->sql, $offset, $length, 'UTF-8'); |
|
283 | } |
||
284 | |||
285 | 157 | if (!$caseSensitive && !isset($this->substrings[$cacheKey . ',0'])) { |
|
286 | $this->substrings[$cacheKey . ',0'] = mb_strtoupper($this->substrings[$cacheKey . ',1'], 'UTF-8'); |
||
287 | } |
||
288 | |||
289 | 157 | return $this->substrings[$cacheKey . ',' . (int) $caseSensitive]; |
|
290 | } |
||
291 | |||
292 | /** |
||
293 | * Returns an index after the given string in the SQL code starting with the specified offset. |
||
294 | * |
||
295 | * @param string $string String to find. |
||
296 | * @param int|null $offset SQL code offset, defaults to current if `null` is passed. |
||
297 | * |
||
298 | * @return int Index after the given string or end of string index. |
||
299 | */ |
||
300 | 116 | protected function indexAfter(string $string, int $offset = null): int |
|
301 | { |
||
302 | 116 | if ($offset === null) { |
|
303 | 18 | $offset = $this->offset; |
|
304 | } |
||
305 | |||
306 | 116 | if ($offset + mb_strlen($string, 'UTF-8') > $this->length) { |
|
307 | return $this->length; |
||
308 | } |
||
309 | |||
310 | 116 | $afterIndexOf = mb_strpos($this->sql, $string, $offset, 'UTF-8'); |
|
311 | |||
312 | 116 | if ($afterIndexOf === false) { |
|
313 | $afterIndexOf = $this->length; |
||
314 | } else { |
||
315 | 116 | $afterIndexOf += mb_strlen($string, 'UTF-8'); |
|
316 | } |
||
317 | |||
318 | 116 | return $afterIndexOf; |
|
319 | } |
||
320 | |||
321 | /** |
||
322 | * Determines whether there is a delimited string at the current offset and adds it to the token children. |
||
323 | */ |
||
324 | 157 | private function tokenizeDelimitedString(int &$length): bool |
|
325 | { |
||
326 | 157 | $isIdentifier = $this->isIdentifier($length, $content); |
|
327 | 157 | $isStringLiteral = !$isIdentifier && $this->isStringLiteral($length, $content); |
|
328 | |||
329 | 157 | if (!$isIdentifier && !$isStringLiteral) { |
|
330 | 157 | return false; |
|
331 | } |
||
332 | |||
333 | 116 | $this->addTokenFromBuffer(); |
|
334 | |||
335 | 116 | $this->currentToken[] = (new SqlToken()) |
|
336 | 116 | ->type($isIdentifier ? SqlToken::TYPE_IDENTIFIER : SqlToken::TYPE_STRING_LITERAL) |
|
337 | 116 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
338 | 116 | ->startOffset($this->offset) |
|
339 | 116 | ->endOffset($this->offset + $length); |
|
340 | |||
341 | 116 | return true; |
|
342 | } |
||
343 | |||
344 | /** |
||
345 | * Determines whether there is an operator at the current offset and adds it to the token children. |
||
346 | */ |
||
347 | 157 | private function tokenizeOperator(int &$length): bool |
|
348 | { |
||
349 | 157 | if (!$this->isOperator($length, $content)) { |
|
350 | 157 | return false; |
|
351 | } |
||
352 | |||
353 | 157 | $this->addTokenFromBuffer(); |
|
354 | |||
355 | 157 | switch ($this->substring($length)) { |
|
356 | 157 | case '(': |
|
357 | 157 | $this->currentToken[] = (new SqlToken()) |
|
358 | 157 | ->type(SqlToken::TYPE_OPERATOR) |
|
359 | 157 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
360 | 157 | ->startOffset($this->offset) |
|
361 | 157 | ->endOffset($this->offset + $length); |
|
362 | 157 | $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_PARENTHESIS); |
|
363 | |||
364 | 157 | if ($this->currentToken[-1] !== null) { |
|
365 | 157 | $this->tokenStack->push($this->currentToken[-1]); |
|
366 | } |
||
367 | |||
368 | 157 | $this->currentToken = $this->tokenStack->top(); |
|
369 | |||
370 | 157 | break; |
|
371 | |||
372 | 157 | case ')': |
|
373 | 157 | $this->tokenStack->pop(); |
|
374 | 157 | $this->currentToken = $this->tokenStack->top(); |
|
375 | 157 | $this->currentToken[] = (new SqlToken()) |
|
376 | 157 | ->type(SqlToken::TYPE_OPERATOR) |
|
377 | 157 | ->content(')') |
|
378 | 157 | ->startOffset($this->offset) |
|
379 | 157 | ->endOffset($this->offset + $length); |
|
380 | |||
381 | 157 | break; |
|
382 | 107 | case ';': |
|
383 | 12 | if ($this->currentToken instanceof SqlToken && !$this->currentToken->getHasChildren()) { |
|
384 | break; |
||
385 | } |
||
386 | |||
387 | 12 | $this->currentToken[] = (new SqlToken()) |
|
388 | 12 | ->type(SqlToken::TYPE_OPERATOR) |
|
389 | 12 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
390 | 12 | ->startOffset($this->offset) |
|
391 | 12 | ->endOffset($this->offset + $length); |
|
392 | 12 | $this->tokenStack->pop(); |
|
393 | 12 | $this->currentToken = $this->tokenStack->top(); |
|
394 | 12 | $this->currentToken[] = (new SqlToken())->type(SqlToken::TYPE_STATEMENT); |
|
395 | |||
396 | 12 | if ($this->currentToken[-1] instanceof SqlToken) { |
|
397 | 12 | $this->tokenStack->push($this->currentToken[-1]); |
|
398 | } |
||
399 | |||
400 | 12 | $this->currentToken = $this->tokenStack->top(); |
|
401 | |||
402 | 12 | break; |
|
403 | default: |
||
404 | 107 | $this->currentToken[] = (new SqlToken()) |
|
405 | 107 | ->type(SqlToken::TYPE_OPERATOR) |
|
406 | 107 | ->content(is_string($content) ? $content : $this->substring($length)) |
|
407 | 107 | ->startOffset($this->offset) |
|
408 | 107 | ->endOffset($this->offset + $length); |
|
409 | |||
410 | 107 | break; |
|
411 | } |
||
412 | |||
413 | 157 | return true; |
|
414 | } |
||
415 | |||
416 | /** |
||
417 | * Determines a type of text in the buffer, tokenizes it and adds it to the token children. |
||
418 | */ |
||
419 | 157 | private function addTokenFromBuffer(): void |
|
420 | { |
||
421 | 157 | if ($this->buffer === '') { |
|
422 | 157 | return; |
|
423 | } |
||
424 | |||
425 | 157 | $isKeyword = $this->isKeyword($this->buffer, $content); |
|
426 | |||
427 | 157 | $this->currentToken[] = (new SqlToken()) |
|
428 | 157 | ->type($isKeyword ? SqlToken::TYPE_KEYWORD : SqlToken::TYPE_TOKEN) |
|
429 | 157 | ->content(is_string($content) ? $content : $this->buffer) |
|
430 | 157 | ->startOffset($this->offset - mb_strlen($this->buffer, 'UTF-8')) |
|
431 | 157 | ->endOffset($this->offset); |
|
432 | |||
433 | 157 | $this->buffer = ''; |
|
434 | } |
||
435 | |||
436 | /** |
||
437 | * Adds the specified length to the current offset. |
||
438 | * |
||
439 | * @throws InvalidArgumentException If the length is less than or equal to 0. |
||
440 | */ |
||
441 | 157 | private function advance(int $length): void |
|
442 | { |
||
443 | 157 | if ($length <= 0) { |
|
444 | throw new InvalidArgumentException('Length must be greater than 0.'); |
||
445 | } |
||
446 | |||
447 | 157 | $this->offset += $length; |
|
448 | 157 | $this->substrings = []; |
|
449 | } |
||
450 | |||
451 | /** |
||
452 | * Returns whether the SQL code is completely traversed. |
||
453 | */ |
||
454 | 157 | private function isEof(): bool |
|
455 | { |
||
456 | 157 | return $this->offset >= $this->length; |
|
457 | } |
||
458 | } |
||
459 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.