1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace PhpMyAdmin\SqlParser; |
||
6 | |||
7 | use Exception; |
||
8 | use PhpMyAdmin\SqlParser\Exceptions\LexerException; |
||
9 | |||
10 | use function in_array; |
||
11 | use function mb_strlen; |
||
12 | use function sprintf; |
||
13 | use function str_ends_with; |
||
14 | use function strlen; |
||
15 | use function substr; |
||
16 | |||
17 | /** |
||
18 | * Defines the lexer of the library. |
||
19 | * |
||
20 | * This is one of the most important components, along with the parser. |
||
21 | * |
||
22 | * Depends on context to extract lexemes. |
||
23 | * |
||
24 | * Performs lexical analysis over a SQL statement and splits it in multiple tokens. |
||
25 | * |
||
26 | * The output of the lexer is affected by the context of the SQL statement. |
||
27 | * |
||
28 | * @see Context |
||
29 | */ |
||
30 | class Lexer |
||
31 | { |
||
32 | /** |
||
33 | * Whether errors should throw exceptions or just be stored. |
||
34 | */ |
||
35 | private bool $strict = false; |
||
36 | |||
37 | /** |
||
38 | * List of errors that occurred during lexing. |
||
39 | * |
||
40 | * Usually, the lexing does not stop once an error occurred because that |
||
41 | * error might be false positive or a partial result (even a bad one) |
||
42 | * might be needed. |
||
43 | * |
||
44 | * @var Exception[] |
||
45 | */ |
||
46 | public array $errors = []; |
||
47 | |||
48 | /** |
||
49 | * A list of keywords that indicate that the function keyword |
||
50 | * is not used as a function |
||
51 | */ |
||
52 | private const KEYWORD_NAME_INDICATORS = [ |
||
53 | 'FROM', |
||
54 | 'SET', |
||
55 | 'WHERE', |
||
56 | ]; |
||
57 | |||
58 | /** |
||
59 | * A list of operators that indicate that the function keyword |
||
60 | * is not used as a function |
||
61 | */ |
||
62 | private const OPERATOR_NAME_INDICATORS = [ |
||
63 | ',', |
||
64 | '.', |
||
65 | ]; |
||
66 | |||
67 | /** |
||
68 | * The string to be parsed. |
||
69 | */ |
||
70 | public string|UtfString $str = ''; |
||
71 | |||
72 | /** |
||
73 | * The length of `$str`. |
||
74 | * |
||
75 | * By storing its length, a lot of time is saved, because parsing methods |
||
76 | * would call `strlen` everytime. |
||
77 | */ |
||
78 | public int $len = 0; |
||
79 | |||
80 | /** |
||
81 | * The index of the last parsed character. |
||
82 | */ |
||
83 | public int $last = 0; |
||
84 | |||
85 | /** |
||
86 | * Tokens extracted from given strings. |
||
87 | */ |
||
88 | public TokensList $list; |
||
89 | |||
90 | /** |
||
91 | * The default delimiter. This is used, by default, in all new instances. |
||
92 | */ |
||
93 | public static string $defaultDelimiter = ';'; |
||
94 | |||
95 | /** |
||
96 | * Statements delimiter. |
||
97 | * This may change during lexing. |
||
98 | */ |
||
99 | public string $delimiter; |
||
100 | |||
101 | /** |
||
102 | * The length of the delimiter. |
||
103 | * |
||
104 | * Because `parseDelimiter` can be called a lot, it would perform a lot of |
||
105 | * calls to `strlen`, which might affect performance when the delimiter is |
||
106 | * big. |
||
107 | */ |
||
108 | public int $delimiterLen; |
||
109 | |||
110 | /** |
||
111 | * @param string|UtfString $str the query to be lexed |
||
112 | * @param bool $strict whether strict mode should be |
||
113 | * enabled or not |
||
114 | * @param string $delimiter the delimiter to be used |
||
115 | */ |
||
116 | 1500 | public function __construct(string|UtfString $str, bool $strict = false, string|null $delimiter = null) |
|
117 | { |
||
118 | 1500 | if (Context::$keywords === []) { |
|
119 | Context::load(); |
||
120 | } |
||
121 | |||
122 | // `strlen` is used instead of `mb_strlen` because the lexer needs to |
||
123 | // parse each byte of the input. |
||
124 | 1500 | $len = $str instanceof UtfString ? $str->length() : strlen($str); |
|
125 | |||
126 | // For multi-byte strings, a new instance of `UtfString` is initialized. |
||
127 | 1500 | if (! $str instanceof UtfString && $len !== mb_strlen($str, 'UTF-8')) { |
|
128 | 10 | $str = new UtfString($str); |
|
129 | } |
||
130 | |||
131 | 1500 | $this->str = $str; |
|
132 | 1500 | $this->len = $str instanceof UtfString ? $str->length() : $len; |
|
133 | |||
134 | 1500 | $this->strict = $strict; |
|
135 | |||
136 | // Setting the delimiter. |
||
137 | 1500 | $this->setDelimiter(! empty($delimiter) ? $delimiter : static::$defaultDelimiter); |
|
138 | |||
139 | 1500 | $this->lex(); |
|
140 | } |
||
141 | |||
142 | /** |
||
143 | * Sets the delimiter. |
||
144 | * |
||
145 | * @param string $delimiter the new delimiter |
||
146 | */ |
||
147 | 1500 | public function setDelimiter(string $delimiter): void |
|
148 | { |
||
149 | 1500 | $this->delimiter = $delimiter; |
|
150 | 1500 | $this->delimiterLen = strlen($delimiter); |
|
151 | } |
||
152 | |||
153 | /** |
||
154 | * Parses the string and extracts lexemes. |
||
155 | */ |
||
156 | 1500 | public function lex(): void |
|
157 | { |
||
158 | // TODO: Sometimes, static::parse* functions make unnecessary calls to |
||
159 | // is* functions. For a better performance, some rules can be deduced |
||
160 | // from context. |
||
161 | // For example, in `parseBool` there is no need to compare the token |
||
162 | // every time with `true` and `false`. The first step would be to |
||
163 | // compare with 'true' only and just after that add another letter from |
||
164 | // context and compare again with `false`. |
||
165 | // Another example is `parseComment`. |
||
166 | |||
167 | 1500 | $list = new TokensList(); |
|
168 | |||
169 | /** |
||
170 | * Last processed token. |
||
171 | */ |
||
172 | 1500 | $lastToken = null; |
|
173 | |||
174 | 1500 | for ($this->last = 0, $lastIdx = 0; $this->last < $this->len; $lastIdx = ++$this->last) { |
|
175 | 1488 | $token = $this->parse(); |
|
176 | |||
177 | 1488 | if ($token === null) { |
|
178 | // @assert($this->last === $lastIdx); |
||
179 | 6 | $token = new Token($this->str[$this->last]); |
|
180 | 6 | $this->error('Unexpected character.', $this->str[$this->last], $this->last); |
|
181 | } elseif ( |
||
182 | 1488 | $lastToken !== null |
|
183 | 1488 | && $token->type === TokenType::Symbol |
|
184 | 1488 | && $token->flags & Token::FLAG_SYMBOL_VARIABLE |
|
185 | && ( |
||
186 | 1488 | $lastToken->type === TokenType::String |
|
187 | 1488 | || ( |
|
188 | 1488 | $lastToken->type === TokenType::Symbol |
|
189 | 1488 | && $lastToken->flags & Token::FLAG_SYMBOL_BACKTICK |
|
190 | 1488 | ) |
|
191 | ) |
||
192 | ) { |
||
193 | // Handles ```... FROM 'user'@'%' ...```. |
||
194 | 46 | $lastToken->token .= $token->token; |
|
195 | 46 | $lastToken->type = TokenType::Symbol; |
|
196 | 46 | $lastToken->flags = Token::FLAG_SYMBOL_USER; |
|
197 | 46 | $lastToken->value .= '@' . $token->value; |
|
198 | 46 | continue; |
|
199 | } elseif ( |
||
200 | 1488 | $lastToken !== null |
|
201 | 1488 | && $token->type === TokenType::Keyword |
|
202 | 1488 | && $lastToken->type === TokenType::Operator |
|
203 | 1488 | && $lastToken->value === '.' |
|
204 | ) { |
||
205 | // Handles ```... tbl.FROM ...```. In this case, FROM is not |
||
206 | // a reserved word. |
||
207 | 30 | $token->type = TokenType::None; |
|
208 | 30 | $token->flags = 0; |
|
209 | 30 | $token->value = $token->token; |
|
210 | } |
||
211 | |||
212 | 1488 | $token->position = $lastIdx; |
|
213 | |||
214 | 1488 | $list->tokens[$list->count++] = $token; |
|
215 | |||
216 | // Handling delimiters. |
||
217 | 1488 | if ($token->type === TokenType::None && $token->value === 'DELIMITER') { |
|
218 | 36 | if ($this->last + 1 >= $this->len) { |
|
219 | 2 | $this->error('Expected whitespace(s) before delimiter.', '', $this->last + 1); |
|
220 | 2 | continue; |
|
221 | } |
||
222 | |||
223 | // Skipping last R (from `delimiteR`) and whitespaces between |
||
224 | // the keyword `DELIMITER` and the actual delimiter. |
||
225 | 34 | $pos = ++$this->last; |
|
226 | 34 | $token = $this->parseWhitespace(); |
|
227 | |||
228 | 34 | if ($token !== null) { |
|
229 | 32 | $token->position = $pos; |
|
230 | 32 | $list->tokens[$list->count++] = $token; |
|
231 | } |
||
232 | |||
233 | // Preparing the token that holds the new delimiter. |
||
234 | 34 | if ($this->last + 1 >= $this->len) { |
|
235 | 2 | $this->error('Expected delimiter.', '', $this->last + 1); |
|
236 | 2 | continue; |
|
237 | } |
||
238 | |||
239 | 32 | $pos = $this->last + 1; |
|
240 | |||
241 | // Parsing the delimiter. |
||
242 | 32 | $this->delimiter = ''; |
|
243 | 32 | $delimiterLen = 0; |
|
244 | while ( |
||
245 | 32 | ++$this->last < $this->len |
|
246 | 32 | && ! Context::isWhitespace($this->str[$this->last]) |
|
247 | 32 | && $delimiterLen < 15 |
|
248 | ) { |
||
249 | 30 | $this->delimiter .= $this->str[$this->last]; |
|
250 | 30 | ++$delimiterLen; |
|
251 | } |
||
252 | |||
253 | 32 | if ($this->delimiter === '') { |
|
254 | 2 | $this->error('Expected delimiter.', '', $this->last); |
|
255 | 2 | $this->delimiter = ';'; |
|
256 | } |
||
257 | |||
258 | 32 | --$this->last; |
|
259 | |||
260 | // Saving the delimiter and its token. |
||
261 | 32 | $this->delimiterLen = strlen($this->delimiter); |
|
262 | 32 | $token = new Token($this->delimiter, TokenType::Delimiter); |
|
263 | 32 | $token->position = $pos; |
|
264 | 32 | $list->tokens[$list->count++] = $token; |
|
265 | } |
||
266 | |||
267 | 1484 | $lastToken = $token; |
|
268 | } |
||
269 | |||
270 | // Adding a final delimiter to mark the ending. |
||
271 | 1500 | $list->tokens[$list->count++] = new Token('', TokenType::Delimiter); |
|
272 | |||
273 | // Saving the tokens list. |
||
274 | 1500 | $this->list = $list; |
|
275 | |||
276 | 1500 | $this->solveAmbiguityOnStarOperator(); |
|
277 | 1500 | $this->solveAmbiguityOnFunctionKeywords(); |
|
278 | } |
||
279 | |||
280 | /** |
||
281 | * Resolves the ambiguity when dealing with the "*" operator. |
||
282 | * |
||
283 | * In SQL statements, the "*" operator can be an arithmetic operator (like in 2*3) or an SQL wildcard (like in |
||
284 | * SELECT a.* FROM ...). To solve this ambiguity, the solution is to find the next token, excluding whitespaces and |
||
285 | * comments, right after the "*" position. The "*" is for sure an SQL wildcard if the next token found is any of: |
||
286 | * - "FROM" (the FROM keyword like in "SELECT * FROM..."); |
||
287 | * - "USING" (the USING keyword like in "DELETE table_name.* USING..."); |
||
288 | * - "," (a comma separator like in "SELECT *, field FROM..."); |
||
289 | * - ")" (a closing parenthesis like in "COUNT(*)"). |
||
290 | * This methods will change the flag of the "*" tokens when any of those condition above is true. Otherwise, the |
||
291 | * default flag (arithmetic) will be kept. |
||
292 | */ |
||
293 | 1500 | private function solveAmbiguityOnStarOperator(): void |
|
294 | { |
||
295 | 1500 | $iBak = $this->list->idx; |
|
296 | 1500 | while (($starToken = $this->list->getNextOfTypeAndValue(TokenType::Operator, '*')) !== null) { |
|
297 | // getNext() already gets rid of whitespaces and comments. |
||
298 | 200 | $next = $this->list->getNext(); |
|
299 | |||
300 | 200 | if ($next === null) { |
|
301 | continue; |
||
302 | } |
||
303 | |||
304 | if ( |
||
305 | 200 | ($next->type !== TokenType::Keyword || ! in_array($next->value, ['FROM', 'USING'], true)) |
|
306 | 200 | && ($next->type !== TokenType::Operator || ! in_array($next->value, [',', ')'], true)) |
|
307 | ) { |
||
308 | 16 | continue; |
|
309 | } |
||
310 | |||
311 | 186 | $starToken->flags = Token::FLAG_OPERATOR_SQL; |
|
312 | } |
||
313 | |||
314 | 1500 | $this->list->idx = $iBak; |
|
315 | } |
||
316 | |||
317 | /** |
||
318 | * Resolves the ambiguity when dealing with the functions keywords. |
||
319 | * |
||
320 | * In SQL statements, the function keywords might be used as table names or columns names. |
||
321 | * To solve this ambiguity, the solution is to find the next token, excluding whitespaces and |
||
322 | * comments, right after the function keyword position. The function keyword is for sure used |
||
323 | * as column name or table name if the next token found is any of: |
||
324 | * |
||
325 | * - "FROM" (the FROM keyword like in "SELECT Country x, AverageSalary avg FROM..."); |
||
326 | * - "WHERE" (the WHERE keyword like in "DELETE FROM emp x WHERE x.salary = 20"); |
||
327 | * - "SET" (the SET keyword like in "UPDATE Country x, City y set x.Name=x.Name"); |
||
328 | * - "," (a comma separator like 'x,' in "UPDATE Country x, City y set x.Name=x.Name"); |
||
329 | * - "." (a dot separator like in "x.asset_id FROM (SELECT evt.asset_id FROM evt)". |
||
330 | * - "NULL" (when used as a table alias like in "avg.col FROM (SELECT ev.col FROM ev) avg"). |
||
331 | * |
||
332 | * This method will change the flag of the function keyword tokens when any of those |
||
333 | * condition above is true. Otherwise, the |
||
334 | * default flag (function keyword) will be kept. |
||
335 | */ |
||
336 | 1500 | private function solveAmbiguityOnFunctionKeywords(): void |
|
337 | { |
||
338 | 1500 | $iBak = $this->list->idx; |
|
339 | 1500 | $keywordFunction = TokenType::Keyword->value | Token::FLAG_KEYWORD_FUNCTION; |
|
340 | 1500 | while (($keywordToken = $this->list->getNextOfTypeAndFlag(TokenType::Keyword, $keywordFunction)) !== null) { |
|
341 | 218 | $next = $this->list->getNext(); |
|
342 | if ( |
||
343 | 218 | ($next->type !== TokenType::Keyword |
|
344 | 218 | || ! in_array($next->value, self::KEYWORD_NAME_INDICATORS, true) |
|
345 | ) |
||
346 | 218 | && ($next->type !== TokenType::Operator |
|
347 | 218 | || ! in_array($next->value, self::OPERATOR_NAME_INDICATORS, true) |
|
348 | ) |
||
349 | 218 | && ($next->value !== '') |
|
350 | ) { |
||
351 | 208 | continue; |
|
352 | } |
||
353 | |||
354 | 12 | $keywordToken->type = TokenType::None; |
|
355 | 12 | $keywordToken->flags = Token::FLAG_NONE; |
|
356 | 12 | $keywordToken->keyword = $keywordToken->value; |
|
357 | } |
||
358 | |||
359 | 1500 | $this->list->idx = $iBak; |
|
360 | } |
||
361 | |||
362 | /** |
||
363 | * Creates a new error log. |
||
364 | * |
||
365 | * @param string $msg the error message |
||
366 | * @param string $str the character that produced the error |
||
367 | * @param int $pos the position of the character |
||
368 | * @param int $code the code of the error |
||
369 | * |
||
370 | * @throws LexerException throws the exception, if strict mode is enabled. |
||
371 | */ |
||
372 | 34 | public function error(string $msg, string $str = '', int $pos = 0, int $code = 0): void |
|
373 | { |
||
374 | 34 | $error = new LexerException( |
|
375 | 34 | Translator::gettext($msg), |
|
376 | 34 | $str, |
|
377 | 34 | $pos, |
|
378 | 34 | $code, |
|
379 | 34 | ); |
|
380 | |||
381 | 34 | if ($this->strict) { |
|
382 | 2 | throw $error; |
|
383 | } |
||
384 | |||
385 | 32 | $this->errors[] = $error; |
|
386 | } |
||
387 | |||
388 | /** |
||
389 | * Parses a keyword. |
||
390 | */ |
||
391 | 1470 | public function parseKeyword(): Token|null |
|
392 | { |
||
393 | 1470 | $token = ''; |
|
394 | |||
395 | /** |
||
396 | * Value to be returned. |
||
397 | */ |
||
398 | 1470 | $ret = null; |
|
399 | |||
400 | /** |
||
401 | * The value of `$this->last` where `$token` ends in `$this->str`. |
||
402 | */ |
||
403 | 1470 | $iEnd = $this->last; |
|
404 | |||
405 | /** |
||
406 | * Whether last parsed character is a whitespace. |
||
407 | */ |
||
408 | 1470 | $lastSpace = false; |
|
409 | |||
410 | 1470 | for ($j = 1; $j < Context::KEYWORD_MAX_LENGTH && $this->last < $this->len; ++$j, ++$this->last) { |
|
411 | // Composed keywords shouldn't have more than one whitespace between |
||
412 | // keywords. |
||
413 | 1470 | if (Context::isWhitespace($this->str[$this->last])) { |
|
414 | 1434 | if ($lastSpace) { |
|
415 | 270 | --$j; // The size of the keyword didn't increase. |
|
416 | 270 | continue; |
|
417 | } |
||
418 | |||
419 | 1434 | $lastSpace = true; |
|
420 | } else { |
||
421 | 1470 | $lastSpace = false; |
|
422 | } |
||
423 | |||
424 | 1470 | $token .= $this->str[$this->last]; |
|
425 | 1470 | $flags = Context::isKeyword($token); |
|
426 | |||
427 | 1470 | if (($this->last + 1 !== $this->len && ! Context::isSeparator($this->str[$this->last + 1])) || ! $flags) { |
|
428 | 1470 | continue; |
|
429 | } |
||
430 | |||
431 | 1434 | $ret = new Token($token, TokenType::Keyword, $flags); |
|
432 | 1434 | $iEnd = $this->last; |
|
433 | |||
434 | // We don't break so we find longest keyword. |
||
435 | // For example, `OR` and `ORDER` have a common prefix `OR`. |
||
436 | // If we stopped at `OR`, the parsing would be invalid. |
||
437 | } |
||
438 | |||
439 | 1470 | $this->last = $iEnd; |
|
440 | |||
441 | 1470 | return $ret; |
|
442 | } |
||
443 | |||
444 | /** |
||
445 | * Parses a label. |
||
446 | */ |
||
447 | 1118 | public function parseLabel(): Token|null |
|
448 | { |
||
449 | 1118 | $token = ''; |
|
450 | |||
451 | /** |
||
452 | * Value to be returned. |
||
453 | */ |
||
454 | 1118 | $ret = null; |
|
455 | |||
456 | /** |
||
457 | * The value of `$this->last` where `$token` ends in `$this->str`. |
||
458 | */ |
||
459 | 1118 | $iEnd = $this->last; |
|
460 | 1118 | for ($j = 1; $j < Context::LABEL_MAX_LENGTH && $this->last < $this->len; ++$j, ++$this->last) { |
|
461 | 1118 | if ($this->str[$this->last] === ':' && $j > 1) { |
|
462 | // End of label |
||
463 | 4 | $token .= $this->str[$this->last]; |
|
464 | 4 | $ret = new Token($token, TokenType::Label); |
|
465 | 4 | $iEnd = $this->last; |
|
466 | 4 | break; |
|
467 | } |
||
468 | |||
469 | 1118 | if (Context::isWhitespace($this->str[$this->last]) && $j > 1) { |
|
470 | // Whitespace between label and : |
||
471 | // The size of the keyword didn't increase. |
||
472 | 882 | --$j; |
|
473 | 1118 | } elseif (Context::isSeparator($this->str[$this->last])) { |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
474 | // Any other separator |
||
475 | 816 | break; |
|
476 | } |
||
477 | |||
478 | 1114 | $token .= $this->str[$this->last]; |
|
479 | } |
||
480 | |||
481 | 1118 | $this->last = $iEnd; |
|
482 | |||
483 | 1118 | return $ret; |
|
484 | } |
||
485 | |||
486 | /** |
||
487 | * Parses an operator. |
||
488 | */ |
||
489 | 1488 | public function parseOperator(): Token|null |
|
490 | { |
||
491 | 1488 | $token = ''; |
|
492 | |||
493 | /** |
||
494 | * Value to be returned. |
||
495 | */ |
||
496 | 1488 | $ret = null; |
|
497 | |||
498 | /** |
||
499 | * The value of `$this->last` where `$token` ends in `$this->str`. |
||
500 | */ |
||
501 | 1488 | $iEnd = $this->last; |
|
502 | |||
503 | 1488 | for ($j = 1; $j < Context::OPERATOR_MAX_LENGTH && $this->last < $this->len; ++$j, ++$this->last) { |
|
504 | 1488 | $token .= $this->str[$this->last]; |
|
505 | 1488 | $flags = Context::isOperator($token); |
|
506 | |||
507 | 1488 | if (! $flags) { |
|
508 | 1484 | continue; |
|
509 | } |
||
510 | |||
511 | 1030 | $ret = new Token($token, TokenType::Operator, $flags); |
|
512 | 1030 | $iEnd = $this->last; |
|
513 | } |
||
514 | |||
515 | 1488 | $this->last = $iEnd; |
|
516 | |||
517 | 1488 | return $ret; |
|
518 | } |
||
519 | |||
520 | /** |
||
521 | * Parses a whitespace. |
||
522 | */ |
||
523 | 1488 | public function parseWhitespace(): Token|null |
|
524 | { |
||
525 | 1488 | $token = $this->str[$this->last]; |
|
526 | |||
527 | 1488 | if (! Context::isWhitespace($token)) { |
|
528 | 1488 | return null; |
|
529 | } |
||
530 | |||
531 | 1450 | while (++$this->last < $this->len && Context::isWhitespace($this->str[$this->last])) { |
|
532 | 274 | $token .= $this->str[$this->last]; |
|
533 | } |
||
534 | |||
535 | 1450 | --$this->last; |
|
536 | |||
537 | 1450 | return new Token($token, TokenType::Whitespace); |
|
538 | } |
||
539 | |||
540 | /** |
||
541 | * Parses a comment. |
||
542 | */ |
||
543 | 1488 | public function parseComment(): Token|null |
|
544 | { |
||
545 | 1488 | $iBak = $this->last; |
|
546 | 1488 | $token = $this->str[$this->last]; |
|
547 | |||
548 | // Bash style comments. (#comment\n) |
||
549 | 1488 | if (Context::isComment($token)) { |
|
550 | 6 | while (++$this->last < $this->len && $this->str[$this->last] !== "\n") { |
|
551 | 6 | $token .= $this->str[$this->last]; |
|
552 | } |
||
553 | |||
554 | // Include trailing \n as whitespace token |
||
555 | 6 | if ($this->last < $this->len) { |
|
556 | 6 | --$this->last; |
|
557 | } |
||
558 | |||
559 | 6 | return new Token($token, TokenType::Comment, Token::FLAG_COMMENT_BASH); |
|
560 | } |
||
561 | |||
562 | // C style comments. (/*comment*\/) |
||
563 | 1488 | if (++$this->last < $this->len) { |
|
564 | 1484 | $token .= $this->str[$this->last]; |
|
565 | 1484 | if (Context::isComment($token)) { |
|
566 | // There might be a conflict with "*" operator here, when string is "*/*". |
||
567 | // This can occurs in the following statements: |
||
568 | // - "SELECT */* comment */ FROM ..." |
||
569 | // - "SELECT 2*/* comment */3 AS `six`;" |
||
570 | 100 | $next = $this->last + 1; |
|
571 | 100 | if (($next < $this->len) && $this->str[$next] === '*') { |
|
572 | // Conflict in "*/*": first "*" was not for ending a comment. |
||
573 | // Stop here and let other parsing method define the true behavior of that first star. |
||
574 | 2 | $this->last = $iBak; |
|
575 | |||
576 | 2 | return null; |
|
577 | } |
||
578 | |||
579 | 100 | $flags = Token::FLAG_COMMENT_C; |
|
580 | |||
581 | // This comment already ended. It may be a part of a |
||
582 | // previous MySQL specific command. |
||
583 | 100 | if ($token === '*/') { |
|
584 | 36 | return new Token($token, TokenType::Comment, $flags); |
|
585 | } |
||
586 | |||
587 | // Checking if this is a MySQL-specific command. |
||
588 | 98 | if ($this->last + 1 < $this->len && $this->str[$this->last + 1] === '!') { |
|
589 | 34 | $flags |= Token::FLAG_COMMENT_MYSQL_CMD; |
|
590 | 34 | $token .= $this->str[++$this->last]; |
|
591 | |||
592 | while ( |
||
593 | 34 | ++$this->last < $this->len |
|
594 | 34 | && $this->str[$this->last] >= '0' |
|
595 | 34 | && $this->str[$this->last] <= '9' |
|
596 | ) { |
||
597 | 32 | $token .= $this->str[$this->last]; |
|
598 | } |
||
599 | |||
600 | 34 | --$this->last; |
|
601 | |||
602 | // We split this comment and parse only its beginning |
||
603 | // here. |
||
604 | 34 | return new Token($token, TokenType::Comment, $flags); |
|
605 | } |
||
606 | |||
607 | // Parsing the comment. |
||
608 | while ( |
||
609 | 68 | ++$this->last < $this->len |
|
610 | 68 | && ( |
|
611 | 68 | $this->str[$this->last - 1] !== '*' |
|
612 | 68 | || $this->str[$this->last] !== '/' |
|
613 | 68 | ) |
|
614 | ) { |
||
615 | 68 | $token .= $this->str[$this->last]; |
|
616 | } |
||
617 | |||
618 | // Adding the ending. |
||
619 | 68 | if ($this->last < $this->len) { |
|
620 | 68 | $token .= $this->str[$this->last]; |
|
621 | } |
||
622 | |||
623 | 68 | return new Token($token, TokenType::Comment, $flags); |
|
624 | } |
||
625 | } |
||
626 | |||
627 | // SQL style comments. (-- comment\n) |
||
628 | 1488 | if (++$this->last < $this->len) { |
|
629 | 1482 | $token .= $this->str[$this->last]; |
|
630 | 1482 | $end = false; |
|
631 | } else { |
||
632 | 422 | --$this->last; |
|
633 | 422 | $end = true; |
|
634 | } |
||
635 | |||
636 | 1488 | if (Context::isComment($token, $end)) { |
|
0 ignored issues
–
show
The expression
PhpMyAdmin\SqlParser\Con...isComment($token, $end) of type integer|null is loosely compared to true ; this is ambiguous if the integer can be 0. You might want to explicitly use !== null instead.
In PHP, under loose comparison (like For 0 == false // true
0 == null // true
123 == false // false
123 == null // false
// It is often better to use strict comparison
0 === false // false
0 === null // false
![]() |
|||
637 | // Checking if this comment did not end already (```--\n```). |
||
638 | 70 | if ($this->str[$this->last] !== "\n") { |
|
639 | 70 | while (++$this->last < $this->len && $this->str[$this->last] !== "\n") { |
|
640 | 70 | $token .= $this->str[$this->last]; |
|
641 | } |
||
642 | } |
||
643 | |||
644 | // Include trailing \n as whitespace token |
||
645 | 70 | if ($this->last < $this->len) { |
|
646 | 62 | --$this->last; |
|
647 | } |
||
648 | |||
649 | 70 | return new Token($token, TokenType::Comment, Token::FLAG_COMMENT_SQL); |
|
650 | } |
||
651 | |||
652 | 1488 | $this->last = $iBak; |
|
653 | |||
654 | 1488 | return null; |
|
655 | } |
||
656 | |||
657 | /** |
||
658 | * Parses a boolean. |
||
659 | */ |
||
660 | 1472 | public function parseBool(): Token|null |
|
661 | { |
||
662 | 1472 | if ($this->last + 3 >= $this->len) { |
|
663 | // At least `min(strlen('TRUE'), strlen('FALSE'))` characters are |
||
664 | // required. |
||
665 | 318 | return null; |
|
666 | } |
||
667 | |||
668 | 1472 | $iBak = $this->last; |
|
669 | 1472 | $token = $this->str[$this->last] . $this->str[++$this->last] |
|
670 | 1472 | . $this->str[++$this->last] . $this->str[++$this->last]; // _TRUE_ or _FALS_e |
|
671 | |||
672 | 1472 | if (Context::isBool($token)) { |
|
673 | 4 | return new Token($token, TokenType::Bool); |
|
674 | } |
||
675 | |||
676 | 1472 | if (++$this->last < $this->len) { |
|
677 | 1468 | $token .= $this->str[$this->last]; // fals_E_ |
|
678 | 1468 | if (Context::isBool($token)) { |
|
679 | 6 | return new Token($token, TokenType::Bool, 1); |
|
680 | } |
||
681 | } |
||
682 | |||
683 | 1472 | $this->last = $iBak; |
|
684 | |||
685 | 1472 | return null; |
|
686 | } |
||
687 | |||
688 | /** |
||
689 | * Parses a number. |
||
690 | */ |
||
691 | 1488 | public function parseNumber(): Token|null |
|
692 | { |
||
693 | // A rudimentary state machine is being used to parse numbers due to |
||
694 | // the various forms of their notation. |
||
695 | // |
||
696 | // Below are the states of the machines and the conditions to change |
||
697 | // the state. |
||
698 | // |
||
699 | // 1 --------------------[ + or - ]-------------------> 1 |
||
700 | // 1 -------------------[ 0x or 0X ]------------------> 2 |
||
701 | // 1 --------------------[ 0 to 9 ]-------------------> 3 |
||
702 | // 1 -----------------------[ . ]---------------------> 4 |
||
703 | // 1 -----------------------[ b ]---------------------> 7 |
||
704 | // |
||
705 | // 2 --------------------[ 0 to F ]-------------------> 2 |
||
706 | // |
||
707 | // 3 --------------------[ 0 to 9 ]-------------------> 3 |
||
708 | // 3 -----------------------[ . ]---------------------> 4 |
||
709 | // 3 --------------------[ e or E ]-------------------> 5 |
||
710 | // |
||
711 | // 4 --------------------[ 0 to 9 ]-------------------> 4 |
||
712 | // 4 --------------------[ e or E ]-------------------> 5 |
||
713 | // |
||
714 | // 5 ---------------[ + or - or 0 to 9 ]--------------> 6 |
||
715 | // |
||
716 | // 7 -----------------------[ ' ]---------------------> 8 |
||
717 | // |
||
718 | // 8 --------------------[ 0 or 1 ]-------------------> 8 |
||
719 | // 8 -----------------------[ ' ]---------------------> 9 |
||
720 | // |
||
721 | // State 1 may be reached by negative numbers. |
||
722 | // State 2 is reached only by hex numbers. |
||
723 | // State 4 is reached only by float numbers. |
||
724 | // State 5 is reached only by numbers in approximate form. |
||
725 | // State 7 is reached only by numbers in bit representation. |
||
726 | // |
||
727 | // Valid final states are: 2, 3, 4 and 6. Any parsing that finished in a |
||
728 | // state other than these is invalid. |
||
729 | // Also, negative states are invalid states. |
||
730 | 1488 | $iBak = $this->last; |
|
731 | 1488 | $token = ''; |
|
732 | 1488 | $flags = 0; |
|
733 | 1488 | $state = 1; |
|
734 | 1488 | for (; $this->last < $this->len; ++$this->last) { |
|
735 | 1488 | if ($state === 1) { |
|
736 | 1488 | if ($this->str[$this->last] === '-') { |
|
737 | 70 | $flags |= Token::FLAG_NUMBER_NEGATIVE; |
|
738 | } elseif ( |
||
739 | 1488 | $this->last + 1 < $this->len |
|
740 | 1488 | && $this->str[$this->last] === '0' |
|
741 | 1488 | && $this->str[$this->last + 1] === 'x' |
|
742 | ) { |
||
743 | 4 | $token .= $this->str[$this->last++]; |
|
744 | 4 | $state = 2; |
|
745 | 1488 | } elseif ($this->str[$this->last] >= '0' && $this->str[$this->last] <= '9') { |
|
746 | 638 | $state = 3; |
|
747 | 1486 | } elseif ($this->str[$this->last] === '.') { |
|
748 | 226 | $state = 4; |
|
749 | 1486 | } elseif ($this->str[$this->last] === 'b') { |
|
750 | 108 | $state = 7; |
|
751 | 1486 | } elseif ($this->str[$this->last] !== '+') { |
|
752 | // `+` is a valid character in a number. |
||
753 | 1486 | break; |
|
754 | } |
||
755 | 742 | } elseif ($state === 2) { |
|
756 | 4 | $flags |= Token::FLAG_NUMBER_HEX; |
|
757 | if ( |
||
758 | ! ( |
||
759 | 4 | ($this->str[$this->last] >= '0' && $this->str[$this->last] <= '9') |
|
760 | 4 | || ($this->str[$this->last] >= 'A' && $this->str[$this->last] <= 'F') |
|
761 | 4 | || ($this->str[$this->last] >= 'a' && $this->str[$this->last] <= 'f') |
|
762 | ) |
||
763 | ) { |
||
764 | 4 | break; |
|
765 | } |
||
766 | 742 | } elseif ($state === 3) { |
|
767 | 578 | if ($this->str[$this->last] === '.') { |
|
768 | 12 | $state = 4; |
|
769 | 576 | } elseif ($this->str[$this->last] === 'e' || $this->str[$this->last] === 'E') { |
|
770 | 2 | $state = 5; |
|
771 | } elseif ( |
||
772 | 576 | ($this->str[$this->last] >= 'a' && $this->str[$this->last] <= 'z') |
|
773 | 576 | || ($this->str[$this->last] >= 'A' && $this->str[$this->last] <= 'Z') |
|
774 | ) { |
||
775 | // A number can't be directly followed by a letter |
||
776 | 10 | $state = -$state; |
|
777 | 572 | } elseif ($this->str[$this->last] < '0' || $this->str[$this->last] > '9') { |
|
778 | // Just digits and `.`, `e` and `E` are valid characters. |
||
779 | 562 | break; |
|
780 | } |
||
781 | 322 | } elseif ($state === 4) { |
|
782 | 236 | $flags |= Token::FLAG_NUMBER_FLOAT; |
|
783 | 236 | if ($this->str[$this->last] === 'e' || $this->str[$this->last] === 'E') { |
|
784 | 14 | $state = 5; |
|
785 | } elseif ( |
||
786 | 236 | ($this->str[$this->last] >= 'a' && $this->str[$this->last] <= 'z') |
|
787 | 236 | || ($this->str[$this->last] >= 'A' && $this->str[$this->last] <= 'Z') |
|
788 | ) { |
||
789 | // A number can't be directly followed by a letter |
||
790 | 176 | $state = -$state; |
|
791 | 94 | } elseif ($this->str[$this->last] < '0' || $this->str[$this->last] > '9') { |
|
792 | // Just digits, `e` and `E` are valid characters. |
||
793 | 92 | break; |
|
794 | } |
||
795 | 272 | } elseif ($state === 5) { |
|
796 | 14 | $flags |= Token::FLAG_NUMBER_APPROXIMATE; |
|
797 | if ( |
||
798 | 14 | $this->str[$this->last] === '+' || $this->str[$this->last] === '-' |
|
799 | 14 | || ($this->str[$this->last] >= '0' && $this->str[$this->last] <= '9') |
|
800 | ) { |
||
801 | 2 | $state = 6; |
|
802 | } elseif ( |
||
803 | 14 | ($this->str[$this->last] >= 'a' && $this->str[$this->last] <= 'z') |
|
804 | 14 | || ($this->str[$this->last] >= 'A' && $this->str[$this->last] <= 'Z') |
|
805 | ) { |
||
806 | // A number can't be directly followed by a letter |
||
807 | 14 | $state = -$state; |
|
808 | } else { |
||
809 | break; |
||
810 | } |
||
811 | 272 | } elseif ($state === 6) { |
|
812 | 2 | if ($this->str[$this->last] < '0' || $this->str[$this->last] > '9') { |
|
813 | // Just digits are valid characters. |
||
814 | 2 | break; |
|
815 | } |
||
816 | 272 | } elseif ($state === 7) { |
|
817 | 106 | $flags |= Token::FLAG_NUMBER_BINARY; |
|
818 | 106 | if ($this->str[$this->last] !== '\'') { |
|
819 | 104 | break; |
|
820 | } |
||
821 | |||
822 | 2 | $state = 8; |
|
823 | 186 | } elseif ($state === 8) { |
|
824 | 2 | if ($this->str[$this->last] === '\'') { |
|
825 | 2 | $state = 9; |
|
826 | 2 | } elseif ($this->str[$this->last] !== '0' && $this->str[$this->last] !== '1') { |
|
827 | 2 | break; |
|
828 | } |
||
829 | 186 | } elseif ($state === 9) { |
|
830 | 2 | break; |
|
831 | } |
||
832 | |||
833 | 826 | $token .= $this->str[$this->last]; |
|
834 | } |
||
835 | |||
836 | 1488 | if ($state === 2 || $state === 3 || ($token !== '.' && $state === 4) || $state === 6 || $state === 9) { |
|
837 | 638 | --$this->last; |
|
838 | |||
839 | 638 | return new Token($token, TokenType::Number, $flags); |
|
840 | } |
||
841 | |||
842 | 1488 | $this->last = $iBak; |
|
843 | |||
844 | 1488 | return null; |
|
845 | } |
||
846 | |||
847 | /** |
||
848 | * Parses a string. |
||
849 | * |
||
850 | * @param string $quote additional starting symbol |
||
851 | * |
||
852 | * @throws LexerException |
||
853 | */ |
||
854 | 1472 | public function parseString(string $quote = ''): Token|null |
|
855 | { |
||
856 | 1472 | $token = $this->str[$this->last]; |
|
857 | 1472 | $flags = Context::isString($token); |
|
858 | |||
859 | 1472 | if (! $flags && $token !== $quote) { |
|
860 | 1472 | return null; |
|
861 | } |
||
862 | |||
863 | 740 | $quote = $token; |
|
864 | |||
865 | 740 | while (++$this->last < $this->len) { |
|
866 | if ( |
||
867 | 740 | $this->last + 1 < $this->len |
|
868 | && ( |
||
869 | 740 | ($this->str[$this->last] === $quote && $this->str[$this->last + 1] === $quote) |
|
870 | 740 | || ($this->str[$this->last] === '\\' && $quote !== '`') |
|
871 | ) |
||
872 | ) { |
||
873 | 30 | $token .= $this->str[$this->last] . $this->str[++$this->last]; |
|
874 | } else { |
||
875 | 740 | if ($this->str[$this->last] === $quote) { |
|
876 | 736 | break; |
|
877 | } |
||
878 | |||
879 | 734 | $token .= $this->str[$this->last]; |
|
880 | } |
||
881 | } |
||
882 | |||
883 | 740 | if ($this->last >= $this->len || $this->str[$this->last] !== $quote) { |
|
884 | 14 | $this->error( |
|
885 | 14 | sprintf( |
|
886 | 14 | Translator::gettext('Ending quote %1$s was expected.'), |
|
887 | 14 | $quote, |
|
888 | 14 | ), |
|
889 | 14 | '', |
|
890 | 14 | $this->last, |
|
891 | 14 | ); |
|
892 | } else { |
||
893 | 736 | $token .= $this->str[$this->last]; |
|
894 | } |
||
895 | |||
896 | 740 | return new Token($token, TokenType::String, $flags ?? Token::FLAG_NONE); |
|
897 | } |
||
898 | |||
899 | /** |
||
900 | * Parses a symbol. |
||
901 | * |
||
902 | * @throws LexerException |
||
903 | */ |
||
904 | 1472 | public function parseSymbol(): Token|null |
|
905 | { |
||
906 | 1472 | $token = $this->str[$this->last]; |
|
907 | 1472 | $flags = Context::isSymbol($token); |
|
908 | |||
909 | 1472 | if (! $flags) { |
|
910 | 1470 | return null; |
|
911 | } |
||
912 | |||
913 | 472 | if ($flags & Token::FLAG_SYMBOL_VARIABLE) { |
|
914 | 122 | if ($this->last + 1 < $this->len && $this->str[++$this->last] === '@') { |
|
915 | // This is a system variable (e.g. `@@hostname`). |
||
916 | 26 | $token .= $this->str[$this->last++]; |
|
917 | 26 | $flags |= Token::FLAG_SYMBOL_SYSTEM; |
|
918 | } |
||
919 | 382 | } elseif ($flags & Token::FLAG_SYMBOL_PARAMETER) { |
|
920 | 18 | if ($token !== '?' && $this->last + 1 < $this->len) { |
|
921 | 8 | ++$this->last; |
|
922 | } |
||
923 | } else { |
||
924 | 370 | $token = ''; |
|
925 | } |
||
926 | |||
927 | 472 | $str = null; |
|
928 | |||
929 | 472 | if ($this->last < $this->len) { |
|
930 | 472 | $str = $this->parseString('`'); |
|
931 | |||
932 | 472 | if ($str === null) { |
|
933 | 100 | $str = $this->parseUnknown(); |
|
934 | |||
935 | 100 | if ($str === null && ! ($flags & Token::FLAG_SYMBOL_PARAMETER)) { |
|
936 | 4 | $this->error('Variable name was expected.', $this->str[$this->last], $this->last); |
|
937 | } |
||
938 | } |
||
939 | } |
||
940 | |||
941 | 472 | if ($str !== null) { |
|
942 | 462 | $token .= $str->token; |
|
943 | } |
||
944 | |||
945 | 472 | return new Token($token, TokenType::Symbol, $flags); |
|
946 | } |
||
947 | |||
948 | /** |
||
949 | * Parses unknown parts of the query. |
||
950 | */ |
||
951 | 1146 | public function parseUnknown(): Token|null |
|
952 | { |
||
953 | 1146 | $token = $this->str[$this->last]; |
|
954 | 1146 | if (Context::isSeparator($token)) { |
|
955 | 22 | return null; |
|
956 | } |
||
957 | |||
958 | 1138 | while (++$this->last < $this->len && ! Context::isSeparator($this->str[$this->last])) { |
|
959 | 1106 | $token .= $this->str[$this->last]; |
|
960 | |||
961 | // Test if end of token equals the current delimiter. If so, remove it from the token. |
||
962 | 1106 | if (str_ends_with($token, $this->delimiter)) { |
|
963 | 4 | $token = substr($token, 0, -$this->delimiterLen); |
|
964 | 4 | $this->last -= $this->delimiterLen - 1; |
|
965 | 4 | break; |
|
966 | } |
||
967 | } |
||
968 | |||
969 | 1138 | --$this->last; |
|
970 | |||
971 | 1138 | return new Token($token); |
|
972 | } |
||
973 | |||
974 | /** |
||
975 | * Parses the delimiter of the query. |
||
976 | */ |
||
977 | 1488 | public function parseDelimiter(): Token|null |
|
978 | { |
||
979 | 1488 | $idx = 0; |
|
980 | |||
981 | 1488 | while ($idx < $this->delimiterLen && $this->last + $idx < $this->len) { |
|
982 | 1488 | if ($this->delimiter[$idx] !== $this->str[$this->last + $idx]) { |
|
983 | 1488 | return null; |
|
984 | } |
||
985 | |||
986 | 604 | ++$idx; |
|
987 | } |
||
988 | |||
989 | 604 | $this->last += $this->delimiterLen - 1; |
|
990 | |||
991 | 604 | return new Token($this->delimiter, TokenType::Delimiter); |
|
992 | } |
||
993 | |||
994 | 1488 | private function parse(): Token|null |
|
995 | { |
||
996 | // It is best to put the parsers in order of their complexity |
||
997 | // (ascending) and their occurrence rate (descending). |
||
998 | // |
||
999 | // Conflicts: |
||
1000 | // |
||
1001 | // 1. `parseDelimiter`, `parseUnknown`, `parseKeyword`, `parseNumber` |
||
1002 | // They fight over delimiter. The delimiter may be a keyword, a |
||
1003 | // number or almost any character which makes the delimiter one of |
||
1004 | // the first tokens that must be parsed. |
||
1005 | // |
||
1006 | // 1. `parseNumber` and `parseOperator` |
||
1007 | // They fight over `+` and `-`. |
||
1008 | // |
||
1009 | // 2. `parseComment` and `parseOperator` |
||
1010 | // They fight over `/` (as in ```/*comment*/``` or ```a / b```) |
||
1011 | // |
||
1012 | // 3. `parseBool` and `parseKeyword` |
||
1013 | // They fight over `TRUE` and `FALSE`. |
||
1014 | // |
||
1015 | // 4. `parseKeyword` and `parseUnknown` |
||
1016 | // They fight over words. `parseUnknown` does not know about |
||
1017 | // keywords. |
||
1018 | |||
1019 | 1488 | return $this->parseDelimiter() |
|
1020 | 1488 | ?? $this->parseWhitespace() |
|
1021 | 1488 | ?? $this->parseNumber() |
|
1022 | 1488 | ?? $this->parseComment() |
|
1023 | 1488 | ?? $this->parseOperator() |
|
1024 | 1488 | ?? $this->parseBool() |
|
1025 | 1488 | ?? $this->parseString() |
|
1026 | 1488 | ?? $this->parseSymbol() |
|
1027 | 1488 | ?? $this->parseKeyword() |
|
1028 | 1488 | ?? $this->parseLabel() |
|
1029 | 1488 | ?? $this->parseUnknown(); |
|
1030 | } |
||
1031 | } |
||
1032 |