1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace PhpMyAdmin\SqlParser; |
6
|
|
|
|
7
|
|
|
use ArrayAccess; |
8
|
|
|
use Exception; |
9
|
|
|
use Stringable; |
10
|
|
|
|
11
|
|
|
use function mb_check_encoding; |
12
|
|
|
use function mb_strlen; |
13
|
|
|
use function mb_substr; |
14
|
|
|
use function ord; |
15
|
|
|
use function strlen; |
16
|
|
|
use function substr; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Implementation for UTF-8 strings. |
20
|
|
|
* |
21
|
|
|
* The subscript operator in PHP, when used with string will return a byte and not a character. Because in UTF-8 |
22
|
|
|
* strings a character may occupy more than one byte, the subscript operator may return an invalid character. |
23
|
|
|
* |
24
|
|
|
* Because the lexer relies on the subscript operator this class had to be implemented. |
25
|
|
|
* |
26
|
|
|
* Implements array-like access for UTF-8 strings. |
27
|
|
|
* |
28
|
|
|
* In this library, this class should be used to parse UTF-8 queries. |
29
|
|
|
* |
30
|
|
|
* @implements ArrayAccess<int, string> |
31
|
|
|
*/ |
32
|
|
|
class UtfString implements ArrayAccess, Stringable |
33
|
|
|
{ |
34
|
|
|
/** |
35
|
|
|
* The raw, multi-byte string. |
36
|
|
|
* |
37
|
|
|
* @var string |
38
|
|
|
*/ |
39
|
|
|
public $str = ''; |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* The index of current byte. |
43
|
|
|
* |
44
|
|
|
* For ASCII strings, the byte index is equal to the character index. |
45
|
|
|
* |
46
|
|
|
* @var int |
47
|
|
|
*/ |
48
|
|
|
public $byteIdx = 0; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* The index of current character. |
52
|
|
|
* |
53
|
|
|
* For non-ASCII strings, some characters occupy more than one byte and |
54
|
|
|
* the character index will have a lower value than the byte index. |
55
|
|
|
* |
56
|
|
|
* @var int |
57
|
|
|
*/ |
58
|
|
|
public $charIdx = 0; |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* The length of the string (in bytes). |
62
|
|
|
* |
63
|
|
|
* @var int |
64
|
|
|
*/ |
65
|
|
|
public $byteLen = 0; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* The length of the string (in characters). |
69
|
|
|
* |
70
|
|
|
* @var int |
71
|
|
|
*/ |
72
|
|
|
public $charLen = 0; |
73
|
|
|
|
74
|
|
|
/** |
75
|
|
|
* @param string $str the string |
76
|
|
|
*/ |
77
|
28 |
|
public function __construct($str) |
78
|
|
|
{ |
79
|
28 |
|
$this->str = $str; |
80
|
28 |
|
$this->byteLen = mb_strlen($str, '8bit'); |
81
|
28 |
|
if (! mb_check_encoding($str, 'UTF-8')) { |
82
|
2 |
|
$this->charLen = 0; |
83
|
|
|
} else { |
84
|
26 |
|
$this->charLen = mb_strlen($str, 'UTF-8'); |
85
|
|
|
} |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* Checks if the given offset exists. |
90
|
|
|
* |
91
|
|
|
* @param int $offset the offset to be checked |
92
|
|
|
*/ |
93
|
2 |
|
public function offsetExists($offset): bool |
94
|
|
|
{ |
95
|
2 |
|
return ($offset >= 0) && ($offset < $this->charLen); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Gets the character at given offset. |
100
|
|
|
* |
101
|
|
|
* @param int $offset the offset to be returned |
102
|
|
|
*/ |
103
|
22 |
|
public function offsetGet($offset): string|null |
104
|
|
|
{ |
105
|
|
|
// This function moves the internal byte and character pointer to the requested offset. |
106
|
|
|
// This function is part of hot code so the aim is to do the following |
107
|
|
|
// operations as efficiently as possible. |
108
|
|
|
// UTF-8 character encoding is a variable length encoding that encodes Unicode |
109
|
|
|
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr |
110
|
|
|
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes. |
111
|
22 |
|
if (($offset < 0) || ($offset >= $this->charLen)) { |
112
|
4 |
|
return null; |
113
|
|
|
} |
114
|
|
|
|
115
|
20 |
|
$delta = $offset - $this->charIdx; |
116
|
|
|
|
117
|
20 |
|
if ($delta > 0) { |
118
|
|
|
// Fast forwarding. |
119
|
20 |
|
$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta)); |
120
|
20 |
|
$this->charIdx += $delta; |
121
|
18 |
|
} elseif ($delta < 0) { |
122
|
|
|
// Rewinding. |
123
|
18 |
|
while ($delta++ < 0) { |
124
|
|
|
// We rewind byte by byte and only count characters that are not continuation bytes, |
125
|
|
|
// i.e. ASCII characters and first octets of multibyte characters |
126
|
|
|
do { |
127
|
18 |
|
$byte = ord($this->str[--$this->byteIdx]); |
128
|
18 |
|
} while (($byte >= 128) && ($byte < 192)); |
129
|
|
|
|
130
|
18 |
|
--$this->charIdx; |
131
|
|
|
} |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
// Fetch the first Unicode character within the next 4 bytes in the string. |
135
|
20 |
|
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1); |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
/** |
139
|
|
|
* Sets the value of a character. |
140
|
|
|
* |
141
|
|
|
* @param int $offset the offset to be set |
142
|
|
|
* @param string $value the value to be set |
143
|
|
|
* |
144
|
|
|
* @throws Exception not implemented. |
145
|
|
|
*/ |
146
|
2 |
|
public function offsetSet($offset, $value): void |
147
|
|
|
{ |
148
|
2 |
|
throw new Exception('Not implemented.'); |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
/** |
152
|
|
|
* Unsets an index. |
153
|
|
|
* |
154
|
|
|
* @param int $offset the value to be unset |
155
|
|
|
* |
156
|
|
|
* @throws Exception not implemented. |
157
|
|
|
*/ |
158
|
2 |
|
public function offsetUnset($offset): void |
159
|
|
|
{ |
160
|
2 |
|
throw new Exception('Not implemented.'); |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
/** |
164
|
|
|
* Returns the length in characters of the string. |
165
|
|
|
*/ |
166
|
10 |
|
public function length(): int |
167
|
|
|
{ |
168
|
10 |
|
return $this->charLen; |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
/** |
172
|
|
|
* Returns the contained string. |
173
|
|
|
*/ |
174
|
2 |
|
public function __toString(): string |
175
|
|
|
{ |
176
|
2 |
|
return $this->str; |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
|