1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace StringObject; |
4
|
|
|
|
5
|
|
|
class UStrObj extends StrObj |
6
|
|
|
{ |
7
|
|
|
protected $chars = []; |
8
|
|
|
protected $uhandler; |
9
|
|
|
|
10
|
|
|
protected static $masks = [ |
11
|
|
|
2 => 0b00011111, |
12
|
|
|
3 => 0b00001111, |
13
|
|
|
4 => 0b00000111, |
14
|
|
|
]; |
15
|
|
|
protected static $winc1umap = [ |
16
|
|
|
128 => 0x20AC, |
17
|
|
|
130 => 0x201A, |
18
|
|
|
131 => 0x0192, |
19
|
|
|
132 => 0x201E, |
20
|
|
|
133 => 0x2026, |
21
|
|
|
134 => 0x2020, |
22
|
|
|
135 => 0x2021, |
23
|
|
|
136 => 0x02C6, |
24
|
|
|
137 => 0x2030, |
25
|
|
|
138 => 0x0160, |
26
|
|
|
139 => 0x2039, |
27
|
|
|
140 => 0x0152, |
28
|
|
|
142 => 0x017D, |
29
|
|
|
145 => 0x2018, |
30
|
|
|
146 => 0x2019, |
31
|
|
|
147 => 0x201C, |
32
|
|
|
148 => 0x201D, |
33
|
|
|
149 => 0x2022, |
34
|
|
|
150 => 0x2013, |
35
|
|
|
151 => 0x2014, |
36
|
|
|
152 => 0x02DC, |
37
|
|
|
153 => 0x2122, |
38
|
|
|
154 => 0x0161, |
39
|
|
|
155 => 0x203A, |
40
|
|
|
156 => 0x0153, |
41
|
|
|
158 => 0x017E, |
42
|
|
|
159 => 0x0178, |
43
|
|
|
]; |
44
|
|
|
|
45
|
|
|
protected static $c1umap = [ |
46
|
|
|
0xC280 => 0x20AC, |
47
|
|
|
0xC282 => 0x201A, |
48
|
|
|
0xC283 => 0x0192, |
49
|
|
|
0xC284 => 0x201E, |
50
|
|
|
0xC285 => 0x2026, |
51
|
|
|
0xC286 => 0x2020, |
52
|
|
|
0xC287 => 0x2021, |
53
|
|
|
0xC288 => 0x02C6, |
54
|
|
|
0xC289 => 0x2030, |
55
|
|
|
0xC28A => 0x0160, |
56
|
|
|
0xC28B => 0x2039, |
57
|
|
|
0xC28C => 0x0152, |
58
|
|
|
0xC28E => 0x017D, |
59
|
|
|
0xC291 => 0x2018, |
60
|
|
|
0xC292 => 0x2019, |
61
|
|
|
0xC293 => 0x201C, |
62
|
|
|
0xC294 => 0x201D, |
63
|
|
|
0xC295 => 0x2022, |
64
|
|
|
0xC296 => 0x2013, |
65
|
|
|
0xC297 => 0x2014, |
66
|
|
|
0xC298 => 0x02DC, |
67
|
|
|
0xC299 => 0x2122, |
68
|
|
|
0xC29A => 0x0161, |
69
|
|
|
0xC29B => 0x203A, |
70
|
|
|
0xC29C => 0x0153, |
71
|
|
|
0xC29E => 0x017E, |
72
|
|
|
0xC29F => 0x0178, |
73
|
|
|
]; |
74
|
|
|
|
75
|
|
|
public function __construct($thing, $uhandler) |
76
|
|
|
{ |
77
|
|
|
parent::__construct($thing); |
78
|
|
|
$this->uhandler = $uhandler; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
View Code Duplication |
public function toArray($delim = '', $limit = null) |
|
|
|
|
82
|
|
|
{ |
83
|
|
|
$this->loadToArray(); |
84
|
|
|
|
85
|
|
|
if (empty($delim)) { |
86
|
|
|
return $this->chars; |
87
|
|
|
} |
88
|
|
|
if (is_int($delim)) { |
89
|
|
|
return \str_split($this->raw, $delim); |
90
|
|
|
} |
91
|
|
|
if ($limit === null) { |
92
|
|
|
return \explode($delim, $this->raw); |
93
|
|
|
} |
94
|
|
|
return \explode($delim, $this->raw, $limit); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
public function charAt($index) |
|
|
|
|
98
|
|
|
{ |
99
|
|
|
$this->loadToArray(); |
100
|
|
|
return $this->chars[$index]; |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
public function charCodeAt($index) |
|
|
|
|
104
|
|
|
{ |
105
|
|
|
$this->loadToArray(); |
106
|
|
|
$bytes = \array_map('ord', \str_split($this->chars[$index])); |
107
|
|
|
$count = \strlen($this->chars[$index]); |
108
|
|
|
|
109
|
|
|
if ($count === 1) { |
110
|
|
|
if ($bytes[0] > 0b01111111 && $bytes[0] < 0b10100000) { |
111
|
|
|
return self::$winc1umap[$bytes[0]]; |
112
|
|
|
} |
113
|
|
|
return $bytes[0]; |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
$overlong = false; |
|
|
|
|
117
|
|
|
|
118
|
|
|
foreach ($bytes as $i => $data) { |
119
|
|
|
if ($i === 0) { |
120
|
|
|
$codepoint = ($data & self::$masks[$count]); |
121
|
|
|
$overlong = ($codepoint === 0); |
|
|
|
|
122
|
|
|
continue; |
123
|
|
|
} |
124
|
|
|
$codepoint <<= 6; |
|
|
|
|
125
|
|
|
$codepoint += $data & 0b00111111; |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
if ($codepoint > 0x10FFFF) { |
|
|
|
|
129
|
|
|
// invalid |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
return $codepoint; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
private function loadToArray() |
136
|
|
|
{ |
137
|
|
|
if (!empty($this->chars)) { |
138
|
|
|
return; |
139
|
|
|
} |
140
|
|
|
|
141
|
|
|
$offset = 0; |
142
|
|
|
$len = \strlen($this->raw); |
143
|
|
|
while ($offset < $len) { |
144
|
|
|
$data = $this->raw{$offset}; |
145
|
|
|
$bytes = self::charLength($data); |
146
|
|
|
$valid = ($offset + $bytes <= $len); |
147
|
|
|
|
148
|
|
|
for ($pos = 2; $pos <= $bytes && $valid === true; $pos++) { |
149
|
|
|
$byte = $this->raw{$offset + $pos - 1}; |
150
|
|
|
$ord = \ord($byte); |
151
|
|
|
|
152
|
|
|
if ($ord < 128 && $ord > 191) { |
153
|
|
|
$valid = false; |
154
|
|
|
} |
155
|
|
|
$data .= $byte; |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
if ($bytes === 1 || $valid === false) { |
159
|
|
|
$this->chars[] = $this->raw{$offset++}; |
160
|
|
|
continue; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
$this->chars[] = $data; |
164
|
|
|
$offset += $bytes; |
165
|
|
|
} |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
protected function parseUtf8CharAt($offset) |
169
|
|
|
{ |
170
|
|
|
list($start, $length, $valid, $current) = $this->findUtf8CharAt($offset); |
171
|
|
|
|
172
|
|
|
if ($length === 1) { |
173
|
|
|
if ($current > 0b01111111 && $current < 0b10100000) { |
174
|
|
|
return [$start, $length, self::$winc1umap[$current]]; |
175
|
|
|
} |
176
|
|
|
return [$start, $length, $current]; |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
$byte = \ord($this->raw{$start}); |
180
|
|
|
|
181
|
|
|
if ($valid === false) { |
182
|
|
|
if ($length === 2 && $byte === 0b11000000) { |
183
|
|
|
// overlong ascii |
184
|
|
|
return [$start + 1, 1, ($offset === $start) ? \ord($this->raw{$start + 1}) : $byte]; |
185
|
|
|
} |
186
|
|
|
return [$offset, 1, $current]; |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
if ($valid === true) { |
190
|
|
|
$bigcode = $byte & 0b00011111; |
191
|
|
|
|
192
|
|
|
if ($length === 3) { |
193
|
|
|
$bigcode = $byte & 0b00001111; |
194
|
|
|
} elseif ($length === 4) { |
195
|
|
|
$bigcode = $byte & 0b00000111; |
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
for ($next = 1; $next < $length; $next++) { |
199
|
|
|
$bigcode <<= 6; |
200
|
|
|
$bigcode += \ord($this->raw{$start + $next}) & 0b00111111; |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
if ($bigcode > 0x10FFFF) { |
204
|
|
|
return [$offset, 1, $current]; |
205
|
|
|
} |
206
|
|
|
return [$start, $length, $bigcode]; |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
|
210
|
|
|
/** |
211
|
|
|
* Determines if the byte at the given offset is part of a valid UTF8 char, |
212
|
|
|
* and returns its actual starting offset, length in bytes, validity, |
213
|
|
|
* and the byte at the original offset. |
214
|
|
|
*/ |
215
|
|
|
protected function findUtf8CharAt($offset) |
216
|
|
|
{ |
217
|
|
|
$byte = \ord($this->raw{$offset}); |
218
|
|
|
|
219
|
|
|
if ($byte <= 0b01111111) { |
220
|
|
|
// ASCII passthru, 1 byte long |
221
|
|
|
return [$offset, 1, true, $byte]; |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
if ($byte <= 0b10111111) { |
225
|
|
|
// either part of a UTF8 char, or an invalid UTF8 codepoint. |
226
|
|
|
// try to find start of UTF8 char |
227
|
|
|
$original = $offset; |
228
|
|
|
while ($offset > 0 && $original - $offset < 4) { |
229
|
|
|
$prev = \ord($this->raw{--$offset}); |
230
|
|
|
|
231
|
|
|
if ($prev <= 0b01111111) { |
232
|
|
|
// prev is plain ASCII so current char can't be valid |
233
|
|
|
return [$original, 1, false, $byte]; |
234
|
|
|
} |
235
|
|
|
|
236
|
|
|
if ($prev <= 0b10111111) { |
237
|
|
|
// prev is also part of a UTF8 char, so keep looking |
238
|
|
|
continue; |
239
|
|
|
} |
240
|
|
|
|
241
|
|
|
if ($prev == 0xC0 || $prev == 0xC1) { |
242
|
|
|
// prev is an invalid UTF8 starter for overlong ASCII |
243
|
|
|
return [$offset, 2, false, $byte]; |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
if ($prev <= 0b11110100) { |
247
|
|
|
// prev is valid start byte, validate length to check this char |
248
|
|
|
$length = self::charLength($prev); |
249
|
|
|
|
250
|
|
|
if ($original < $offset + $length) { |
251
|
|
|
return [$offset, $length, true, $byte]; |
252
|
|
|
} |
253
|
|
|
} |
254
|
|
|
return [$original, 1, false, $byte]; |
255
|
|
|
} |
256
|
|
|
return [$original, 1, false, $byte]; |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
if ($byte <= 0b11110100) { |
260
|
|
|
// valid UTF8 start byte, find the rest, determine if length is valid |
261
|
|
|
$actual = $length = self::charLength($byte); |
262
|
|
|
|
263
|
|
|
for ($i = 1; $i < $length; $i++) { |
264
|
|
|
if ($offset + $i >= $this->length()) { |
265
|
|
|
$actual = $i - 1; |
266
|
|
|
break; |
267
|
|
|
} |
268
|
|
|
$last = \ord($this->raw{$offset + $i}); |
269
|
|
|
if ($last < 0b10000000 || $last > 0b10111111) { |
270
|
|
|
$actual = $i; |
271
|
|
|
break; |
272
|
|
|
} |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
if ($actual !== $length) { |
276
|
|
|
return [$offset, $actual, false, $byte]; |
277
|
|
|
} |
278
|
|
|
return [$offset, $length, true, $byte]; |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
// if 245 to 255, Windows-1252 passthru |
282
|
|
|
return [$offset, 1, false, $byte]; |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* @param integer $byte |
287
|
|
|
*/ |
288
|
|
|
protected static function charLength($byte) |
289
|
|
|
{ |
290
|
|
|
if ($byte >> 3 === 0b00011110) { |
291
|
|
|
return 4; |
292
|
|
|
} |
293
|
|
|
if ($byte >> 4 === 0b00001110) { |
294
|
|
|
return 3; |
295
|
|
|
} |
296
|
|
|
if ($byte >> 5 === 0b00000110) { |
297
|
|
|
return 2; |
298
|
|
|
} |
299
|
|
|
return 1; |
300
|
|
|
} |
301
|
|
|
} |
302
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.