1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace StringObject; |
4
|
|
|
|
5
|
|
|
class UStrObj extends StrObj |
6
|
|
|
{ |
7
|
|
|
protected $chars = []; |
8
|
|
|
protected $uhandler; |
9
|
|
|
|
10
|
|
|
protected static $spec = [ |
11
|
|
|
2 => ['datamask' => 0b00011111, 'threshold' => 0x80], |
12
|
|
|
3 => ['datamask' => 0b00001111, 'threshold' => 0x800], |
13
|
|
|
4 => ['datamask' => 0b00000111, 'threshold' => 0x10000], |
14
|
|
|
5 => ['datamask' => 0b00000011, 'threshold' => 0x200000], |
15
|
|
|
6 => ['datamask' => 0b00000001, 'threshold' => 0x4000000], |
16
|
|
|
]; |
17
|
|
|
protected static $winc1umap = [ |
18
|
|
|
0x80 => 0x20AC, |
19
|
|
|
0x81 => 0xFFFD, // invalid |
20
|
|
|
0x82 => 0x201A, |
21
|
|
|
0x83 => 0x0192, |
22
|
|
|
0x84 => 0x201E, |
23
|
|
|
0x85 => 0x2026, |
24
|
|
|
0x86 => 0x2020, |
25
|
|
|
0x87 => 0x2021, |
26
|
|
|
0x88 => 0x02C6, |
27
|
|
|
0x89 => 0x2030, |
28
|
|
|
0x8A => 0x0160, |
29
|
|
|
0x8B => 0x2039, |
30
|
|
|
0x8C => 0x0152, |
31
|
|
|
0x8D => 0xFFFD, // invalid |
32
|
|
|
0x8E => 0x017D, |
33
|
|
|
0x8F => 0xFFFD, // invalid |
34
|
|
|
0x90 => 0xFFFD, // invalid |
35
|
|
|
0x91 => 0x2018, |
36
|
|
|
0x92 => 0x2019, |
37
|
|
|
0x93 => 0x201C, |
38
|
|
|
0x94 => 0x201D, |
39
|
|
|
0x95 => 0x2022, |
40
|
|
|
0x96 => 0x2013, |
41
|
|
|
0x97 => 0x2014, |
42
|
|
|
0x98 => 0x02DC, |
43
|
|
|
0x99 => 0x2122, |
44
|
|
|
0x9A => 0x0161, |
45
|
|
|
0x9B => 0x203A, |
46
|
|
|
0x9C => 0x0153, |
47
|
|
|
0x9D => 0xFFFD, // invalid |
48
|
|
|
0x9E => 0x017E, |
49
|
|
|
0x9F => 0x0178, |
50
|
|
|
]; |
51
|
|
|
|
52
|
|
|
public function __construct($thing) |
53
|
|
|
{ |
54
|
|
|
parent::__construct($thing); |
55
|
|
|
} |
56
|
|
|
|
57
|
|
View Code Duplication |
public function toArray($delim = '', $limit = null) |
|
|
|
|
58
|
|
|
{ |
59
|
|
|
$this->loadToArray(); |
60
|
|
|
|
61
|
|
|
if (empty($delim)) { |
62
|
|
|
return $this->chars; |
63
|
|
|
} |
64
|
|
|
if (is_int($delim)) { |
65
|
|
|
return \str_split($this->raw, $delim); |
66
|
|
|
} |
67
|
|
|
if ($limit === null) { |
68
|
|
|
return \explode($delim, $this->raw); |
69
|
|
|
} |
70
|
|
|
return \explode($delim, $this->raw, $limit); |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
public function charAt($index) |
|
|
|
|
74
|
|
|
{ |
75
|
|
|
$this->loadToArray(); |
76
|
|
|
return $this->chars[$index][0]; |
77
|
|
|
} |
78
|
|
|
|
79
|
|
|
public function charCodeAt($index) |
|
|
|
|
80
|
|
|
{ |
81
|
|
|
$this->loadToArray(); |
82
|
|
|
return $this->chars[$index][1]; |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
private function loadToArray() |
86
|
|
|
{ |
87
|
|
|
if (!empty($this->chars)) { |
88
|
|
|
return; |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
$len = \strlen($this->raw); |
92
|
|
|
$inside = false; |
93
|
|
|
$invalid = false; |
94
|
|
|
$cache = ''; |
95
|
|
|
$ordcache = 0; |
96
|
|
|
$originOffset = 0; |
97
|
|
|
$bytes = 0; |
98
|
|
|
|
99
|
|
|
for ($offset = 0; $offset < $len; $offset++) { |
100
|
|
|
$char = $this->raw{$offset}; |
101
|
|
|
$ord = \ord($char); |
102
|
|
|
|
103
|
|
|
if ($inside === false) { |
104
|
|
|
$bytes = self::charLength($ord); |
105
|
|
|
|
106
|
|
|
if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) { |
107
|
|
|
// valid UTF-8 multibyte start |
108
|
|
|
$inside = true; |
109
|
|
|
$cache = $char; |
110
|
|
|
$ordcache = ($ord & self::$spec[$bytes]['datamask']) << (6 * ($bytes - 1)); |
111
|
|
|
$originOffset = $offset; |
112
|
|
|
} elseif ($ord < 0x80) { |
113
|
|
|
// ASCII 7-bit char |
114
|
|
|
$this->chars[] = [$char, $ord]; |
115
|
|
|
} else { |
116
|
|
|
// either C0/C1 block or higher; map from cp1252 to utf8 or just convert |
117
|
|
|
$ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord; |
118
|
|
|
$this->chars[] = [self::cpToUtf8Char($ord), $ord]; |
119
|
|
|
$invalid = false; |
120
|
|
|
} |
121
|
|
|
continue; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
// $inside === true, i.e. *should be* continuation character |
125
|
|
|
if (($ord & 0b11000000) !== 0b10000000) { |
126
|
|
|
// actually, it's not one, so now the whole UTF-8 char is invalid |
127
|
|
|
// go back and force it to parse as ISO or 1252 |
128
|
|
|
$inside = false; |
129
|
|
|
$invalid = true; |
130
|
|
|
$offset = $originOffset - 1; |
131
|
|
|
continue; |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
// put this byte's data where it needs to go |
135
|
|
|
$ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset))); |
136
|
|
|
$cache .= $char; |
137
|
|
|
|
138
|
|
|
if ($originOffset + ($bytes - 1) === $offset) { |
139
|
|
|
// we're done parsing this char, now let's verify |
140
|
|
|
$inside = false; |
141
|
|
|
|
142
|
|
|
// check for overlong, surrogate, too large, BOM, or C0/C1 |
143
|
|
|
$overlong = ($ordcache < self::$spec[$bytes]['threshold']); |
144
|
|
|
$surrogate = ($ordcache & 0xFFFFF800 === 0xD800); |
145
|
|
|
$toobig = ($ordcache > 0x10FFFF); |
146
|
|
|
|
147
|
|
|
if ($overlong || $surrogate || $toobig) { |
148
|
|
|
$inside = false; |
149
|
|
|
$invalid = true; |
150
|
|
|
$offset = $originOffset - 1; |
151
|
|
|
continue; |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
if ($ordcache === 0xFEFF) { // BOM |
155
|
|
|
if ($originOffset !== 0) { |
156
|
|
|
// if not at beginning, store as word joiner U+2060 |
157
|
|
|
$this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060]; |
158
|
|
|
} |
159
|
|
|
// otherwise discard |
160
|
|
|
continue; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
// verification passed, now store it |
164
|
|
|
$this->chars[] = [$cache, $ordcache]; |
165
|
|
|
// then clear out the temp vars for the next sequence |
166
|
|
|
$inside = false; |
167
|
|
|
$invalid = false; |
168
|
|
|
$cache = ''; |
169
|
|
|
$ordcache = 0; |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
} |
173
|
|
|
|
174
|
|
|
/** |
175
|
|
|
* |
176
|
|
|
*/ |
177
|
|
|
protected static function cpToUtf8Char($cpt) |
178
|
|
|
{ |
179
|
|
|
if ($cpt < 0x80) { |
180
|
|
|
return \chr($cpt); |
181
|
|
|
} |
182
|
|
|
|
183
|
|
|
$data = []; |
|
|
|
|
184
|
|
|
if ($cpt < 0x800) { |
185
|
|
|
$data = [ |
186
|
|
|
0b11000000 | ($cpt >> 6), |
187
|
|
|
0b10000000 | ($cpt & 0b00111111) |
188
|
|
|
]; |
189
|
|
|
} elseif ($cpt < 0x10000) { |
190
|
|
|
$data = [ |
191
|
|
|
0b11100000 | ($cpt >> 12), |
192
|
|
|
0b10000000 | (($cpt >> 6) & 0b00111111), |
193
|
|
|
0b10000000 | ($cpt & 0b00111111), |
194
|
|
|
]; |
195
|
|
|
} elseif ($cpt < 0x10FFFF) { |
196
|
|
|
$data = [ |
197
|
|
|
0b11110100, |
198
|
|
|
0b10000000 | (($cpt >> 12) & 0b00111111), |
199
|
|
|
0b10000000 | (($cpt >> 6) & 0b00111111), |
200
|
|
|
0b10000000 | ($cpt & 0b00111111), |
201
|
|
|
]; |
202
|
|
|
} else { |
203
|
|
|
$data = [0xEF, 0xBF, 0xBD]; // U+FFFD |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
return implode(array_map('chr', $data)); |
207
|
|
|
} |
208
|
|
|
/** |
209
|
|
|
* @param integer $byte |
210
|
|
|
*/ |
211
|
|
|
protected static function charLength($byte) |
212
|
|
|
{ |
213
|
|
|
if (($byte & 0b11111110) === 0b11111100) { |
214
|
|
|
return 6; |
215
|
|
|
} |
216
|
|
|
if (($byte & 0b11111100) === 0b11111000) { |
217
|
|
|
return 5; |
218
|
|
|
} |
219
|
|
|
if (($byte & 0b11111000) === 0b11110000) { |
220
|
|
|
return 4; |
221
|
|
|
} |
222
|
|
|
if (($byte & 0b11110000) === 0b11100000) { |
223
|
|
|
return 3; |
224
|
|
|
} |
225
|
|
|
if (($byte & 0b11100000) === 0b11000000) { |
226
|
|
|
return 2; |
227
|
|
|
} |
228
|
|
|
return 1; |
229
|
|
|
} |
230
|
|
|
} |
231
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.