1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace devtoolboxuk\soteria\voku\Resources; |
4
|
|
|
|
5
|
|
|
class Utf8 extends Resources |
|
|
|
|
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
private $system; |
9
|
|
|
private $ENCODINGS; |
10
|
|
|
private $_supported = []; |
11
|
|
|
private $BROKEN_UTF8_FIX; |
12
|
|
|
private $ORD; |
13
|
|
|
private $CHR; |
14
|
|
|
private $WIN1252_TO_UTF8; |
15
|
|
|
|
16
|
|
|
private $BOM = [ |
17
|
|
|
"\xef\xbb\xbf" => 3, // UTF-8 BOM |
18
|
|
|
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
19
|
|
|
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
20
|
|
|
' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
21
|
|
|
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
22
|
|
|
'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
23
|
|
|
"\xfe\xff" => 2, // UTF-16 (BE) BOM |
24
|
|
|
'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
25
|
|
|
"\xff\xfe" => 2, // UTF-16 (LE) BOM |
26
|
|
|
'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
27
|
|
|
]; |
28
|
|
|
|
29
|
|
|
private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
30
|
|
|
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
31
|
|
|
8234 => "\xE2\x80\xAA", |
32
|
|
|
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
33
|
|
|
8235 => "\xE2\x80\xAB", |
34
|
|
|
// POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
35
|
|
|
8236 => "\xE2\x80\xAC", |
36
|
|
|
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
37
|
|
|
8237 => "\xE2\x80\xAD", |
38
|
|
|
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
39
|
|
|
8238 => "\xE2\x80\xAE", |
40
|
|
|
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
41
|
|
|
8294 => "\xE2\x81\xA6", |
42
|
|
|
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
43
|
|
|
8295 => "\xE2\x81\xA7", |
44
|
|
|
// FIRST STRONG ISOLATE // (use -> dir = "auto") |
45
|
|
|
8296 => "\xE2\x81\xA8", |
46
|
|
|
// POP DIRECTIONAL ISOLATE |
47
|
|
|
8297 => "\xE2\x81\xA9", |
48
|
|
|
]; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var array |
52
|
|
|
*/ |
53
|
|
|
private $WHITESPACE_TABLE = [ |
54
|
|
|
'SPACE' => "\x20", |
55
|
|
|
'NO-BREAK SPACE' => "\xc2\xa0", |
56
|
|
|
'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
57
|
|
|
'EN QUAD' => "\xe2\x80\x80", |
58
|
|
|
'EM QUAD' => "\xe2\x80\x81", |
59
|
|
|
'EN SPACE' => "\xe2\x80\x82", |
60
|
|
|
'EM SPACE' => "\xe2\x80\x83", |
61
|
|
|
'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
62
|
|
|
'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
63
|
|
|
'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
64
|
|
|
'FIGURE SPACE' => "\xe2\x80\x87", |
65
|
|
|
'PUNCTUATION SPACE' => "\xe2\x80\x88", |
66
|
|
|
'THIN SPACE' => "\xe2\x80\x89", |
67
|
|
|
'HAIR SPACE' => "\xe2\x80\x8a", |
68
|
|
|
'LINE SEPARATOR' => "\xe2\x80\xa8", |
69
|
|
|
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
70
|
|
|
'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
71
|
|
|
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
72
|
|
|
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
73
|
|
|
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
74
|
|
|
]; |
75
|
|
|
|
76
|
6 |
|
function __construct() |
|
|
|
|
77
|
|
|
{ |
78
|
6 |
|
$this->system = new System(); |
79
|
6 |
|
$this->checkForSupport(); |
80
|
6 |
|
} |
81
|
|
|
|
82
|
6 |
|
private function checkForSupport() |
|
|
|
|
83
|
|
|
{ |
84
|
6 |
|
if (!isset($this->_supported['already_checked_via_portable_utf8'])) { |
85
|
6 |
|
$this->_supported['already_checked_via_portable_utf8'] = true; |
86
|
|
|
|
87
|
|
|
// http://php.net/manual/en/book.mbstring.php |
88
|
6 |
|
$this->_supported['mbstring'] = $this->system->mbstring_loaded(); |
89
|
6 |
|
$this->_supported['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
90
|
6 |
|
if ($this->_supported['mbstring'] === true) { |
91
|
6 |
|
\mb_internal_encoding('UTF-8'); |
92
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
93
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
94
|
6 |
|
\mb_regex_encoding('UTF-8'); |
95
|
6 |
|
$this->_supported['mbstring_internal_encoding'] = 'UTF-8'; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
// http://php.net/manual/en/book.iconv.php |
99
|
6 |
|
$this->_supported['iconv'] = $this->system->iconv_loaded(); |
100
|
|
|
|
101
|
|
|
// http://php.net/manual/en/book.intl.php |
102
|
6 |
|
$this->_supported['intl'] = $this->system->intl_loaded(); |
103
|
6 |
|
$this->_supported['intl__transliterator_list_ids'] = []; |
104
|
|
|
|
105
|
|
|
if ( |
106
|
6 |
|
$this->_supported['intl'] === true |
107
|
|
|
&& |
108
|
6 |
|
\function_exists('transliterator_list_ids') === true |
109
|
|
|
) { |
110
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
111
|
6 |
|
$this->_supported['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
// http://php.net/manual/en/class.intlchar.php |
115
|
6 |
|
$this->_supported['intlChar'] = $this->system->intlChar_loaded(); |
116
|
|
|
|
117
|
|
|
// http://php.net/manual/en/book.ctype.php |
118
|
6 |
|
$this->_supported['ctype'] = $this->system->ctype_loaded(); |
119
|
|
|
|
120
|
|
|
// http://php.net/manual/en/class.finfo.php |
121
|
6 |
|
$this->_supported['finfo'] = $this->system->finfo_loaded(); |
122
|
|
|
|
123
|
|
|
// http://php.net/manual/en/book.json.php |
124
|
6 |
|
$this->_supported['json'] = $this->system->json_loaded(); |
125
|
|
|
|
126
|
|
|
// http://php.net/manual/en/book.pcre.php |
127
|
6 |
|
$this->_supported['pcre_utf8'] = $this->system->pcre_utf8_support(); |
128
|
|
|
|
129
|
6 |
|
$this->_supported['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
130
|
6 |
|
if ($this->_supported['symfony_polyfill_used'] === true) { |
131
|
|
|
\mb_internal_encoding('UTF-8'); |
132
|
|
|
$this->_supported['mbstring_internal_encoding'] = 'UTF-8'; |
133
|
|
|
} |
134
|
|
|
} |
135
|
6 |
|
} |
136
|
|
|
|
137
|
6 |
|
public function rawurldecode($str, $multi_decode = true) |
|
|
|
|
138
|
|
|
{ |
139
|
6 |
|
if ($str === '') { |
140
|
|
|
return ''; |
141
|
|
|
} |
142
|
|
|
|
143
|
6 |
|
if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
144
|
6 |
|
return $this->fixSimpleUtf8($str); |
145
|
|
|
} |
146
|
|
|
|
147
|
6 |
|
$pattern = '/%u([0-9a-fA-F]{3,4})/'; |
148
|
6 |
|
if (preg_match($pattern, $str)) { |
149
|
|
|
$str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
150
|
|
|
} |
151
|
|
|
|
152
|
6 |
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
153
|
|
|
|
154
|
6 |
|
if ($multi_decode === true) { |
155
|
|
|
do { |
156
|
6 |
|
$str_compare = $str; |
157
|
|
|
|
158
|
|
|
/** |
159
|
|
|
* @psalm-suppress PossiblyInvalidArgument |
160
|
|
|
*/ |
161
|
6 |
|
$str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
162
|
6 |
|
} while ($str_compare !== $str); |
163
|
|
|
} |
164
|
|
|
|
165
|
6 |
|
return $str; |
166
|
|
|
} |
167
|
|
|
|
168
|
6 |
|
private function fixSimpleUtf8($str) |
|
|
|
|
169
|
|
|
{ |
170
|
6 |
|
if ($str === '') { |
171
|
|
|
return ''; |
172
|
|
|
} |
173
|
|
|
|
174
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
|
|
|
175
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
|
|
|
176
|
|
|
|
177
|
6 |
|
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
178
|
1 |
|
if ($this->BROKEN_UTF8_FIX === null) { |
179
|
1 |
|
$this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
180
|
|
|
} |
181
|
|
|
|
182
|
1 |
|
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
183
|
1 |
|
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
184
|
|
|
} |
185
|
|
|
|
186
|
6 |
|
return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
/** |
190
|
|
|
* @param $file |
191
|
|
|
* @return mixed |
192
|
|
|
*/ |
193
|
2 |
|
private function getData($file) |
194
|
|
|
{ |
195
|
2 |
|
return include __DIR__ . '/../Data/' . $file . '.php'; |
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
/** |
199
|
|
|
* @param $str |
200
|
|
|
* @param null $flags |
201
|
|
|
* @param string $encoding |
202
|
|
|
* @return bool|false|string|string[]|null |
203
|
|
|
*/ |
204
|
6 |
|
private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
|
|
|
205
|
|
|
{ |
206
|
|
|
if ( |
207
|
6 |
|
!isset($str[3]) // examples: &; || &x; |
208
|
|
|
|| |
209
|
6 |
|
strpos($str, '&') === false // no "&" |
210
|
|
|
) { |
211
|
6 |
|
return $str; |
212
|
|
|
} |
213
|
|
|
|
214
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
215
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
216
|
|
|
} |
217
|
|
|
|
218
|
6 |
|
if ($flags === null) { |
219
|
|
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
220
|
|
|
} |
221
|
|
|
|
222
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->_supported['mbstring'] === false) { |
223
|
|
|
trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
224
|
|
|
} |
225
|
|
|
|
226
|
|
|
do { |
227
|
6 |
|
$str_compare = $str; |
228
|
|
|
|
229
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
230
|
6 |
|
if ($this->_supported['mbstring'] === true) { |
231
|
6 |
|
if ($encoding === 'UTF-8') { |
232
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
233
|
|
|
} else { |
|
|
|
|
234
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
235
|
|
|
} |
236
|
|
|
} else { |
|
|
|
|
237
|
|
|
$str = (string)preg_replace_callback( |
238
|
|
|
"/&#\d{2,6};/", |
239
|
|
|
/** |
240
|
|
|
* @param string[] $matches |
241
|
|
|
* |
242
|
|
|
* @return string |
243
|
|
|
*/ |
244
|
|
|
static function ($matches) use ($encoding) { |
245
|
|
|
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
246
|
|
|
if ($returnTmp !== '"' && $returnTmp !== "'") { |
247
|
|
|
return $returnTmp; |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
return $matches[0]; |
251
|
|
|
}, |
252
|
|
|
$str |
253
|
|
|
); |
254
|
|
|
} |
255
|
|
|
|
256
|
6 |
|
if (strpos($str, '&') !== false) { |
257
|
6 |
|
if (strpos($str, '&#') !== false) { |
258
|
|
|
// decode also numeric & UTF16 two byte entities |
259
|
6 |
|
$str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
260
|
|
|
} |
261
|
|
|
|
262
|
6 |
|
$str = html_entity_decode($str, $flags, $encoding); |
263
|
|
|
} |
264
|
6 |
|
} while ($str_compare !== $str); |
265
|
|
|
|
266
|
6 |
|
return $str; |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
/** |
270
|
|
|
* @param $encoding |
271
|
|
|
* @param string $fallback |
272
|
|
|
* @return mixed|string |
273
|
|
|
*/ |
274
|
|
|
private function normalize_encoding($encoding, $fallback = '') |
|
|
|
|
275
|
|
|
{ |
276
|
|
|
static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
|
|
|
|
277
|
|
|
|
278
|
|
|
// init |
279
|
|
|
$encoding = (string)$encoding; |
280
|
|
|
|
281
|
|
|
if (!$encoding) { |
282
|
|
|
return $fallback; |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
286
|
|
|
return 'UTF-8'; |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
if ($encoding === '8BIT' || $encoding === 'BINARY') { |
290
|
|
|
return 'CP850'; |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
294
|
|
|
return 'HTML-ENTITIES'; |
295
|
|
|
} |
296
|
|
|
|
297
|
|
|
if ( |
298
|
|
|
$encoding === '1' // only a fallback, for non "strict_types" usage ... |
299
|
|
|
|| |
300
|
|
|
$encoding === '0' // only a fallback, for non "strict_types" usage ... |
301
|
|
|
) { |
302
|
|
|
return $fallback; |
303
|
|
|
} |
304
|
|
|
|
305
|
|
|
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
306
|
|
|
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
if ($this->ENCODINGS === null) { |
310
|
|
|
$this->ENCODINGS = $this->getData('encodings'); |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
if (in_array($encoding, $this->ENCODINGS, true)) { |
314
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
315
|
|
|
|
316
|
|
|
return $encoding; |
317
|
|
|
} |
318
|
|
|
|
319
|
|
|
$encodingOrig = $encoding; |
320
|
|
|
$encoding = strtoupper($encoding); |
321
|
|
|
$encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
322
|
|
|
|
323
|
|
|
$equivalences = [ |
324
|
|
|
'ISO8859' => 'ISO-8859-1', |
325
|
|
|
'ISO88591' => 'ISO-8859-1', |
326
|
|
|
'ISO' => 'ISO-8859-1', |
327
|
|
|
'LATIN' => 'ISO-8859-1', |
328
|
|
|
'LATIN1' => 'ISO-8859-1', // Western European |
329
|
|
|
'ISO88592' => 'ISO-8859-2', |
330
|
|
|
'LATIN2' => 'ISO-8859-2', // Central European |
331
|
|
|
'ISO88593' => 'ISO-8859-3', |
332
|
|
|
'LATIN3' => 'ISO-8859-3', // Southern European |
333
|
|
|
'ISO88594' => 'ISO-8859-4', |
334
|
|
|
'LATIN4' => 'ISO-8859-4', // Northern European |
335
|
|
|
'ISO88595' => 'ISO-8859-5', |
336
|
|
|
'ISO88596' => 'ISO-8859-6', // Greek |
337
|
|
|
'ISO88597' => 'ISO-8859-7', |
338
|
|
|
'ISO88598' => 'ISO-8859-8', // Hebrew |
339
|
|
|
'ISO88599' => 'ISO-8859-9', |
340
|
|
|
'LATIN5' => 'ISO-8859-9', // Turkish |
341
|
|
|
'ISO885911' => 'ISO-8859-11', |
342
|
|
|
'TIS620' => 'ISO-8859-11', // Thai |
343
|
|
|
'ISO885910' => 'ISO-8859-10', |
344
|
|
|
'LATIN6' => 'ISO-8859-10', // Nordic |
345
|
|
|
'ISO885913' => 'ISO-8859-13', |
346
|
|
|
'LATIN7' => 'ISO-8859-13', // Baltic |
347
|
|
|
'ISO885914' => 'ISO-8859-14', |
348
|
|
|
'LATIN8' => 'ISO-8859-14', // Celtic |
349
|
|
|
'ISO885915' => 'ISO-8859-15', |
350
|
|
|
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
351
|
|
|
'ISO885916' => 'ISO-8859-16', |
352
|
|
|
'LATIN10' => 'ISO-8859-16', // Southeast European |
353
|
|
|
'CP1250' => 'WINDOWS-1250', |
354
|
|
|
'WIN1250' => 'WINDOWS-1250', |
355
|
|
|
'WINDOWS1250' => 'WINDOWS-1250', |
356
|
|
|
'CP1251' => 'WINDOWS-1251', |
357
|
|
|
'WIN1251' => 'WINDOWS-1251', |
358
|
|
|
'WINDOWS1251' => 'WINDOWS-1251', |
359
|
|
|
'CP1252' => 'WINDOWS-1252', |
360
|
|
|
'WIN1252' => 'WINDOWS-1252', |
361
|
|
|
'WINDOWS1252' => 'WINDOWS-1252', |
362
|
|
|
'CP1253' => 'WINDOWS-1253', |
363
|
|
|
'WIN1253' => 'WINDOWS-1253', |
364
|
|
|
'WINDOWS1253' => 'WINDOWS-1253', |
365
|
|
|
'CP1254' => 'WINDOWS-1254', |
366
|
|
|
'WIN1254' => 'WINDOWS-1254', |
367
|
|
|
'WINDOWS1254' => 'WINDOWS-1254', |
368
|
|
|
'CP1255' => 'WINDOWS-1255', |
369
|
|
|
'WIN1255' => 'WINDOWS-1255', |
370
|
|
|
'WINDOWS1255' => 'WINDOWS-1255', |
371
|
|
|
'CP1256' => 'WINDOWS-1256', |
372
|
|
|
'WIN1256' => 'WINDOWS-1256', |
373
|
|
|
'WINDOWS1256' => 'WINDOWS-1256', |
374
|
|
|
'CP1257' => 'WINDOWS-1257', |
375
|
|
|
'WIN1257' => 'WINDOWS-1257', |
376
|
|
|
'WINDOWS1257' => 'WINDOWS-1257', |
377
|
|
|
'CP1258' => 'WINDOWS-1258', |
378
|
|
|
'WIN1258' => 'WINDOWS-1258', |
379
|
|
|
'WINDOWS1258' => 'WINDOWS-1258', |
380
|
|
|
'UTF16' => 'UTF-16', |
381
|
|
|
'UTF32' => 'UTF-32', |
382
|
|
|
'UTF8' => 'UTF-8', |
383
|
|
|
'UTF' => 'UTF-8', |
384
|
|
|
'UTF7' => 'UTF-7', |
385
|
|
|
'8BIT' => 'CP850', |
386
|
|
|
'BINARY' => 'CP850', |
387
|
|
|
]; |
388
|
|
|
|
389
|
|
|
if (!empty($equivalences[$encodingUpperHelper])) { |
390
|
|
|
$encoding = $equivalences[$encodingUpperHelper]; |
391
|
|
|
} |
392
|
|
|
|
393
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
394
|
|
|
|
395
|
|
|
return $encoding; |
396
|
|
|
} |
397
|
|
|
|
398
|
6 |
|
private function toUtf8($str) |
|
|
|
|
399
|
|
|
{ |
400
|
|
|
|
401
|
6 |
|
if (is_array($str) === true) { |
402
|
|
|
foreach ($str as $key => $value) { |
403
|
|
|
$str[$key] = $this->toUtf8($value); |
404
|
|
|
} |
405
|
|
|
return $str; |
406
|
|
|
} |
407
|
|
|
|
408
|
|
|
|
409
|
6 |
|
$str = (string)$str; |
410
|
6 |
|
if ($str === '') { |
411
|
|
|
return $str; |
412
|
|
|
} |
413
|
|
|
|
414
|
6 |
|
$max = \strlen($str); |
415
|
6 |
|
$buf = ''; |
416
|
|
|
|
417
|
6 |
|
for ($i = 0; $i < $max; ++$i) { |
418
|
6 |
|
$c1 = $str[$i]; |
|
|
|
|
419
|
|
|
|
420
|
6 |
|
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
421
|
|
|
|
422
|
|
|
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
423
|
|
|
|
424
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
|
|
|
425
|
|
|
|
426
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
427
|
|
|
$buf .= $c1 . $c2; |
428
|
|
|
++$i; |
429
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
430
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
431
|
|
|
} |
432
|
|
|
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
433
|
|
|
|
434
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
435
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
|
|
|
436
|
|
|
|
437
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
438
|
|
|
$buf .= $c1 . $c2 . $c3; |
439
|
|
|
$i += 2; |
440
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
441
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
442
|
|
|
} |
443
|
|
|
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
444
|
|
|
|
445
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
446
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
447
|
|
|
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
|
|
|
|
448
|
|
|
|
449
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
450
|
|
|
$buf .= $c1 . $c2 . $c3 . $c4; |
451
|
|
|
$i += 3; |
452
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
453
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
454
|
|
|
} |
455
|
|
|
} else { // doesn't look like UTF8, but should be converted |
|
|
|
|
456
|
|
|
|
457
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
458
|
|
|
} |
459
|
6 |
|
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
460
|
|
|
|
461
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
462
|
|
|
} else { // it doesn't need conversion |
|
|
|
|
463
|
|
|
|
464
|
6 |
|
$buf .= $c1; |
465
|
|
|
} |
466
|
|
|
} |
467
|
|
|
|
468
|
|
|
// decode unicode escape sequences + unicode surrogate pairs |
469
|
6 |
|
$buf = preg_replace_callback( |
470
|
6 |
|
'/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
471
|
|
|
/** |
472
|
|
|
* @param array $matches |
473
|
|
|
* |
474
|
|
|
* @return string |
475
|
|
|
*/ |
476
|
|
|
function (array $matches) { |
477
|
1 |
|
if (isset($matches[3])) { |
478
|
1 |
|
$cp = (int)hexdec($matches[3]); |
|
|
|
|
479
|
|
|
} else { |
|
|
|
|
480
|
|
|
// http://unicode.org/faq/utf_bom.html#utf16-4 |
481
|
|
|
$cp = ((int)hexdec($matches[1]) << 10) |
482
|
|
|
+ (int)hexdec($matches[2]) |
483
|
|
|
+ 0x10000 |
484
|
|
|
- (0xD800 << 10) |
485
|
|
|
- 0xDC00; |
486
|
|
|
} |
487
|
|
|
|
488
|
|
|
// https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
489
|
|
|
// |
490
|
|
|
// php_utf32_utf8(unsigned char *buf, unsigned k) |
491
|
|
|
|
492
|
1 |
|
if ($cp < 0x80) { |
493
|
1 |
|
return (string)$this->chr($cp); |
494
|
|
|
} |
495
|
|
|
|
496
|
|
|
if ($cp < 0xA0) { |
497
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
498
|
|
|
return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
499
|
|
|
} |
500
|
|
|
|
501
|
|
|
return $this->decimalToChr($cp); |
502
|
6 |
|
}, |
503
|
6 |
|
$buf |
504
|
|
|
); |
505
|
|
|
|
506
|
6 |
|
if ($buf === null) { |
507
|
|
|
return ''; |
508
|
|
|
} |
509
|
|
|
|
510
|
|
|
|
511
|
6 |
|
return $buf; |
512
|
|
|
} |
513
|
|
|
|
514
|
|
|
private function toUtf8ConvertHelper($input) |
|
|
|
|
515
|
|
|
{ |
516
|
|
|
// init |
517
|
|
|
$buf = ''; |
518
|
|
|
|
519
|
|
|
if ($this->ORD === null) { |
520
|
|
|
$this->ORD = $this->getData('ord'); |
521
|
|
|
} |
522
|
|
|
|
523
|
|
|
if ($this->CHR === null) { |
524
|
|
|
$this->CHR = $this->getData('chr'); |
525
|
|
|
} |
526
|
|
|
|
527
|
|
|
if ($this->WIN1252_TO_UTF8 === null) { |
528
|
|
|
$this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
529
|
|
|
} |
530
|
|
|
|
531
|
|
|
$ordC1 = $this->ORD[$input]; |
532
|
|
|
if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
533
|
|
|
$buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
534
|
|
|
} else { |
|
|
|
|
535
|
|
|
$cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
536
|
|
|
$cc2 = ((string)$input & "\x3F") | "\x80"; |
537
|
|
|
$buf .= $cc1 . $cc2; |
538
|
|
|
} |
539
|
|
|
|
540
|
|
|
return $buf; |
541
|
|
|
} |
542
|
|
|
|
543
|
1 |
|
private function chr($code_point, $encoding = 'UTF-8') |
|
|
|
|
544
|
|
|
{ |
545
|
|
|
// init |
546
|
1 |
|
static $CHAR_CACHE = []; |
547
|
|
|
|
548
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
549
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
550
|
|
|
} |
551
|
|
|
|
552
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->_supported['mbstring'] === false) { |
553
|
|
|
trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
554
|
|
|
} |
555
|
|
|
|
556
|
1 |
|
$cacheKey = $code_point . $encoding; |
557
|
1 |
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
558
|
|
|
return $CHAR_CACHE[$cacheKey]; |
559
|
|
|
} |
560
|
|
|
|
561
|
1 |
|
if ($code_point <= 127) { // use "simple"-char only until "\x80" |
562
|
|
|
|
563
|
1 |
|
if ($this->CHR === null) { |
564
|
1 |
|
$this->CHR = (array)$this->getData('chr'); |
565
|
|
|
} |
566
|
|
|
|
567
|
|
|
/** |
568
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
569
|
|
|
*/ |
570
|
1 |
|
$chr = $this->CHR[$code_point]; |
571
|
|
|
|
572
|
1 |
|
if ($encoding !== 'UTF-8') { |
573
|
|
|
$chr = $this->encode($encoding, $chr); |
574
|
|
|
} |
575
|
|
|
|
576
|
1 |
|
return $CHAR_CACHE[$cacheKey] = $chr; |
577
|
|
|
} |
578
|
|
|
|
579
|
|
|
// |
580
|
|
|
// fallback via "IntlChar" |
581
|
|
|
// |
582
|
|
|
|
583
|
|
|
if ($this->_supported['intlChar'] === true) { |
584
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
585
|
|
|
$chr = IntlChar::chr($code_point); |
586
|
|
|
|
587
|
|
|
if ($encoding !== 'UTF-8') { |
588
|
|
|
$chr = $this->encode($encoding, $chr); |
589
|
|
|
} |
590
|
|
|
|
591
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
592
|
|
|
} |
593
|
|
|
|
594
|
|
|
// |
595
|
|
|
// fallback via vanilla php |
596
|
|
|
// |
597
|
|
|
|
598
|
|
|
if ($this->CHR === null) { |
599
|
|
|
$this->CHR = (array)$this->getData('chr'); |
600
|
|
|
} |
601
|
|
|
|
602
|
|
|
$code_point = (int)$code_point; |
603
|
|
|
if ($code_point <= 0x7F) { |
604
|
|
|
/** |
605
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
606
|
|
|
*/ |
607
|
|
|
$chr = $this->CHR[$code_point]; |
608
|
|
|
} elseif ($code_point <= 0x7FF) { |
609
|
|
|
/** |
610
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
611
|
|
|
*/ |
612
|
|
|
$chr = $this->CHR[($code_point >> 6) + 0xC0] . |
613
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
614
|
|
|
} elseif ($code_point <= 0xFFFF) { |
615
|
|
|
/** |
616
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
617
|
|
|
*/ |
618
|
|
|
$chr = $this->CHR[($code_point >> 12) + 0xE0] . |
619
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
620
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
621
|
|
|
} else { |
|
|
|
|
622
|
|
|
/** |
623
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
624
|
|
|
*/ |
625
|
|
|
$chr = $this->CHR[($code_point >> 18) + 0xF0] . |
626
|
|
|
$this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
627
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
628
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
629
|
|
|
} |
630
|
|
|
|
631
|
|
|
if ($encoding !== 'UTF-8') { |
632
|
|
|
$chr = $this->encode($encoding, $chr); |
633
|
|
|
} |
634
|
|
|
|
635
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
636
|
|
|
} |
637
|
|
|
|
638
|
|
|
private function encode($toEncoding, $str) |
|
|
|
|
639
|
|
|
{ |
640
|
|
|
if ($str === '' || $toEncoding === '') { |
641
|
|
|
return $str; |
642
|
|
|
} |
643
|
|
|
|
644
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
645
|
|
|
$toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
646
|
|
|
} |
647
|
|
|
|
648
|
|
|
// if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
649
|
|
|
// $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
650
|
|
|
// } |
651
|
|
|
|
652
|
|
|
// if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
653
|
|
|
// return $str; |
654
|
|
|
// } |
655
|
|
|
|
656
|
|
|
if ($toEncoding === 'JSON') { |
657
|
|
|
$return = $this->jsonEncode($str); |
658
|
|
|
if ($return === false) { |
659
|
|
|
throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
660
|
|
|
} |
661
|
|
|
|
662
|
|
|
return $return; |
663
|
|
|
} |
664
|
|
|
// if ($fromEncoding === 'JSON') { |
665
|
|
|
// $str = $this->json_decode($str); |
666
|
|
|
// $fromEncoding = ''; |
667
|
|
|
// } |
668
|
|
|
|
669
|
|
|
if ($toEncoding === 'BASE64') { |
670
|
|
|
return base64_encode($str); |
671
|
|
|
} |
672
|
|
|
// if ($fromEncoding === 'BASE64') { |
673
|
|
|
// $str = base64_decode($str, true); |
674
|
|
|
// $fromEncoding = ''; |
675
|
|
|
// } |
676
|
|
|
|
677
|
|
|
if ($toEncoding === 'HTML-ENTITIES') { |
678
|
|
|
return $this->htmlEncode($str, true, 'UTF-8'); |
679
|
|
|
} |
680
|
|
|
// if ($fromEncoding === 'HTML-ENTITIES') { |
681
|
|
|
// $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
682
|
|
|
// $fromEncoding = ''; |
683
|
|
|
// } |
684
|
|
|
|
685
|
|
|
$fromEncodingDetected = false; |
|
|
|
|
686
|
|
|
// if ($autodetectFromEncoding === true || !$fromEncoding) { |
687
|
|
|
// $fromEncodingDetected = $this->str_detect_encoding($str); |
688
|
|
|
// } |
689
|
|
|
|
690
|
|
|
// DEBUG |
691
|
|
|
//var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
692
|
|
|
|
693
|
|
|
// if ($fromEncodingDetected !== false) { |
694
|
|
|
// $fromEncoding = $fromEncodingDetected; |
695
|
|
|
// } elseif ($autodetectFromEncoding === true) { |
696
|
|
|
// // fallback for the "autodetect"-mode |
697
|
|
|
// return $this->toUtf8($str); |
698
|
|
|
// } |
699
|
|
|
|
700
|
|
|
// if (!$fromEncoding || $fromEncoding === $toEncoding) { |
701
|
|
|
// return $str; |
702
|
|
|
// } |
703
|
|
|
|
704
|
|
|
// if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
705
|
|
|
// return $this->toUtf8($str); |
706
|
|
|
// } |
707
|
|
|
|
708
|
|
|
// if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
709
|
|
|
// return $this->to_iso8859($str); |
710
|
|
|
// } |
711
|
|
|
|
712
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->_supported['mbstring'] === false) { |
713
|
|
|
trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
714
|
|
|
} |
715
|
|
|
// |
716
|
|
|
// if ($this->_supported['mbstring'] === true) { |
717
|
|
|
// // warning: do not use the symfony polyfill here |
718
|
|
|
// $strEncoded = mb_convert_encoding( |
719
|
|
|
// $str, |
720
|
|
|
// $toEncoding, |
721
|
|
|
// $fromEncoding |
722
|
|
|
// ); |
723
|
|
|
// |
724
|
|
|
// if ($strEncoded) { |
725
|
|
|
// return $strEncoded; |
726
|
|
|
// } |
727
|
|
|
// } |
728
|
|
|
// |
729
|
|
|
// $return = \iconv($fromEncoding, $toEncoding, $str); |
730
|
|
|
// if ($return !== false) { |
731
|
|
|
// return $return; |
732
|
|
|
// } |
733
|
|
|
|
734
|
|
|
return $str; |
735
|
|
|
} |
736
|
|
|
|
737
|
|
|
private function jsonEncode($value) |
738
|
|
|
{ |
739
|
|
|
$value = $this->filter($value); |
740
|
|
|
|
741
|
|
|
if ($this->_supported['json'] === false) { |
742
|
|
|
throw new \RuntimeException('ext-json: is not installed'); |
743
|
|
|
} |
744
|
|
|
|
745
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
746
|
|
|
return json_encode($value, 0, 512); |
747
|
|
|
} |
748
|
|
|
|
749
|
|
|
private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
|
|
|
|
750
|
|
|
{ |
751
|
|
|
switch (\gettype($var)) { |
752
|
|
|
case 'array': |
753
|
|
|
foreach ($var as $key => $value) { |
754
|
|
|
$var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
755
|
|
|
} |
756
|
|
|
unset($v); |
|
|
|
|
757
|
|
|
|
758
|
|
|
break; |
759
|
|
|
case 'object': |
760
|
|
|
foreach ($var as $key => $value) { |
761
|
|
|
$str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
|
|
|
762
|
|
|
} |
763
|
|
|
unset($v); |
764
|
|
|
|
765
|
|
|
break; |
766
|
|
|
case 'string': |
|
|
|
|
767
|
|
|
|
768
|
|
|
if (strpos($var, "\r") !== false) { |
769
|
|
|
// Workaround https://bugs.php.net/65732 |
770
|
|
|
$var = $this->normalizeLineEnding($var); |
771
|
|
|
} |
772
|
|
|
|
773
|
|
|
if ($this->isAscii($var) === false) { |
774
|
|
|
if (\Normalizer::isNormalized($var, $normalization_form)) { |
775
|
|
|
$n = '-'; |
|
|
|
|
776
|
|
|
} else { |
|
|
|
|
777
|
|
|
$n = \Normalizer::normalize($var, $normalization_form); |
778
|
|
|
|
779
|
|
|
if (isset($n[0])) { |
780
|
|
|
$var = $n; |
781
|
|
|
} else { |
|
|
|
|
782
|
|
|
$var = $this->encode('UTF-8', $var, true); |
|
|
|
|
783
|
|
|
} |
784
|
|
|
} |
785
|
|
|
|
786
|
|
|
if ( |
787
|
|
|
$var[0] >= "\x80" |
788
|
|
|
&& |
789
|
|
|
isset($n[0], $leading_combining[0]) |
790
|
|
|
&& |
791
|
|
|
preg_match('/^\p{Mn}/u', $var) |
792
|
|
|
) { |
793
|
|
|
// Prevent leading combining chars |
794
|
|
|
// for NFC-safe concatenations. |
795
|
|
|
$var = $leading_combining . $var; |
796
|
|
|
} |
797
|
|
|
} |
798
|
|
|
|
799
|
|
|
break; |
800
|
|
|
} |
801
|
|
|
|
802
|
|
|
return $var; |
803
|
|
|
} |
804
|
|
|
|
805
|
|
|
private function normalizeLineEnding($str) |
806
|
|
|
{ |
807
|
|
|
return str_replace(["\r\n", "\r"], "\n", $str); |
808
|
|
|
} |
809
|
|
|
|
810
|
|
|
private function isAscii($str) |
811
|
|
|
{ |
812
|
|
|
if ($str === '') { |
813
|
|
|
return true; |
814
|
|
|
} |
815
|
|
|
|
816
|
|
|
return !preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); |
817
|
|
|
} |
818
|
|
|
|
819
|
|
|
private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
820
|
|
|
{ |
821
|
|
|
if ($str === '') { |
822
|
|
|
return ''; |
823
|
|
|
} |
824
|
|
|
|
825
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
826
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
827
|
|
|
} |
828
|
|
|
|
829
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
830
|
|
|
if ($this->_supported['mbstring'] === true) { |
831
|
|
|
$startCode = 0x00; |
832
|
|
|
if ($keepAsciiChars === true) { |
833
|
|
|
$startCode = 0x80; |
834
|
|
|
} |
835
|
|
|
|
836
|
|
|
if ($encoding === 'UTF-8') { |
837
|
|
|
return mb_encode_numericentity( |
838
|
|
|
$str, |
839
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0] |
840
|
|
|
); |
841
|
|
|
} |
842
|
|
|
|
843
|
|
|
return mb_encode_numericentity( |
844
|
|
|
$str, |
845
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0], |
846
|
|
|
$encoding |
847
|
|
|
); |
848
|
|
|
} |
849
|
|
|
|
850
|
|
|
return implode( |
851
|
|
|
'', |
852
|
|
|
\array_map( |
853
|
|
|
function (string $chr) use ($keepAsciiChars, $encoding) { |
854
|
|
|
return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
855
|
|
|
}, |
856
|
|
|
$this->strSplit($str) |
857
|
|
|
) |
858
|
|
|
); |
859
|
|
|
} |
860
|
|
|
|
861
|
|
|
|
862
|
|
|
private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
863
|
|
|
{ |
864
|
|
|
if ($char === '') { |
865
|
|
|
return ''; |
866
|
|
|
} |
867
|
|
|
|
868
|
|
|
if ($keepAsciiChars === true && $this->isAscii($char) === true) { |
869
|
|
|
return $char; |
870
|
|
|
} |
871
|
|
|
|
872
|
|
|
return '&#' . $this->ord($char, $encoding) . ';'; |
873
|
|
|
} |
874
|
|
|
|
875
|
|
|
private function ord($chr, $encoding = 'UTF-8') |
|
|
|
|
876
|
|
|
{ |
877
|
|
|
static $CHAR_CACHE = []; |
878
|
|
|
|
879
|
|
|
// init |
880
|
|
|
$chr = (string)$chr; |
881
|
|
|
|
882
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
883
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
884
|
|
|
} |
885
|
|
|
|
886
|
|
|
$cacheKey = $chr . $encoding; |
887
|
|
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
888
|
|
|
return $CHAR_CACHE[$cacheKey]; |
889
|
|
|
} |
890
|
|
|
|
891
|
|
|
// check again, if it's still not UTF-8 |
892
|
|
|
if ($encoding !== 'UTF-8') { |
893
|
|
|
$chr = $this->encode($encoding, $chr); |
894
|
|
|
} |
895
|
|
|
|
896
|
|
|
if ($this->ORD === null) { |
897
|
|
|
$this->ORD = $this->getData('ord'); |
898
|
|
|
} |
899
|
|
|
|
900
|
|
|
if (isset($this->ORD[$chr])) { |
901
|
|
|
return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
902
|
|
|
} |
903
|
|
|
|
904
|
|
|
// |
905
|
|
|
// fallback via "IntlChar" |
906
|
|
|
// |
907
|
|
|
|
908
|
|
|
if ($this->_supported['intlChar'] === true) { |
909
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
910
|
|
|
$code = \IntlChar::ord($chr); |
911
|
|
|
if ($code) { |
912
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
913
|
|
|
} |
914
|
|
|
} |
915
|
|
|
|
916
|
|
|
// |
917
|
|
|
// fallback via vanilla php |
918
|
|
|
// |
919
|
|
|
|
920
|
|
|
/** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
921
|
|
|
$chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
922
|
|
|
$code = $chr ? $chr[1] : 0; |
923
|
|
|
|
924
|
|
|
if ($code >= 0xF0 && isset($chr[4])) { |
925
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
926
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
927
|
|
|
} |
928
|
|
|
|
929
|
|
|
if ($code >= 0xE0 && isset($chr[3])) { |
930
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
931
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
932
|
|
|
} |
933
|
|
|
|
934
|
|
|
if ($code >= 0xC0 && isset($chr[2])) { |
935
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
936
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
937
|
|
|
} |
938
|
|
|
|
939
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
940
|
|
|
} |
941
|
|
|
|
942
|
|
|
private function strSplit($str, $length = 1) |
|
|
|
|
943
|
|
|
{ |
944
|
|
|
if ($length <= 0) { |
945
|
|
|
return []; |
946
|
|
|
} |
947
|
|
|
|
948
|
|
|
if (is_array($str) === true) { |
949
|
|
|
foreach ($str as $key => $value) { |
950
|
|
|
$str[$key] = $this->strSplit($value, $length); |
951
|
|
|
} |
952
|
|
|
|
953
|
|
|
return $str; |
954
|
|
|
} |
955
|
|
|
|
956
|
|
|
// init |
957
|
|
|
$str = (string)$str; |
958
|
|
|
|
959
|
|
|
if ($str === '') { |
960
|
|
|
return []; |
961
|
|
|
} |
962
|
|
|
|
963
|
|
|
|
964
|
|
|
//gere |
965
|
|
|
$ret = $this->strSplitString($str); |
966
|
|
|
|
967
|
|
|
if ($length > 1) { |
968
|
|
|
$ret = \array_chunk($ret, $length); |
969
|
|
|
|
970
|
|
|
return array_map( |
971
|
|
|
static function (&$item) { |
972
|
|
|
return implode('', $item); |
973
|
|
|
}, |
974
|
|
|
$ret |
975
|
|
|
); |
976
|
|
|
} |
977
|
|
|
|
978
|
|
|
if (isset($ret[0]) && $ret[0] === '') { |
979
|
|
|
return []; |
980
|
|
|
} |
981
|
|
|
|
982
|
|
|
return $ret; |
983
|
|
|
} |
984
|
|
|
|
985
|
|
|
private function strSplitString($str) |
|
|
|
|
986
|
|
|
{ |
987
|
|
|
$supportString = 'default'; |
988
|
|
|
if ($this->_supported['mbstring'] === true) { |
989
|
|
|
$supportString = 'mbstring'; |
990
|
|
|
} |
991
|
|
|
if ($this->_supported['pcre_utf8'] === true) { |
992
|
|
|
$supportString = 'pcre_utf8'; |
993
|
|
|
} |
994
|
|
|
|
995
|
|
|
switch ($supportString) { |
|
|
|
|
996
|
|
|
|
997
|
|
|
case 'mbstring': |
|
|
|
|
998
|
|
|
|
999
|
|
|
$iMax = \mb_strlen($str); |
1000
|
|
|
if ($iMax <= 127) { |
1001
|
|
|
$ret = []; |
1002
|
|
|
for ($i = 0; $i < $iMax; ++$i) { |
1003
|
|
|
$ret[] = \mb_substr($str, $i, 1); |
1004
|
|
|
} |
1005
|
|
|
} else { |
|
|
|
|
1006
|
|
|
$retArray = []; |
1007
|
|
|
preg_match_all('/./us', $str, $retArray); |
1008
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
1009
|
|
|
} |
1010
|
|
|
|
1011
|
|
|
break; |
1012
|
|
|
case 'pcre_utf8': |
1013
|
|
|
$retArray = []; |
1014
|
|
|
preg_match_all('/./us', $str, $retArray); |
1015
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
1016
|
|
|
break; |
1017
|
|
|
default: |
1018
|
|
|
$ret = []; |
1019
|
|
|
$len = \strlen($str); |
1020
|
|
|
|
1021
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
1022
|
|
|
for ($i = 0; $i < $len; ++$i) { |
1023
|
|
|
if (($str[$i] & "\x80") === "\x00") { |
1024
|
|
|
$ret[] = $str[$i]; |
1025
|
|
|
} elseif ( |
1026
|
|
|
isset($str[$i + 1]) |
1027
|
|
|
&& |
1028
|
|
|
($str[$i] & "\xE0") === "\xC0" |
1029
|
|
|
) { |
1030
|
|
|
if (($str[$i + 1] & "\xC0") === "\x80") { |
1031
|
|
|
$ret[] = $str[$i] . $str[$i + 1]; |
1032
|
|
|
|
1033
|
|
|
++$i; |
1034
|
|
|
} |
1035
|
|
|
} elseif ( |
1036
|
|
|
isset($str[$i + 2]) |
1037
|
|
|
&& |
1038
|
|
|
($str[$i] & "\xF0") === "\xE0" |
1039
|
|
|
) { |
1040
|
|
|
if ( |
1041
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
1042
|
|
|
&& |
1043
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
1044
|
|
|
) { |
1045
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
1046
|
|
|
|
1047
|
|
|
$i += 2; |
1048
|
|
|
} |
1049
|
|
|
} elseif ( |
1050
|
|
|
isset($str[$i + 3]) |
1051
|
|
|
&& |
1052
|
|
|
($str[$i] & "\xF8") === "\xF0" |
1053
|
|
|
) { |
1054
|
|
|
if ( |
1055
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
1056
|
|
|
&& |
1057
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
1058
|
|
|
&& |
1059
|
|
|
($str[$i + 3] & "\xC0") === "\x80" |
1060
|
|
|
) { |
1061
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
1062
|
|
|
|
1063
|
|
|
$i += 3; |
1064
|
|
|
} |
1065
|
|
|
} |
1066
|
|
|
} |
1067
|
|
|
break; |
1068
|
|
|
} |
1069
|
|
|
|
1070
|
|
|
return $ret; |
1071
|
|
|
} |
1072
|
|
|
|
1073
|
|
|
private function decimalToChr($int) |
1074
|
|
|
{ |
1075
|
|
|
return $this->htmlEntityDecode('&#' . $int . ';', \ENT_QUOTES | \ENT_HTML5); |
1076
|
|
|
} |
1077
|
|
|
|
1078
|
|
|
private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
|
|
|
|
1079
|
|
|
{ |
1080
|
|
|
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
1081
|
|
|
// caused connection reset problem on larger strings |
1082
|
|
|
|
1083
|
|
|
$regx = '/ |
1084
|
|
|
( |
1085
|
|
|
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
1086
|
|
|
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
1087
|
|
|
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
1088
|
|
|
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
1089
|
|
|
){1,100} # ...one or more times |
1090
|
|
|
) |
1091
|
|
|
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
1092
|
|
|
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
1093
|
|
|
/x'; |
1094
|
|
|
$str = (string)preg_replace($regx, '$1', $str); |
1095
|
|
|
|
1096
|
|
|
if ($replace_diamond_question_mark === true) { |
1097
|
|
|
$str = $this->replace_diamond_question_mark($str, ''); |
1098
|
|
|
} |
1099
|
|
|
|
1100
|
|
|
if ($remove_invisible_characters === true) { |
1101
|
|
|
$str = $this->remove_invisible_characters($str); |
1102
|
|
|
} |
1103
|
|
|
|
1104
|
|
|
if ($normalize_whitespace === true) { |
1105
|
|
|
$str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
1106
|
|
|
} |
1107
|
|
|
|
1108
|
|
|
if ($normalize_msword === true) { |
1109
|
|
|
$str = $this->normalize_msword($str); |
1110
|
|
|
} |
1111
|
|
|
|
1112
|
|
|
if ($remove_bom === true) { |
1113
|
|
|
$str = $this->remove_bom($str); |
1114
|
|
|
} |
1115
|
|
|
|
1116
|
|
|
return $str; |
1117
|
|
|
} |
1118
|
|
|
|
1119
|
6 |
|
public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
|
|
|
1120
|
|
|
{ |
1121
|
6 |
|
if ($str === '') { |
1122
|
|
|
return ''; |
1123
|
|
|
} |
1124
|
|
|
|
1125
|
6 |
|
if ($processInvalidUtf8 === true) { |
1126
|
6 |
|
$replacementCharHelper = $replacementChar; |
|
|
|
|
1127
|
6 |
|
if ($replacementChar === '') { |
1128
|
6 |
|
$replacementCharHelper = 'none'; |
1129
|
|
|
} |
1130
|
|
|
|
1131
|
6 |
|
if ($this->_supported['mbstring'] === false) { |
1132
|
|
|
// if there is no native support for "mbstring", |
1133
|
|
|
// then we need to clean the string before ... |
1134
|
|
|
$str = $this->clean($str); |
1135
|
|
|
} |
1136
|
|
|
|
1137
|
6 |
|
$save = \mb_substitute_character(); |
1138
|
6 |
|
\mb_substitute_character($replacementCharHelper); |
1139
|
|
|
// the polyfill maybe return false, so cast to string |
1140
|
6 |
|
$str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
1141
|
6 |
|
\mb_substitute_character($save); |
1142
|
|
|
} |
1143
|
|
|
|
1144
|
6 |
|
return str_replace( |
1145
|
|
|
[ |
1146
|
6 |
|
"\xEF\xBF\xBD", |
1147
|
|
|
'�', |
1148
|
|
|
], |
1149
|
|
|
[ |
1150
|
6 |
|
$replacementChar, |
1151
|
6 |
|
$replacementChar, |
1152
|
|
|
], |
1153
|
6 |
|
$str |
1154
|
|
|
); |
1155
|
|
|
} |
1156
|
|
|
|
1157
|
6 |
|
public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
|
|
|
1158
|
|
|
{ |
1159
|
|
|
// init |
1160
|
6 |
|
$non_displayables = []; |
1161
|
|
|
|
1162
|
|
|
// every control character except newline (dec 10), |
1163
|
|
|
// carriage return (dec 13) and horizontal tab (dec 09) |
1164
|
6 |
|
if ($url_encoded) { |
1165
|
6 |
|
$non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
1166
|
6 |
|
$non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
1167
|
|
|
} |
1168
|
|
|
|
1169
|
6 |
|
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
1170
|
|
|
|
1171
|
|
|
do { |
1172
|
6 |
|
$str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
1173
|
6 |
|
} while ($count !== 0); |
1174
|
|
|
|
1175
|
6 |
|
return $str; |
1176
|
|
|
} |
1177
|
|
|
|
1178
|
6 |
|
public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
|
|
|
1179
|
|
|
{ |
1180
|
6 |
|
if ($str === '') { |
1181
|
|
|
return ''; |
1182
|
|
|
} |
1183
|
|
|
|
1184
|
6 |
|
static $WHITESPACE_CACHE = []; |
1185
|
6 |
|
$cacheKey = (int)$keepNonBreakingSpace; |
1186
|
|
|
|
1187
|
6 |
|
if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
1188
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
1189
|
|
|
|
1190
|
1 |
|
if ($keepNonBreakingSpace === true) { |
1191
|
|
|
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
1192
|
|
|
} |
1193
|
|
|
|
1194
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
1195
|
|
|
} |
1196
|
|
|
|
1197
|
6 |
|
if ($keepBidiUnicodeControls === false) { |
1198
|
6 |
|
static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
|
|
|
1199
|
|
|
|
1200
|
6 |
|
if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
1201
|
1 |
|
$BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
1202
|
|
|
} |
1203
|
|
|
|
1204
|
6 |
|
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
1205
|
|
|
} |
1206
|
|
|
|
1207
|
6 |
|
return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
1208
|
|
|
} |
1209
|
|
|
|
1210
|
|
|
private function normalize_msword($str) |
|
|
|
|
1211
|
|
|
{ |
1212
|
|
|
if ($str === '') { |
1213
|
|
|
return ''; |
1214
|
|
|
} |
1215
|
|
|
|
1216
|
|
|
$keys = [ |
1217
|
|
|
"\xc2\xab", // « (U+00AB) in UTF-8 |
1218
|
|
|
"\xc2\xbb", // » (U+00BB) in UTF-8 |
1219
|
|
|
"\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
1220
|
|
|
"\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
1221
|
|
|
"\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
1222
|
|
|
"\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
1223
|
|
|
"\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
1224
|
|
|
"\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
1225
|
|
|
"\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
1226
|
|
|
"\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
1227
|
|
|
"\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
1228
|
|
|
"\xe2\x80\xba", // › (U+203A) in UTF-8 |
1229
|
|
|
"\xe2\x80\x93", // – (U+2013) in UTF-8 |
1230
|
|
|
"\xe2\x80\x94", // — (U+2014) in UTF-8 |
1231
|
|
|
"\xe2\x80\xa6", // … (U+2026) in UTF-8 |
1232
|
|
|
]; |
1233
|
|
|
|
1234
|
|
|
$values = [ |
1235
|
|
|
'"', // « (U+00AB) in UTF-8 |
1236
|
|
|
'"', // » (U+00BB) in UTF-8 |
1237
|
|
|
"'", // ‘ (U+2018) in UTF-8 |
1238
|
|
|
"'", // ’ (U+2019) in UTF-8 |
1239
|
|
|
"'", // ‚ (U+201A) in UTF-8 |
1240
|
|
|
"'", // ‛ (U+201B) in UTF-8 |
1241
|
|
|
'"', // “ (U+201C) in UTF-8 |
1242
|
|
|
'"', // ” (U+201D) in UTF-8 |
1243
|
|
|
'"', // „ (U+201E) in UTF-8 |
1244
|
|
|
'"', // ‟ (U+201F) in UTF-8 |
1245
|
|
|
"'", // ‹ (U+2039) in UTF-8 |
1246
|
|
|
"'", // › (U+203A) in UTF-8 |
1247
|
|
|
'-', // – (U+2013) in UTF-8 |
1248
|
|
|
'-', // — (U+2014) in UTF-8 |
1249
|
|
|
'...', // … (U+2026) in UTF-8 |
1250
|
|
|
]; |
1251
|
|
|
|
1252
|
|
|
return str_replace($keys, $values, $str); |
1253
|
|
|
} |
1254
|
|
|
|
1255
|
6 |
|
public function remove_bom($str) |
|
|
|
|
1256
|
|
|
{ |
1257
|
6 |
|
if ($str === '') { |
1258
|
|
|
return ''; |
1259
|
|
|
} |
1260
|
|
|
|
1261
|
6 |
|
$strLength = \strlen($str); |
1262
|
6 |
|
foreach ($this->BOM as $bomString => $bomByteLength) { |
1263
|
6 |
|
if (strpos($str, $bomString, 0) === 0) { |
1264
|
|
|
$strTmp = \substr($str, $bomByteLength, $strLength); |
1265
|
|
|
if ($strTmp === false) { |
1266
|
|
|
return ''; |
1267
|
|
|
} |
1268
|
|
|
|
1269
|
|
|
$strLength -= (int)$bomByteLength; |
1270
|
|
|
$str = (string)$strTmp; |
1271
|
|
|
} |
1272
|
|
|
} |
1273
|
|
|
|
1274
|
6 |
|
return $str; |
1275
|
|
|
} |
1276
|
|
|
|
1277
|
|
|
} |
1278
|
|
|
|
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString
.