1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace devtoolboxuk\soteria\voku\Resources; |
4
|
|
|
|
5
|
|
|
class Utf8 extends Resources |
|
|
|
|
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
private $system; |
9
|
|
|
private $ENCODINGS; |
10
|
|
|
private $SUPPORT = []; |
11
|
|
|
private $BROKEN_UTF8_FIX; |
12
|
|
|
private $ORD; |
13
|
|
|
private $CHR; |
14
|
|
|
private $WIN1252_TO_UTF8; |
15
|
|
|
private $BOM = [ |
16
|
|
|
"\xef\xbb\xbf" => 3, // UTF-8 BOM |
17
|
|
|
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
18
|
|
|
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
19
|
|
|
' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
20
|
|
|
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
21
|
|
|
'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
22
|
|
|
"\xfe\xff" => 2, // UTF-16 (BE) BOM |
23
|
|
|
'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
24
|
|
|
"\xff\xfe" => 2, // UTF-16 (LE) BOM |
25
|
|
|
'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
26
|
|
|
]; |
27
|
|
|
|
28
|
|
|
private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
29
|
|
|
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
30
|
|
|
8234 => "\xE2\x80\xAA", |
31
|
|
|
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
32
|
|
|
8235 => "\xE2\x80\xAB", |
33
|
|
|
// POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
34
|
|
|
8236 => "\xE2\x80\xAC", |
35
|
|
|
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
36
|
|
|
8237 => "\xE2\x80\xAD", |
37
|
|
|
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
38
|
|
|
8238 => "\xE2\x80\xAE", |
39
|
|
|
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
40
|
|
|
8294 => "\xE2\x81\xA6", |
41
|
|
|
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
42
|
|
|
8295 => "\xE2\x81\xA7", |
43
|
|
|
// FIRST STRONG ISOLATE // (use -> dir = "auto") |
44
|
|
|
8296 => "\xE2\x81\xA8", |
45
|
|
|
// POP DIRECTIONAL ISOLATE |
46
|
|
|
8297 => "\xE2\x81\xA9", |
47
|
|
|
]; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @var array |
51
|
|
|
*/ |
52
|
|
|
private $WHITESPACE_TABLE = [ |
53
|
|
|
'SPACE' => "\x20", |
54
|
|
|
'NO-BREAK SPACE' => "\xc2\xa0", |
55
|
|
|
'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
56
|
|
|
'EN QUAD' => "\xe2\x80\x80", |
57
|
|
|
'EM QUAD' => "\xe2\x80\x81", |
58
|
|
|
'EN SPACE' => "\xe2\x80\x82", |
59
|
|
|
'EM SPACE' => "\xe2\x80\x83", |
60
|
|
|
'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
61
|
|
|
'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
62
|
|
|
'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
63
|
|
|
'FIGURE SPACE' => "\xe2\x80\x87", |
64
|
|
|
'PUNCTUATION SPACE' => "\xe2\x80\x88", |
65
|
|
|
'THIN SPACE' => "\xe2\x80\x89", |
66
|
|
|
'HAIR SPACE' => "\xe2\x80\x8a", |
67
|
|
|
'LINE SEPARATOR' => "\xe2\x80\xa8", |
68
|
|
|
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
69
|
|
|
'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
70
|
|
|
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
71
|
|
|
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
72
|
|
|
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
73
|
|
|
]; |
74
|
|
|
|
75
|
6 |
|
function __construct() |
|
|
|
|
76
|
|
|
{ |
77
|
6 |
|
$this->system = new System(); |
78
|
6 |
|
$this->checkForSupport(); |
79
|
6 |
|
} |
80
|
|
|
|
81
|
6 |
|
private function checkForSupport() |
|
|
|
|
82
|
|
|
{ |
83
|
6 |
|
if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
84
|
6 |
|
$this->SUPPORT['already_checked_via_portable_utf8'] = true; |
85
|
|
|
|
86
|
|
|
// http://php.net/manual/en/book.mbstring.php |
87
|
6 |
|
$this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
88
|
6 |
|
$this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
89
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
90
|
6 |
|
\mb_internal_encoding('UTF-8'); |
91
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
92
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
93
|
6 |
|
\mb_regex_encoding('UTF-8'); |
94
|
6 |
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
// http://php.net/manual/en/book.iconv.php |
98
|
6 |
|
$this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
99
|
|
|
|
100
|
|
|
// http://php.net/manual/en/book.intl.php |
101
|
6 |
|
$this->SUPPORT['intl'] = $this->system->intl_loaded(); |
102
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = []; |
103
|
|
|
|
104
|
|
|
if ( |
105
|
6 |
|
$this->SUPPORT['intl'] === true |
106
|
|
|
&& |
107
|
6 |
|
\function_exists('transliterator_list_ids') === true |
108
|
|
|
) { |
109
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
110
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
// http://php.net/manual/en/class.intlchar.php |
114
|
6 |
|
$this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
115
|
|
|
|
116
|
|
|
// http://php.net/manual/en/book.ctype.php |
117
|
6 |
|
$this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
118
|
|
|
|
119
|
|
|
// http://php.net/manual/en/class.finfo.php |
120
|
6 |
|
$this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
121
|
|
|
|
122
|
|
|
// http://php.net/manual/en/book.json.php |
123
|
6 |
|
$this->SUPPORT['json'] = $this->system->json_loaded(); |
124
|
|
|
|
125
|
|
|
// http://php.net/manual/en/book.pcre.php |
126
|
6 |
|
$this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
127
|
|
|
|
128
|
6 |
|
$this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
129
|
6 |
|
if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
130
|
|
|
\mb_internal_encoding('UTF-8'); |
131
|
|
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
132
|
|
|
} |
133
|
|
|
} |
134
|
6 |
|
} |
135
|
|
|
|
136
|
6 |
|
public function rawurldecode($str, $multi_decode = true) |
|
|
|
|
137
|
|
|
{ |
138
|
6 |
|
if ($str === '') { |
139
|
|
|
return ''; |
140
|
|
|
} |
141
|
|
|
|
142
|
6 |
|
if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
143
|
6 |
|
return $this->fixSimpleUtf8($str); |
144
|
|
|
} |
145
|
|
|
|
146
|
6 |
|
$pattern = '/%u([0-9a-fA-F]{3,4})/'; |
147
|
6 |
|
if (preg_match($pattern, $str)) { |
148
|
|
|
$str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
149
|
|
|
} |
150
|
|
|
|
151
|
6 |
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
152
|
|
|
|
153
|
6 |
|
if ($multi_decode === true) { |
154
|
|
|
do { |
155
|
6 |
|
$str_compare = $str; |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* @psalm-suppress PossiblyInvalidArgument |
159
|
|
|
*/ |
160
|
6 |
|
$str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
161
|
6 |
|
} while ($str_compare !== $str); |
162
|
|
|
} |
163
|
|
|
|
164
|
6 |
|
return $str; |
165
|
|
|
} |
166
|
|
|
|
167
|
6 |
|
private function fixSimpleUtf8($str) |
|
|
|
|
168
|
|
|
{ |
169
|
6 |
|
if ($str === '') { |
170
|
|
|
return ''; |
171
|
|
|
} |
172
|
|
|
|
173
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
|
|
|
174
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
|
|
|
175
|
|
|
|
176
|
6 |
|
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
177
|
1 |
|
if ($this->BROKEN_UTF8_FIX === null) { |
178
|
1 |
|
$this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
179
|
|
|
} |
180
|
|
|
|
181
|
1 |
|
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
182
|
1 |
|
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
183
|
|
|
} |
184
|
|
|
|
185
|
6 |
|
return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
186
|
|
|
} |
187
|
|
|
|
188
|
2 |
|
private function getData($file) |
189
|
|
|
{ |
190
|
|
|
|
191
|
2 |
|
return include __DIR__ . '/../Data/' . $file . '.php'; |
192
|
|
|
} |
193
|
|
|
|
194
|
6 |
|
private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
|
|
|
195
|
|
|
{ |
196
|
|
|
if ( |
197
|
6 |
|
!isset($str[3]) // examples: &; || &x; |
198
|
|
|
|| |
199
|
6 |
|
strpos($str, '&') === false // no "&" |
200
|
|
|
) { |
201
|
6 |
|
return $str; |
202
|
|
|
} |
203
|
|
|
|
204
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
205
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
206
|
|
|
} |
207
|
|
|
|
208
|
6 |
|
if ($flags === null) { |
209
|
|
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
210
|
|
|
} |
211
|
|
|
|
212
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
213
|
|
|
trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
do { |
217
|
6 |
|
$str_compare = $str; |
218
|
|
|
|
219
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
220
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
221
|
6 |
|
if ($encoding === 'UTF-8') { |
222
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
223
|
|
|
} else { |
|
|
|
|
224
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
225
|
|
|
} |
226
|
|
|
} else { |
|
|
|
|
227
|
|
|
$str = (string)preg_replace_callback( |
228
|
|
|
"/&#\d{2,6};/", |
229
|
|
|
/** |
230
|
|
|
* @param string[] $matches |
231
|
|
|
* |
232
|
|
|
* @return string |
233
|
|
|
*/ |
234
|
|
|
static function ($matches) use ($encoding) { |
235
|
|
|
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
236
|
|
|
if ($returnTmp !== '"' && $returnTmp !== "'") { |
237
|
|
|
return $returnTmp; |
238
|
|
|
} |
239
|
|
|
|
240
|
|
|
return $matches[0]; |
241
|
|
|
}, |
242
|
|
|
$str |
243
|
|
|
); |
244
|
|
|
} |
245
|
|
|
|
246
|
6 |
|
if (strpos($str, '&') !== false) { |
247
|
6 |
|
if (strpos($str, '&#') !== false) { |
248
|
|
|
// decode also numeric & UTF16 two byte entities |
249
|
6 |
|
$str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
250
|
|
|
} |
251
|
|
|
|
252
|
6 |
|
$str = html_entity_decode($str, $flags, $encoding); |
253
|
|
|
} |
254
|
6 |
|
} while ($str_compare !== $str); |
255
|
|
|
|
256
|
6 |
|
return $str; |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
private function normalize_encoding($encoding, $fallback = '') |
|
|
|
|
260
|
|
|
{ |
261
|
|
|
static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
|
|
|
|
262
|
|
|
|
263
|
|
|
// init |
264
|
|
|
$encoding = (string)$encoding; |
265
|
|
|
|
266
|
|
|
if (!$encoding) { |
267
|
|
|
return $fallback; |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
271
|
|
|
return 'UTF-8'; |
272
|
|
|
} |
273
|
|
|
|
274
|
|
|
if ($encoding === '8BIT' || $encoding === 'BINARY') { |
275
|
|
|
return 'CP850'; |
276
|
|
|
} |
277
|
|
|
|
278
|
|
|
if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
279
|
|
|
return 'HTML-ENTITIES'; |
280
|
|
|
} |
281
|
|
|
|
282
|
|
|
if ( |
283
|
|
|
$encoding === '1' // only a fallback, for non "strict_types" usage ... |
284
|
|
|
|| |
285
|
|
|
$encoding === '0' // only a fallback, for non "strict_types" usage ... |
286
|
|
|
) { |
287
|
|
|
return $fallback; |
288
|
|
|
} |
289
|
|
|
|
290
|
|
|
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
291
|
|
|
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
292
|
|
|
} |
293
|
|
|
|
294
|
|
|
if ($this->ENCODINGS === null) { |
295
|
|
|
$this->ENCODINGS = $this->getData('encodings'); |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
if (in_array($encoding, $this->ENCODINGS, true)) { |
299
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
300
|
|
|
|
301
|
|
|
return $encoding; |
302
|
|
|
} |
303
|
|
|
|
304
|
|
|
$encodingOrig = $encoding; |
305
|
|
|
$encoding = strtoupper($encoding); |
306
|
|
|
$encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
307
|
|
|
|
308
|
|
|
$equivalences = [ |
309
|
|
|
'ISO8859' => 'ISO-8859-1', |
310
|
|
|
'ISO88591' => 'ISO-8859-1', |
311
|
|
|
'ISO' => 'ISO-8859-1', |
312
|
|
|
'LATIN' => 'ISO-8859-1', |
313
|
|
|
'LATIN1' => 'ISO-8859-1', // Western European |
314
|
|
|
'ISO88592' => 'ISO-8859-2', |
315
|
|
|
'LATIN2' => 'ISO-8859-2', // Central European |
316
|
|
|
'ISO88593' => 'ISO-8859-3', |
317
|
|
|
'LATIN3' => 'ISO-8859-3', // Southern European |
318
|
|
|
'ISO88594' => 'ISO-8859-4', |
319
|
|
|
'LATIN4' => 'ISO-8859-4', // Northern European |
320
|
|
|
'ISO88595' => 'ISO-8859-5', |
321
|
|
|
'ISO88596' => 'ISO-8859-6', // Greek |
322
|
|
|
'ISO88597' => 'ISO-8859-7', |
323
|
|
|
'ISO88598' => 'ISO-8859-8', // Hebrew |
324
|
|
|
'ISO88599' => 'ISO-8859-9', |
325
|
|
|
'LATIN5' => 'ISO-8859-9', // Turkish |
326
|
|
|
'ISO885911' => 'ISO-8859-11', |
327
|
|
|
'TIS620' => 'ISO-8859-11', // Thai |
328
|
|
|
'ISO885910' => 'ISO-8859-10', |
329
|
|
|
'LATIN6' => 'ISO-8859-10', // Nordic |
330
|
|
|
'ISO885913' => 'ISO-8859-13', |
331
|
|
|
'LATIN7' => 'ISO-8859-13', // Baltic |
332
|
|
|
'ISO885914' => 'ISO-8859-14', |
333
|
|
|
'LATIN8' => 'ISO-8859-14', // Celtic |
334
|
|
|
'ISO885915' => 'ISO-8859-15', |
335
|
|
|
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
336
|
|
|
'ISO885916' => 'ISO-8859-16', |
337
|
|
|
'LATIN10' => 'ISO-8859-16', // Southeast European |
338
|
|
|
'CP1250' => 'WINDOWS-1250', |
339
|
|
|
'WIN1250' => 'WINDOWS-1250', |
340
|
|
|
'WINDOWS1250' => 'WINDOWS-1250', |
341
|
|
|
'CP1251' => 'WINDOWS-1251', |
342
|
|
|
'WIN1251' => 'WINDOWS-1251', |
343
|
|
|
'WINDOWS1251' => 'WINDOWS-1251', |
344
|
|
|
'CP1252' => 'WINDOWS-1252', |
345
|
|
|
'WIN1252' => 'WINDOWS-1252', |
346
|
|
|
'WINDOWS1252' => 'WINDOWS-1252', |
347
|
|
|
'CP1253' => 'WINDOWS-1253', |
348
|
|
|
'WIN1253' => 'WINDOWS-1253', |
349
|
|
|
'WINDOWS1253' => 'WINDOWS-1253', |
350
|
|
|
'CP1254' => 'WINDOWS-1254', |
351
|
|
|
'WIN1254' => 'WINDOWS-1254', |
352
|
|
|
'WINDOWS1254' => 'WINDOWS-1254', |
353
|
|
|
'CP1255' => 'WINDOWS-1255', |
354
|
|
|
'WIN1255' => 'WINDOWS-1255', |
355
|
|
|
'WINDOWS1255' => 'WINDOWS-1255', |
356
|
|
|
'CP1256' => 'WINDOWS-1256', |
357
|
|
|
'WIN1256' => 'WINDOWS-1256', |
358
|
|
|
'WINDOWS1256' => 'WINDOWS-1256', |
359
|
|
|
'CP1257' => 'WINDOWS-1257', |
360
|
|
|
'WIN1257' => 'WINDOWS-1257', |
361
|
|
|
'WINDOWS1257' => 'WINDOWS-1257', |
362
|
|
|
'CP1258' => 'WINDOWS-1258', |
363
|
|
|
'WIN1258' => 'WINDOWS-1258', |
364
|
|
|
'WINDOWS1258' => 'WINDOWS-1258', |
365
|
|
|
'UTF16' => 'UTF-16', |
366
|
|
|
'UTF32' => 'UTF-32', |
367
|
|
|
'UTF8' => 'UTF-8', |
368
|
|
|
'UTF' => 'UTF-8', |
369
|
|
|
'UTF7' => 'UTF-7', |
370
|
|
|
'8BIT' => 'CP850', |
371
|
|
|
'BINARY' => 'CP850', |
372
|
|
|
]; |
373
|
|
|
|
374
|
|
|
if (!empty($equivalences[$encodingUpperHelper])) { |
375
|
|
|
$encoding = $equivalences[$encodingUpperHelper]; |
376
|
|
|
} |
377
|
|
|
|
378
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
379
|
|
|
|
380
|
|
|
return $encoding; |
381
|
|
|
} |
382
|
|
|
|
383
|
6 |
|
private function toUtf8($str) |
|
|
|
|
384
|
|
|
{ |
385
|
|
|
|
386
|
6 |
|
if (is_array($str) === true) { |
387
|
|
|
foreach ($str as $key => $value) { |
388
|
|
|
$str[$key] = $this->toUtf8($value); |
389
|
|
|
} |
390
|
|
|
return $str; |
391
|
|
|
} |
392
|
|
|
|
393
|
|
|
|
394
|
6 |
|
$str = (string)$str; |
395
|
6 |
|
if ($str === '') { |
396
|
|
|
return $str; |
397
|
|
|
} |
398
|
|
|
|
399
|
6 |
|
$max = \strlen($str); |
400
|
6 |
|
$buf = ''; |
401
|
|
|
|
402
|
6 |
|
for ($i = 0; $i < $max; ++$i) { |
403
|
6 |
|
$c1 = $str[$i]; |
|
|
|
|
404
|
|
|
|
405
|
6 |
|
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
406
|
|
|
|
407
|
|
|
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
408
|
|
|
|
409
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
|
|
|
410
|
|
|
|
411
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
412
|
|
|
$buf .= $c1 . $c2; |
413
|
|
|
++$i; |
414
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
415
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
416
|
|
|
} |
417
|
|
|
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
418
|
|
|
|
419
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
420
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
|
|
|
421
|
|
|
|
422
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
423
|
|
|
$buf .= $c1 . $c2 . $c3; |
424
|
|
|
$i += 2; |
425
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
426
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
427
|
|
|
} |
428
|
|
|
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
429
|
|
|
|
430
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
431
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
432
|
|
|
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
|
|
|
|
433
|
|
|
|
434
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
435
|
|
|
$buf .= $c1 . $c2 . $c3 . $c4; |
436
|
|
|
$i += 3; |
437
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
438
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
439
|
|
|
} |
440
|
|
|
} else { // doesn't look like UTF8, but should be converted |
|
|
|
|
441
|
|
|
|
442
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
443
|
|
|
} |
444
|
6 |
|
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
445
|
|
|
|
446
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
447
|
|
|
} else { // it doesn't need conversion |
|
|
|
|
448
|
|
|
|
449
|
6 |
|
$buf .= $c1; |
450
|
|
|
} |
451
|
|
|
} |
452
|
|
|
|
453
|
|
|
// decode unicode escape sequences + unicode surrogate pairs |
454
|
6 |
|
$buf = preg_replace_callback( |
455
|
6 |
|
'/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
456
|
|
|
/** |
457
|
|
|
* @param array $matches |
458
|
|
|
* |
459
|
|
|
* @return string |
460
|
|
|
*/ |
461
|
|
|
function (array $matches) { |
462
|
1 |
|
if (isset($matches[3])) { |
463
|
1 |
|
$cp = (int)hexdec($matches[3]); |
|
|
|
|
464
|
|
|
} else { |
|
|
|
|
465
|
|
|
// http://unicode.org/faq/utf_bom.html#utf16-4 |
466
|
|
|
$cp = ((int)hexdec($matches[1]) << 10) |
467
|
|
|
+ (int)hexdec($matches[2]) |
468
|
|
|
+ 0x10000 |
469
|
|
|
- (0xD800 << 10) |
470
|
|
|
- 0xDC00; |
471
|
|
|
} |
472
|
|
|
|
473
|
|
|
// https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
474
|
|
|
// |
475
|
|
|
// php_utf32_utf8(unsigned char *buf, unsigned k) |
476
|
|
|
|
477
|
1 |
|
if ($cp < 0x80) { |
478
|
1 |
|
return (string)$this->chr($cp); |
479
|
|
|
} |
480
|
|
|
|
481
|
|
|
if ($cp < 0xA0) { |
482
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
483
|
|
|
return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
484
|
|
|
} |
485
|
|
|
|
486
|
|
|
return $this->decimalToChr($cp); |
487
|
6 |
|
}, |
488
|
6 |
|
$buf |
489
|
|
|
); |
490
|
|
|
|
491
|
6 |
|
if ($buf === null) { |
492
|
|
|
return ''; |
493
|
|
|
} |
494
|
|
|
|
495
|
|
|
|
496
|
6 |
|
return $buf; |
497
|
|
|
} |
498
|
|
|
|
499
|
|
|
private function toUtf8ConvertHelper($input) |
|
|
|
|
500
|
|
|
{ |
501
|
|
|
// init |
502
|
|
|
$buf = ''; |
503
|
|
|
|
504
|
|
|
if ($this->ORD === null) { |
505
|
|
|
$this->ORD = $this->getData('ord'); |
506
|
|
|
} |
507
|
|
|
|
508
|
|
|
if ($this->CHR === null) { |
509
|
|
|
$this->CHR = $this->getData('chr'); |
510
|
|
|
} |
511
|
|
|
|
512
|
|
|
if ($this->WIN1252_TO_UTF8 === null) { |
513
|
|
|
$this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
514
|
|
|
} |
515
|
|
|
|
516
|
|
|
$ordC1 = $this->ORD[$input]; |
517
|
|
|
if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
518
|
|
|
$buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
519
|
|
|
} else { |
|
|
|
|
520
|
|
|
$cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
521
|
|
|
$cc2 = ((string)$input & "\x3F") | "\x80"; |
522
|
|
|
$buf .= $cc1 . $cc2; |
523
|
|
|
} |
524
|
|
|
|
525
|
|
|
return $buf; |
526
|
|
|
} |
527
|
|
|
|
528
|
1 |
|
private function chr($code_point, $encoding = 'UTF-8') |
|
|
|
|
529
|
|
|
{ |
530
|
|
|
// init |
531
|
1 |
|
static $CHAR_CACHE = []; |
532
|
|
|
|
533
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
534
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
535
|
|
|
} |
536
|
|
|
|
537
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
538
|
|
|
trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
539
|
|
|
} |
540
|
|
|
|
541
|
1 |
|
$cacheKey = $code_point . $encoding; |
542
|
1 |
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
543
|
|
|
return $CHAR_CACHE[$cacheKey]; |
544
|
|
|
} |
545
|
|
|
|
546
|
1 |
|
if ($code_point <= 127) { // use "simple"-char only until "\x80" |
547
|
|
|
|
548
|
1 |
|
if ($this->CHR === null) { |
549
|
1 |
|
$this->CHR = (array)$this->getData('chr'); |
550
|
|
|
} |
551
|
|
|
|
552
|
|
|
/** |
553
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
554
|
|
|
*/ |
555
|
1 |
|
$chr = $this->CHR[$code_point]; |
556
|
|
|
|
557
|
1 |
|
if ($encoding !== 'UTF-8') { |
558
|
|
|
$chr = $this->encode($encoding, $chr); |
559
|
|
|
} |
560
|
|
|
|
561
|
1 |
|
return $CHAR_CACHE[$cacheKey] = $chr; |
562
|
|
|
} |
563
|
|
|
|
564
|
|
|
// |
565
|
|
|
// fallback via "IntlChar" |
566
|
|
|
// |
567
|
|
|
|
568
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
569
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
570
|
|
|
$chr = IntlChar::chr($code_point); |
571
|
|
|
|
572
|
|
|
if ($encoding !== 'UTF-8') { |
573
|
|
|
$chr = $this->encode($encoding, $chr); |
574
|
|
|
} |
575
|
|
|
|
576
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
577
|
|
|
} |
578
|
|
|
|
579
|
|
|
// |
580
|
|
|
// fallback via vanilla php |
581
|
|
|
// |
582
|
|
|
|
583
|
|
|
if ($this->CHR === null) { |
584
|
|
|
$this->CHR = (array)$this->getData('chr'); |
585
|
|
|
} |
586
|
|
|
|
587
|
|
|
$code_point = (int)$code_point; |
588
|
|
|
if ($code_point <= 0x7F) { |
589
|
|
|
/** |
590
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
591
|
|
|
*/ |
592
|
|
|
$chr = $this->CHR[$code_point]; |
593
|
|
|
} elseif ($code_point <= 0x7FF) { |
594
|
|
|
/** |
595
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
596
|
|
|
*/ |
597
|
|
|
$chr = $this->CHR[($code_point >> 6) + 0xC0] . |
598
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
599
|
|
|
} elseif ($code_point <= 0xFFFF) { |
600
|
|
|
/** |
601
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
602
|
|
|
*/ |
603
|
|
|
$chr = $this->CHR[($code_point >> 12) + 0xE0] . |
604
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
605
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
606
|
|
|
} else { |
|
|
|
|
607
|
|
|
/** |
608
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
609
|
|
|
*/ |
610
|
|
|
$chr = $this->CHR[($code_point >> 18) + 0xF0] . |
611
|
|
|
$this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
612
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
613
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
614
|
|
|
} |
615
|
|
|
|
616
|
|
|
if ($encoding !== 'UTF-8') { |
617
|
|
|
$chr = $this->encode($encoding, $chr); |
618
|
|
|
} |
619
|
|
|
|
620
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
621
|
|
|
} |
622
|
|
|
|
623
|
|
|
private function encode($toEncoding, $str) |
|
|
|
|
624
|
|
|
{ |
625
|
|
|
if ($str === '' || $toEncoding === '') { |
626
|
|
|
return $str; |
627
|
|
|
} |
628
|
|
|
|
629
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
630
|
|
|
$toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
631
|
|
|
} |
632
|
|
|
|
633
|
|
|
// if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
634
|
|
|
// $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
635
|
|
|
// } |
636
|
|
|
|
637
|
|
|
// if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
638
|
|
|
// return $str; |
639
|
|
|
// } |
640
|
|
|
|
641
|
|
|
if ($toEncoding === 'JSON') { |
642
|
|
|
$return = $this->jsonEncode($str); |
643
|
|
|
if ($return === false) { |
644
|
|
|
throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
645
|
|
|
} |
646
|
|
|
|
647
|
|
|
return $return; |
648
|
|
|
} |
649
|
|
|
// if ($fromEncoding === 'JSON') { |
650
|
|
|
// $str = $this->json_decode($str); |
651
|
|
|
// $fromEncoding = ''; |
652
|
|
|
// } |
653
|
|
|
|
654
|
|
|
if ($toEncoding === 'BASE64') { |
655
|
|
|
return base64_encode($str); |
656
|
|
|
} |
657
|
|
|
// if ($fromEncoding === 'BASE64') { |
658
|
|
|
// $str = base64_decode($str, true); |
659
|
|
|
// $fromEncoding = ''; |
660
|
|
|
// } |
661
|
|
|
|
662
|
|
|
if ($toEncoding === 'HTML-ENTITIES') { |
663
|
|
|
return $this->htmlEncode($str, true, 'UTF-8'); |
664
|
|
|
} |
665
|
|
|
// if ($fromEncoding === 'HTML-ENTITIES') { |
666
|
|
|
// $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
667
|
|
|
// $fromEncoding = ''; |
668
|
|
|
// } |
669
|
|
|
|
670
|
|
|
$fromEncodingDetected = false; |
|
|
|
|
671
|
|
|
// if ($autodetectFromEncoding === true || !$fromEncoding) { |
672
|
|
|
// $fromEncodingDetected = $this->str_detect_encoding($str); |
673
|
|
|
// } |
674
|
|
|
|
675
|
|
|
// DEBUG |
676
|
|
|
//var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
677
|
|
|
|
678
|
|
|
// if ($fromEncodingDetected !== false) { |
679
|
|
|
// $fromEncoding = $fromEncodingDetected; |
680
|
|
|
// } elseif ($autodetectFromEncoding === true) { |
681
|
|
|
// // fallback for the "autodetect"-mode |
682
|
|
|
// return $this->toUtf8($str); |
683
|
|
|
// } |
684
|
|
|
|
685
|
|
|
// if (!$fromEncoding || $fromEncoding === $toEncoding) { |
686
|
|
|
// return $str; |
687
|
|
|
// } |
688
|
|
|
|
689
|
|
|
// if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
690
|
|
|
// return $this->toUtf8($str); |
691
|
|
|
// } |
692
|
|
|
|
693
|
|
|
// if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
694
|
|
|
// return $this->to_iso8859($str); |
695
|
|
|
// } |
696
|
|
|
|
697
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
698
|
|
|
trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
699
|
|
|
} |
700
|
|
|
// |
701
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
702
|
|
|
// // warning: do not use the symfony polyfill here |
703
|
|
|
// $strEncoded = mb_convert_encoding( |
704
|
|
|
// $str, |
705
|
|
|
// $toEncoding, |
706
|
|
|
// $fromEncoding |
707
|
|
|
// ); |
708
|
|
|
// |
709
|
|
|
// if ($strEncoded) { |
710
|
|
|
// return $strEncoded; |
711
|
|
|
// } |
712
|
|
|
// } |
713
|
|
|
// |
714
|
|
|
// $return = \iconv($fromEncoding, $toEncoding, $str); |
715
|
|
|
// if ($return !== false) { |
716
|
|
|
// return $return; |
717
|
|
|
// } |
718
|
|
|
|
719
|
|
|
return $str; |
720
|
|
|
} |
721
|
|
|
|
722
|
|
|
private function jsonEncode($value) |
723
|
|
|
{ |
724
|
|
|
$value = $this->filter($value); |
725
|
|
|
|
726
|
|
|
if ($this->SUPPORT['json'] === false) { |
727
|
|
|
throw new \RuntimeException('ext-json: is not installed'); |
728
|
|
|
} |
729
|
|
|
|
730
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
731
|
|
|
return json_encode($value, 0, 512); |
732
|
|
|
} |
733
|
|
|
|
734
|
|
|
private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
|
|
|
|
735
|
|
|
{ |
736
|
|
|
switch (\gettype($var)) { |
737
|
|
|
case 'array': |
738
|
|
|
foreach ($var as $key => $value) { |
739
|
|
|
$var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
740
|
|
|
} |
741
|
|
|
unset($v); |
|
|
|
|
742
|
|
|
|
743
|
|
|
break; |
744
|
|
|
case 'object': |
745
|
|
|
foreach ($var as $key => $value) { |
746
|
|
|
$str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
|
|
|
747
|
|
|
} |
748
|
|
|
unset($v); |
749
|
|
|
|
750
|
|
|
break; |
751
|
|
|
case 'string': |
|
|
|
|
752
|
|
|
|
753
|
|
|
if (strpos($var, "\r") !== false) { |
754
|
|
|
// Workaround https://bugs.php.net/65732 |
755
|
|
|
$var = $this->normalizeLineEnding($var); |
756
|
|
|
} |
757
|
|
|
|
758
|
|
|
if ($this->isAscii($var) === false) { |
759
|
|
|
if (\Normalizer::isNormalized($var, $normalization_form)) { |
760
|
|
|
$n = '-'; |
|
|
|
|
761
|
|
|
} else { |
|
|
|
|
762
|
|
|
$n = \Normalizer::normalize($var, $normalization_form); |
763
|
|
|
|
764
|
|
|
if (isset($n[0])) { |
765
|
|
|
$var = $n; |
766
|
|
|
} else { |
|
|
|
|
767
|
|
|
$var = $this->encode('UTF-8', $var, true); |
|
|
|
|
768
|
|
|
} |
769
|
|
|
} |
770
|
|
|
|
771
|
|
|
if ( |
772
|
|
|
$var[0] >= "\x80" |
773
|
|
|
&& |
774
|
|
|
isset($n[0], $leading_combining[0]) |
775
|
|
|
&& |
776
|
|
|
preg_match('/^\p{Mn}/u', $var) |
777
|
|
|
) { |
778
|
|
|
// Prevent leading combining chars |
779
|
|
|
// for NFC-safe concatenations. |
780
|
|
|
$var = $leading_combining . $var; |
781
|
|
|
} |
782
|
|
|
} |
783
|
|
|
|
784
|
|
|
break; |
785
|
|
|
} |
786
|
|
|
|
787
|
|
|
return $var; |
788
|
|
|
} |
789
|
|
|
|
790
|
|
|
private function normalizeLineEnding($str) |
791
|
|
|
{ |
792
|
|
|
return str_replace(["\r\n", "\r"], "\n", $str); |
793
|
|
|
} |
794
|
|
|
|
795
|
|
|
private function isAscii($str) |
796
|
|
|
{ |
797
|
|
|
if ($str === '') { |
798
|
|
|
return true; |
799
|
|
|
} |
800
|
|
|
|
801
|
|
|
return !preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); |
802
|
|
|
} |
803
|
|
|
|
804
|
|
|
private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
805
|
|
|
{ |
806
|
|
|
if ($str === '') { |
807
|
|
|
return ''; |
808
|
|
|
} |
809
|
|
|
|
810
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
811
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
812
|
|
|
} |
813
|
|
|
|
814
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
815
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
816
|
|
|
$startCode = 0x00; |
817
|
|
|
if ($keepAsciiChars === true) { |
818
|
|
|
$startCode = 0x80; |
819
|
|
|
} |
820
|
|
|
|
821
|
|
|
if ($encoding === 'UTF-8') { |
822
|
|
|
return mb_encode_numericentity( |
823
|
|
|
$str, |
824
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0] |
825
|
|
|
); |
826
|
|
|
} |
827
|
|
|
|
828
|
|
|
return mb_encode_numericentity( |
829
|
|
|
$str, |
830
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0], |
831
|
|
|
$encoding |
832
|
|
|
); |
833
|
|
|
} |
834
|
|
|
|
835
|
|
|
return implode( |
836
|
|
|
'', |
837
|
|
|
\array_map( |
838
|
|
|
function (string $chr) use ($keepAsciiChars, $encoding) { |
839
|
|
|
return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
840
|
|
|
}, |
841
|
|
|
$this->strSplit($str) |
842
|
|
|
) |
843
|
|
|
); |
844
|
|
|
} |
845
|
|
|
|
846
|
|
|
private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
847
|
|
|
{ |
848
|
|
|
if ($char === '') { |
849
|
|
|
return ''; |
850
|
|
|
} |
851
|
|
|
|
852
|
|
|
if ($keepAsciiChars === true && $this->isAscii($char) === true) { |
853
|
|
|
return $char; |
854
|
|
|
} |
855
|
|
|
|
856
|
|
|
return '&#' . $this->ord($char, $encoding) . ';'; |
857
|
|
|
} |
858
|
|
|
|
859
|
|
|
private function ord($chr, $encoding = 'UTF-8') |
|
|
|
|
860
|
|
|
{ |
861
|
|
|
static $CHAR_CACHE = []; |
862
|
|
|
|
863
|
|
|
// init |
864
|
|
|
$chr = (string)$chr; |
865
|
|
|
|
866
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
867
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
868
|
|
|
} |
869
|
|
|
|
870
|
|
|
$cacheKey = $chr . $encoding; |
871
|
|
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
872
|
|
|
return $CHAR_CACHE[$cacheKey]; |
873
|
|
|
} |
874
|
|
|
|
875
|
|
|
// check again, if it's still not UTF-8 |
876
|
|
|
if ($encoding !== 'UTF-8') { |
877
|
|
|
$chr = $this->encode($encoding, $chr); |
878
|
|
|
} |
879
|
|
|
|
880
|
|
|
if ($this->ORD === null) { |
881
|
|
|
$this->ORD = $this->getData('ord'); |
882
|
|
|
} |
883
|
|
|
|
884
|
|
|
if (isset($this->ORD[$chr])) { |
885
|
|
|
return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
886
|
|
|
} |
887
|
|
|
|
888
|
|
|
// |
889
|
|
|
// fallback via "IntlChar" |
890
|
|
|
// |
891
|
|
|
|
892
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
893
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
894
|
|
|
$code = \IntlChar::ord($chr); |
895
|
|
|
if ($code) { |
896
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
897
|
|
|
} |
898
|
|
|
} |
899
|
|
|
|
900
|
|
|
// |
901
|
|
|
// fallback via vanilla php |
902
|
|
|
// |
903
|
|
|
|
904
|
|
|
/** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
905
|
|
|
$chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
906
|
|
|
$code = $chr ? $chr[1] : 0; |
907
|
|
|
|
908
|
|
|
if ($code >= 0xF0 && isset($chr[4])) { |
909
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
910
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
911
|
|
|
} |
912
|
|
|
|
913
|
|
|
if ($code >= 0xE0 && isset($chr[3])) { |
914
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
915
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
916
|
|
|
} |
917
|
|
|
|
918
|
|
|
if ($code >= 0xC0 && isset($chr[2])) { |
919
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
920
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
921
|
|
|
} |
922
|
|
|
|
923
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
924
|
|
|
} |
925
|
|
|
|
926
|
|
|
private function strSplit($str, $length = 1, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
927
|
|
|
{ |
928
|
|
|
if ($length <= 0) { |
929
|
|
|
return []; |
930
|
|
|
} |
931
|
|
|
|
932
|
|
|
if (is_array($str) === true) { |
933
|
|
|
foreach ($str as $key => $value) { |
934
|
|
|
$str[$key] = $this->strSplit($value, $length, $cleanUtf8, $tryToUseMbFunction); |
935
|
|
|
} |
936
|
|
|
|
937
|
|
|
return $str; |
938
|
|
|
} |
939
|
|
|
|
940
|
|
|
// init |
941
|
|
|
$str = (string)$str; |
942
|
|
|
|
943
|
|
|
if ($str === '') { |
944
|
|
|
return []; |
945
|
|
|
} |
946
|
|
|
|
947
|
|
|
if ($cleanUtf8 === true) { |
948
|
|
|
$str = $this->clean($str); |
949
|
|
|
} |
950
|
|
|
|
951
|
|
|
if ($tryToUseMbFunction === true && $this->SUPPORT['mbstring'] === true) { |
952
|
|
|
$iMax = \mb_strlen($str); |
953
|
|
|
if ($iMax <= 127) { |
954
|
|
|
$ret = []; |
955
|
|
|
for ($i = 0; $i < $iMax; ++$i) { |
956
|
|
|
$ret[] = \mb_substr($str, $i, 1); |
957
|
|
|
} |
958
|
|
|
} else { |
|
|
|
|
959
|
|
|
$retArray = []; |
960
|
|
|
preg_match_all('/./us', $str, $retArray); |
961
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
962
|
|
|
} |
963
|
|
|
} elseif ($this->SUPPORT['pcre_utf8'] === true) { |
964
|
|
|
$retArray = []; |
965
|
|
|
preg_match_all('/./us', $str, $retArray); |
966
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
967
|
|
|
} else { |
|
|
|
|
968
|
|
|
|
969
|
|
|
// fallback |
970
|
|
|
|
971
|
|
|
$ret = []; |
972
|
|
|
$len = \strlen($str); |
973
|
|
|
|
974
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
975
|
|
|
for ($i = 0; $i < $len; ++$i) { |
976
|
|
|
if (($str[$i] & "\x80") === "\x00") { |
977
|
|
|
$ret[] = $str[$i]; |
978
|
|
|
} elseif ( |
979
|
|
|
isset($str[$i + 1]) |
980
|
|
|
&& |
981
|
|
|
($str[$i] & "\xE0") === "\xC0" |
982
|
|
|
) { |
983
|
|
|
if (($str[$i + 1] & "\xC0") === "\x80") { |
984
|
|
|
$ret[] = $str[$i] . $str[$i + 1]; |
985
|
|
|
|
986
|
|
|
++$i; |
987
|
|
|
} |
988
|
|
|
} elseif ( |
989
|
|
|
isset($str[$i + 2]) |
990
|
|
|
&& |
991
|
|
|
($str[$i] & "\xF0") === "\xE0" |
992
|
|
|
) { |
993
|
|
|
if ( |
994
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
995
|
|
|
&& |
996
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
997
|
|
|
) { |
998
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
999
|
|
|
|
1000
|
|
|
$i += 2; |
1001
|
|
|
} |
1002
|
|
|
} elseif ( |
1003
|
|
|
isset($str[$i + 3]) |
1004
|
|
|
&& |
1005
|
|
|
($str[$i] & "\xF8") === "\xF0" |
1006
|
|
|
) { |
1007
|
|
|
if ( |
1008
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
1009
|
|
|
&& |
1010
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
1011
|
|
|
&& |
1012
|
|
|
($str[$i + 3] & "\xC0") === "\x80" |
1013
|
|
|
) { |
1014
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
1015
|
|
|
|
1016
|
|
|
$i += 3; |
1017
|
|
|
} |
1018
|
|
|
} |
1019
|
|
|
} |
1020
|
|
|
} |
1021
|
|
|
|
1022
|
|
|
if ($length > 1) { |
1023
|
|
|
$ret = \array_chunk($ret, $length); |
1024
|
|
|
|
1025
|
|
|
return array_map( |
1026
|
|
|
static function (&$item) { |
1027
|
|
|
return implode('', $item); |
1028
|
|
|
}, |
1029
|
|
|
$ret |
1030
|
|
|
); |
1031
|
|
|
} |
1032
|
|
|
|
1033
|
|
|
if (isset($ret[0]) && $ret[0] === '') { |
1034
|
|
|
return []; |
1035
|
|
|
} |
1036
|
|
|
|
1037
|
|
|
return $ret; |
1038
|
|
|
} |
1039
|
|
|
|
1040
|
|
|
private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
|
|
|
|
1041
|
|
|
{ |
1042
|
|
|
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
1043
|
|
|
// caused connection reset problem on larger strings |
1044
|
|
|
|
1045
|
|
|
$regx = '/ |
1046
|
|
|
( |
1047
|
|
|
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
1048
|
|
|
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
1049
|
|
|
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
1050
|
|
|
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
1051
|
|
|
){1,100} # ...one or more times |
1052
|
|
|
) |
1053
|
|
|
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
1054
|
|
|
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
1055
|
|
|
/x'; |
1056
|
|
|
$str = (string)preg_replace($regx, '$1', $str); |
1057
|
|
|
|
1058
|
|
|
if ($replace_diamond_question_mark === true) { |
1059
|
|
|
$str = $this->replace_diamond_question_mark($str, ''); |
1060
|
|
|
} |
1061
|
|
|
|
1062
|
|
|
if ($remove_invisible_characters === true) { |
1063
|
|
|
$str = $this->remove_invisible_characters($str); |
1064
|
|
|
} |
1065
|
|
|
|
1066
|
|
|
if ($normalize_whitespace === true) { |
1067
|
|
|
$str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
1068
|
|
|
} |
1069
|
|
|
|
1070
|
|
|
if ($normalize_msword === true) { |
1071
|
|
|
$str = $this->normalize_msword($str); |
1072
|
|
|
} |
1073
|
|
|
|
1074
|
|
|
if ($remove_bom === true) { |
1075
|
|
|
$str = $this->remove_bom($str); |
1076
|
|
|
} |
1077
|
|
|
|
1078
|
|
|
return $str; |
1079
|
|
|
} |
1080
|
|
|
|
1081
|
6 |
|
public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
|
|
|
1082
|
|
|
{ |
1083
|
6 |
|
if ($str === '') { |
1084
|
|
|
return ''; |
1085
|
|
|
} |
1086
|
|
|
|
1087
|
6 |
|
if ($processInvalidUtf8 === true) { |
1088
|
6 |
|
$replacementCharHelper = $replacementChar; |
|
|
|
|
1089
|
6 |
|
if ($replacementChar === '') { |
1090
|
6 |
|
$replacementCharHelper = 'none'; |
1091
|
|
|
} |
1092
|
|
|
|
1093
|
6 |
|
if ($this->SUPPORT['mbstring'] === false) { |
1094
|
|
|
// if there is no native support for "mbstring", |
1095
|
|
|
// then we need to clean the string before ... |
1096
|
|
|
$str = $this->clean($str); |
1097
|
|
|
} |
1098
|
|
|
|
1099
|
6 |
|
$save = \mb_substitute_character(); |
1100
|
6 |
|
\mb_substitute_character($replacementCharHelper); |
1101
|
|
|
// the polyfill maybe return false, so cast to string |
1102
|
6 |
|
$str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
1103
|
6 |
|
\mb_substitute_character($save); |
1104
|
|
|
} |
1105
|
|
|
|
1106
|
6 |
|
return str_replace( |
1107
|
|
|
[ |
1108
|
6 |
|
"\xEF\xBF\xBD", |
1109
|
|
|
'�', |
1110
|
|
|
], |
1111
|
|
|
[ |
1112
|
6 |
|
$replacementChar, |
1113
|
6 |
|
$replacementChar, |
1114
|
|
|
], |
1115
|
6 |
|
$str |
1116
|
|
|
); |
1117
|
|
|
} |
1118
|
|
|
|
1119
|
6 |
|
public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
|
|
|
1120
|
|
|
{ |
1121
|
|
|
// init |
1122
|
6 |
|
$non_displayables = []; |
1123
|
|
|
|
1124
|
|
|
// every control character except newline (dec 10), |
1125
|
|
|
// carriage return (dec 13) and horizontal tab (dec 09) |
1126
|
6 |
|
if ($url_encoded) { |
1127
|
6 |
|
$non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
1128
|
6 |
|
$non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
1129
|
|
|
} |
1130
|
|
|
|
1131
|
6 |
|
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
1132
|
|
|
|
1133
|
|
|
do { |
1134
|
6 |
|
$str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
1135
|
6 |
|
} while ($count !== 0); |
1136
|
|
|
|
1137
|
6 |
|
return $str; |
1138
|
|
|
} |
1139
|
|
|
|
1140
|
6 |
|
public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
|
|
|
1141
|
|
|
{ |
1142
|
6 |
|
if ($str === '') { |
1143
|
|
|
return ''; |
1144
|
|
|
} |
1145
|
|
|
|
1146
|
6 |
|
static $WHITESPACE_CACHE = []; |
1147
|
6 |
|
$cacheKey = (int)$keepNonBreakingSpace; |
1148
|
|
|
|
1149
|
6 |
|
if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
1150
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
1151
|
|
|
|
1152
|
1 |
|
if ($keepNonBreakingSpace === true) { |
1153
|
|
|
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
1154
|
|
|
} |
1155
|
|
|
|
1156
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
1157
|
|
|
} |
1158
|
|
|
|
1159
|
6 |
|
if ($keepBidiUnicodeControls === false) { |
1160
|
6 |
|
static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
|
|
|
1161
|
|
|
|
1162
|
6 |
|
if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
1163
|
1 |
|
$BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
1164
|
|
|
} |
1165
|
|
|
|
1166
|
6 |
|
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
1167
|
|
|
} |
1168
|
|
|
|
1169
|
6 |
|
return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
1170
|
|
|
} |
1171
|
|
|
|
1172
|
|
|
private function normalize_msword($str) |
|
|
|
|
1173
|
|
|
{ |
1174
|
|
|
if ($str === '') { |
1175
|
|
|
return ''; |
1176
|
|
|
} |
1177
|
|
|
|
1178
|
|
|
$keys = [ |
1179
|
|
|
"\xc2\xab", // « (U+00AB) in UTF-8 |
1180
|
|
|
"\xc2\xbb", // » (U+00BB) in UTF-8 |
1181
|
|
|
"\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
1182
|
|
|
"\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
1183
|
|
|
"\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
1184
|
|
|
"\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
1185
|
|
|
"\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
1186
|
|
|
"\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
1187
|
|
|
"\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
1188
|
|
|
"\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
1189
|
|
|
"\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
1190
|
|
|
"\xe2\x80\xba", // › (U+203A) in UTF-8 |
1191
|
|
|
"\xe2\x80\x93", // – (U+2013) in UTF-8 |
1192
|
|
|
"\xe2\x80\x94", // — (U+2014) in UTF-8 |
1193
|
|
|
"\xe2\x80\xa6", // … (U+2026) in UTF-8 |
1194
|
|
|
]; |
1195
|
|
|
|
1196
|
|
|
$values = [ |
1197
|
|
|
'"', // « (U+00AB) in UTF-8 |
1198
|
|
|
'"', // » (U+00BB) in UTF-8 |
1199
|
|
|
"'", // ‘ (U+2018) in UTF-8 |
1200
|
|
|
"'", // ’ (U+2019) in UTF-8 |
1201
|
|
|
"'", // ‚ (U+201A) in UTF-8 |
1202
|
|
|
"'", // ‛ (U+201B) in UTF-8 |
1203
|
|
|
'"', // “ (U+201C) in UTF-8 |
1204
|
|
|
'"', // ” (U+201D) in UTF-8 |
1205
|
|
|
'"', // „ (U+201E) in UTF-8 |
1206
|
|
|
'"', // ‟ (U+201F) in UTF-8 |
1207
|
|
|
"'", // ‹ (U+2039) in UTF-8 |
1208
|
|
|
"'", // › (U+203A) in UTF-8 |
1209
|
|
|
'-', // – (U+2013) in UTF-8 |
1210
|
|
|
'-', // — (U+2014) in UTF-8 |
1211
|
|
|
'...', // … (U+2026) in UTF-8 |
1212
|
|
|
]; |
1213
|
|
|
|
1214
|
|
|
return str_replace($keys, $values, $str); |
1215
|
|
|
} |
1216
|
|
|
|
1217
|
6 |
|
public function remove_bom($str) |
|
|
|
|
1218
|
|
|
{ |
1219
|
6 |
|
if ($str === '') { |
1220
|
|
|
return ''; |
1221
|
|
|
} |
1222
|
|
|
|
1223
|
6 |
|
$strLength = \strlen($str); |
1224
|
6 |
|
foreach ($this->BOM as $bomString => $bomByteLength) { |
1225
|
6 |
|
if (strpos($str, $bomString, 0) === 0) { |
1226
|
|
|
$strTmp = \substr($str, $bomByteLength, $strLength); |
1227
|
|
|
if ($strTmp === false) { |
1228
|
|
|
return ''; |
1229
|
|
|
} |
1230
|
|
|
|
1231
|
|
|
$strLength -= (int)$bomByteLength; |
1232
|
|
|
$str = (string)$strTmp; |
1233
|
|
|
} |
1234
|
|
|
} |
1235
|
|
|
|
1236
|
6 |
|
return $str; |
1237
|
|
|
} |
1238
|
|
|
|
1239
|
|
|
// private function str_detect_encoding($str) |
1240
|
|
|
// { |
1241
|
|
|
// // init |
1242
|
|
|
// $str = (string)$str; |
1243
|
|
|
// |
1244
|
|
|
// // |
1245
|
|
|
// // 1.) check binary strings (010001001...) like UTF-16 / UTF-32 / PDF / Images / ... |
1246
|
|
|
// // |
1247
|
|
|
// |
1248
|
|
|
// if ($this->is_binary($str, true) === true) { |
1249
|
|
|
// $isUtf16 = $this->is_utf16($str, false); |
1250
|
|
|
// if ($isUtf16 === 1) { |
1251
|
|
|
// return 'UTF-16LE'; |
1252
|
|
|
// } |
1253
|
|
|
// if ($isUtf16 === 2) { |
1254
|
|
|
// return 'UTF-16BE'; |
1255
|
|
|
// } |
1256
|
|
|
// |
1257
|
|
|
// $isUtf32 = $this->is_utf32($str, false); |
1258
|
|
|
// if ($isUtf32 === 1) { |
1259
|
|
|
// return 'UTF-32LE'; |
1260
|
|
|
// } |
1261
|
|
|
// if ($isUtf32 === 2) { |
1262
|
|
|
// return 'UTF-32BE'; |
1263
|
|
|
// } |
1264
|
|
|
// |
1265
|
|
|
// // is binary but not "UTF-16" or "UTF-32" |
1266
|
|
|
// return false; |
1267
|
|
|
// } |
1268
|
|
|
// |
1269
|
|
|
// // |
1270
|
|
|
// // 2.) simple check for ASCII chars |
1271
|
|
|
// // |
1272
|
|
|
// |
1273
|
|
|
// if ($this->isAscii($str) === true) { |
1274
|
|
|
// return 'ASCII'; |
1275
|
|
|
// } |
1276
|
|
|
// |
1277
|
|
|
// // |
1278
|
|
|
// // 3.) simple check for UTF-8 chars |
1279
|
|
|
// // |
1280
|
|
|
// |
1281
|
|
|
// if ($this->isUtf8($str) === true) { |
1282
|
|
|
// return 'UTF-8'; |
1283
|
|
|
// } |
1284
|
|
|
// |
1285
|
|
|
// // |
1286
|
|
|
// // 4.) check via "mb_detect_encoding()" |
1287
|
|
|
// // |
1288
|
|
|
// // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "mb_detect_encoding()" |
1289
|
|
|
// |
1290
|
|
|
// $detectOrder = [ |
1291
|
|
|
// 'ISO-8859-1', |
1292
|
|
|
// 'ISO-8859-2', |
1293
|
|
|
// 'ISO-8859-3', |
1294
|
|
|
// 'ISO-8859-4', |
1295
|
|
|
// 'ISO-8859-5', |
1296
|
|
|
// 'ISO-8859-6', |
1297
|
|
|
// 'ISO-8859-7', |
1298
|
|
|
// 'ISO-8859-8', |
1299
|
|
|
// 'ISO-8859-9', |
1300
|
|
|
// 'ISO-8859-10', |
1301
|
|
|
// 'ISO-8859-13', |
1302
|
|
|
// 'ISO-8859-14', |
1303
|
|
|
// 'ISO-8859-15', |
1304
|
|
|
// 'ISO-8859-16', |
1305
|
|
|
// 'WINDOWS-1251', |
1306
|
|
|
// 'WINDOWS-1252', |
1307
|
|
|
// 'WINDOWS-1254', |
1308
|
|
|
// 'CP932', |
1309
|
|
|
// 'CP936', |
1310
|
|
|
// 'CP950', |
1311
|
|
|
// 'CP866', |
1312
|
|
|
// 'CP850', |
1313
|
|
|
// 'CP51932', |
1314
|
|
|
// 'CP50220', |
1315
|
|
|
// 'CP50221', |
1316
|
|
|
// 'CP50222', |
1317
|
|
|
// 'ISO-2022-JP', |
1318
|
|
|
// 'ISO-2022-KR', |
1319
|
|
|
// 'JIS', |
1320
|
|
|
// 'JIS-ms', |
1321
|
|
|
// 'EUC-CN', |
1322
|
|
|
// 'EUC-JP', |
1323
|
|
|
// ]; |
1324
|
|
|
// |
1325
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
1326
|
|
|
// // info: do not use the symfony polyfill here |
1327
|
|
|
// $encoding = \mb_detect_encoding($str, $detectOrder, true); |
1328
|
|
|
// if ($encoding) { |
1329
|
|
|
// return $encoding; |
1330
|
|
|
// } |
1331
|
|
|
// } |
1332
|
|
|
// |
1333
|
|
|
// // |
1334
|
|
|
// // 5.) check via "iconv()" |
1335
|
|
|
// // |
1336
|
|
|
// |
1337
|
|
|
// if ($this->ENCODINGS === null) { |
1338
|
|
|
// $this->ENCODINGS = $this->getData('encodings'); |
1339
|
|
|
// } |
1340
|
|
|
// |
1341
|
|
|
// foreach ($this->ENCODINGS as $encodingTmp) { |
1342
|
|
|
// // INFO: //IGNORE but still throw notice |
1343
|
|
|
// /** @noinspection PhpUsageOfSilenceOperatorInspection */ |
1344
|
|
|
// if ((string)@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str) === $str) { |
1345
|
|
|
// return $encodingTmp; |
1346
|
|
|
// } |
1347
|
|
|
// } |
1348
|
|
|
// |
1349
|
|
|
// return false; |
1350
|
|
|
// } |
1351
|
|
|
|
1352
|
|
|
private function decimalToChr($int) |
1353
|
|
|
{ |
1354
|
|
|
return $this->htmlEntityDecode('&#' . $int . ';', \ENT_QUOTES | \ENT_HTML5); |
1355
|
|
|
} |
1356
|
|
|
// |
1357
|
|
|
// private function is_utf16($str, $checkIfStringIsBinary = true) |
1358
|
|
|
// { |
1359
|
|
|
// |
1360
|
|
|
// // init |
1361
|
|
|
// $str = (string)$str; |
1362
|
|
|
// $strChars = []; |
1363
|
|
|
// |
1364
|
|
|
// if ( |
1365
|
|
|
// $checkIfStringIsBinary === true |
1366
|
|
|
// && |
1367
|
|
|
// $this->is_binary($str, true) === false |
1368
|
|
|
// ) { |
1369
|
|
|
// return false; |
1370
|
|
|
// } |
1371
|
|
|
// |
1372
|
|
|
// if ($this->SUPPORT['mbstring'] === false) { |
1373
|
|
|
// \trigger_error('UTF8::is_utf16() without mbstring may did not work correctly', \E_USER_WARNING); |
1374
|
|
|
// } |
1375
|
|
|
// |
1376
|
|
|
// $str = $this->remove_bom($str); |
1377
|
|
|
// |
1378
|
|
|
// |
1379
|
|
|
// $maybeUTF16LE = 0; |
1380
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE'); |
1381
|
|
|
// if ($test) { |
1382
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8'); |
1383
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE'); |
1384
|
|
|
// if ($test3 === $test) { |
1385
|
|
|
// if (\count($strChars) === 0) { |
1386
|
|
|
// $strChars = $this->count_chars($str, true, false); |
1387
|
|
|
// } |
1388
|
|
|
// $countChars = $this->count_chars($test3); |
1389
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
1390
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
1391
|
|
|
// ++$maybeUTF16LE; |
1392
|
|
|
// } |
1393
|
|
|
// unset($countChars[$test3char]); |
1394
|
|
|
// } |
1395
|
|
|
// } |
1396
|
|
|
// } |
1397
|
|
|
// |
1398
|
|
|
// $maybeUTF16BE = 0; |
1399
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE'); |
1400
|
|
|
// if ($test) { |
1401
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8'); |
1402
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE'); |
1403
|
|
|
// if ($test3 === $test) { |
1404
|
|
|
// if (\count($strChars) === 0) { |
1405
|
|
|
// $strChars = $this->count_chars($str, true, false); |
1406
|
|
|
// } |
1407
|
|
|
// $countChars = $this->count_chars($test3); |
1408
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
1409
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
1410
|
|
|
// ++$maybeUTF16BE; |
1411
|
|
|
// } |
1412
|
|
|
// unset($countChars[$test3char]); |
1413
|
|
|
// } |
1414
|
|
|
// |
1415
|
|
|
// } |
1416
|
|
|
// } |
1417
|
|
|
// |
1418
|
|
|
// if ($maybeUTF16BE !== $maybeUTF16LE) { |
1419
|
|
|
// if ($maybeUTF16LE > $maybeUTF16BE) { |
1420
|
|
|
// return 1; |
1421
|
|
|
// } |
1422
|
|
|
// |
1423
|
|
|
// return 2; |
1424
|
|
|
// } |
1425
|
|
|
// |
1426
|
|
|
// return false; |
1427
|
|
|
// } |
1428
|
|
|
|
1429
|
|
|
/** |
1430
|
|
|
* Check if the string is UTF-32. |
1431
|
|
|
* |
1432
|
|
|
* @param mixed $str <p>The input string.</p> |
1433
|
|
|
* @param bool $checkIfStringIsBinary |
1434
|
|
|
* |
1435
|
|
|
* @return false|int |
1436
|
|
|
* <strong>false</strong> if is't not UTF-32,<br> |
1437
|
|
|
* <strong>1</strong> for UTF-32LE,<br> |
1438
|
|
|
* <strong>2</strong> for UTF-32BE |
1439
|
|
|
*/ |
1440
|
|
|
private function is_utf32($str, $checkIfStringIsBinary = true) |
|
|
|
|
1441
|
|
|
{ |
1442
|
|
|
// init |
1443
|
|
|
$str = (string)$str; |
1444
|
|
|
$strChars = []; |
1445
|
|
|
|
1446
|
|
|
if ($checkIfStringIsBinary === true && $this->is_binary($str, true) === false) { |
1447
|
|
|
return false; |
1448
|
|
|
} |
1449
|
|
|
|
1450
|
|
|
if ($this->SUPPORT['mbstring'] === false) { |
1451
|
|
|
\trigger_error('UTF8::is_utf32() without mbstring may did not work correctly', \E_USER_WARNING); |
1452
|
|
|
} |
1453
|
|
|
|
1454
|
|
|
$str = $this->remove_bom($str); |
1455
|
|
|
|
1456
|
|
|
$maybeUTF32LE = 0; |
1457
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE'); |
1458
|
|
|
if ($test) { |
1459
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8'); |
1460
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE'); |
1461
|
|
|
if ($test3 === $test) { |
1462
|
|
|
if (\count($strChars) === 0) { |
1463
|
|
|
$strChars = $this->count_chars($str, true, false); |
1464
|
|
|
} |
1465
|
|
|
$countChars = $this->count_chars($test3); |
1466
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
1467
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
1468
|
|
|
++$maybeUTF32LE; |
1469
|
|
|
} |
1470
|
|
|
unset($countChars[$test3char]); |
1471
|
|
|
} |
1472
|
|
|
} |
1473
|
|
|
} |
1474
|
|
|
|
1475
|
|
|
$maybeUTF32BE = 0; |
1476
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE'); |
1477
|
|
|
if ($test) { |
1478
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8'); |
1479
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE'); |
1480
|
|
|
if ($test3 === $test) { |
1481
|
|
|
if (\count($strChars) === 0) { |
1482
|
|
|
$strChars = $this->count_chars($str, true, false); |
1483
|
|
|
} |
1484
|
|
|
$countChars = $this->count_chars($test3); |
1485
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
1486
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
1487
|
|
|
++$maybeUTF32BE; |
1488
|
|
|
} |
1489
|
|
|
unset($countChars[$test3char]); |
1490
|
|
|
} |
1491
|
|
|
} |
1492
|
|
|
} |
1493
|
|
|
|
1494
|
|
|
if ($maybeUTF32BE !== $maybeUTF32LE) { |
1495
|
|
|
if ($maybeUTF32LE > $maybeUTF32BE) { |
1496
|
|
|
return 1; |
1497
|
|
|
} |
1498
|
|
|
|
1499
|
|
|
return 2; |
1500
|
|
|
} |
1501
|
|
|
|
1502
|
|
|
return false; |
1503
|
|
|
} |
1504
|
|
|
|
1505
|
|
|
private function is_binary($input, $strict = false) |
|
|
|
|
1506
|
|
|
{ |
1507
|
|
|
$input = (string)$input; |
1508
|
|
|
if ($input === '') { |
1509
|
|
|
return false; |
1510
|
|
|
} |
1511
|
|
|
|
1512
|
|
|
if (preg_match('~^[01]+$~', $input)) { |
1513
|
|
|
return true; |
1514
|
|
|
} |
1515
|
|
|
|
1516
|
|
|
$ext = $this->get_file_type($input); |
1517
|
|
|
if ($ext['type'] === 'binary') { |
1518
|
|
|
return true; |
1519
|
|
|
} |
1520
|
|
|
|
1521
|
|
|
$testLength = \strlen($input); |
1522
|
|
|
$testNull = \substr_count($input, "\x0", 0, $testLength); |
1523
|
|
|
if (($testNull / $testLength) > 0.25) { |
1524
|
|
|
return true; |
1525
|
|
|
} |
1526
|
|
|
|
1527
|
|
|
if ($strict === true) { |
1528
|
|
|
if ($this->SUPPORT['finfo'] === false) { |
1529
|
|
|
throw new \RuntimeException('ext-fileinfo: is not installed'); |
1530
|
|
|
} |
1531
|
|
|
|
1532
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
1533
|
|
|
$finfo_encoding = (new \finfo(\FILEINFO_MIME_ENCODING))->buffer($input); |
1534
|
|
|
if ($finfo_encoding && $finfo_encoding === 'binary') { |
1535
|
|
|
return true; |
1536
|
|
|
} |
1537
|
|
|
} |
1538
|
|
|
|
1539
|
|
|
return false; |
1540
|
|
|
} |
1541
|
|
|
|
1542
|
|
|
private function get_file_type( |
|
|
|
|
1543
|
|
|
$str, |
1544
|
|
|
$fallback = [ |
1545
|
|
|
'ext' => null, |
1546
|
|
|
'mime' => 'application/octet-stream', |
1547
|
|
|
'type' => null, |
1548
|
|
|
] |
1549
|
|
|
) { |
1550
|
|
|
if ($str === '') { |
1551
|
|
|
return $fallback; |
1552
|
|
|
} |
1553
|
|
|
|
1554
|
|
|
$str_info = \substr($str, 0, 2); |
1555
|
|
|
if ($str_info === false || \strlen($str_info) !== 2) { |
1556
|
|
|
return $fallback; |
1557
|
|
|
} |
1558
|
|
|
|
1559
|
|
|
$str_info = \unpack('C2chars', $str_info); |
1560
|
|
|
if ($str_info === false) { |
1561
|
|
|
return $fallback; |
1562
|
|
|
} |
1563
|
|
|
$type_code = (int)($str_info['chars1'] . $str_info['chars2']); |
1564
|
|
|
|
1565
|
|
|
switch ($type_code) { |
1566
|
|
|
case 3780: |
1567
|
|
|
$ext = 'pdf'; |
1568
|
|
|
$mime = 'application/pdf'; |
1569
|
|
|
$type = 'binary'; |
1570
|
|
|
|
1571
|
|
|
break; |
1572
|
|
|
case 7790: |
1573
|
|
|
$ext = 'exe'; |
1574
|
|
|
$mime = 'application/octet-stream'; |
1575
|
|
|
$type = 'binary'; |
1576
|
|
|
|
1577
|
|
|
break; |
1578
|
|
|
case 7784: |
1579
|
|
|
$ext = 'midi'; |
1580
|
|
|
$mime = 'audio/x-midi'; |
1581
|
|
|
$type = 'binary'; |
1582
|
|
|
|
1583
|
|
|
break; |
1584
|
|
|
case 8075: |
1585
|
|
|
$ext = 'zip'; |
1586
|
|
|
$mime = 'application/zip'; |
1587
|
|
|
$type = 'binary'; |
1588
|
|
|
|
1589
|
|
|
break; |
1590
|
|
|
case 8297: |
1591
|
|
|
$ext = 'rar'; |
1592
|
|
|
$mime = 'application/rar'; |
1593
|
|
|
$type = 'binary'; |
1594
|
|
|
|
1595
|
|
|
break; |
1596
|
|
|
case 255216: |
1597
|
|
|
$ext = 'jpg'; |
1598
|
|
|
$mime = 'image/jpeg'; |
1599
|
|
|
$type = 'binary'; |
1600
|
|
|
|
1601
|
|
|
break; |
1602
|
|
|
case 7173: |
1603
|
|
|
$ext = 'gif'; |
1604
|
|
|
$mime = 'image/gif'; |
1605
|
|
|
$type = 'binary'; |
1606
|
|
|
|
1607
|
|
|
break; |
1608
|
|
|
case 6677: |
1609
|
|
|
$ext = 'bmp'; |
1610
|
|
|
$mime = 'image/bmp'; |
1611
|
|
|
$type = 'binary'; |
1612
|
|
|
|
1613
|
|
|
break; |
1614
|
|
|
case 13780: |
1615
|
|
|
$ext = 'png'; |
1616
|
|
|
$mime = 'image/png'; |
1617
|
|
|
$type = 'binary'; |
1618
|
|
|
|
1619
|
|
|
break; |
1620
|
|
|
default: |
1621
|
|
|
return $fallback; |
1622
|
|
|
} |
1623
|
|
|
|
1624
|
|
|
return [ |
1625
|
|
|
'ext' => $ext, |
1626
|
|
|
'mime' => $mime, |
1627
|
|
|
'type' => $type, |
1628
|
|
|
]; |
1629
|
|
|
} |
1630
|
|
|
|
1631
|
|
|
private function count_chars($str, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
1632
|
|
|
{ |
1633
|
|
|
return array_count_values($this->strSplit($str, 1, $cleanUtf8, $tryToUseMbFunction)); |
1634
|
|
|
} |
1635
|
|
|
|
1636
|
|
|
} |
1637
|
|
|
|
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString
.