1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace devtoolboxuk\soteria\voku\Resources; |
4
|
|
|
|
5
|
|
|
class Utf8 extends Resources |
|
|
|
|
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
private $system; |
9
|
|
|
private $ENCODINGS; |
10
|
|
|
private $SUPPORT = []; |
11
|
|
|
private $BROKEN_UTF8_FIX; |
12
|
|
|
private $ORD; |
13
|
|
|
private $CHR; |
14
|
|
|
private $WIN1252_TO_UTF8; |
15
|
|
|
private $BOM = [ |
16
|
|
|
"\xef\xbb\xbf" => 3, // UTF-8 BOM |
17
|
|
|
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
18
|
|
|
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
19
|
|
|
' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
20
|
|
|
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
21
|
|
|
'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
22
|
|
|
"\xfe\xff" => 2, // UTF-16 (BE) BOM |
23
|
|
|
'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
24
|
|
|
"\xff\xfe" => 2, // UTF-16 (LE) BOM |
25
|
|
|
'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
26
|
|
|
]; |
27
|
|
|
|
28
|
|
|
private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
29
|
|
|
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
30
|
|
|
8234 => "\xE2\x80\xAA", |
31
|
|
|
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
32
|
|
|
8235 => "\xE2\x80\xAB", |
33
|
|
|
// POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
34
|
|
|
8236 => "\xE2\x80\xAC", |
35
|
|
|
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
36
|
|
|
8237 => "\xE2\x80\xAD", |
37
|
|
|
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
38
|
|
|
8238 => "\xE2\x80\xAE", |
39
|
|
|
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
40
|
|
|
8294 => "\xE2\x81\xA6", |
41
|
|
|
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
42
|
|
|
8295 => "\xE2\x81\xA7", |
43
|
|
|
// FIRST STRONG ISOLATE // (use -> dir = "auto") |
44
|
|
|
8296 => "\xE2\x81\xA8", |
45
|
|
|
// POP DIRECTIONAL ISOLATE |
46
|
|
|
8297 => "\xE2\x81\xA9", |
47
|
|
|
]; |
48
|
|
|
// private $WHITESPACE = [ |
49
|
|
|
// // NUL Byte |
50
|
|
|
// 0 => "\x0", |
51
|
|
|
// // Tab |
52
|
|
|
// 9 => "\x9", |
53
|
|
|
// // New Line |
54
|
|
|
// 10 => "\xa", |
55
|
|
|
// // Vertical Tab |
56
|
|
|
// 11 => "\xb", |
57
|
|
|
// // Carriage Return |
58
|
|
|
// 13 => "\xd", |
59
|
|
|
// // Ordinary Space |
60
|
|
|
// 32 => "\x20", |
61
|
|
|
// // NO-BREAK SPACE |
62
|
|
|
// 160 => "\xc2\xa0", |
63
|
|
|
// // OGHAM SPACE MARK |
64
|
|
|
// 5760 => "\xe1\x9a\x80", |
65
|
|
|
// // MONGOLIAN VOWEL SEPARATOR |
66
|
|
|
// 6158 => "\xe1\xa0\x8e", |
67
|
|
|
// // EN QUAD |
68
|
|
|
// 8192 => "\xe2\x80\x80", |
69
|
|
|
// // EM QUAD |
70
|
|
|
// 8193 => "\xe2\x80\x81", |
71
|
|
|
// // EN SPACE |
72
|
|
|
// 8194 => "\xe2\x80\x82", |
73
|
|
|
// // EM SPACE |
74
|
|
|
// 8195 => "\xe2\x80\x83", |
75
|
|
|
// // THREE-PER-EM SPACE |
76
|
|
|
// 8196 => "\xe2\x80\x84", |
77
|
|
|
// // FOUR-PER-EM SPACE |
78
|
|
|
// 8197 => "\xe2\x80\x85", |
79
|
|
|
// // SIX-PER-EM SPACE |
80
|
|
|
// 8198 => "\xe2\x80\x86", |
81
|
|
|
// // FIGURE SPACE |
82
|
|
|
// 8199 => "\xe2\x80\x87", |
83
|
|
|
// // PUNCTUATION SPACE |
84
|
|
|
// 8200 => "\xe2\x80\x88", |
85
|
|
|
// // THIN SPACE |
86
|
|
|
// 8201 => "\xe2\x80\x89", |
87
|
|
|
// //HAIR SPACE |
88
|
|
|
// 8202 => "\xe2\x80\x8a", |
89
|
|
|
// // LINE SEPARATOR |
90
|
|
|
// 8232 => "\xe2\x80\xa8", |
91
|
|
|
// // PARAGRAPH SEPARATOR |
92
|
|
|
// 8233 => "\xe2\x80\xa9", |
93
|
|
|
// // NARROW NO-BREAK SPACE |
94
|
|
|
// 8239 => "\xe2\x80\xaf", |
95
|
|
|
// // MEDIUM MATHEMATICAL SPACE |
96
|
|
|
// 8287 => "\xe2\x81\x9f", |
97
|
|
|
// // IDEOGRAPHIC SPACE |
98
|
|
|
// 12288 => "\xe3\x80\x80", |
99
|
|
|
// ]; |
100
|
|
|
/** |
101
|
|
|
* @var array |
102
|
|
|
*/ |
103
|
|
|
private $WHITESPACE_TABLE = [ |
104
|
|
|
'SPACE' => "\x20", |
105
|
|
|
'NO-BREAK SPACE' => "\xc2\xa0", |
106
|
|
|
'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
107
|
|
|
'EN QUAD' => "\xe2\x80\x80", |
108
|
|
|
'EM QUAD' => "\xe2\x80\x81", |
109
|
|
|
'EN SPACE' => "\xe2\x80\x82", |
110
|
|
|
'EM SPACE' => "\xe2\x80\x83", |
111
|
|
|
'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
112
|
|
|
'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
113
|
|
|
'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
114
|
|
|
'FIGURE SPACE' => "\xe2\x80\x87", |
115
|
|
|
'PUNCTUATION SPACE' => "\xe2\x80\x88", |
116
|
|
|
'THIN SPACE' => "\xe2\x80\x89", |
117
|
|
|
'HAIR SPACE' => "\xe2\x80\x8a", |
118
|
|
|
'LINE SEPARATOR' => "\xe2\x80\xa8", |
119
|
|
|
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
120
|
|
|
'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
121
|
|
|
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
122
|
|
|
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
123
|
|
|
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
124
|
|
|
]; |
125
|
|
|
|
126
|
6 |
|
function __construct() |
|
|
|
|
127
|
|
|
{ |
128
|
6 |
|
$this->system = new System(); |
129
|
6 |
|
$this->checkForSupport(); |
130
|
6 |
|
} |
131
|
|
|
|
132
|
6 |
|
private function checkForSupport() |
|
|
|
|
133
|
|
|
{ |
134
|
6 |
|
if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
135
|
6 |
|
$this->SUPPORT['already_checked_via_portable_utf8'] = true; |
136
|
|
|
|
137
|
|
|
// http://php.net/manual/en/book.mbstring.php |
138
|
6 |
|
$this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
139
|
6 |
|
$this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
140
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
141
|
6 |
|
\mb_internal_encoding('UTF-8'); |
142
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
143
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
144
|
6 |
|
\mb_regex_encoding('UTF-8'); |
145
|
6 |
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
// http://php.net/manual/en/book.iconv.php |
149
|
6 |
|
$this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
150
|
|
|
|
151
|
|
|
// http://php.net/manual/en/book.intl.php |
152
|
6 |
|
$this->SUPPORT['intl'] = $this->system->intl_loaded(); |
153
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = []; |
154
|
|
|
|
155
|
|
|
if ( |
156
|
6 |
|
$this->SUPPORT['intl'] === true |
157
|
|
|
&& |
158
|
6 |
|
\function_exists('transliterator_list_ids') === true |
159
|
|
|
) { |
160
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
161
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
// http://php.net/manual/en/class.intlchar.php |
165
|
6 |
|
$this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
166
|
|
|
|
167
|
|
|
// http://php.net/manual/en/book.ctype.php |
168
|
6 |
|
$this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
169
|
|
|
|
170
|
|
|
// http://php.net/manual/en/class.finfo.php |
171
|
6 |
|
$this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
172
|
|
|
|
173
|
|
|
// http://php.net/manual/en/book.json.php |
174
|
6 |
|
$this->SUPPORT['json'] = $this->system->json_loaded(); |
175
|
|
|
|
176
|
|
|
// http://php.net/manual/en/book.pcre.php |
177
|
6 |
|
$this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
178
|
|
|
|
179
|
6 |
|
$this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
180
|
6 |
|
if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
181
|
|
|
\mb_internal_encoding('UTF-8'); |
182
|
|
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
183
|
|
|
} |
184
|
|
|
} |
185
|
6 |
|
} |
186
|
|
|
|
187
|
6 |
|
public function rawurldecode($str, $multi_decode = true) |
|
|
|
|
188
|
|
|
{ |
189
|
6 |
|
if ($str === '') { |
190
|
|
|
return ''; |
191
|
|
|
} |
192
|
|
|
|
193
|
6 |
|
if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
194
|
6 |
|
return $this->fixSimpleUtf8($str); |
195
|
|
|
} |
196
|
|
|
|
197
|
6 |
|
$pattern = '/%u([0-9a-fA-F]{3,4})/'; |
198
|
6 |
|
if (preg_match($pattern, $str)) { |
199
|
|
|
$str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
200
|
|
|
} |
201
|
|
|
|
202
|
6 |
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
203
|
|
|
|
204
|
6 |
|
if ($multi_decode === true) { |
205
|
|
|
do { |
206
|
6 |
|
$str_compare = $str; |
207
|
|
|
|
208
|
|
|
/** |
209
|
|
|
* @psalm-suppress PossiblyInvalidArgument |
210
|
|
|
*/ |
211
|
6 |
|
$str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
212
|
6 |
|
} while ($str_compare !== $str); |
213
|
|
|
} |
214
|
|
|
|
215
|
6 |
|
return $str; |
216
|
|
|
} |
217
|
|
|
|
218
|
6 |
|
private function fixSimpleUtf8($str) |
|
|
|
|
219
|
|
|
{ |
220
|
6 |
|
if ($str === '') { |
221
|
|
|
return ''; |
222
|
|
|
} |
223
|
|
|
|
224
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
|
|
|
225
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
|
|
|
226
|
|
|
|
227
|
6 |
|
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
228
|
1 |
|
if ($this->BROKEN_UTF8_FIX === null) { |
229
|
1 |
|
$this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
230
|
|
|
} |
231
|
|
|
|
232
|
1 |
|
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
233
|
1 |
|
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
234
|
|
|
} |
235
|
|
|
|
236
|
6 |
|
return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
237
|
|
|
} |
238
|
|
|
|
239
|
2 |
|
private function getData($file) |
240
|
|
|
{ |
241
|
|
|
|
242
|
2 |
|
return include __DIR__ . '/../Data/' . $file . '.php'; |
243
|
|
|
} |
244
|
|
|
|
245
|
6 |
|
private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
|
|
|
246
|
|
|
{ |
247
|
|
|
if ( |
248
|
6 |
|
!isset($str[3]) // examples: &; || &x; |
249
|
|
|
|| |
250
|
6 |
|
strpos($str, '&') === false // no "&" |
251
|
|
|
) { |
252
|
6 |
|
return $str; |
253
|
|
|
} |
254
|
|
|
|
255
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
256
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
257
|
|
|
} |
258
|
|
|
|
259
|
6 |
|
if ($flags === null) { |
260
|
|
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
261
|
|
|
} |
262
|
|
|
|
263
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
264
|
|
|
trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
do { |
268
|
6 |
|
$str_compare = $str; |
269
|
|
|
|
270
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
271
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
272
|
6 |
|
if ($encoding === 'UTF-8') { |
273
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
274
|
|
|
} else { |
|
|
|
|
275
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
276
|
|
|
} |
277
|
|
|
} else { |
|
|
|
|
278
|
|
|
$str = (string)preg_replace_callback( |
279
|
|
|
"/&#\d{2,6};/", |
280
|
|
|
/** |
281
|
|
|
* @param string[] $matches |
282
|
|
|
* |
283
|
|
|
* @return string |
284
|
|
|
*/ |
285
|
|
|
static function ($matches) use ($encoding) { |
286
|
|
|
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
287
|
|
|
if ($returnTmp !== '"' && $returnTmp !== "'") { |
288
|
|
|
return $returnTmp; |
289
|
|
|
} |
290
|
|
|
|
291
|
|
|
return $matches[0]; |
292
|
|
|
}, |
293
|
|
|
$str |
294
|
|
|
); |
295
|
|
|
} |
296
|
|
|
|
297
|
6 |
|
if (strpos($str, '&') !== false) { |
298
|
6 |
|
if (strpos($str, '&#') !== false) { |
299
|
|
|
// decode also numeric & UTF16 two byte entities |
300
|
6 |
|
$str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
301
|
|
|
} |
302
|
|
|
|
303
|
6 |
|
$str = html_entity_decode($str, $flags, $encoding); |
304
|
|
|
} |
305
|
6 |
|
} while ($str_compare !== $str); |
306
|
|
|
|
307
|
6 |
|
return $str; |
308
|
|
|
} |
309
|
|
|
|
310
|
|
|
private function normalize_encoding($encoding, $fallback = '') |
|
|
|
|
311
|
|
|
{ |
312
|
|
|
static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
|
|
|
|
313
|
|
|
|
314
|
|
|
// init |
315
|
|
|
$encoding = (string)$encoding; |
316
|
|
|
|
317
|
|
|
if (!$encoding) { |
318
|
|
|
return $fallback; |
319
|
|
|
} |
320
|
|
|
|
321
|
|
|
if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
322
|
|
|
return 'UTF-8'; |
323
|
|
|
} |
324
|
|
|
|
325
|
|
|
if ($encoding === '8BIT' || $encoding === 'BINARY') { |
326
|
|
|
return 'CP850'; |
327
|
|
|
} |
328
|
|
|
|
329
|
|
|
if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
330
|
|
|
return 'HTML-ENTITIES'; |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
if ( |
334
|
|
|
$encoding === '1' // only a fallback, for non "strict_types" usage ... |
335
|
|
|
|| |
336
|
|
|
$encoding === '0' // only a fallback, for non "strict_types" usage ... |
337
|
|
|
) { |
338
|
|
|
return $fallback; |
339
|
|
|
} |
340
|
|
|
|
341
|
|
|
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
342
|
|
|
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
if ($this->ENCODINGS === null) { |
346
|
|
|
$this->ENCODINGS = $this->getData('encodings'); |
347
|
|
|
} |
348
|
|
|
|
349
|
|
|
if (in_array($encoding, $this->ENCODINGS, true)) { |
350
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
351
|
|
|
|
352
|
|
|
return $encoding; |
353
|
|
|
} |
354
|
|
|
|
355
|
|
|
$encodingOrig = $encoding; |
356
|
|
|
$encoding = strtoupper($encoding); |
357
|
|
|
$encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
358
|
|
|
|
359
|
|
|
$equivalences = [ |
360
|
|
|
'ISO8859' => 'ISO-8859-1', |
361
|
|
|
'ISO88591' => 'ISO-8859-1', |
362
|
|
|
'ISO' => 'ISO-8859-1', |
363
|
|
|
'LATIN' => 'ISO-8859-1', |
364
|
|
|
'LATIN1' => 'ISO-8859-1', // Western European |
365
|
|
|
'ISO88592' => 'ISO-8859-2', |
366
|
|
|
'LATIN2' => 'ISO-8859-2', // Central European |
367
|
|
|
'ISO88593' => 'ISO-8859-3', |
368
|
|
|
'LATIN3' => 'ISO-8859-3', // Southern European |
369
|
|
|
'ISO88594' => 'ISO-8859-4', |
370
|
|
|
'LATIN4' => 'ISO-8859-4', // Northern European |
371
|
|
|
'ISO88595' => 'ISO-8859-5', |
372
|
|
|
'ISO88596' => 'ISO-8859-6', // Greek |
373
|
|
|
'ISO88597' => 'ISO-8859-7', |
374
|
|
|
'ISO88598' => 'ISO-8859-8', // Hebrew |
375
|
|
|
'ISO88599' => 'ISO-8859-9', |
376
|
|
|
'LATIN5' => 'ISO-8859-9', // Turkish |
377
|
|
|
'ISO885911' => 'ISO-8859-11', |
378
|
|
|
'TIS620' => 'ISO-8859-11', // Thai |
379
|
|
|
'ISO885910' => 'ISO-8859-10', |
380
|
|
|
'LATIN6' => 'ISO-8859-10', // Nordic |
381
|
|
|
'ISO885913' => 'ISO-8859-13', |
382
|
|
|
'LATIN7' => 'ISO-8859-13', // Baltic |
383
|
|
|
'ISO885914' => 'ISO-8859-14', |
384
|
|
|
'LATIN8' => 'ISO-8859-14', // Celtic |
385
|
|
|
'ISO885915' => 'ISO-8859-15', |
386
|
|
|
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
387
|
|
|
'ISO885916' => 'ISO-8859-16', |
388
|
|
|
'LATIN10' => 'ISO-8859-16', // Southeast European |
389
|
|
|
'CP1250' => 'WINDOWS-1250', |
390
|
|
|
'WIN1250' => 'WINDOWS-1250', |
391
|
|
|
'WINDOWS1250' => 'WINDOWS-1250', |
392
|
|
|
'CP1251' => 'WINDOWS-1251', |
393
|
|
|
'WIN1251' => 'WINDOWS-1251', |
394
|
|
|
'WINDOWS1251' => 'WINDOWS-1251', |
395
|
|
|
'CP1252' => 'WINDOWS-1252', |
396
|
|
|
'WIN1252' => 'WINDOWS-1252', |
397
|
|
|
'WINDOWS1252' => 'WINDOWS-1252', |
398
|
|
|
'CP1253' => 'WINDOWS-1253', |
399
|
|
|
'WIN1253' => 'WINDOWS-1253', |
400
|
|
|
'WINDOWS1253' => 'WINDOWS-1253', |
401
|
|
|
'CP1254' => 'WINDOWS-1254', |
402
|
|
|
'WIN1254' => 'WINDOWS-1254', |
403
|
|
|
'WINDOWS1254' => 'WINDOWS-1254', |
404
|
|
|
'CP1255' => 'WINDOWS-1255', |
405
|
|
|
'WIN1255' => 'WINDOWS-1255', |
406
|
|
|
'WINDOWS1255' => 'WINDOWS-1255', |
407
|
|
|
'CP1256' => 'WINDOWS-1256', |
408
|
|
|
'WIN1256' => 'WINDOWS-1256', |
409
|
|
|
'WINDOWS1256' => 'WINDOWS-1256', |
410
|
|
|
'CP1257' => 'WINDOWS-1257', |
411
|
|
|
'WIN1257' => 'WINDOWS-1257', |
412
|
|
|
'WINDOWS1257' => 'WINDOWS-1257', |
413
|
|
|
'CP1258' => 'WINDOWS-1258', |
414
|
|
|
'WIN1258' => 'WINDOWS-1258', |
415
|
|
|
'WINDOWS1258' => 'WINDOWS-1258', |
416
|
|
|
'UTF16' => 'UTF-16', |
417
|
|
|
'UTF32' => 'UTF-32', |
418
|
|
|
'UTF8' => 'UTF-8', |
419
|
|
|
'UTF' => 'UTF-8', |
420
|
|
|
'UTF7' => 'UTF-7', |
421
|
|
|
'8BIT' => 'CP850', |
422
|
|
|
'BINARY' => 'CP850', |
423
|
|
|
]; |
424
|
|
|
|
425
|
|
|
if (!empty($equivalences[$encodingUpperHelper])) { |
426
|
|
|
$encoding = $equivalences[$encodingUpperHelper]; |
427
|
|
|
} |
428
|
|
|
|
429
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
430
|
|
|
|
431
|
|
|
return $encoding; |
432
|
|
|
} |
433
|
|
|
|
434
|
6 |
|
private function toUtf8($str) |
|
|
|
|
435
|
|
|
{ |
436
|
|
|
|
437
|
6 |
|
if (is_array($str) === true) { |
438
|
|
|
foreach ($str as $key => $value) { |
439
|
|
|
$str[$key] = $this->toUtf8($value); |
440
|
|
|
} |
441
|
|
|
return $str; |
442
|
|
|
} |
443
|
|
|
|
444
|
|
|
|
445
|
6 |
|
$str = (string)$str; |
446
|
6 |
|
if ($str === '') { |
447
|
|
|
return $str; |
448
|
|
|
} |
449
|
|
|
|
450
|
6 |
|
$max = \strlen($str); |
451
|
6 |
|
$buf = ''; |
452
|
|
|
|
453
|
6 |
|
for ($i = 0; $i < $max; ++$i) { |
454
|
6 |
|
$c1 = $str[$i]; |
|
|
|
|
455
|
|
|
|
456
|
6 |
|
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
457
|
|
|
|
458
|
|
|
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
459
|
|
|
|
460
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
|
|
|
461
|
|
|
|
462
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
463
|
|
|
$buf .= $c1 . $c2; |
464
|
|
|
++$i; |
465
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
466
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
467
|
|
|
} |
468
|
|
|
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
469
|
|
|
|
470
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
471
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
|
|
|
472
|
|
|
|
473
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
474
|
|
|
$buf .= $c1 . $c2 . $c3; |
475
|
|
|
$i += 2; |
476
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
477
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
478
|
|
|
} |
479
|
|
|
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
480
|
|
|
|
481
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
482
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
483
|
|
|
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
|
|
|
|
484
|
|
|
|
485
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
486
|
|
|
$buf .= $c1 . $c2 . $c3 . $c4; |
487
|
|
|
$i += 3; |
488
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
489
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
490
|
|
|
} |
491
|
|
|
} else { // doesn't look like UTF8, but should be converted |
|
|
|
|
492
|
|
|
|
493
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
494
|
|
|
} |
495
|
6 |
|
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
496
|
|
|
|
497
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
498
|
|
|
} else { // it doesn't need conversion |
|
|
|
|
499
|
|
|
|
500
|
6 |
|
$buf .= $c1; |
501
|
|
|
} |
502
|
|
|
} |
503
|
|
|
|
504
|
|
|
// decode unicode escape sequences + unicode surrogate pairs |
505
|
6 |
|
$buf = preg_replace_callback( |
506
|
6 |
|
'/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
507
|
|
|
/** |
508
|
|
|
* @param array $matches |
509
|
|
|
* |
510
|
|
|
* @return string |
511
|
|
|
*/ |
512
|
|
|
function (array $matches) { |
513
|
1 |
|
if (isset($matches[3])) { |
514
|
1 |
|
$cp = (int)hexdec($matches[3]); |
|
|
|
|
515
|
|
|
} else { |
|
|
|
|
516
|
|
|
// http://unicode.org/faq/utf_bom.html#utf16-4 |
517
|
|
|
$cp = ((int)hexdec($matches[1]) << 10) |
518
|
|
|
+ (int)hexdec($matches[2]) |
519
|
|
|
+ 0x10000 |
520
|
|
|
- (0xD800 << 10) |
521
|
|
|
- 0xDC00; |
522
|
|
|
} |
523
|
|
|
|
524
|
|
|
// https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
525
|
|
|
// |
526
|
|
|
// php_utf32_utf8(unsigned char *buf, unsigned k) |
527
|
|
|
|
528
|
1 |
|
if ($cp < 0x80) { |
529
|
1 |
|
return (string)$this->chr($cp); |
530
|
|
|
} |
531
|
|
|
|
532
|
|
|
if ($cp < 0xA0) { |
533
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
534
|
|
|
return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
535
|
|
|
} |
536
|
|
|
|
537
|
|
|
return $this->decimalToChr($cp); |
538
|
6 |
|
}, |
539
|
6 |
|
$buf |
540
|
|
|
); |
541
|
|
|
|
542
|
6 |
|
if ($buf === null) { |
543
|
|
|
return ''; |
544
|
|
|
} |
545
|
|
|
|
546
|
|
|
|
547
|
6 |
|
return $buf; |
548
|
|
|
} |
549
|
|
|
|
550
|
|
|
private function toUtf8ConvertHelper($input) |
|
|
|
|
551
|
|
|
{ |
552
|
|
|
// init |
553
|
|
|
$buf = ''; |
554
|
|
|
|
555
|
|
|
if ($this->ORD === null) { |
556
|
|
|
$this->ORD = $this->getData('ord'); |
557
|
|
|
} |
558
|
|
|
|
559
|
|
|
if ($this->CHR === null) { |
560
|
|
|
$this->CHR = $this->getData('chr'); |
561
|
|
|
} |
562
|
|
|
|
563
|
|
|
if ($this->WIN1252_TO_UTF8 === null) { |
564
|
|
|
$this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
565
|
|
|
} |
566
|
|
|
|
567
|
|
|
$ordC1 = $this->ORD[$input]; |
568
|
|
|
if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
569
|
|
|
$buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
570
|
|
|
} else { |
|
|
|
|
571
|
|
|
$cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
572
|
|
|
$cc2 = ((string)$input & "\x3F") | "\x80"; |
573
|
|
|
$buf .= $cc1 . $cc2; |
574
|
|
|
} |
575
|
|
|
|
576
|
|
|
return $buf; |
577
|
|
|
} |
578
|
|
|
|
579
|
1 |
|
private function chr($code_point, $encoding = 'UTF-8') |
|
|
|
|
580
|
|
|
{ |
581
|
|
|
// init |
582
|
1 |
|
static $CHAR_CACHE = []; |
583
|
|
|
|
584
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
585
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
586
|
|
|
} |
587
|
|
|
|
588
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
589
|
|
|
trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
590
|
|
|
} |
591
|
|
|
|
592
|
1 |
|
$cacheKey = $code_point . $encoding; |
593
|
1 |
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
594
|
|
|
return $CHAR_CACHE[$cacheKey]; |
595
|
|
|
} |
596
|
|
|
|
597
|
1 |
|
if ($code_point <= 127) { // use "simple"-char only until "\x80" |
598
|
|
|
|
599
|
1 |
|
if ($this->CHR === null) { |
600
|
1 |
|
$this->CHR = (array)$this->getData('chr'); |
601
|
|
|
} |
602
|
|
|
|
603
|
|
|
/** |
604
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
605
|
|
|
*/ |
606
|
1 |
|
$chr = $this->CHR[$code_point]; |
607
|
|
|
|
608
|
1 |
|
if ($encoding !== 'UTF-8') { |
609
|
|
|
$chr = $this->encode($encoding, $chr); |
610
|
|
|
} |
611
|
|
|
|
612
|
1 |
|
return $CHAR_CACHE[$cacheKey] = $chr; |
613
|
|
|
} |
614
|
|
|
|
615
|
|
|
// |
616
|
|
|
// fallback via "IntlChar" |
617
|
|
|
// |
618
|
|
|
|
619
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
620
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
621
|
|
|
$chr = IntlChar::chr($code_point); |
622
|
|
|
|
623
|
|
|
if ($encoding !== 'UTF-8') { |
624
|
|
|
$chr = $this->encode($encoding, $chr); |
625
|
|
|
} |
626
|
|
|
|
627
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
628
|
|
|
} |
629
|
|
|
|
630
|
|
|
// |
631
|
|
|
// fallback via vanilla php |
632
|
|
|
// |
633
|
|
|
|
634
|
|
|
if ($this->CHR === null) { |
635
|
|
|
$this->CHR = (array)$this->getData('chr'); |
636
|
|
|
} |
637
|
|
|
|
638
|
|
|
$code_point = (int)$code_point; |
639
|
|
|
if ($code_point <= 0x7F) { |
640
|
|
|
/** |
641
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
642
|
|
|
*/ |
643
|
|
|
$chr = $this->CHR[$code_point]; |
644
|
|
|
} elseif ($code_point <= 0x7FF) { |
645
|
|
|
/** |
646
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
647
|
|
|
*/ |
648
|
|
|
$chr = $this->CHR[($code_point >> 6) + 0xC0] . |
649
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
650
|
|
|
} elseif ($code_point <= 0xFFFF) { |
651
|
|
|
/** |
652
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
653
|
|
|
*/ |
654
|
|
|
$chr = $this->CHR[($code_point >> 12) + 0xE0] . |
655
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
656
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
657
|
|
|
} else { |
|
|
|
|
658
|
|
|
/** |
659
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
660
|
|
|
*/ |
661
|
|
|
$chr = $this->CHR[($code_point >> 18) + 0xF0] . |
662
|
|
|
$this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
663
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
664
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
665
|
|
|
} |
666
|
|
|
|
667
|
|
|
if ($encoding !== 'UTF-8') { |
668
|
|
|
$chr = $this->encode($encoding, $chr); |
669
|
|
|
} |
670
|
|
|
|
671
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
672
|
|
|
} |
673
|
|
|
|
674
|
|
|
private function encode($toEncoding, $str) |
|
|
|
|
675
|
|
|
{ |
676
|
|
|
if ($str === '' || $toEncoding === '') { |
677
|
|
|
return $str; |
678
|
|
|
} |
679
|
|
|
|
680
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
681
|
|
|
$toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
682
|
|
|
} |
683
|
|
|
|
684
|
|
|
// if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
685
|
|
|
// $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
686
|
|
|
// } |
687
|
|
|
|
688
|
|
|
// if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
689
|
|
|
// return $str; |
690
|
|
|
// } |
691
|
|
|
|
692
|
|
|
if ($toEncoding === 'JSON') { |
693
|
|
|
$return = $this->jsonEncode($str); |
694
|
|
|
if ($return === false) { |
695
|
|
|
throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
696
|
|
|
} |
697
|
|
|
|
698
|
|
|
return $return; |
699
|
|
|
} |
700
|
|
|
// if ($fromEncoding === 'JSON') { |
701
|
|
|
// $str = $this->json_decode($str); |
702
|
|
|
// $fromEncoding = ''; |
703
|
|
|
// } |
704
|
|
|
|
705
|
|
|
if ($toEncoding === 'BASE64') { |
706
|
|
|
return base64_encode($str); |
707
|
|
|
} |
708
|
|
|
// if ($fromEncoding === 'BASE64') { |
709
|
|
|
// $str = base64_decode($str, true); |
710
|
|
|
// $fromEncoding = ''; |
711
|
|
|
// } |
712
|
|
|
|
713
|
|
|
if ($toEncoding === 'HTML-ENTITIES') { |
714
|
|
|
return $this->htmlEncode($str, true, 'UTF-8'); |
715
|
|
|
} |
716
|
|
|
// if ($fromEncoding === 'HTML-ENTITIES') { |
717
|
|
|
// $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
718
|
|
|
// $fromEncoding = ''; |
719
|
|
|
// } |
720
|
|
|
|
721
|
|
|
$fromEncodingDetected = false; |
|
|
|
|
722
|
|
|
// if ($autodetectFromEncoding === true || !$fromEncoding) { |
723
|
|
|
// $fromEncodingDetected = $this->str_detect_encoding($str); |
724
|
|
|
// } |
725
|
|
|
|
726
|
|
|
// DEBUG |
727
|
|
|
//var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
728
|
|
|
|
729
|
|
|
// if ($fromEncodingDetected !== false) { |
730
|
|
|
// $fromEncoding = $fromEncodingDetected; |
731
|
|
|
// } elseif ($autodetectFromEncoding === true) { |
732
|
|
|
// // fallback for the "autodetect"-mode |
733
|
|
|
// return $this->toUtf8($str); |
734
|
|
|
// } |
735
|
|
|
|
736
|
|
|
// if (!$fromEncoding || $fromEncoding === $toEncoding) { |
737
|
|
|
// return $str; |
738
|
|
|
// } |
739
|
|
|
|
740
|
|
|
// if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
741
|
|
|
// return $this->toUtf8($str); |
742
|
|
|
// } |
743
|
|
|
|
744
|
|
|
// if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
745
|
|
|
// return $this->to_iso8859($str); |
746
|
|
|
// } |
747
|
|
|
|
748
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
749
|
|
|
trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
750
|
|
|
} |
751
|
|
|
// |
752
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
753
|
|
|
// // warning: do not use the symfony polyfill here |
754
|
|
|
// $strEncoded = mb_convert_encoding( |
755
|
|
|
// $str, |
756
|
|
|
// $toEncoding, |
757
|
|
|
// $fromEncoding |
758
|
|
|
// ); |
759
|
|
|
// |
760
|
|
|
// if ($strEncoded) { |
761
|
|
|
// return $strEncoded; |
762
|
|
|
// } |
763
|
|
|
// } |
764
|
|
|
// |
765
|
|
|
// $return = \iconv($fromEncoding, $toEncoding, $str); |
766
|
|
|
// if ($return !== false) { |
767
|
|
|
// return $return; |
768
|
|
|
// } |
769
|
|
|
|
770
|
|
|
return $str; |
771
|
|
|
} |
772
|
|
|
|
773
|
|
|
private function jsonEncode($value) |
774
|
|
|
{ |
775
|
|
|
$value = $this->filter($value); |
776
|
|
|
|
777
|
|
|
if ($this->SUPPORT['json'] === false) { |
778
|
|
|
throw new \RuntimeException('ext-json: is not installed'); |
779
|
|
|
} |
780
|
|
|
|
781
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
782
|
|
|
return json_encode($value, 0, 512); |
783
|
|
|
} |
784
|
|
|
|
785
|
|
|
private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
|
|
|
|
786
|
|
|
{ |
787
|
|
|
switch (\gettype($var)) { |
788
|
|
|
case 'array': |
789
|
|
|
foreach ($var as $key => $value) { |
790
|
|
|
$var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
791
|
|
|
} |
792
|
|
|
unset($v); |
|
|
|
|
793
|
|
|
|
794
|
|
|
break; |
795
|
|
|
case 'object': |
796
|
|
|
foreach ($var as $key => $value) { |
797
|
|
|
$str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
|
|
|
798
|
|
|
} |
799
|
|
|
unset($v); |
800
|
|
|
|
801
|
|
|
break; |
802
|
|
|
case 'string': |
|
|
|
|
803
|
|
|
|
804
|
|
|
if (strpos($var, "\r") !== false) { |
805
|
|
|
// Workaround https://bugs.php.net/65732 |
806
|
|
|
$var = $this->normalizeLineEnding($var); |
807
|
|
|
} |
808
|
|
|
|
809
|
|
|
if ($this->isAscii($var) === false) { |
810
|
|
|
if (\Normalizer::isNormalized($var, $normalization_form)) { |
811
|
|
|
$n = '-'; |
|
|
|
|
812
|
|
|
} else { |
|
|
|
|
813
|
|
|
$n = \Normalizer::normalize($var, $normalization_form); |
814
|
|
|
|
815
|
|
|
if (isset($n[0])) { |
816
|
|
|
$var = $n; |
817
|
|
|
} else { |
|
|
|
|
818
|
|
|
$var = $this->encode('UTF-8', $var, true); |
|
|
|
|
819
|
|
|
} |
820
|
|
|
} |
821
|
|
|
|
822
|
|
|
if ( |
823
|
|
|
$var[0] >= "\x80" |
824
|
|
|
&& |
825
|
|
|
isset($n[0], $leading_combining[0]) |
826
|
|
|
&& |
827
|
|
|
preg_match('/^\p{Mn}/u', $var) |
828
|
|
|
) { |
829
|
|
|
// Prevent leading combining chars |
830
|
|
|
// for NFC-safe concatenations. |
831
|
|
|
$var = $leading_combining . $var; |
832
|
|
|
} |
833
|
|
|
} |
834
|
|
|
|
835
|
|
|
break; |
836
|
|
|
} |
837
|
|
|
|
838
|
|
|
return $var; |
839
|
|
|
} |
840
|
|
|
|
841
|
|
|
private function normalizeLineEnding($str) |
842
|
|
|
{ |
843
|
|
|
return str_replace(["\r\n", "\r"], "\n", $str); |
844
|
|
|
} |
845
|
|
|
|
846
|
|
|
private function isAscii($str) |
847
|
|
|
{ |
848
|
|
|
if ($str === '') { |
849
|
|
|
return true; |
850
|
|
|
} |
851
|
|
|
|
852
|
|
|
return !preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); |
853
|
|
|
} |
854
|
|
|
|
855
|
|
|
private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
856
|
|
|
{ |
857
|
|
|
if ($str === '') { |
858
|
|
|
return ''; |
859
|
|
|
} |
860
|
|
|
|
861
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
862
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
863
|
|
|
} |
864
|
|
|
|
865
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
866
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
867
|
|
|
$startCode = 0x00; |
868
|
|
|
if ($keepAsciiChars === true) { |
869
|
|
|
$startCode = 0x80; |
870
|
|
|
} |
871
|
|
|
|
872
|
|
|
if ($encoding === 'UTF-8') { |
873
|
|
|
return mb_encode_numericentity( |
874
|
|
|
$str, |
875
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0] |
876
|
|
|
); |
877
|
|
|
} |
878
|
|
|
|
879
|
|
|
return mb_encode_numericentity( |
880
|
|
|
$str, |
881
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0], |
882
|
|
|
$encoding |
883
|
|
|
); |
884
|
|
|
} |
885
|
|
|
|
886
|
|
|
return implode( |
887
|
|
|
'', |
888
|
|
|
\array_map( |
889
|
|
|
function (string $chr) use ($keepAsciiChars, $encoding) { |
890
|
|
|
return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
891
|
|
|
}, |
892
|
|
|
$this->strSplit($str) |
893
|
|
|
) |
894
|
|
|
); |
895
|
|
|
} |
896
|
|
|
|
897
|
|
|
private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
898
|
|
|
{ |
899
|
|
|
if ($char === '') { |
900
|
|
|
return ''; |
901
|
|
|
} |
902
|
|
|
|
903
|
|
|
if ($keepAsciiChars === true && $this->isAscii($char) === true) { |
904
|
|
|
return $char; |
905
|
|
|
} |
906
|
|
|
|
907
|
|
|
return '&#' . $this->ord($char, $encoding) . ';'; |
908
|
|
|
} |
909
|
|
|
|
910
|
|
|
private function ord($chr, $encoding = 'UTF-8') |
|
|
|
|
911
|
|
|
{ |
912
|
|
|
static $CHAR_CACHE = []; |
913
|
|
|
|
914
|
|
|
// init |
915
|
|
|
$chr = (string)$chr; |
916
|
|
|
|
917
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
918
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
919
|
|
|
} |
920
|
|
|
|
921
|
|
|
$cacheKey = $chr . $encoding; |
922
|
|
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
923
|
|
|
return $CHAR_CACHE[$cacheKey]; |
924
|
|
|
} |
925
|
|
|
|
926
|
|
|
// check again, if it's still not UTF-8 |
927
|
|
|
if ($encoding !== 'UTF-8') { |
928
|
|
|
$chr = $this->encode($encoding, $chr); |
929
|
|
|
} |
930
|
|
|
|
931
|
|
|
if ($this->ORD === null) { |
932
|
|
|
$this->ORD = $this->getData('ord'); |
933
|
|
|
} |
934
|
|
|
|
935
|
|
|
if (isset($this->ORD[$chr])) { |
936
|
|
|
return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
937
|
|
|
} |
938
|
|
|
|
939
|
|
|
// |
940
|
|
|
// fallback via "IntlChar" |
941
|
|
|
// |
942
|
|
|
|
943
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
944
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
945
|
|
|
$code = \IntlChar::ord($chr); |
946
|
|
|
if ($code) { |
947
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
948
|
|
|
} |
949
|
|
|
} |
950
|
|
|
|
951
|
|
|
// |
952
|
|
|
// fallback via vanilla php |
953
|
|
|
// |
954
|
|
|
|
955
|
|
|
/** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
956
|
|
|
$chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
957
|
|
|
$code = $chr ? $chr[1] : 0; |
958
|
|
|
|
959
|
|
|
if ($code >= 0xF0 && isset($chr[4])) { |
960
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
961
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
962
|
|
|
} |
963
|
|
|
|
964
|
|
|
if ($code >= 0xE0 && isset($chr[3])) { |
965
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
966
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
967
|
|
|
} |
968
|
|
|
|
969
|
|
|
if ($code >= 0xC0 && isset($chr[2])) { |
970
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
971
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
972
|
|
|
} |
973
|
|
|
|
974
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
975
|
|
|
} |
976
|
|
|
|
977
|
|
|
private function strSplit($str, $length = 1, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
978
|
|
|
{ |
979
|
|
|
if ($length <= 0) { |
980
|
|
|
return []; |
981
|
|
|
} |
982
|
|
|
|
983
|
|
|
if (is_array($str) === true) { |
984
|
|
|
foreach ($str as $key => $value) { |
985
|
|
|
$str[$key] = $this->strSplit($value, $length, $cleanUtf8, $tryToUseMbFunction); |
986
|
|
|
} |
987
|
|
|
|
988
|
|
|
return $str; |
989
|
|
|
} |
990
|
|
|
|
991
|
|
|
// init |
992
|
|
|
$str = (string)$str; |
993
|
|
|
|
994
|
|
|
if ($str === '') { |
995
|
|
|
return []; |
996
|
|
|
} |
997
|
|
|
|
998
|
|
|
if ($cleanUtf8 === true) { |
999
|
|
|
$str = $this->clean($str); |
1000
|
|
|
} |
1001
|
|
|
|
1002
|
|
|
if ($tryToUseMbFunction === true && $this->SUPPORT['mbstring'] === true) { |
1003
|
|
|
$iMax = \mb_strlen($str); |
1004
|
|
|
if ($iMax <= 127) { |
1005
|
|
|
$ret = []; |
1006
|
|
|
for ($i = 0; $i < $iMax; ++$i) { |
1007
|
|
|
$ret[] = \mb_substr($str, $i, 1); |
1008
|
|
|
} |
1009
|
|
|
} else { |
|
|
|
|
1010
|
|
|
$retArray = []; |
1011
|
|
|
preg_match_all('/./us', $str, $retArray); |
1012
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
1013
|
|
|
} |
1014
|
|
|
} elseif ($this->SUPPORT['pcre_utf8'] === true) { |
1015
|
|
|
$retArray = []; |
1016
|
|
|
preg_match_all('/./us', $str, $retArray); |
1017
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
1018
|
|
|
} else { |
|
|
|
|
1019
|
|
|
|
1020
|
|
|
// fallback |
1021
|
|
|
|
1022
|
|
|
$ret = []; |
1023
|
|
|
$len = \strlen($str); |
1024
|
|
|
|
1025
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
1026
|
|
|
for ($i = 0; $i < $len; ++$i) { |
1027
|
|
|
if (($str[$i] & "\x80") === "\x00") { |
1028
|
|
|
$ret[] = $str[$i]; |
1029
|
|
|
} elseif ( |
1030
|
|
|
isset($str[$i + 1]) |
1031
|
|
|
&& |
1032
|
|
|
($str[$i] & "\xE0") === "\xC0" |
1033
|
|
|
) { |
1034
|
|
|
if (($str[$i + 1] & "\xC0") === "\x80") { |
1035
|
|
|
$ret[] = $str[$i] . $str[$i + 1]; |
1036
|
|
|
|
1037
|
|
|
++$i; |
1038
|
|
|
} |
1039
|
|
|
} elseif ( |
1040
|
|
|
isset($str[$i + 2]) |
1041
|
|
|
&& |
1042
|
|
|
($str[$i] & "\xF0") === "\xE0" |
1043
|
|
|
) { |
1044
|
|
|
if ( |
1045
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
1046
|
|
|
&& |
1047
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
1048
|
|
|
) { |
1049
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
1050
|
|
|
|
1051
|
|
|
$i += 2; |
1052
|
|
|
} |
1053
|
|
|
} elseif ( |
1054
|
|
|
isset($str[$i + 3]) |
1055
|
|
|
&& |
1056
|
|
|
($str[$i] & "\xF8") === "\xF0" |
1057
|
|
|
) { |
1058
|
|
|
if ( |
1059
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
1060
|
|
|
&& |
1061
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
1062
|
|
|
&& |
1063
|
|
|
($str[$i + 3] & "\xC0") === "\x80" |
1064
|
|
|
) { |
1065
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
1066
|
|
|
|
1067
|
|
|
$i += 3; |
1068
|
|
|
} |
1069
|
|
|
} |
1070
|
|
|
} |
1071
|
|
|
} |
1072
|
|
|
|
1073
|
|
|
if ($length > 1) { |
1074
|
|
|
$ret = \array_chunk($ret, $length); |
1075
|
|
|
|
1076
|
|
|
return array_map( |
1077
|
|
|
static function (&$item) { |
1078
|
|
|
return implode('', $item); |
1079
|
|
|
}, |
1080
|
|
|
$ret |
1081
|
|
|
); |
1082
|
|
|
} |
1083
|
|
|
|
1084
|
|
|
if (isset($ret[0]) && $ret[0] === '') { |
1085
|
|
|
return []; |
1086
|
|
|
} |
1087
|
|
|
|
1088
|
|
|
return $ret; |
1089
|
|
|
} |
1090
|
|
|
|
1091
|
|
|
private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
|
|
|
|
1092
|
|
|
{ |
1093
|
|
|
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
1094
|
|
|
// caused connection reset problem on larger strings |
1095
|
|
|
|
1096
|
|
|
$regx = '/ |
1097
|
|
|
( |
1098
|
|
|
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
1099
|
|
|
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
1100
|
|
|
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
1101
|
|
|
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
1102
|
|
|
){1,100} # ...one or more times |
1103
|
|
|
) |
1104
|
|
|
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
1105
|
|
|
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
1106
|
|
|
/x'; |
1107
|
|
|
$str = (string)preg_replace($regx, '$1', $str); |
1108
|
|
|
|
1109
|
|
|
if ($replace_diamond_question_mark === true) { |
1110
|
|
|
$str = $this->replace_diamond_question_mark($str, ''); |
1111
|
|
|
} |
1112
|
|
|
|
1113
|
|
|
if ($remove_invisible_characters === true) { |
1114
|
|
|
$str = $this->remove_invisible_characters($str); |
1115
|
|
|
} |
1116
|
|
|
|
1117
|
|
|
if ($normalize_whitespace === true) { |
1118
|
|
|
$str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
1119
|
|
|
} |
1120
|
|
|
|
1121
|
|
|
if ($normalize_msword === true) { |
1122
|
|
|
$str = $this->normalize_msword($str); |
1123
|
|
|
} |
1124
|
|
|
|
1125
|
|
|
if ($remove_bom === true) { |
1126
|
|
|
$str = $this->remove_bom($str); |
1127
|
|
|
} |
1128
|
|
|
|
1129
|
|
|
return $str; |
1130
|
|
|
} |
1131
|
|
|
|
1132
|
6 |
|
public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
|
|
|
1133
|
|
|
{ |
1134
|
6 |
|
if ($str === '') { |
1135
|
|
|
return ''; |
1136
|
|
|
} |
1137
|
|
|
|
1138
|
6 |
|
if ($processInvalidUtf8 === true) { |
1139
|
6 |
|
$replacementCharHelper = $replacementChar; |
|
|
|
|
1140
|
6 |
|
if ($replacementChar === '') { |
1141
|
6 |
|
$replacementCharHelper = 'none'; |
1142
|
|
|
} |
1143
|
|
|
|
1144
|
6 |
|
if ($this->SUPPORT['mbstring'] === false) { |
1145
|
|
|
// if there is no native support for "mbstring", |
1146
|
|
|
// then we need to clean the string before ... |
1147
|
|
|
$str = $this->clean($str); |
1148
|
|
|
} |
1149
|
|
|
|
1150
|
6 |
|
$save = \mb_substitute_character(); |
1151
|
6 |
|
\mb_substitute_character($replacementCharHelper); |
1152
|
|
|
// the polyfill maybe return false, so cast to string |
1153
|
6 |
|
$str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
1154
|
6 |
|
\mb_substitute_character($save); |
1155
|
|
|
} |
1156
|
|
|
|
1157
|
6 |
|
return str_replace( |
1158
|
|
|
[ |
1159
|
6 |
|
"\xEF\xBF\xBD", |
1160
|
|
|
'�', |
1161
|
|
|
], |
1162
|
|
|
[ |
1163
|
6 |
|
$replacementChar, |
1164
|
6 |
|
$replacementChar, |
1165
|
|
|
], |
1166
|
6 |
|
$str |
1167
|
|
|
); |
1168
|
|
|
} |
1169
|
|
|
|
1170
|
6 |
|
public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
|
|
|
1171
|
|
|
{ |
1172
|
|
|
// init |
1173
|
6 |
|
$non_displayables = []; |
1174
|
|
|
|
1175
|
|
|
// every control character except newline (dec 10), |
1176
|
|
|
// carriage return (dec 13) and horizontal tab (dec 09) |
1177
|
6 |
|
if ($url_encoded) { |
1178
|
6 |
|
$non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
1179
|
6 |
|
$non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
1180
|
|
|
} |
1181
|
|
|
|
1182
|
6 |
|
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
1183
|
|
|
|
1184
|
|
|
do { |
1185
|
6 |
|
$str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
1186
|
6 |
|
} while ($count !== 0); |
1187
|
|
|
|
1188
|
6 |
|
return $str; |
1189
|
|
|
} |
1190
|
|
|
|
1191
|
6 |
|
public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
|
|
|
1192
|
|
|
{ |
1193
|
6 |
|
if ($str === '') { |
1194
|
|
|
return ''; |
1195
|
|
|
} |
1196
|
|
|
|
1197
|
6 |
|
static $WHITESPACE_CACHE = []; |
1198
|
6 |
|
$cacheKey = (int)$keepNonBreakingSpace; |
1199
|
|
|
|
1200
|
6 |
|
if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
1201
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
1202
|
|
|
|
1203
|
1 |
|
if ($keepNonBreakingSpace === true) { |
1204
|
|
|
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
1205
|
|
|
} |
1206
|
|
|
|
1207
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
1208
|
|
|
} |
1209
|
|
|
|
1210
|
6 |
|
if ($keepBidiUnicodeControls === false) { |
1211
|
6 |
|
static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
|
|
|
1212
|
|
|
|
1213
|
6 |
|
if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
1214
|
1 |
|
$BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
1215
|
|
|
} |
1216
|
|
|
|
1217
|
6 |
|
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
1218
|
|
|
} |
1219
|
|
|
|
1220
|
6 |
|
return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
1221
|
|
|
} |
1222
|
|
|
|
1223
|
|
|
private function normalize_msword($str) |
|
|
|
|
1224
|
|
|
{ |
1225
|
|
|
if ($str === '') { |
1226
|
|
|
return ''; |
1227
|
|
|
} |
1228
|
|
|
|
1229
|
|
|
$keys = [ |
1230
|
|
|
"\xc2\xab", // « (U+00AB) in UTF-8 |
1231
|
|
|
"\xc2\xbb", // » (U+00BB) in UTF-8 |
1232
|
|
|
"\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
1233
|
|
|
"\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
1234
|
|
|
"\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
1235
|
|
|
"\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
1236
|
|
|
"\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
1237
|
|
|
"\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
1238
|
|
|
"\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
1239
|
|
|
"\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
1240
|
|
|
"\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
1241
|
|
|
"\xe2\x80\xba", // › (U+203A) in UTF-8 |
1242
|
|
|
"\xe2\x80\x93", // – (U+2013) in UTF-8 |
1243
|
|
|
"\xe2\x80\x94", // — (U+2014) in UTF-8 |
1244
|
|
|
"\xe2\x80\xa6", // … (U+2026) in UTF-8 |
1245
|
|
|
]; |
1246
|
|
|
|
1247
|
|
|
$values = [ |
1248
|
|
|
'"', // « (U+00AB) in UTF-8 |
1249
|
|
|
'"', // » (U+00BB) in UTF-8 |
1250
|
|
|
"'", // ‘ (U+2018) in UTF-8 |
1251
|
|
|
"'", // ’ (U+2019) in UTF-8 |
1252
|
|
|
"'", // ‚ (U+201A) in UTF-8 |
1253
|
|
|
"'", // ‛ (U+201B) in UTF-8 |
1254
|
|
|
'"', // “ (U+201C) in UTF-8 |
1255
|
|
|
'"', // ” (U+201D) in UTF-8 |
1256
|
|
|
'"', // „ (U+201E) in UTF-8 |
1257
|
|
|
'"', // ‟ (U+201F) in UTF-8 |
1258
|
|
|
"'", // ‹ (U+2039) in UTF-8 |
1259
|
|
|
"'", // › (U+203A) in UTF-8 |
1260
|
|
|
'-', // – (U+2013) in UTF-8 |
1261
|
|
|
'-', // — (U+2014) in UTF-8 |
1262
|
|
|
'...', // … (U+2026) in UTF-8 |
1263
|
|
|
]; |
1264
|
|
|
|
1265
|
|
|
return str_replace($keys, $values, $str); |
1266
|
|
|
} |
1267
|
|
|
|
1268
|
6 |
|
public function remove_bom($str) |
|
|
|
|
1269
|
|
|
{ |
1270
|
6 |
|
if ($str === '') { |
1271
|
|
|
return ''; |
1272
|
|
|
} |
1273
|
|
|
|
1274
|
6 |
|
$strLength = \strlen($str); |
1275
|
6 |
|
foreach ($this->BOM as $bomString => $bomByteLength) { |
1276
|
6 |
|
if (strpos($str, $bomString, 0) === 0) { |
1277
|
|
|
$strTmp = \substr($str, $bomByteLength, $strLength); |
1278
|
|
|
if ($strTmp === false) { |
1279
|
|
|
return ''; |
1280
|
|
|
} |
1281
|
|
|
|
1282
|
|
|
$strLength -= (int)$bomByteLength; |
1283
|
|
|
$str = (string)$strTmp; |
1284
|
|
|
} |
1285
|
|
|
} |
1286
|
|
|
|
1287
|
6 |
|
return $str; |
1288
|
|
|
} |
1289
|
|
|
|
1290
|
|
|
// private function str_detect_encoding($str) |
1291
|
|
|
// { |
1292
|
|
|
// // init |
1293
|
|
|
// $str = (string)$str; |
1294
|
|
|
// |
1295
|
|
|
// // |
1296
|
|
|
// // 1.) check binary strings (010001001...) like UTF-16 / UTF-32 / PDF / Images / ... |
1297
|
|
|
// // |
1298
|
|
|
// |
1299
|
|
|
// if ($this->is_binary($str, true) === true) { |
1300
|
|
|
// $isUtf16 = $this->is_utf16($str, false); |
1301
|
|
|
// if ($isUtf16 === 1) { |
1302
|
|
|
// return 'UTF-16LE'; |
1303
|
|
|
// } |
1304
|
|
|
// if ($isUtf16 === 2) { |
1305
|
|
|
// return 'UTF-16BE'; |
1306
|
|
|
// } |
1307
|
|
|
// |
1308
|
|
|
// $isUtf32 = $this->is_utf32($str, false); |
1309
|
|
|
// if ($isUtf32 === 1) { |
1310
|
|
|
// return 'UTF-32LE'; |
1311
|
|
|
// } |
1312
|
|
|
// if ($isUtf32 === 2) { |
1313
|
|
|
// return 'UTF-32BE'; |
1314
|
|
|
// } |
1315
|
|
|
// |
1316
|
|
|
// // is binary but not "UTF-16" or "UTF-32" |
1317
|
|
|
// return false; |
1318
|
|
|
// } |
1319
|
|
|
// |
1320
|
|
|
// // |
1321
|
|
|
// // 2.) simple check for ASCII chars |
1322
|
|
|
// // |
1323
|
|
|
// |
1324
|
|
|
// if ($this->isAscii($str) === true) { |
1325
|
|
|
// return 'ASCII'; |
1326
|
|
|
// } |
1327
|
|
|
// |
1328
|
|
|
// // |
1329
|
|
|
// // 3.) simple check for UTF-8 chars |
1330
|
|
|
// // |
1331
|
|
|
// |
1332
|
|
|
// if ($this->isUtf8($str) === true) { |
1333
|
|
|
// return 'UTF-8'; |
1334
|
|
|
// } |
1335
|
|
|
// |
1336
|
|
|
// // |
1337
|
|
|
// // 4.) check via "mb_detect_encoding()" |
1338
|
|
|
// // |
1339
|
|
|
// // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "mb_detect_encoding()" |
1340
|
|
|
// |
1341
|
|
|
// $detectOrder = [ |
1342
|
|
|
// 'ISO-8859-1', |
1343
|
|
|
// 'ISO-8859-2', |
1344
|
|
|
// 'ISO-8859-3', |
1345
|
|
|
// 'ISO-8859-4', |
1346
|
|
|
// 'ISO-8859-5', |
1347
|
|
|
// 'ISO-8859-6', |
1348
|
|
|
// 'ISO-8859-7', |
1349
|
|
|
// 'ISO-8859-8', |
1350
|
|
|
// 'ISO-8859-9', |
1351
|
|
|
// 'ISO-8859-10', |
1352
|
|
|
// 'ISO-8859-13', |
1353
|
|
|
// 'ISO-8859-14', |
1354
|
|
|
// 'ISO-8859-15', |
1355
|
|
|
// 'ISO-8859-16', |
1356
|
|
|
// 'WINDOWS-1251', |
1357
|
|
|
// 'WINDOWS-1252', |
1358
|
|
|
// 'WINDOWS-1254', |
1359
|
|
|
// 'CP932', |
1360
|
|
|
// 'CP936', |
1361
|
|
|
// 'CP950', |
1362
|
|
|
// 'CP866', |
1363
|
|
|
// 'CP850', |
1364
|
|
|
// 'CP51932', |
1365
|
|
|
// 'CP50220', |
1366
|
|
|
// 'CP50221', |
1367
|
|
|
// 'CP50222', |
1368
|
|
|
// 'ISO-2022-JP', |
1369
|
|
|
// 'ISO-2022-KR', |
1370
|
|
|
// 'JIS', |
1371
|
|
|
// 'JIS-ms', |
1372
|
|
|
// 'EUC-CN', |
1373
|
|
|
// 'EUC-JP', |
1374
|
|
|
// ]; |
1375
|
|
|
// |
1376
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
1377
|
|
|
// // info: do not use the symfony polyfill here |
1378
|
|
|
// $encoding = \mb_detect_encoding($str, $detectOrder, true); |
1379
|
|
|
// if ($encoding) { |
1380
|
|
|
// return $encoding; |
1381
|
|
|
// } |
1382
|
|
|
// } |
1383
|
|
|
// |
1384
|
|
|
// // |
1385
|
|
|
// // 5.) check via "iconv()" |
1386
|
|
|
// // |
1387
|
|
|
// |
1388
|
|
|
// if ($this->ENCODINGS === null) { |
1389
|
|
|
// $this->ENCODINGS = $this->getData('encodings'); |
1390
|
|
|
// } |
1391
|
|
|
// |
1392
|
|
|
// foreach ($this->ENCODINGS as $encodingTmp) { |
1393
|
|
|
// // INFO: //IGNORE but still throw notice |
1394
|
|
|
// /** @noinspection PhpUsageOfSilenceOperatorInspection */ |
1395
|
|
|
// if ((string)@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str) === $str) { |
1396
|
|
|
// return $encodingTmp; |
1397
|
|
|
// } |
1398
|
|
|
// } |
1399
|
|
|
// |
1400
|
|
|
// return false; |
1401
|
|
|
// } |
1402
|
|
|
|
1403
|
|
|
private function decimalToChr($int) |
1404
|
|
|
{ |
1405
|
|
|
return $this->htmlEntityDecode('&#' . $int . ';', \ENT_QUOTES | \ENT_HTML5); |
1406
|
|
|
} |
1407
|
|
|
// |
1408
|
|
|
// private function is_utf16($str, $checkIfStringIsBinary = true) |
1409
|
|
|
// { |
1410
|
|
|
// |
1411
|
|
|
// // init |
1412
|
|
|
// $str = (string)$str; |
1413
|
|
|
// $strChars = []; |
1414
|
|
|
// |
1415
|
|
|
// if ( |
1416
|
|
|
// $checkIfStringIsBinary === true |
1417
|
|
|
// && |
1418
|
|
|
// $this->is_binary($str, true) === false |
1419
|
|
|
// ) { |
1420
|
|
|
// return false; |
1421
|
|
|
// } |
1422
|
|
|
// |
1423
|
|
|
// if ($this->SUPPORT['mbstring'] === false) { |
1424
|
|
|
// \trigger_error('UTF8::is_utf16() without mbstring may did not work correctly', \E_USER_WARNING); |
1425
|
|
|
// } |
1426
|
|
|
// |
1427
|
|
|
// $str = $this->remove_bom($str); |
1428
|
|
|
// |
1429
|
|
|
// |
1430
|
|
|
// $maybeUTF16LE = 0; |
1431
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE'); |
1432
|
|
|
// if ($test) { |
1433
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8'); |
1434
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE'); |
1435
|
|
|
// if ($test3 === $test) { |
1436
|
|
|
// if (\count($strChars) === 0) { |
1437
|
|
|
// $strChars = $this->count_chars($str, true, false); |
1438
|
|
|
// } |
1439
|
|
|
// $countChars = $this->count_chars($test3); |
1440
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
1441
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
1442
|
|
|
// ++$maybeUTF16LE; |
1443
|
|
|
// } |
1444
|
|
|
// unset($countChars[$test3char]); |
1445
|
|
|
// } |
1446
|
|
|
// } |
1447
|
|
|
// } |
1448
|
|
|
// |
1449
|
|
|
// $maybeUTF16BE = 0; |
1450
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE'); |
1451
|
|
|
// if ($test) { |
1452
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8'); |
1453
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE'); |
1454
|
|
|
// if ($test3 === $test) { |
1455
|
|
|
// if (\count($strChars) === 0) { |
1456
|
|
|
// $strChars = $this->count_chars($str, true, false); |
1457
|
|
|
// } |
1458
|
|
|
// $countChars = $this->count_chars($test3); |
1459
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
1460
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
1461
|
|
|
// ++$maybeUTF16BE; |
1462
|
|
|
// } |
1463
|
|
|
// unset($countChars[$test3char]); |
1464
|
|
|
// } |
1465
|
|
|
// |
1466
|
|
|
// } |
1467
|
|
|
// } |
1468
|
|
|
// |
1469
|
|
|
// if ($maybeUTF16BE !== $maybeUTF16LE) { |
1470
|
|
|
// if ($maybeUTF16LE > $maybeUTF16BE) { |
1471
|
|
|
// return 1; |
1472
|
|
|
// } |
1473
|
|
|
// |
1474
|
|
|
// return 2; |
1475
|
|
|
// } |
1476
|
|
|
// |
1477
|
|
|
// return false; |
1478
|
|
|
// } |
1479
|
|
|
|
1480
|
|
|
/** |
1481
|
|
|
* Check if the string is UTF-32. |
1482
|
|
|
* |
1483
|
|
|
* @param mixed $str <p>The input string.</p> |
1484
|
|
|
* @param bool $checkIfStringIsBinary |
1485
|
|
|
* |
1486
|
|
|
* @return false|int |
1487
|
|
|
* <strong>false</strong> if is't not UTF-32,<br> |
1488
|
|
|
* <strong>1</strong> for UTF-32LE,<br> |
1489
|
|
|
* <strong>2</strong> for UTF-32BE |
1490
|
|
|
*/ |
1491
|
|
|
private function is_utf32($str, $checkIfStringIsBinary = true) |
|
|
|
|
1492
|
|
|
{ |
1493
|
|
|
// init |
1494
|
|
|
$str = (string)$str; |
1495
|
|
|
$strChars = []; |
1496
|
|
|
|
1497
|
|
|
if ($checkIfStringIsBinary === true && $this->is_binary($str, true) === false) { |
1498
|
|
|
return false; |
1499
|
|
|
} |
1500
|
|
|
|
1501
|
|
|
if ($this->SUPPORT['mbstring'] === false) { |
1502
|
|
|
\trigger_error('UTF8::is_utf32() without mbstring may did not work correctly', \E_USER_WARNING); |
1503
|
|
|
} |
1504
|
|
|
|
1505
|
|
|
$str = $this->remove_bom($str); |
1506
|
|
|
|
1507
|
|
|
$maybeUTF32LE = 0; |
1508
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE'); |
1509
|
|
|
if ($test) { |
1510
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8'); |
1511
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE'); |
1512
|
|
|
if ($test3 === $test) { |
1513
|
|
|
if (\count($strChars) === 0) { |
1514
|
|
|
$strChars = $this->count_chars($str, true, false); |
1515
|
|
|
} |
1516
|
|
|
$countChars = $this->count_chars($test3); |
1517
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
1518
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
1519
|
|
|
++$maybeUTF32LE; |
1520
|
|
|
} |
1521
|
|
|
unset($countChars[$test3char]); |
1522
|
|
|
} |
1523
|
|
|
} |
1524
|
|
|
} |
1525
|
|
|
|
1526
|
|
|
$maybeUTF32BE = 0; |
1527
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE'); |
1528
|
|
|
if ($test) { |
1529
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8'); |
1530
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE'); |
1531
|
|
|
if ($test3 === $test) { |
1532
|
|
|
if (\count($strChars) === 0) { |
1533
|
|
|
$strChars = $this->count_chars($str, true, false); |
1534
|
|
|
} |
1535
|
|
|
$countChars = $this->count_chars($test3); |
1536
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
1537
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
1538
|
|
|
++$maybeUTF32BE; |
1539
|
|
|
} |
1540
|
|
|
unset($countChars[$test3char]); |
1541
|
|
|
} |
1542
|
|
|
} |
1543
|
|
|
} |
1544
|
|
|
|
1545
|
|
|
if ($maybeUTF32BE !== $maybeUTF32LE) { |
1546
|
|
|
if ($maybeUTF32LE > $maybeUTF32BE) { |
1547
|
|
|
return 1; |
1548
|
|
|
} |
1549
|
|
|
|
1550
|
|
|
return 2; |
1551
|
|
|
} |
1552
|
|
|
|
1553
|
|
|
return false; |
1554
|
|
|
} |
1555
|
|
|
|
1556
|
|
|
private function is_binary($input, $strict = false) |
|
|
|
|
1557
|
|
|
{ |
1558
|
|
|
$input = (string)$input; |
1559
|
|
|
if ($input === '') { |
1560
|
|
|
return false; |
1561
|
|
|
} |
1562
|
|
|
|
1563
|
|
|
if (preg_match('~^[01]+$~', $input)) { |
1564
|
|
|
return true; |
1565
|
|
|
} |
1566
|
|
|
|
1567
|
|
|
$ext = $this->get_file_type($input); |
1568
|
|
|
if ($ext['type'] === 'binary') { |
1569
|
|
|
return true; |
1570
|
|
|
} |
1571
|
|
|
|
1572
|
|
|
$testLength = \strlen($input); |
1573
|
|
|
$testNull = \substr_count($input, "\x0", 0, $testLength); |
1574
|
|
|
if (($testNull / $testLength) > 0.25) { |
1575
|
|
|
return true; |
1576
|
|
|
} |
1577
|
|
|
|
1578
|
|
|
if ($strict === true) { |
1579
|
|
|
if ($this->SUPPORT['finfo'] === false) { |
1580
|
|
|
throw new \RuntimeException('ext-fileinfo: is not installed'); |
1581
|
|
|
} |
1582
|
|
|
|
1583
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
1584
|
|
|
$finfo_encoding = (new \finfo(\FILEINFO_MIME_ENCODING))->buffer($input); |
1585
|
|
|
if ($finfo_encoding && $finfo_encoding === 'binary') { |
1586
|
|
|
return true; |
1587
|
|
|
} |
1588
|
|
|
} |
1589
|
|
|
|
1590
|
|
|
return false; |
1591
|
|
|
} |
1592
|
|
|
|
1593
|
|
|
private function get_file_type( |
|
|
|
|
1594
|
|
|
$str, |
1595
|
|
|
$fallback = [ |
1596
|
|
|
'ext' => null, |
1597
|
|
|
'mime' => 'application/octet-stream', |
1598
|
|
|
'type' => null, |
1599
|
|
|
] |
1600
|
|
|
) { |
1601
|
|
|
if ($str === '') { |
1602
|
|
|
return $fallback; |
1603
|
|
|
} |
1604
|
|
|
|
1605
|
|
|
$str_info = \substr($str, 0, 2); |
1606
|
|
|
if ($str_info === false || \strlen($str_info) !== 2) { |
1607
|
|
|
return $fallback; |
1608
|
|
|
} |
1609
|
|
|
|
1610
|
|
|
$str_info = \unpack('C2chars', $str_info); |
1611
|
|
|
if ($str_info === false) { |
1612
|
|
|
return $fallback; |
1613
|
|
|
} |
1614
|
|
|
$type_code = (int)($str_info['chars1'] . $str_info['chars2']); |
1615
|
|
|
|
1616
|
|
|
switch ($type_code) { |
1617
|
|
|
case 3780: |
1618
|
|
|
$ext = 'pdf'; |
1619
|
|
|
$mime = 'application/pdf'; |
1620
|
|
|
$type = 'binary'; |
1621
|
|
|
|
1622
|
|
|
break; |
1623
|
|
|
case 7790: |
1624
|
|
|
$ext = 'exe'; |
1625
|
|
|
$mime = 'application/octet-stream'; |
1626
|
|
|
$type = 'binary'; |
1627
|
|
|
|
1628
|
|
|
break; |
1629
|
|
|
case 7784: |
1630
|
|
|
$ext = 'midi'; |
1631
|
|
|
$mime = 'audio/x-midi'; |
1632
|
|
|
$type = 'binary'; |
1633
|
|
|
|
1634
|
|
|
break; |
1635
|
|
|
case 8075: |
1636
|
|
|
$ext = 'zip'; |
1637
|
|
|
$mime = 'application/zip'; |
1638
|
|
|
$type = 'binary'; |
1639
|
|
|
|
1640
|
|
|
break; |
1641
|
|
|
case 8297: |
1642
|
|
|
$ext = 'rar'; |
1643
|
|
|
$mime = 'application/rar'; |
1644
|
|
|
$type = 'binary'; |
1645
|
|
|
|
1646
|
|
|
break; |
1647
|
|
|
case 255216: |
1648
|
|
|
$ext = 'jpg'; |
1649
|
|
|
$mime = 'image/jpeg'; |
1650
|
|
|
$type = 'binary'; |
1651
|
|
|
|
1652
|
|
|
break; |
1653
|
|
|
case 7173: |
1654
|
|
|
$ext = 'gif'; |
1655
|
|
|
$mime = 'image/gif'; |
1656
|
|
|
$type = 'binary'; |
1657
|
|
|
|
1658
|
|
|
break; |
1659
|
|
|
case 6677: |
1660
|
|
|
$ext = 'bmp'; |
1661
|
|
|
$mime = 'image/bmp'; |
1662
|
|
|
$type = 'binary'; |
1663
|
|
|
|
1664
|
|
|
break; |
1665
|
|
|
case 13780: |
1666
|
|
|
$ext = 'png'; |
1667
|
|
|
$mime = 'image/png'; |
1668
|
|
|
$type = 'binary'; |
1669
|
|
|
|
1670
|
|
|
break; |
1671
|
|
|
default: |
1672
|
|
|
return $fallback; |
1673
|
|
|
} |
1674
|
|
|
|
1675
|
|
|
return [ |
1676
|
|
|
'ext' => $ext, |
1677
|
|
|
'mime' => $mime, |
1678
|
|
|
'type' => $type, |
1679
|
|
|
]; |
1680
|
|
|
} |
1681
|
|
|
|
1682
|
|
|
private function count_chars($str, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
1683
|
|
|
{ |
1684
|
|
|
return array_count_values($this->strSplit($str, 1, $cleanUtf8, $tryToUseMbFunction)); |
1685
|
|
|
} |
1686
|
|
|
|
1687
|
|
|
// private function to_iso8859($str) |
1688
|
|
|
// { |
1689
|
|
|
// if (is_array($str) === true) { |
1690
|
|
|
// |
1691
|
|
|
// foreach ($str as $key => $value) { |
1692
|
|
|
// $str[$k] = $this->to_iso8859($value); |
1693
|
|
|
// } |
1694
|
|
|
// |
1695
|
|
|
// return $str; |
1696
|
|
|
// } |
1697
|
|
|
// |
1698
|
|
|
// $str = (string)$str; |
1699
|
|
|
// if ($str === '') { |
1700
|
|
|
// return ''; |
1701
|
|
|
// } |
1702
|
|
|
// |
1703
|
|
|
// return $this->utf8_decode($str); |
1704
|
|
|
// } |
1705
|
|
|
|
1706
|
|
|
/** |
1707
|
|
|
* Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters. |
1708
|
|
|
* |
1709
|
|
|
* @see http://hsivonen.iki.fi/php-utf8/ |
1710
|
|
|
* |
1711
|
|
|
* @param string|string[] $str <p>The string to be checked.</p> |
1712
|
|
|
* @param bool $strict <p>Check also if the string is not UTF-16 or UTF-32.</p> |
|
|
|
|
1713
|
|
|
* |
1714
|
|
|
* @return bool |
1715
|
|
|
*/ |
1716
|
|
|
private function isUtf8($str) |
|
|
|
|
1717
|
|
|
{ |
1718
|
|
|
if (\is_array($str) === true) { |
1719
|
|
|
foreach ($str as $v) { |
1720
|
|
|
if ($this->isUtf8($v) === false) { |
1721
|
|
|
return false; |
1722
|
|
|
} |
1723
|
|
|
} |
1724
|
|
|
|
1725
|
|
|
return true; |
1726
|
|
|
} |
1727
|
|
|
|
1728
|
|
|
if ($str === '') { |
1729
|
|
|
return true; |
1730
|
|
|
} |
1731
|
|
|
|
1732
|
|
|
if ($this->system->pcre_utf8_support() !== true) { |
|
|
|
|
1733
|
|
|
|
1734
|
|
|
// If even just the first character can be matched, when the /u |
1735
|
|
|
// modifier is used, then it's valid UTF-8. If the UTF-8 is somehow |
1736
|
|
|
// invalid, nothing at all will match, even if the string contains |
1737
|
|
|
// some valid sequences |
1738
|
|
|
return preg_match('/^.{1}/us', $str, $ar) === 1; |
|
|
|
|
1739
|
|
|
} |
1740
|
|
|
|
1741
|
|
|
$mState = 0; // cached expected number of octets after the current octet |
1742
|
|
|
// until the beginning of the next UTF8 character sequence |
1743
|
|
|
$mUcs4 = 0; // cached Unicode character |
1744
|
|
|
$mBytes = 1; // cached expected number of octets in the current sequence |
1745
|
|
|
|
1746
|
|
|
if ($this->ORD === null) { |
1747
|
|
|
$this->ORD = $this->getData('ord'); |
1748
|
|
|
} |
1749
|
|
|
|
1750
|
|
|
$len = \strlen((string)$str); |
1751
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
1752
|
|
|
for ($i = 0; $i < $len; ++$i) { |
1753
|
|
|
$in = $this->ORD[$str[$i]]; |
|
|
|
|
1754
|
|
|
if ($mState === 0) { |
1755
|
|
|
// When mState is zero we expect either a US-ASCII character or a |
1756
|
|
|
// multi-octet sequence. |
1757
|
|
|
if ((0x80 & $in) === 0) { |
1758
|
|
|
// US-ASCII, pass straight through. |
1759
|
|
|
$mBytes = 1; |
1760
|
|
|
} elseif ((0xE0 & $in) === 0xC0) { |
1761
|
|
|
// First octet of 2 octet sequence. |
1762
|
|
|
$mUcs4 = $in; |
1763
|
|
|
$mUcs4 = ($mUcs4 & 0x1F) << 6; |
1764
|
|
|
$mState = 1; |
1765
|
|
|
$mBytes = 2; |
1766
|
|
|
} elseif ((0xF0 & $in) === 0xE0) { |
1767
|
|
|
// First octet of 3 octet sequence. |
1768
|
|
|
$mUcs4 = $in; |
1769
|
|
|
$mUcs4 = ($mUcs4 & 0x0F) << 12; |
1770
|
|
|
$mState = 2; |
1771
|
|
|
$mBytes = 3; |
1772
|
|
|
} elseif ((0xF8 & $in) === 0xF0) { |
1773
|
|
|
// First octet of 4 octet sequence. |
1774
|
|
|
$mUcs4 = $in; |
1775
|
|
|
$mUcs4 = ($mUcs4 & 0x07) << 18; |
1776
|
|
|
$mState = 3; |
1777
|
|
|
$mBytes = 4; |
1778
|
|
|
} elseif ((0xFC & $in) === 0xF8) { |
1779
|
|
|
/* First octet of 5 octet sequence. |
1780
|
|
|
* |
1781
|
|
|
* This is illegal because the encoded codepoint must be either |
1782
|
|
|
* (a) not the shortest form or |
1783
|
|
|
* (b) outside the Unicode range of 0-0x10FFFF. |
1784
|
|
|
* Rather than trying to resynchronize, we will carry on until the end |
1785
|
|
|
* of the sequence and let the later error handling code catch it. |
1786
|
|
|
*/ |
1787
|
|
|
$mUcs4 = $in; |
1788
|
|
|
$mUcs4 = ($mUcs4 & 0x03) << 24; |
1789
|
|
|
$mState = 4; |
1790
|
|
|
$mBytes = 5; |
1791
|
|
|
} elseif ((0xFE & $in) === 0xFC) { |
1792
|
|
|
// First octet of 6 octet sequence, see comments for 5 octet sequence. |
1793
|
|
|
$mUcs4 = $in; |
1794
|
|
|
$mUcs4 = ($mUcs4 & 1) << 30; |
1795
|
|
|
$mState = 5; |
1796
|
|
|
$mBytes = 6; |
1797
|
|
|
} else { |
|
|
|
|
1798
|
|
|
// Current octet is neither in the US-ASCII range nor a legal first |
1799
|
|
|
// octet of a multi-octet sequence. |
1800
|
|
|
return false; |
1801
|
|
|
} |
1802
|
|
|
} elseif ((0xC0 & $in) === 0x80) { |
|
|
|
|
1803
|
|
|
|
1804
|
|
|
// When mState is non-zero, we expect a continuation of the multi-octet |
1805
|
|
|
// sequence |
1806
|
|
|
|
1807
|
|
|
// Legal continuation. |
1808
|
|
|
$shift = ($mState - 1) * 6; |
1809
|
|
|
$tmp = $in; |
1810
|
|
|
$tmp = ($tmp & 0x0000003F) << $shift; |
1811
|
|
|
$mUcs4 |= $tmp; |
1812
|
|
|
// Prefix: End of the multi-octet sequence. mUcs4 now contains the final |
1813
|
|
|
// Unicode code point to be output. |
1814
|
|
|
if (--$mState === 0) { |
1815
|
|
|
// Check for illegal sequences and code points. |
1816
|
|
|
// |
1817
|
|
|
// From Unicode 3.1, non-shortest form is illegal |
1818
|
|
|
if ( |
1819
|
|
|
($mBytes === 2 && $mUcs4 < 0x0080) |
1820
|
|
|
|| |
1821
|
|
|
($mBytes === 3 && $mUcs4 < 0x0800) |
1822
|
|
|
|| |
1823
|
|
|
($mBytes === 4 && $mUcs4 < 0x10000) |
1824
|
|
|
|| |
1825
|
|
|
($mBytes > 4) |
1826
|
|
|
|| |
1827
|
|
|
// From Unicode 3.2, surrogate characters are illegal. |
1828
|
|
|
(($mUcs4 & 0xFFFFF800) === 0xD800) |
1829
|
|
|
|| |
1830
|
|
|
// Code points outside the Unicode range are illegal. |
1831
|
|
|
($mUcs4 > 0x10FFFF) |
1832
|
|
|
) { |
1833
|
|
|
return false; |
1834
|
|
|
} |
1835
|
|
|
// initialize UTF8 cache |
1836
|
|
|
$mState = 0; |
1837
|
|
|
$mUcs4 = 0; |
1838
|
|
|
$mBytes = 1; |
1839
|
|
|
} |
1840
|
|
|
} else { |
|
|
|
|
1841
|
|
|
// ((0xC0 & (*in) != 0x80) && (mState != 0)) |
1842
|
|
|
// Incomplete multi-octet sequence. |
1843
|
|
|
return false; |
1844
|
|
|
} |
1845
|
|
|
} |
1846
|
|
|
|
1847
|
|
|
return true; |
1848
|
|
|
} |
1849
|
|
|
|
1850
|
|
|
/** |
1851
|
|
|
* Decodes an UTF-8 string to ISO-8859-1. |
1852
|
|
|
* |
1853
|
|
|
* @param string $str <p>The input string.</p> |
1854
|
|
|
* @param bool $keepUtf8Chars |
1855
|
|
|
* |
1856
|
|
|
* @return string |
1857
|
|
|
*/ |
1858
|
|
|
private function utf8_decode($str, $keepUtf8Chars = false) |
|
|
|
|
1859
|
|
|
{ |
1860
|
|
|
if ($str === '') { |
1861
|
|
|
return ''; |
1862
|
|
|
} |
1863
|
|
|
|
1864
|
|
|
// save for later comparision |
1865
|
|
|
$str_backup = $str; |
1866
|
|
|
$len = \strlen($str); |
1867
|
|
|
|
1868
|
|
|
if ($this->ORD === null) { |
1869
|
|
|
$this->ORD = $this->getData('ord'); |
1870
|
|
|
} |
1871
|
|
|
|
1872
|
|
|
if ($this->CHR === null) { |
1873
|
|
|
$this->CHR = $this->getData('chr'); |
1874
|
|
|
} |
1875
|
|
|
|
1876
|
|
|
$noCharFound = '?'; |
1877
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
1878
|
|
|
for ($i = 0, $j = 0; $i < $len; ++$i, ++$j) { |
1879
|
|
|
switch ($str[$i] & "\xF0") { |
1880
|
|
|
case "\xC0": |
1881
|
|
|
case "\xD0": |
1882
|
|
|
$c = ($this->ORD[$str[$i] & "\x1F"] << 6) | $this->ORD[$str[++$i] & "\x3F"]; |
|
|
|
|
1883
|
|
|
$str[$j] = $c < 256 ? $this->CHR[$c] : $noCharFound; |
1884
|
|
|
|
1885
|
|
|
break; |
1886
|
|
|
|
1887
|
|
|
/** @noinspection PhpMissingBreakStatementInspection */ |
1888
|
|
|
case "\xF0": |
1889
|
|
|
++$i; |
1890
|
|
|
|
1891
|
|
|
// no break |
1892
|
|
|
|
1893
|
|
|
case "\xE0": |
1894
|
|
|
$str[$j] = $noCharFound; |
1895
|
|
|
$i += 2; |
1896
|
|
|
|
1897
|
|
|
break; |
1898
|
|
|
|
1899
|
|
|
default: |
1900
|
|
|
$str[$j] = $str[$i]; |
1901
|
|
|
} |
1902
|
|
|
} |
1903
|
|
|
|
1904
|
|
|
$return = substr($str, 0, $j); |
1905
|
|
|
if ($return === false) { |
1906
|
|
|
$return = ''; |
1907
|
|
|
} |
1908
|
|
|
|
1909
|
|
|
if ( |
1910
|
|
|
$keepUtf8Chars === true |
1911
|
|
|
&& |
1912
|
|
|
$this->stringLength($return) >= (int)$this->stringLength($str_backup) |
1913
|
|
|
) { |
1914
|
|
|
return $str_backup; |
1915
|
|
|
} |
1916
|
|
|
|
1917
|
|
|
return $return; |
1918
|
|
|
} |
1919
|
|
|
|
1920
|
|
|
/** |
1921
|
|
|
* @param $str |
1922
|
|
|
* @param string $encoding |
|
|
|
|
1923
|
|
|
* @param bool $cleanUtf8 |
|
|
|
|
1924
|
|
|
* @return bool|int |
1925
|
|
|
*/ |
1926
|
|
|
private function stringLength($str) |
|
|
|
|
1927
|
|
|
{ |
1928
|
|
|
if ($str === '') { |
1929
|
|
|
return 0; |
1930
|
|
|
} |
1931
|
|
|
|
1932
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
1933
|
|
|
return mb_strlen($str, 'UTF-8'); |
1934
|
|
|
} |
1935
|
|
|
|
1936
|
|
|
if ($this->SUPPORT['iconv'] === true) { |
1937
|
|
|
$returnTmp = \iconv_strlen($str, 'UTF-8'); |
1938
|
|
|
if ($returnTmp !== false) { |
1939
|
|
|
return $returnTmp; |
1940
|
|
|
} |
1941
|
|
|
} |
1942
|
|
|
|
1943
|
|
|
if ( |
1944
|
|
|
$this->SUPPORT['intl'] === true |
1945
|
|
|
) { |
1946
|
|
|
$returnTmp = \grapheme_strlen($str); |
1947
|
|
|
if ($returnTmp !== null) { |
1948
|
|
|
return $returnTmp; |
1949
|
|
|
} |
1950
|
|
|
} |
1951
|
|
|
|
1952
|
|
|
if ($this->isAscii($str)) { |
1953
|
|
|
return strlen($str); |
1954
|
|
|
} |
1955
|
|
|
|
1956
|
|
|
// |
1957
|
|
|
// fallback via vanilla php |
1958
|
|
|
// |
1959
|
|
|
|
1960
|
|
|
\preg_match_all('/./us', $str, $parts); |
1961
|
|
|
|
1962
|
|
|
$returnTmp = \count($parts[0]); |
1963
|
|
|
if ($returnTmp === 0) { |
1964
|
|
|
return false; |
1965
|
|
|
} |
1966
|
|
|
|
1967
|
|
|
return $returnTmp; |
1968
|
|
|
} |
1969
|
|
|
|
1970
|
|
|
|
1971
|
|
|
} |
1972
|
|
|
|
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString
.