|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace devtoolboxuk\soteria\voku\Resources; |
|
4
|
|
|
|
|
5
|
|
|
class Utf8 extends Resources |
|
|
|
|
|
|
6
|
|
|
{ |
|
7
|
|
|
|
|
8
|
|
|
private $system; |
|
9
|
|
|
private $ENCODINGS; |
|
10
|
|
|
private $SUPPORT = []; |
|
11
|
|
|
private $BROKEN_UTF8_FIX; |
|
12
|
|
|
private $ORD; |
|
13
|
|
|
private $CHR; |
|
14
|
|
|
private $WIN1252_TO_UTF8; |
|
15
|
|
|
private $BOM = [ |
|
16
|
|
|
"\xef\xbb\xbf" => 3, // UTF-8 BOM |
|
17
|
|
|
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
|
18
|
|
|
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
|
19
|
|
|
' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
|
20
|
|
|
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
|
21
|
|
|
'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
|
22
|
|
|
"\xfe\xff" => 2, // UTF-16 (BE) BOM |
|
23
|
|
|
'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
|
24
|
|
|
"\xff\xfe" => 2, // UTF-16 (LE) BOM |
|
25
|
|
|
'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
|
26
|
|
|
]; |
|
27
|
|
|
|
|
28
|
|
|
private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
|
29
|
|
|
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
|
30
|
|
|
8234 => "\xE2\x80\xAA", |
|
31
|
|
|
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
|
32
|
|
|
8235 => "\xE2\x80\xAB", |
|
33
|
|
|
// POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
|
34
|
|
|
8236 => "\xE2\x80\xAC", |
|
35
|
|
|
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
|
36
|
|
|
8237 => "\xE2\x80\xAD", |
|
37
|
|
|
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
|
38
|
|
|
8238 => "\xE2\x80\xAE", |
|
39
|
|
|
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
|
40
|
|
|
8294 => "\xE2\x81\xA6", |
|
41
|
|
|
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
|
42
|
|
|
8295 => "\xE2\x81\xA7", |
|
43
|
|
|
// FIRST STRONG ISOLATE // (use -> dir = "auto") |
|
44
|
|
|
8296 => "\xE2\x81\xA8", |
|
45
|
|
|
// POP DIRECTIONAL ISOLATE |
|
46
|
|
|
8297 => "\xE2\x81\xA9", |
|
47
|
|
|
]; |
|
48
|
|
|
// private $WHITESPACE = [ |
|
49
|
|
|
// // NUL Byte |
|
50
|
|
|
// 0 => "\x0", |
|
51
|
|
|
// // Tab |
|
52
|
|
|
// 9 => "\x9", |
|
53
|
|
|
// // New Line |
|
54
|
|
|
// 10 => "\xa", |
|
55
|
|
|
// // Vertical Tab |
|
56
|
|
|
// 11 => "\xb", |
|
57
|
|
|
// // Carriage Return |
|
58
|
|
|
// 13 => "\xd", |
|
59
|
|
|
// // Ordinary Space |
|
60
|
|
|
// 32 => "\x20", |
|
61
|
|
|
// // NO-BREAK SPACE |
|
62
|
|
|
// 160 => "\xc2\xa0", |
|
63
|
|
|
// // OGHAM SPACE MARK |
|
64
|
|
|
// 5760 => "\xe1\x9a\x80", |
|
65
|
|
|
// // MONGOLIAN VOWEL SEPARATOR |
|
66
|
|
|
// 6158 => "\xe1\xa0\x8e", |
|
67
|
|
|
// // EN QUAD |
|
68
|
|
|
// 8192 => "\xe2\x80\x80", |
|
69
|
|
|
// // EM QUAD |
|
70
|
|
|
// 8193 => "\xe2\x80\x81", |
|
71
|
|
|
// // EN SPACE |
|
72
|
|
|
// 8194 => "\xe2\x80\x82", |
|
73
|
|
|
// // EM SPACE |
|
74
|
|
|
// 8195 => "\xe2\x80\x83", |
|
75
|
|
|
// // THREE-PER-EM SPACE |
|
76
|
|
|
// 8196 => "\xe2\x80\x84", |
|
77
|
|
|
// // FOUR-PER-EM SPACE |
|
78
|
|
|
// 8197 => "\xe2\x80\x85", |
|
79
|
|
|
// // SIX-PER-EM SPACE |
|
80
|
|
|
// 8198 => "\xe2\x80\x86", |
|
81
|
|
|
// // FIGURE SPACE |
|
82
|
|
|
// 8199 => "\xe2\x80\x87", |
|
83
|
|
|
// // PUNCTUATION SPACE |
|
84
|
|
|
// 8200 => "\xe2\x80\x88", |
|
85
|
|
|
// // THIN SPACE |
|
86
|
|
|
// 8201 => "\xe2\x80\x89", |
|
87
|
|
|
// //HAIR SPACE |
|
88
|
|
|
// 8202 => "\xe2\x80\x8a", |
|
89
|
|
|
// // LINE SEPARATOR |
|
90
|
|
|
// 8232 => "\xe2\x80\xa8", |
|
91
|
|
|
// // PARAGRAPH SEPARATOR |
|
92
|
|
|
// 8233 => "\xe2\x80\xa9", |
|
93
|
|
|
// // NARROW NO-BREAK SPACE |
|
94
|
|
|
// 8239 => "\xe2\x80\xaf", |
|
95
|
|
|
// // MEDIUM MATHEMATICAL SPACE |
|
96
|
|
|
// 8287 => "\xe2\x81\x9f", |
|
97
|
|
|
// // IDEOGRAPHIC SPACE |
|
98
|
|
|
// 12288 => "\xe3\x80\x80", |
|
99
|
|
|
// ]; |
|
100
|
|
|
/** |
|
101
|
|
|
* @var array |
|
102
|
|
|
*/ |
|
103
|
|
|
private $WHITESPACE_TABLE = [ |
|
104
|
|
|
'SPACE' => "\x20", |
|
105
|
|
|
'NO-BREAK SPACE' => "\xc2\xa0", |
|
106
|
|
|
'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
|
107
|
|
|
'EN QUAD' => "\xe2\x80\x80", |
|
108
|
|
|
'EM QUAD' => "\xe2\x80\x81", |
|
109
|
|
|
'EN SPACE' => "\xe2\x80\x82", |
|
110
|
|
|
'EM SPACE' => "\xe2\x80\x83", |
|
111
|
|
|
'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
|
112
|
|
|
'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
|
113
|
|
|
'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
|
114
|
|
|
'FIGURE SPACE' => "\xe2\x80\x87", |
|
115
|
|
|
'PUNCTUATION SPACE' => "\xe2\x80\x88", |
|
116
|
|
|
'THIN SPACE' => "\xe2\x80\x89", |
|
117
|
|
|
'HAIR SPACE' => "\xe2\x80\x8a", |
|
118
|
|
|
'LINE SEPARATOR' => "\xe2\x80\xa8", |
|
119
|
|
|
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
|
120
|
|
|
'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
|
121
|
|
|
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
|
122
|
|
|
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
|
123
|
|
|
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
|
124
|
|
|
]; |
|
125
|
|
|
|
|
126
|
6 |
|
function __construct() |
|
|
|
|
|
|
127
|
|
|
{ |
|
128
|
6 |
|
$this->system = new System(); |
|
129
|
6 |
|
$this->checkForSupport(); |
|
130
|
6 |
|
} |
|
131
|
|
|
|
|
132
|
6 |
|
private function checkForSupport() |
|
|
|
|
|
|
133
|
|
|
{ |
|
134
|
6 |
|
if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
|
135
|
6 |
|
$this->SUPPORT['already_checked_via_portable_utf8'] = true; |
|
136
|
|
|
|
|
137
|
|
|
// http://php.net/manual/en/book.mbstring.php |
|
138
|
6 |
|
$this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
|
139
|
6 |
|
$this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
|
140
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
|
141
|
6 |
|
\mb_internal_encoding('UTF-8'); |
|
142
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
|
143
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
144
|
6 |
|
\mb_regex_encoding('UTF-8'); |
|
145
|
6 |
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
146
|
|
|
} |
|
147
|
|
|
|
|
148
|
|
|
// http://php.net/manual/en/book.iconv.php |
|
149
|
6 |
|
$this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
|
150
|
|
|
|
|
151
|
|
|
// http://php.net/manual/en/book.intl.php |
|
152
|
6 |
|
$this->SUPPORT['intl'] = $this->system->intl_loaded(); |
|
153
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = []; |
|
154
|
|
|
|
|
155
|
|
|
if ( |
|
156
|
6 |
|
$this->SUPPORT['intl'] === true |
|
157
|
|
|
&& |
|
158
|
6 |
|
\function_exists('transliterator_list_ids') === true |
|
159
|
|
|
) { |
|
160
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
161
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
|
162
|
|
|
} |
|
163
|
|
|
|
|
164
|
|
|
// http://php.net/manual/en/class.intlchar.php |
|
165
|
6 |
|
$this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
|
166
|
|
|
|
|
167
|
|
|
// http://php.net/manual/en/book.ctype.php |
|
168
|
6 |
|
$this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
|
169
|
|
|
|
|
170
|
|
|
// http://php.net/manual/en/class.finfo.php |
|
171
|
6 |
|
$this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
|
172
|
|
|
|
|
173
|
|
|
// http://php.net/manual/en/book.json.php |
|
174
|
6 |
|
$this->SUPPORT['json'] = $this->system->json_loaded(); |
|
175
|
|
|
|
|
176
|
|
|
// http://php.net/manual/en/book.pcre.php |
|
177
|
6 |
|
$this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
|
178
|
|
|
|
|
179
|
6 |
|
$this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
|
180
|
6 |
|
if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
|
181
|
|
|
\mb_internal_encoding('UTF-8'); |
|
182
|
|
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
183
|
|
|
} |
|
184
|
|
|
} |
|
185
|
6 |
|
} |
|
186
|
|
|
|
|
187
|
6 |
|
public function rawurldecode($str, $multi_decode = true) |
|
|
|
|
|
|
188
|
|
|
{ |
|
189
|
6 |
|
if ($str === '') { |
|
190
|
|
|
return ''; |
|
191
|
|
|
} |
|
192
|
|
|
|
|
193
|
6 |
|
if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
|
194
|
6 |
|
return $this->fixSimpleUtf8($str); |
|
195
|
|
|
} |
|
196
|
|
|
|
|
197
|
6 |
|
$pattern = '/%u([0-9a-fA-F]{3,4})/'; |
|
198
|
6 |
|
if (preg_match($pattern, $str)) { |
|
199
|
|
|
$str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
|
200
|
|
|
} |
|
201
|
|
|
|
|
202
|
6 |
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
|
203
|
|
|
|
|
204
|
6 |
|
if ($multi_decode === true) { |
|
205
|
|
|
do { |
|
206
|
6 |
|
$str_compare = $str; |
|
207
|
|
|
|
|
208
|
|
|
/** |
|
209
|
|
|
* @psalm-suppress PossiblyInvalidArgument |
|
210
|
|
|
*/ |
|
211
|
6 |
|
$str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
|
212
|
6 |
|
} while ($str_compare !== $str); |
|
213
|
|
|
} |
|
214
|
|
|
|
|
215
|
6 |
|
return $str; |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
6 |
|
private function fixSimpleUtf8($str) |
|
|
|
|
|
|
219
|
|
|
{ |
|
220
|
6 |
|
if ($str === '') { |
|
221
|
|
|
return ''; |
|
222
|
|
|
} |
|
223
|
|
|
|
|
224
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
|
|
|
|
|
225
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
|
|
|
|
|
226
|
|
|
|
|
227
|
6 |
|
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
|
228
|
1 |
|
if ($this->BROKEN_UTF8_FIX === null) { |
|
229
|
1 |
|
$this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
|
230
|
|
|
} |
|
231
|
|
|
|
|
232
|
1 |
|
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
|
233
|
1 |
|
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
|
234
|
|
|
} |
|
235
|
|
|
|
|
236
|
6 |
|
return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
|
237
|
|
|
} |
|
238
|
|
|
|
|
239
|
2 |
|
private function getData($file) |
|
240
|
|
|
{ |
|
241
|
|
|
|
|
242
|
2 |
|
return include __DIR__ . '/../Data/' . $file . '.php'; |
|
243
|
|
|
} |
|
244
|
|
|
|
|
245
|
6 |
|
private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
|
|
|
|
|
246
|
|
|
{ |
|
247
|
|
|
if ( |
|
248
|
6 |
|
!isset($str[3]) // examples: &; || &x; |
|
249
|
|
|
|| |
|
250
|
6 |
|
strpos($str, '&') === false // no "&" |
|
251
|
|
|
) { |
|
252
|
6 |
|
return $str; |
|
253
|
|
|
} |
|
254
|
|
|
|
|
255
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
256
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
257
|
|
|
} |
|
258
|
|
|
|
|
259
|
6 |
|
if ($flags === null) { |
|
260
|
|
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
|
261
|
|
|
} |
|
262
|
|
|
|
|
263
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
264
|
|
|
trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
|
265
|
|
|
} |
|
266
|
|
|
|
|
267
|
|
|
do { |
|
268
|
6 |
|
$str_compare = $str; |
|
269
|
|
|
|
|
270
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
|
271
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
|
272
|
6 |
|
if ($encoding === 'UTF-8') { |
|
273
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
|
274
|
|
|
} else { |
|
|
|
|
|
|
275
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
|
276
|
|
|
} |
|
277
|
|
|
} else { |
|
|
|
|
|
|
278
|
|
|
$str = (string)preg_replace_callback( |
|
279
|
|
|
"/&#\d{2,6};/", |
|
280
|
|
|
/** |
|
281
|
|
|
* @param string[] $matches |
|
282
|
|
|
* |
|
283
|
|
|
* @return string |
|
284
|
|
|
*/ |
|
285
|
|
|
static function ($matches) use ($encoding) { |
|
286
|
|
|
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
|
287
|
|
|
if ($returnTmp !== '"' && $returnTmp !== "'") { |
|
288
|
|
|
return $returnTmp; |
|
289
|
|
|
} |
|
290
|
|
|
|
|
291
|
|
|
return $matches[0]; |
|
292
|
|
|
}, |
|
293
|
|
|
$str |
|
294
|
|
|
); |
|
295
|
|
|
} |
|
296
|
|
|
|
|
297
|
6 |
|
if (strpos($str, '&') !== false) { |
|
298
|
6 |
|
if (strpos($str, '&#') !== false) { |
|
299
|
|
|
// decode also numeric & UTF16 two byte entities |
|
300
|
6 |
|
$str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
|
301
|
|
|
} |
|
302
|
|
|
|
|
303
|
6 |
|
$str = html_entity_decode($str, $flags, $encoding); |
|
304
|
|
|
} |
|
305
|
6 |
|
} while ($str_compare !== $str); |
|
306
|
|
|
|
|
307
|
6 |
|
return $str; |
|
308
|
|
|
} |
|
309
|
|
|
|
|
310
|
|
|
private function normalize_encoding($encoding, $fallback = '') |
|
|
|
|
|
|
311
|
|
|
{ |
|
312
|
|
|
static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
|
|
|
|
|
|
313
|
|
|
|
|
314
|
|
|
// init |
|
315
|
|
|
$encoding = (string)$encoding; |
|
316
|
|
|
|
|
317
|
|
|
if (!$encoding) { |
|
318
|
|
|
return $fallback; |
|
319
|
|
|
} |
|
320
|
|
|
|
|
321
|
|
|
if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
|
322
|
|
|
return 'UTF-8'; |
|
323
|
|
|
} |
|
324
|
|
|
|
|
325
|
|
|
if ($encoding === '8BIT' || $encoding === 'BINARY') { |
|
326
|
|
|
return 'CP850'; |
|
327
|
|
|
} |
|
328
|
|
|
|
|
329
|
|
|
if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
|
330
|
|
|
return 'HTML-ENTITIES'; |
|
331
|
|
|
} |
|
332
|
|
|
|
|
333
|
|
|
if ( |
|
334
|
|
|
$encoding === '1' // only a fallback, for non "strict_types" usage ... |
|
335
|
|
|
|| |
|
336
|
|
|
$encoding === '0' // only a fallback, for non "strict_types" usage ... |
|
337
|
|
|
) { |
|
338
|
|
|
return $fallback; |
|
339
|
|
|
} |
|
340
|
|
|
|
|
341
|
|
|
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
|
342
|
|
|
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
|
343
|
|
|
} |
|
344
|
|
|
|
|
345
|
|
|
if ($this->ENCODINGS === null) { |
|
346
|
|
|
$this->ENCODINGS = $this->getData('encodings'); |
|
347
|
|
|
} |
|
348
|
|
|
|
|
349
|
|
|
if (in_array($encoding, $this->ENCODINGS, true)) { |
|
350
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
|
351
|
|
|
|
|
352
|
|
|
return $encoding; |
|
353
|
|
|
} |
|
354
|
|
|
|
|
355
|
|
|
$encodingOrig = $encoding; |
|
356
|
|
|
$encoding = strtoupper($encoding); |
|
357
|
|
|
$encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
|
358
|
|
|
|
|
359
|
|
|
$equivalences = [ |
|
360
|
|
|
'ISO8859' => 'ISO-8859-1', |
|
361
|
|
|
'ISO88591' => 'ISO-8859-1', |
|
362
|
|
|
'ISO' => 'ISO-8859-1', |
|
363
|
|
|
'LATIN' => 'ISO-8859-1', |
|
364
|
|
|
'LATIN1' => 'ISO-8859-1', // Western European |
|
365
|
|
|
'ISO88592' => 'ISO-8859-2', |
|
366
|
|
|
'LATIN2' => 'ISO-8859-2', // Central European |
|
367
|
|
|
'ISO88593' => 'ISO-8859-3', |
|
368
|
|
|
'LATIN3' => 'ISO-8859-3', // Southern European |
|
369
|
|
|
'ISO88594' => 'ISO-8859-4', |
|
370
|
|
|
'LATIN4' => 'ISO-8859-4', // Northern European |
|
371
|
|
|
'ISO88595' => 'ISO-8859-5', |
|
372
|
|
|
'ISO88596' => 'ISO-8859-6', // Greek |
|
373
|
|
|
'ISO88597' => 'ISO-8859-7', |
|
374
|
|
|
'ISO88598' => 'ISO-8859-8', // Hebrew |
|
375
|
|
|
'ISO88599' => 'ISO-8859-9', |
|
376
|
|
|
'LATIN5' => 'ISO-8859-9', // Turkish |
|
377
|
|
|
'ISO885911' => 'ISO-8859-11', |
|
378
|
|
|
'TIS620' => 'ISO-8859-11', // Thai |
|
379
|
|
|
'ISO885910' => 'ISO-8859-10', |
|
380
|
|
|
'LATIN6' => 'ISO-8859-10', // Nordic |
|
381
|
|
|
'ISO885913' => 'ISO-8859-13', |
|
382
|
|
|
'LATIN7' => 'ISO-8859-13', // Baltic |
|
383
|
|
|
'ISO885914' => 'ISO-8859-14', |
|
384
|
|
|
'LATIN8' => 'ISO-8859-14', // Celtic |
|
385
|
|
|
'ISO885915' => 'ISO-8859-15', |
|
386
|
|
|
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
|
387
|
|
|
'ISO885916' => 'ISO-8859-16', |
|
388
|
|
|
'LATIN10' => 'ISO-8859-16', // Southeast European |
|
389
|
|
|
'CP1250' => 'WINDOWS-1250', |
|
390
|
|
|
'WIN1250' => 'WINDOWS-1250', |
|
391
|
|
|
'WINDOWS1250' => 'WINDOWS-1250', |
|
392
|
|
|
'CP1251' => 'WINDOWS-1251', |
|
393
|
|
|
'WIN1251' => 'WINDOWS-1251', |
|
394
|
|
|
'WINDOWS1251' => 'WINDOWS-1251', |
|
395
|
|
|
'CP1252' => 'WINDOWS-1252', |
|
396
|
|
|
'WIN1252' => 'WINDOWS-1252', |
|
397
|
|
|
'WINDOWS1252' => 'WINDOWS-1252', |
|
398
|
|
|
'CP1253' => 'WINDOWS-1253', |
|
399
|
|
|
'WIN1253' => 'WINDOWS-1253', |
|
400
|
|
|
'WINDOWS1253' => 'WINDOWS-1253', |
|
401
|
|
|
'CP1254' => 'WINDOWS-1254', |
|
402
|
|
|
'WIN1254' => 'WINDOWS-1254', |
|
403
|
|
|
'WINDOWS1254' => 'WINDOWS-1254', |
|
404
|
|
|
'CP1255' => 'WINDOWS-1255', |
|
405
|
|
|
'WIN1255' => 'WINDOWS-1255', |
|
406
|
|
|
'WINDOWS1255' => 'WINDOWS-1255', |
|
407
|
|
|
'CP1256' => 'WINDOWS-1256', |
|
408
|
|
|
'WIN1256' => 'WINDOWS-1256', |
|
409
|
|
|
'WINDOWS1256' => 'WINDOWS-1256', |
|
410
|
|
|
'CP1257' => 'WINDOWS-1257', |
|
411
|
|
|
'WIN1257' => 'WINDOWS-1257', |
|
412
|
|
|
'WINDOWS1257' => 'WINDOWS-1257', |
|
413
|
|
|
'CP1258' => 'WINDOWS-1258', |
|
414
|
|
|
'WIN1258' => 'WINDOWS-1258', |
|
415
|
|
|
'WINDOWS1258' => 'WINDOWS-1258', |
|
416
|
|
|
'UTF16' => 'UTF-16', |
|
417
|
|
|
'UTF32' => 'UTF-32', |
|
418
|
|
|
'UTF8' => 'UTF-8', |
|
419
|
|
|
'UTF' => 'UTF-8', |
|
420
|
|
|
'UTF7' => 'UTF-7', |
|
421
|
|
|
'8BIT' => 'CP850', |
|
422
|
|
|
'BINARY' => 'CP850', |
|
423
|
|
|
]; |
|
424
|
|
|
|
|
425
|
|
|
if (!empty($equivalences[$encodingUpperHelper])) { |
|
426
|
|
|
$encoding = $equivalences[$encodingUpperHelper]; |
|
427
|
|
|
} |
|
428
|
|
|
|
|
429
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
|
430
|
|
|
|
|
431
|
|
|
return $encoding; |
|
432
|
|
|
} |
|
433
|
|
|
|
|
434
|
6 |
|
private function toUtf8($str) |
|
|
|
|
|
|
435
|
|
|
{ |
|
436
|
|
|
|
|
437
|
6 |
|
if (is_array($str) === true) { |
|
438
|
|
|
foreach ($str as $key => $value) { |
|
439
|
|
|
$str[$key] = $this->toUtf8($value); |
|
440
|
|
|
} |
|
441
|
|
|
return $str; |
|
442
|
|
|
} |
|
443
|
|
|
|
|
444
|
|
|
|
|
445
|
6 |
|
$str = (string)$str; |
|
446
|
6 |
|
if ($str === '') { |
|
447
|
|
|
return $str; |
|
448
|
|
|
} |
|
449
|
|
|
|
|
450
|
6 |
|
$max = \strlen($str); |
|
451
|
6 |
|
$buf = ''; |
|
452
|
|
|
|
|
453
|
6 |
|
for ($i = 0; $i < $max; ++$i) { |
|
454
|
6 |
|
$c1 = $str[$i]; |
|
|
|
|
|
|
455
|
|
|
|
|
456
|
6 |
|
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
|
457
|
|
|
|
|
458
|
|
|
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
|
459
|
|
|
|
|
460
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
|
|
|
|
|
461
|
|
|
|
|
462
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
463
|
|
|
$buf .= $c1 . $c2; |
|
464
|
|
|
++$i; |
|
465
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
466
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
467
|
|
|
} |
|
468
|
|
|
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
|
469
|
|
|
|
|
470
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
471
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
|
|
|
|
|
472
|
|
|
|
|
473
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
474
|
|
|
$buf .= $c1 . $c2 . $c3; |
|
475
|
|
|
$i += 2; |
|
476
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
477
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
478
|
|
|
} |
|
479
|
|
|
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
|
480
|
|
|
|
|
481
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
482
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
483
|
|
|
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
|
|
|
|
|
|
484
|
|
|
|
|
485
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
486
|
|
|
$buf .= $c1 . $c2 . $c3 . $c4; |
|
487
|
|
|
$i += 3; |
|
488
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
489
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
490
|
|
|
} |
|
491
|
|
|
} else { // doesn't look like UTF8, but should be converted |
|
|
|
|
|
|
492
|
|
|
|
|
493
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
494
|
|
|
} |
|
495
|
6 |
|
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
|
496
|
|
|
|
|
497
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
498
|
|
|
} else { // it doesn't need conversion |
|
|
|
|
|
|
499
|
|
|
|
|
500
|
6 |
|
$buf .= $c1; |
|
501
|
|
|
} |
|
502
|
|
|
} |
|
503
|
|
|
|
|
504
|
|
|
// decode unicode escape sequences + unicode surrogate pairs |
|
505
|
6 |
|
$buf = preg_replace_callback( |
|
506
|
6 |
|
'/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
|
507
|
|
|
/** |
|
508
|
|
|
* @param array $matches |
|
509
|
|
|
* |
|
510
|
|
|
* @return string |
|
511
|
|
|
*/ |
|
512
|
|
|
function (array $matches) { |
|
513
|
1 |
|
if (isset($matches[3])) { |
|
514
|
1 |
|
$cp = (int)hexdec($matches[3]); |
|
|
|
|
|
|
515
|
|
|
} else { |
|
|
|
|
|
|
516
|
|
|
// http://unicode.org/faq/utf_bom.html#utf16-4 |
|
517
|
|
|
$cp = ((int)hexdec($matches[1]) << 10) |
|
518
|
|
|
+ (int)hexdec($matches[2]) |
|
519
|
|
|
+ 0x10000 |
|
520
|
|
|
- (0xD800 << 10) |
|
521
|
|
|
- 0xDC00; |
|
522
|
|
|
} |
|
523
|
|
|
|
|
524
|
|
|
// https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
|
525
|
|
|
// |
|
526
|
|
|
// php_utf32_utf8(unsigned char *buf, unsigned k) |
|
527
|
|
|
|
|
528
|
1 |
|
if ($cp < 0x80) { |
|
529
|
1 |
|
return (string)$this->chr($cp); |
|
530
|
|
|
} |
|
531
|
|
|
|
|
532
|
|
|
if ($cp < 0xA0) { |
|
533
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
534
|
|
|
return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
|
535
|
|
|
} |
|
536
|
|
|
|
|
537
|
|
|
return $this->decimalToChr($cp); |
|
538
|
6 |
|
}, |
|
539
|
6 |
|
$buf |
|
540
|
|
|
); |
|
541
|
|
|
|
|
542
|
6 |
|
if ($buf === null) { |
|
543
|
|
|
return ''; |
|
544
|
|
|
} |
|
545
|
|
|
|
|
546
|
|
|
|
|
547
|
6 |
|
return $buf; |
|
548
|
|
|
} |
|
549
|
|
|
|
|
550
|
|
|
private function toUtf8ConvertHelper($input) |
|
|
|
|
|
|
551
|
|
|
{ |
|
552
|
|
|
// init |
|
553
|
|
|
$buf = ''; |
|
554
|
|
|
|
|
555
|
|
|
if ($this->ORD === null) { |
|
556
|
|
|
$this->ORD = $this->getData('ord'); |
|
557
|
|
|
} |
|
558
|
|
|
|
|
559
|
|
|
if ($this->CHR === null) { |
|
560
|
|
|
$this->CHR = $this->getData('chr'); |
|
561
|
|
|
} |
|
562
|
|
|
|
|
563
|
|
|
if ($this->WIN1252_TO_UTF8 === null) { |
|
564
|
|
|
$this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
|
565
|
|
|
} |
|
566
|
|
|
|
|
567
|
|
|
$ordC1 = $this->ORD[$input]; |
|
568
|
|
|
if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
|
569
|
|
|
$buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
|
570
|
|
|
} else { |
|
|
|
|
|
|
571
|
|
|
$cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
|
572
|
|
|
$cc2 = ((string)$input & "\x3F") | "\x80"; |
|
573
|
|
|
$buf .= $cc1 . $cc2; |
|
574
|
|
|
} |
|
575
|
|
|
|
|
576
|
|
|
return $buf; |
|
577
|
|
|
} |
|
578
|
|
|
|
|
579
|
1 |
|
private function chr($code_point, $encoding = 'UTF-8') |
|
|
|
|
|
|
580
|
|
|
{ |
|
581
|
|
|
// init |
|
582
|
1 |
|
static $CHAR_CACHE = []; |
|
583
|
|
|
|
|
584
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
585
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
586
|
|
|
} |
|
587
|
|
|
|
|
588
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
589
|
|
|
trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
|
590
|
|
|
} |
|
591
|
|
|
|
|
592
|
1 |
|
$cacheKey = $code_point . $encoding; |
|
593
|
1 |
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
594
|
|
|
return $CHAR_CACHE[$cacheKey]; |
|
595
|
|
|
} |
|
596
|
|
|
|
|
597
|
1 |
|
if ($code_point <= 127) { // use "simple"-char only until "\x80" |
|
598
|
|
|
|
|
599
|
1 |
|
if ($this->CHR === null) { |
|
600
|
1 |
|
$this->CHR = (array)$this->getData('chr'); |
|
601
|
|
|
} |
|
602
|
|
|
|
|
603
|
|
|
/** |
|
604
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
605
|
|
|
*/ |
|
606
|
1 |
|
$chr = $this->CHR[$code_point]; |
|
607
|
|
|
|
|
608
|
1 |
|
if ($encoding !== 'UTF-8') { |
|
609
|
|
|
$chr = $this->encode($encoding, $chr); |
|
610
|
|
|
} |
|
611
|
|
|
|
|
612
|
1 |
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
613
|
|
|
} |
|
614
|
|
|
|
|
615
|
|
|
// |
|
616
|
|
|
// fallback via "IntlChar" |
|
617
|
|
|
// |
|
618
|
|
|
|
|
619
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
|
620
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
621
|
|
|
$chr = IntlChar::chr($code_point); |
|
622
|
|
|
|
|
623
|
|
|
if ($encoding !== 'UTF-8') { |
|
624
|
|
|
$chr = $this->encode($encoding, $chr); |
|
625
|
|
|
} |
|
626
|
|
|
|
|
627
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
628
|
|
|
} |
|
629
|
|
|
|
|
630
|
|
|
// |
|
631
|
|
|
// fallback via vanilla php |
|
632
|
|
|
// |
|
633
|
|
|
|
|
634
|
|
|
if ($this->CHR === null) { |
|
635
|
|
|
$this->CHR = (array)$this->getData('chr'); |
|
636
|
|
|
} |
|
637
|
|
|
|
|
638
|
|
|
$code_point = (int)$code_point; |
|
639
|
|
|
if ($code_point <= 0x7F) { |
|
640
|
|
|
/** |
|
641
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
642
|
|
|
*/ |
|
643
|
|
|
$chr = $this->CHR[$code_point]; |
|
644
|
|
|
} elseif ($code_point <= 0x7FF) { |
|
645
|
|
|
/** |
|
646
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
647
|
|
|
*/ |
|
648
|
|
|
$chr = $this->CHR[($code_point >> 6) + 0xC0] . |
|
649
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
650
|
|
|
} elseif ($code_point <= 0xFFFF) { |
|
651
|
|
|
/** |
|
652
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
653
|
|
|
*/ |
|
654
|
|
|
$chr = $this->CHR[($code_point >> 12) + 0xE0] . |
|
655
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
|
656
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
657
|
|
|
} else { |
|
|
|
|
|
|
658
|
|
|
/** |
|
659
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
660
|
|
|
*/ |
|
661
|
|
|
$chr = $this->CHR[($code_point >> 18) + 0xF0] . |
|
662
|
|
|
$this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
|
663
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
|
664
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
665
|
|
|
} |
|
666
|
|
|
|
|
667
|
|
|
if ($encoding !== 'UTF-8') { |
|
668
|
|
|
$chr = $this->encode($encoding, $chr); |
|
669
|
|
|
} |
|
670
|
|
|
|
|
671
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
672
|
|
|
} |
|
673
|
|
|
|
|
674
|
|
|
private function encode($toEncoding, $str) |
|
|
|
|
|
|
675
|
|
|
{ |
|
676
|
|
|
if ($str === '' || $toEncoding === '') { |
|
677
|
|
|
return $str; |
|
678
|
|
|
} |
|
679
|
|
|
|
|
680
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
|
681
|
|
|
$toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
|
682
|
|
|
} |
|
683
|
|
|
|
|
684
|
|
|
// if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
|
685
|
|
|
// $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
|
686
|
|
|
// } |
|
687
|
|
|
|
|
688
|
|
|
// if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
|
689
|
|
|
// return $str; |
|
690
|
|
|
// } |
|
691
|
|
|
|
|
692
|
|
|
if ($toEncoding === 'JSON') { |
|
693
|
|
|
$return = $this->jsonEncode($str); |
|
694
|
|
|
if ($return === false) { |
|
695
|
|
|
throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
|
696
|
|
|
} |
|
697
|
|
|
|
|
698
|
|
|
return $return; |
|
699
|
|
|
} |
|
700
|
|
|
// if ($fromEncoding === 'JSON') { |
|
701
|
|
|
// $str = $this->json_decode($str); |
|
702
|
|
|
// $fromEncoding = ''; |
|
703
|
|
|
// } |
|
704
|
|
|
|
|
705
|
|
|
if ($toEncoding === 'BASE64') { |
|
706
|
|
|
return base64_encode($str); |
|
707
|
|
|
} |
|
708
|
|
|
// if ($fromEncoding === 'BASE64') { |
|
709
|
|
|
// $str = base64_decode($str, true); |
|
710
|
|
|
// $fromEncoding = ''; |
|
711
|
|
|
// } |
|
712
|
|
|
|
|
713
|
|
|
if ($toEncoding === 'HTML-ENTITIES') { |
|
714
|
|
|
return $this->htmlEncode($str, true, 'UTF-8'); |
|
715
|
|
|
} |
|
716
|
|
|
// if ($fromEncoding === 'HTML-ENTITIES') { |
|
717
|
|
|
// $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
|
718
|
|
|
// $fromEncoding = ''; |
|
719
|
|
|
// } |
|
720
|
|
|
|
|
721
|
|
|
$fromEncodingDetected = false; |
|
|
|
|
|
|
722
|
|
|
// if ($autodetectFromEncoding === true || !$fromEncoding) { |
|
723
|
|
|
// $fromEncodingDetected = $this->str_detect_encoding($str); |
|
724
|
|
|
// } |
|
725
|
|
|
|
|
726
|
|
|
// DEBUG |
|
727
|
|
|
//var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
|
728
|
|
|
|
|
729
|
|
|
// if ($fromEncodingDetected !== false) { |
|
730
|
|
|
// $fromEncoding = $fromEncodingDetected; |
|
731
|
|
|
// } elseif ($autodetectFromEncoding === true) { |
|
732
|
|
|
// // fallback for the "autodetect"-mode |
|
733
|
|
|
// return $this->toUtf8($str); |
|
734
|
|
|
// } |
|
735
|
|
|
|
|
736
|
|
|
// if (!$fromEncoding || $fromEncoding === $toEncoding) { |
|
737
|
|
|
// return $str; |
|
738
|
|
|
// } |
|
739
|
|
|
|
|
740
|
|
|
// if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
|
741
|
|
|
// return $this->toUtf8($str); |
|
742
|
|
|
// } |
|
743
|
|
|
|
|
744
|
|
|
// if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
|
745
|
|
|
// return $this->to_iso8859($str); |
|
746
|
|
|
// } |
|
747
|
|
|
|
|
748
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
749
|
|
|
trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
|
750
|
|
|
} |
|
751
|
|
|
// |
|
752
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
|
753
|
|
|
// // warning: do not use the symfony polyfill here |
|
754
|
|
|
// $strEncoded = mb_convert_encoding( |
|
755
|
|
|
// $str, |
|
756
|
|
|
// $toEncoding, |
|
757
|
|
|
// $fromEncoding |
|
758
|
|
|
// ); |
|
759
|
|
|
// |
|
760
|
|
|
// if ($strEncoded) { |
|
761
|
|
|
// return $strEncoded; |
|
762
|
|
|
// } |
|
763
|
|
|
// } |
|
764
|
|
|
// |
|
765
|
|
|
// $return = \iconv($fromEncoding, $toEncoding, $str); |
|
766
|
|
|
// if ($return !== false) { |
|
767
|
|
|
// return $return; |
|
768
|
|
|
// } |
|
769
|
|
|
|
|
770
|
|
|
return $str; |
|
771
|
|
|
} |
|
772
|
|
|
|
|
773
|
|
|
private function jsonEncode($value) |
|
774
|
|
|
{ |
|
775
|
|
|
$value = $this->filter($value); |
|
776
|
|
|
|
|
777
|
|
|
if ($this->SUPPORT['json'] === false) { |
|
778
|
|
|
throw new \RuntimeException('ext-json: is not installed'); |
|
779
|
|
|
} |
|
780
|
|
|
|
|
781
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
782
|
|
|
return json_encode($value, 0, 512); |
|
783
|
|
|
} |
|
784
|
|
|
|
|
785
|
|
|
private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
|
|
|
|
|
|
786
|
|
|
{ |
|
787
|
|
|
switch (\gettype($var)) { |
|
788
|
|
|
case 'array': |
|
789
|
|
|
foreach ($var as $key => $value) { |
|
790
|
|
|
$var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
791
|
|
|
} |
|
792
|
|
|
unset($v); |
|
|
|
|
|
|
793
|
|
|
|
|
794
|
|
|
break; |
|
795
|
|
|
case 'object': |
|
796
|
|
|
foreach ($var as $key => $value) { |
|
797
|
|
|
$str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
|
|
|
|
|
798
|
|
|
} |
|
799
|
|
|
unset($v); |
|
800
|
|
|
|
|
801
|
|
|
break; |
|
802
|
|
|
case 'string': |
|
|
|
|
|
|
803
|
|
|
|
|
804
|
|
|
if (strpos($var, "\r") !== false) { |
|
805
|
|
|
// Workaround https://bugs.php.net/65732 |
|
806
|
|
|
$var = $this->normalizeLineEnding($var); |
|
807
|
|
|
} |
|
808
|
|
|
|
|
809
|
|
|
if ($this->isAscii($var) === false) { |
|
810
|
|
|
if (\Normalizer::isNormalized($var, $normalization_form)) { |
|
811
|
|
|
$n = '-'; |
|
|
|
|
|
|
812
|
|
|
} else { |
|
|
|
|
|
|
813
|
|
|
$n = \Normalizer::normalize($var, $normalization_form); |
|
814
|
|
|
|
|
815
|
|
|
if (isset($n[0])) { |
|
816
|
|
|
$var = $n; |
|
817
|
|
|
} else { |
|
|
|
|
|
|
818
|
|
|
$var = $this->encode('UTF-8', $var, true); |
|
|
|
|
|
|
819
|
|
|
} |
|
820
|
|
|
} |
|
821
|
|
|
|
|
822
|
|
|
if ( |
|
823
|
|
|
$var[0] >= "\x80" |
|
824
|
|
|
&& |
|
825
|
|
|
isset($n[0], $leading_combining[0]) |
|
826
|
|
|
&& |
|
827
|
|
|
preg_match('/^\p{Mn}/u', $var) |
|
828
|
|
|
) { |
|
829
|
|
|
// Prevent leading combining chars |
|
830
|
|
|
// for NFC-safe concatenations. |
|
831
|
|
|
$var = $leading_combining . $var; |
|
832
|
|
|
} |
|
833
|
|
|
} |
|
834
|
|
|
|
|
835
|
|
|
break; |
|
836
|
|
|
} |
|
837
|
|
|
|
|
838
|
|
|
return $var; |
|
839
|
|
|
} |
|
840
|
|
|
|
|
841
|
|
|
private function normalizeLineEnding($str) |
|
842
|
|
|
{ |
|
843
|
|
|
return str_replace(["\r\n", "\r"], "\n", $str); |
|
844
|
|
|
} |
|
845
|
|
|
|
|
846
|
|
|
private function isAscii($str) |
|
847
|
|
|
{ |
|
848
|
|
|
if ($str === '') { |
|
849
|
|
|
return true; |
|
850
|
|
|
} |
|
851
|
|
|
|
|
852
|
|
|
return !preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); |
|
853
|
|
|
} |
|
854
|
|
|
|
|
855
|
|
|
private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
|
|
856
|
|
|
{ |
|
857
|
|
|
if ($str === '') { |
|
858
|
|
|
return ''; |
|
859
|
|
|
} |
|
860
|
|
|
|
|
861
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
862
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
863
|
|
|
} |
|
864
|
|
|
|
|
865
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
|
866
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
|
867
|
|
|
$startCode = 0x00; |
|
868
|
|
|
if ($keepAsciiChars === true) { |
|
869
|
|
|
$startCode = 0x80; |
|
870
|
|
|
} |
|
871
|
|
|
|
|
872
|
|
|
if ($encoding === 'UTF-8') { |
|
873
|
|
|
return mb_encode_numericentity( |
|
874
|
|
|
$str, |
|
875
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0] |
|
876
|
|
|
); |
|
877
|
|
|
} |
|
878
|
|
|
|
|
879
|
|
|
return mb_encode_numericentity( |
|
880
|
|
|
$str, |
|
881
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0], |
|
882
|
|
|
$encoding |
|
883
|
|
|
); |
|
884
|
|
|
} |
|
885
|
|
|
|
|
886
|
|
|
return implode( |
|
887
|
|
|
'', |
|
888
|
|
|
\array_map( |
|
889
|
|
|
function (string $chr) use ($keepAsciiChars, $encoding) { |
|
890
|
|
|
return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
|
891
|
|
|
}, |
|
892
|
|
|
$this->strSplit($str) |
|
893
|
|
|
) |
|
894
|
|
|
); |
|
895
|
|
|
} |
|
896
|
|
|
|
|
897
|
|
|
private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
|
|
898
|
|
|
{ |
|
899
|
|
|
if ($char === '') { |
|
900
|
|
|
return ''; |
|
901
|
|
|
} |
|
902
|
|
|
|
|
903
|
|
|
if ($keepAsciiChars === true && $this->isAscii($char) === true) { |
|
904
|
|
|
return $char; |
|
905
|
|
|
} |
|
906
|
|
|
|
|
907
|
|
|
return '&#' . $this->ord($char, $encoding) . ';'; |
|
908
|
|
|
} |
|
909
|
|
|
|
|
910
|
|
|
private function ord($chr, $encoding = 'UTF-8') |
|
|
|
|
|
|
911
|
|
|
{ |
|
912
|
|
|
static $CHAR_CACHE = []; |
|
913
|
|
|
|
|
914
|
|
|
// init |
|
915
|
|
|
$chr = (string)$chr; |
|
916
|
|
|
|
|
917
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
918
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
919
|
|
|
} |
|
920
|
|
|
|
|
921
|
|
|
$cacheKey = $chr . $encoding; |
|
922
|
|
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
923
|
|
|
return $CHAR_CACHE[$cacheKey]; |
|
924
|
|
|
} |
|
925
|
|
|
|
|
926
|
|
|
// check again, if it's still not UTF-8 |
|
927
|
|
|
if ($encoding !== 'UTF-8') { |
|
928
|
|
|
$chr = $this->encode($encoding, $chr); |
|
929
|
|
|
} |
|
930
|
|
|
|
|
931
|
|
|
if ($this->ORD === null) { |
|
932
|
|
|
$this->ORD = $this->getData('ord'); |
|
933
|
|
|
} |
|
934
|
|
|
|
|
935
|
|
|
if (isset($this->ORD[$chr])) { |
|
936
|
|
|
return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
|
937
|
|
|
} |
|
938
|
|
|
|
|
939
|
|
|
// |
|
940
|
|
|
// fallback via "IntlChar" |
|
941
|
|
|
// |
|
942
|
|
|
|
|
943
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
|
944
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
945
|
|
|
$code = \IntlChar::ord($chr); |
|
946
|
|
|
if ($code) { |
|
947
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
|
948
|
|
|
} |
|
949
|
|
|
} |
|
950
|
|
|
|
|
951
|
|
|
// |
|
952
|
|
|
// fallback via vanilla php |
|
953
|
|
|
// |
|
954
|
|
|
|
|
955
|
|
|
/** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
|
956
|
|
|
$chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
|
957
|
|
|
$code = $chr ? $chr[1] : 0; |
|
958
|
|
|
|
|
959
|
|
|
if ($code >= 0xF0 && isset($chr[4])) { |
|
960
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
961
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
|
962
|
|
|
} |
|
963
|
|
|
|
|
964
|
|
|
if ($code >= 0xE0 && isset($chr[3])) { |
|
965
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
966
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
|
967
|
|
|
} |
|
968
|
|
|
|
|
969
|
|
|
if ($code >= 0xC0 && isset($chr[2])) { |
|
970
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
971
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
|
972
|
|
|
} |
|
973
|
|
|
|
|
974
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
|
975
|
|
|
} |
|
976
|
|
|
|
|
977
|
|
|
private function strSplit($str, $length = 1, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
|
|
978
|
|
|
{ |
|
979
|
|
|
if ($length <= 0) { |
|
980
|
|
|
return []; |
|
981
|
|
|
} |
|
982
|
|
|
|
|
983
|
|
|
if (is_array($str) === true) { |
|
984
|
|
|
foreach ($str as $key => $value) { |
|
985
|
|
|
$str[$key] = $this->strSplit($value, $length, $cleanUtf8, $tryToUseMbFunction); |
|
986
|
|
|
} |
|
987
|
|
|
|
|
988
|
|
|
return $str; |
|
989
|
|
|
} |
|
990
|
|
|
|
|
991
|
|
|
// init |
|
992
|
|
|
$str = (string)$str; |
|
993
|
|
|
|
|
994
|
|
|
if ($str === '') { |
|
995
|
|
|
return []; |
|
996
|
|
|
} |
|
997
|
|
|
|
|
998
|
|
|
if ($cleanUtf8 === true) { |
|
999
|
|
|
$str = $this->clean($str); |
|
1000
|
|
|
} |
|
1001
|
|
|
|
|
1002
|
|
|
if ($tryToUseMbFunction === true && $this->SUPPORT['mbstring'] === true) { |
|
1003
|
|
|
$iMax = \mb_strlen($str); |
|
1004
|
|
|
if ($iMax <= 127) { |
|
1005
|
|
|
$ret = []; |
|
1006
|
|
|
for ($i = 0; $i < $iMax; ++$i) { |
|
1007
|
|
|
$ret[] = \mb_substr($str, $i, 1); |
|
1008
|
|
|
} |
|
1009
|
|
|
} else { |
|
|
|
|
|
|
1010
|
|
|
$retArray = []; |
|
1011
|
|
|
preg_match_all('/./us', $str, $retArray); |
|
1012
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
|
1013
|
|
|
} |
|
1014
|
|
|
} elseif ($this->SUPPORT['pcre_utf8'] === true) { |
|
1015
|
|
|
$retArray = []; |
|
1016
|
|
|
preg_match_all('/./us', $str, $retArray); |
|
1017
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
|
1018
|
|
|
} else { |
|
|
|
|
|
|
1019
|
|
|
|
|
1020
|
|
|
// fallback |
|
1021
|
|
|
|
|
1022
|
|
|
$ret = []; |
|
1023
|
|
|
$len = \strlen($str); |
|
1024
|
|
|
|
|
1025
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
|
1026
|
|
|
for ($i = 0; $i < $len; ++$i) { |
|
1027
|
|
|
if (($str[$i] & "\x80") === "\x00") { |
|
1028
|
|
|
$ret[] = $str[$i]; |
|
1029
|
|
|
} elseif ( |
|
1030
|
|
|
isset($str[$i + 1]) |
|
1031
|
|
|
&& |
|
1032
|
|
|
($str[$i] & "\xE0") === "\xC0" |
|
1033
|
|
|
) { |
|
1034
|
|
|
if (($str[$i + 1] & "\xC0") === "\x80") { |
|
1035
|
|
|
$ret[] = $str[$i] . $str[$i + 1]; |
|
1036
|
|
|
|
|
1037
|
|
|
++$i; |
|
1038
|
|
|
} |
|
1039
|
|
|
} elseif ( |
|
1040
|
|
|
isset($str[$i + 2]) |
|
1041
|
|
|
&& |
|
1042
|
|
|
($str[$i] & "\xF0") === "\xE0" |
|
1043
|
|
|
) { |
|
1044
|
|
|
if ( |
|
1045
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
|
1046
|
|
|
&& |
|
1047
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
|
1048
|
|
|
) { |
|
1049
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
|
1050
|
|
|
|
|
1051
|
|
|
$i += 2; |
|
1052
|
|
|
} |
|
1053
|
|
|
} elseif ( |
|
1054
|
|
|
isset($str[$i + 3]) |
|
1055
|
|
|
&& |
|
1056
|
|
|
($str[$i] & "\xF8") === "\xF0" |
|
1057
|
|
|
) { |
|
1058
|
|
|
if ( |
|
1059
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
|
1060
|
|
|
&& |
|
1061
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
|
1062
|
|
|
&& |
|
1063
|
|
|
($str[$i + 3] & "\xC0") === "\x80" |
|
1064
|
|
|
) { |
|
1065
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
|
1066
|
|
|
|
|
1067
|
|
|
$i += 3; |
|
1068
|
|
|
} |
|
1069
|
|
|
} |
|
1070
|
|
|
} |
|
1071
|
|
|
} |
|
1072
|
|
|
|
|
1073
|
|
|
if ($length > 1) { |
|
1074
|
|
|
$ret = \array_chunk($ret, $length); |
|
1075
|
|
|
|
|
1076
|
|
|
return array_map( |
|
1077
|
|
|
static function (&$item) { |
|
1078
|
|
|
return implode('', $item); |
|
1079
|
|
|
}, |
|
1080
|
|
|
$ret |
|
1081
|
|
|
); |
|
1082
|
|
|
} |
|
1083
|
|
|
|
|
1084
|
|
|
if (isset($ret[0]) && $ret[0] === '') { |
|
1085
|
|
|
return []; |
|
1086
|
|
|
} |
|
1087
|
|
|
|
|
1088
|
|
|
return $ret; |
|
1089
|
|
|
} |
|
1090
|
|
|
|
|
1091
|
|
|
private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
|
|
|
|
|
|
1092
|
|
|
{ |
|
1093
|
|
|
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
|
1094
|
|
|
// caused connection reset problem on larger strings |
|
1095
|
|
|
|
|
1096
|
|
|
$regx = '/ |
|
1097
|
|
|
( |
|
1098
|
|
|
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
|
1099
|
|
|
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
|
1100
|
|
|
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
|
1101
|
|
|
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
|
1102
|
|
|
){1,100} # ...one or more times |
|
1103
|
|
|
) |
|
1104
|
|
|
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
|
1105
|
|
|
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
|
1106
|
|
|
/x'; |
|
1107
|
|
|
$str = (string)preg_replace($regx, '$1', $str); |
|
1108
|
|
|
|
|
1109
|
|
|
if ($replace_diamond_question_mark === true) { |
|
1110
|
|
|
$str = $this->replace_diamond_question_mark($str, ''); |
|
1111
|
|
|
} |
|
1112
|
|
|
|
|
1113
|
|
|
if ($remove_invisible_characters === true) { |
|
1114
|
|
|
$str = $this->remove_invisible_characters($str); |
|
1115
|
|
|
} |
|
1116
|
|
|
|
|
1117
|
|
|
if ($normalize_whitespace === true) { |
|
1118
|
|
|
$str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
|
1119
|
|
|
} |
|
1120
|
|
|
|
|
1121
|
|
|
if ($normalize_msword === true) { |
|
1122
|
|
|
$str = $this->normalize_msword($str); |
|
1123
|
|
|
} |
|
1124
|
|
|
|
|
1125
|
|
|
if ($remove_bom === true) { |
|
1126
|
|
|
$str = $this->remove_bom($str); |
|
1127
|
|
|
} |
|
1128
|
|
|
|
|
1129
|
|
|
return $str; |
|
1130
|
|
|
} |
|
1131
|
|
|
|
|
1132
|
6 |
|
public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
|
|
|
|
|
1133
|
|
|
{ |
|
1134
|
6 |
|
if ($str === '') { |
|
1135
|
|
|
return ''; |
|
1136
|
|
|
} |
|
1137
|
|
|
|
|
1138
|
6 |
|
if ($processInvalidUtf8 === true) { |
|
1139
|
6 |
|
$replacementCharHelper = $replacementChar; |
|
|
|
|
|
|
1140
|
6 |
|
if ($replacementChar === '') { |
|
1141
|
6 |
|
$replacementCharHelper = 'none'; |
|
1142
|
|
|
} |
|
1143
|
|
|
|
|
1144
|
6 |
|
if ($this->SUPPORT['mbstring'] === false) { |
|
1145
|
|
|
// if there is no native support for "mbstring", |
|
1146
|
|
|
// then we need to clean the string before ... |
|
1147
|
|
|
$str = $this->clean($str); |
|
1148
|
|
|
} |
|
1149
|
|
|
|
|
1150
|
6 |
|
$save = \mb_substitute_character(); |
|
1151
|
6 |
|
\mb_substitute_character($replacementCharHelper); |
|
1152
|
|
|
// the polyfill maybe return false, so cast to string |
|
1153
|
6 |
|
$str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
|
1154
|
6 |
|
\mb_substitute_character($save); |
|
1155
|
|
|
} |
|
1156
|
|
|
|
|
1157
|
6 |
|
return str_replace( |
|
1158
|
|
|
[ |
|
1159
|
6 |
|
"\xEF\xBF\xBD", |
|
1160
|
|
|
'�', |
|
1161
|
|
|
], |
|
1162
|
|
|
[ |
|
1163
|
6 |
|
$replacementChar, |
|
1164
|
6 |
|
$replacementChar, |
|
1165
|
|
|
], |
|
1166
|
6 |
|
$str |
|
1167
|
|
|
); |
|
1168
|
|
|
} |
|
1169
|
|
|
|
|
1170
|
6 |
|
public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
|
|
|
|
|
1171
|
|
|
{ |
|
1172
|
|
|
// init |
|
1173
|
6 |
|
$non_displayables = []; |
|
1174
|
|
|
|
|
1175
|
|
|
// every control character except newline (dec 10), |
|
1176
|
|
|
// carriage return (dec 13) and horizontal tab (dec 09) |
|
1177
|
6 |
|
if ($url_encoded) { |
|
1178
|
6 |
|
$non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
|
1179
|
6 |
|
$non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
|
1180
|
|
|
} |
|
1181
|
|
|
|
|
1182
|
6 |
|
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
|
1183
|
|
|
|
|
1184
|
|
|
do { |
|
1185
|
6 |
|
$str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
|
1186
|
6 |
|
} while ($count !== 0); |
|
1187
|
|
|
|
|
1188
|
6 |
|
return $str; |
|
1189
|
|
|
} |
|
1190
|
|
|
|
|
1191
|
6 |
|
public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
|
|
|
|
|
1192
|
|
|
{ |
|
1193
|
6 |
|
if ($str === '') { |
|
1194
|
|
|
return ''; |
|
1195
|
|
|
} |
|
1196
|
|
|
|
|
1197
|
6 |
|
static $WHITESPACE_CACHE = []; |
|
1198
|
6 |
|
$cacheKey = (int)$keepNonBreakingSpace; |
|
1199
|
|
|
|
|
1200
|
6 |
|
if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
|
1201
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
|
1202
|
|
|
|
|
1203
|
1 |
|
if ($keepNonBreakingSpace === true) { |
|
1204
|
|
|
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
|
1205
|
|
|
} |
|
1206
|
|
|
|
|
1207
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
|
1208
|
|
|
} |
|
1209
|
|
|
|
|
1210
|
6 |
|
if ($keepBidiUnicodeControls === false) { |
|
1211
|
6 |
|
static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
|
|
|
|
|
1212
|
|
|
|
|
1213
|
6 |
|
if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
|
1214
|
1 |
|
$BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
|
1215
|
|
|
} |
|
1216
|
|
|
|
|
1217
|
6 |
|
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
|
1218
|
|
|
} |
|
1219
|
|
|
|
|
1220
|
6 |
|
return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
|
1221
|
|
|
} |
|
1222
|
|
|
|
|
1223
|
|
|
private function normalize_msword($str) |
|
|
|
|
|
|
1224
|
|
|
{ |
|
1225
|
|
|
if ($str === '') { |
|
1226
|
|
|
return ''; |
|
1227
|
|
|
} |
|
1228
|
|
|
|
|
1229
|
|
|
$keys = [ |
|
1230
|
|
|
"\xc2\xab", // « (U+00AB) in UTF-8 |
|
1231
|
|
|
"\xc2\xbb", // » (U+00BB) in UTF-8 |
|
1232
|
|
|
"\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
|
1233
|
|
|
"\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
|
1234
|
|
|
"\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
|
1235
|
|
|
"\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
|
1236
|
|
|
"\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
|
1237
|
|
|
"\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
|
1238
|
|
|
"\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
|
1239
|
|
|
"\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
|
1240
|
|
|
"\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
|
1241
|
|
|
"\xe2\x80\xba", // › (U+203A) in UTF-8 |
|
1242
|
|
|
"\xe2\x80\x93", // – (U+2013) in UTF-8 |
|
1243
|
|
|
"\xe2\x80\x94", // — (U+2014) in UTF-8 |
|
1244
|
|
|
"\xe2\x80\xa6", // … (U+2026) in UTF-8 |
|
1245
|
|
|
]; |
|
1246
|
|
|
|
|
1247
|
|
|
$values = [ |
|
1248
|
|
|
'"', // « (U+00AB) in UTF-8 |
|
1249
|
|
|
'"', // » (U+00BB) in UTF-8 |
|
1250
|
|
|
"'", // ‘ (U+2018) in UTF-8 |
|
1251
|
|
|
"'", // ’ (U+2019) in UTF-8 |
|
1252
|
|
|
"'", // ‚ (U+201A) in UTF-8 |
|
1253
|
|
|
"'", // ‛ (U+201B) in UTF-8 |
|
1254
|
|
|
'"', // “ (U+201C) in UTF-8 |
|
1255
|
|
|
'"', // ” (U+201D) in UTF-8 |
|
1256
|
|
|
'"', // „ (U+201E) in UTF-8 |
|
1257
|
|
|
'"', // ‟ (U+201F) in UTF-8 |
|
1258
|
|
|
"'", // ‹ (U+2039) in UTF-8 |
|
1259
|
|
|
"'", // › (U+203A) in UTF-8 |
|
1260
|
|
|
'-', // – (U+2013) in UTF-8 |
|
1261
|
|
|
'-', // — (U+2014) in UTF-8 |
|
1262
|
|
|
'...', // … (U+2026) in UTF-8 |
|
1263
|
|
|
]; |
|
1264
|
|
|
|
|
1265
|
|
|
return str_replace($keys, $values, $str); |
|
1266
|
|
|
} |
|
1267
|
|
|
|
|
1268
|
6 |
|
public function remove_bom($str) |
|
|
|
|
|
|
1269
|
|
|
{ |
|
1270
|
6 |
|
if ($str === '') { |
|
1271
|
|
|
return ''; |
|
1272
|
|
|
} |
|
1273
|
|
|
|
|
1274
|
6 |
|
$strLength = \strlen($str); |
|
1275
|
6 |
|
foreach ($this->BOM as $bomString => $bomByteLength) { |
|
1276
|
6 |
|
if (strpos($str, $bomString, 0) === 0) { |
|
1277
|
|
|
$strTmp = \substr($str, $bomByteLength, $strLength); |
|
1278
|
|
|
if ($strTmp === false) { |
|
1279
|
|
|
return ''; |
|
1280
|
|
|
} |
|
1281
|
|
|
|
|
1282
|
|
|
$strLength -= (int)$bomByteLength; |
|
1283
|
|
|
$str = (string)$strTmp; |
|
1284
|
|
|
} |
|
1285
|
|
|
} |
|
1286
|
|
|
|
|
1287
|
6 |
|
return $str; |
|
1288
|
|
|
} |
|
1289
|
|
|
|
|
1290
|
|
|
// private function str_detect_encoding($str) |
|
1291
|
|
|
// { |
|
1292
|
|
|
// // init |
|
1293
|
|
|
// $str = (string)$str; |
|
1294
|
|
|
// |
|
1295
|
|
|
// // |
|
1296
|
|
|
// // 1.) check binary strings (010001001...) like UTF-16 / UTF-32 / PDF / Images / ... |
|
1297
|
|
|
// // |
|
1298
|
|
|
// |
|
1299
|
|
|
// if ($this->is_binary($str, true) === true) { |
|
1300
|
|
|
// $isUtf16 = $this->is_utf16($str, false); |
|
1301
|
|
|
// if ($isUtf16 === 1) { |
|
1302
|
|
|
// return 'UTF-16LE'; |
|
1303
|
|
|
// } |
|
1304
|
|
|
// if ($isUtf16 === 2) { |
|
1305
|
|
|
// return 'UTF-16BE'; |
|
1306
|
|
|
// } |
|
1307
|
|
|
// |
|
1308
|
|
|
// $isUtf32 = $this->is_utf32($str, false); |
|
1309
|
|
|
// if ($isUtf32 === 1) { |
|
1310
|
|
|
// return 'UTF-32LE'; |
|
1311
|
|
|
// } |
|
1312
|
|
|
// if ($isUtf32 === 2) { |
|
1313
|
|
|
// return 'UTF-32BE'; |
|
1314
|
|
|
// } |
|
1315
|
|
|
// |
|
1316
|
|
|
// // is binary but not "UTF-16" or "UTF-32" |
|
1317
|
|
|
// return false; |
|
1318
|
|
|
// } |
|
1319
|
|
|
// |
|
1320
|
|
|
// // |
|
1321
|
|
|
// // 2.) simple check for ASCII chars |
|
1322
|
|
|
// // |
|
1323
|
|
|
// |
|
1324
|
|
|
// if ($this->isAscii($str) === true) { |
|
1325
|
|
|
// return 'ASCII'; |
|
1326
|
|
|
// } |
|
1327
|
|
|
// |
|
1328
|
|
|
// // |
|
1329
|
|
|
// // 3.) simple check for UTF-8 chars |
|
1330
|
|
|
// // |
|
1331
|
|
|
// |
|
1332
|
|
|
// if ($this->isUtf8($str) === true) { |
|
1333
|
|
|
// return 'UTF-8'; |
|
1334
|
|
|
// } |
|
1335
|
|
|
// |
|
1336
|
|
|
// // |
|
1337
|
|
|
// // 4.) check via "mb_detect_encoding()" |
|
1338
|
|
|
// // |
|
1339
|
|
|
// // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "mb_detect_encoding()" |
|
1340
|
|
|
// |
|
1341
|
|
|
// $detectOrder = [ |
|
1342
|
|
|
// 'ISO-8859-1', |
|
1343
|
|
|
// 'ISO-8859-2', |
|
1344
|
|
|
// 'ISO-8859-3', |
|
1345
|
|
|
// 'ISO-8859-4', |
|
1346
|
|
|
// 'ISO-8859-5', |
|
1347
|
|
|
// 'ISO-8859-6', |
|
1348
|
|
|
// 'ISO-8859-7', |
|
1349
|
|
|
// 'ISO-8859-8', |
|
1350
|
|
|
// 'ISO-8859-9', |
|
1351
|
|
|
// 'ISO-8859-10', |
|
1352
|
|
|
// 'ISO-8859-13', |
|
1353
|
|
|
// 'ISO-8859-14', |
|
1354
|
|
|
// 'ISO-8859-15', |
|
1355
|
|
|
// 'ISO-8859-16', |
|
1356
|
|
|
// 'WINDOWS-1251', |
|
1357
|
|
|
// 'WINDOWS-1252', |
|
1358
|
|
|
// 'WINDOWS-1254', |
|
1359
|
|
|
// 'CP932', |
|
1360
|
|
|
// 'CP936', |
|
1361
|
|
|
// 'CP950', |
|
1362
|
|
|
// 'CP866', |
|
1363
|
|
|
// 'CP850', |
|
1364
|
|
|
// 'CP51932', |
|
1365
|
|
|
// 'CP50220', |
|
1366
|
|
|
// 'CP50221', |
|
1367
|
|
|
// 'CP50222', |
|
1368
|
|
|
// 'ISO-2022-JP', |
|
1369
|
|
|
// 'ISO-2022-KR', |
|
1370
|
|
|
// 'JIS', |
|
1371
|
|
|
// 'JIS-ms', |
|
1372
|
|
|
// 'EUC-CN', |
|
1373
|
|
|
// 'EUC-JP', |
|
1374
|
|
|
// ]; |
|
1375
|
|
|
// |
|
1376
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
|
1377
|
|
|
// // info: do not use the symfony polyfill here |
|
1378
|
|
|
// $encoding = \mb_detect_encoding($str, $detectOrder, true); |
|
1379
|
|
|
// if ($encoding) { |
|
1380
|
|
|
// return $encoding; |
|
1381
|
|
|
// } |
|
1382
|
|
|
// } |
|
1383
|
|
|
// |
|
1384
|
|
|
// // |
|
1385
|
|
|
// // 5.) check via "iconv()" |
|
1386
|
|
|
// // |
|
1387
|
|
|
// |
|
1388
|
|
|
// if ($this->ENCODINGS === null) { |
|
1389
|
|
|
// $this->ENCODINGS = $this->getData('encodings'); |
|
1390
|
|
|
// } |
|
1391
|
|
|
// |
|
1392
|
|
|
// foreach ($this->ENCODINGS as $encodingTmp) { |
|
1393
|
|
|
// // INFO: //IGNORE but still throw notice |
|
1394
|
|
|
// /** @noinspection PhpUsageOfSilenceOperatorInspection */ |
|
1395
|
|
|
// if ((string)@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str) === $str) { |
|
1396
|
|
|
// return $encodingTmp; |
|
1397
|
|
|
// } |
|
1398
|
|
|
// } |
|
1399
|
|
|
// |
|
1400
|
|
|
// return false; |
|
1401
|
|
|
// } |
|
1402
|
|
|
|
|
1403
|
|
|
private function decimalToChr($int) |
|
1404
|
|
|
{ |
|
1405
|
|
|
return $this->htmlEntityDecode('&#' . $int . ';', \ENT_QUOTES | \ENT_HTML5); |
|
1406
|
|
|
} |
|
1407
|
|
|
// |
|
1408
|
|
|
// private function is_utf16($str, $checkIfStringIsBinary = true) |
|
1409
|
|
|
// { |
|
1410
|
|
|
// |
|
1411
|
|
|
// // init |
|
1412
|
|
|
// $str = (string)$str; |
|
1413
|
|
|
// $strChars = []; |
|
1414
|
|
|
// |
|
1415
|
|
|
// if ( |
|
1416
|
|
|
// $checkIfStringIsBinary === true |
|
1417
|
|
|
// && |
|
1418
|
|
|
// $this->is_binary($str, true) === false |
|
1419
|
|
|
// ) { |
|
1420
|
|
|
// return false; |
|
1421
|
|
|
// } |
|
1422
|
|
|
// |
|
1423
|
|
|
// if ($this->SUPPORT['mbstring'] === false) { |
|
1424
|
|
|
// \trigger_error('UTF8::is_utf16() without mbstring may did not work correctly', \E_USER_WARNING); |
|
1425
|
|
|
// } |
|
1426
|
|
|
// |
|
1427
|
|
|
// $str = $this->remove_bom($str); |
|
1428
|
|
|
// |
|
1429
|
|
|
// |
|
1430
|
|
|
// $maybeUTF16LE = 0; |
|
1431
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE'); |
|
1432
|
|
|
// if ($test) { |
|
1433
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8'); |
|
1434
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE'); |
|
1435
|
|
|
// if ($test3 === $test) { |
|
1436
|
|
|
// if (\count($strChars) === 0) { |
|
1437
|
|
|
// $strChars = $this->count_chars($str, true, false); |
|
1438
|
|
|
// } |
|
1439
|
|
|
// $countChars = $this->count_chars($test3); |
|
1440
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
|
1441
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
|
1442
|
|
|
// ++$maybeUTF16LE; |
|
1443
|
|
|
// } |
|
1444
|
|
|
// unset($countChars[$test3char]); |
|
1445
|
|
|
// } |
|
1446
|
|
|
// } |
|
1447
|
|
|
// } |
|
1448
|
|
|
// |
|
1449
|
|
|
// $maybeUTF16BE = 0; |
|
1450
|
|
|
// $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE'); |
|
1451
|
|
|
// if ($test) { |
|
1452
|
|
|
// $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8'); |
|
1453
|
|
|
// $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE'); |
|
1454
|
|
|
// if ($test3 === $test) { |
|
1455
|
|
|
// if (\count($strChars) === 0) { |
|
1456
|
|
|
// $strChars = $this->count_chars($str, true, false); |
|
1457
|
|
|
// } |
|
1458
|
|
|
// $countChars = $this->count_chars($test3); |
|
1459
|
|
|
// foreach ($countChars as $test3char => $test3charEmpty) { |
|
1460
|
|
|
// if (\in_array($test3char, $strChars, true) === true) { |
|
1461
|
|
|
// ++$maybeUTF16BE; |
|
1462
|
|
|
// } |
|
1463
|
|
|
// unset($countChars[$test3char]); |
|
1464
|
|
|
// } |
|
1465
|
|
|
// |
|
1466
|
|
|
// } |
|
1467
|
|
|
// } |
|
1468
|
|
|
// |
|
1469
|
|
|
// if ($maybeUTF16BE !== $maybeUTF16LE) { |
|
1470
|
|
|
// if ($maybeUTF16LE > $maybeUTF16BE) { |
|
1471
|
|
|
// return 1; |
|
1472
|
|
|
// } |
|
1473
|
|
|
// |
|
1474
|
|
|
// return 2; |
|
1475
|
|
|
// } |
|
1476
|
|
|
// |
|
1477
|
|
|
// return false; |
|
1478
|
|
|
// } |
|
1479
|
|
|
|
|
1480
|
|
|
/** |
|
1481
|
|
|
* Check if the string is UTF-32. |
|
1482
|
|
|
* |
|
1483
|
|
|
* @param mixed $str <p>The input string.</p> |
|
1484
|
|
|
* @param bool $checkIfStringIsBinary |
|
1485
|
|
|
* |
|
1486
|
|
|
* @return false|int |
|
1487
|
|
|
* <strong>false</strong> if is't not UTF-32,<br> |
|
1488
|
|
|
* <strong>1</strong> for UTF-32LE,<br> |
|
1489
|
|
|
* <strong>2</strong> for UTF-32BE |
|
1490
|
|
|
*/ |
|
1491
|
|
|
private function is_utf32($str, $checkIfStringIsBinary = true) |
|
|
|
|
|
|
1492
|
|
|
{ |
|
1493
|
|
|
// init |
|
1494
|
|
|
$str = (string)$str; |
|
1495
|
|
|
$strChars = []; |
|
1496
|
|
|
|
|
1497
|
|
|
if ($checkIfStringIsBinary === true && $this->is_binary($str, true) === false) { |
|
1498
|
|
|
return false; |
|
1499
|
|
|
} |
|
1500
|
|
|
|
|
1501
|
|
|
if ($this->SUPPORT['mbstring'] === false) { |
|
1502
|
|
|
\trigger_error('UTF8::is_utf32() without mbstring may did not work correctly', \E_USER_WARNING); |
|
1503
|
|
|
} |
|
1504
|
|
|
|
|
1505
|
|
|
$str = $this->remove_bom($str); |
|
1506
|
|
|
|
|
1507
|
|
|
$maybeUTF32LE = 0; |
|
1508
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE'); |
|
1509
|
|
|
if ($test) { |
|
1510
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8'); |
|
1511
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE'); |
|
1512
|
|
|
if ($test3 === $test) { |
|
1513
|
|
|
if (\count($strChars) === 0) { |
|
1514
|
|
|
$strChars = $this->count_chars($str, true, false); |
|
1515
|
|
|
} |
|
1516
|
|
|
$countChars = $this->count_chars($test3); |
|
1517
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
|
1518
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
|
1519
|
|
|
++$maybeUTF32LE; |
|
1520
|
|
|
} |
|
1521
|
|
|
unset($countChars[$test3char]); |
|
1522
|
|
|
} |
|
1523
|
|
|
} |
|
1524
|
|
|
} |
|
1525
|
|
|
|
|
1526
|
|
|
$maybeUTF32BE = 0; |
|
1527
|
|
|
$test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE'); |
|
1528
|
|
|
if ($test) { |
|
1529
|
|
|
$test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8'); |
|
1530
|
|
|
$test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE'); |
|
1531
|
|
|
if ($test3 === $test) { |
|
1532
|
|
|
if (\count($strChars) === 0) { |
|
1533
|
|
|
$strChars = $this->count_chars($str, true, false); |
|
1534
|
|
|
} |
|
1535
|
|
|
$countChars = $this->count_chars($test3); |
|
1536
|
|
|
foreach ($countChars as $test3char => $test3charEmpty) { |
|
1537
|
|
|
if (\in_array($test3char, $strChars, true) === true) { |
|
1538
|
|
|
++$maybeUTF32BE; |
|
1539
|
|
|
} |
|
1540
|
|
|
unset($countChars[$test3char]); |
|
1541
|
|
|
} |
|
1542
|
|
|
} |
|
1543
|
|
|
} |
|
1544
|
|
|
|
|
1545
|
|
|
if ($maybeUTF32BE !== $maybeUTF32LE) { |
|
1546
|
|
|
if ($maybeUTF32LE > $maybeUTF32BE) { |
|
1547
|
|
|
return 1; |
|
1548
|
|
|
} |
|
1549
|
|
|
|
|
1550
|
|
|
return 2; |
|
1551
|
|
|
} |
|
1552
|
|
|
|
|
1553
|
|
|
return false; |
|
1554
|
|
|
} |
|
1555
|
|
|
|
|
1556
|
|
|
private function is_binary($input, $strict = false) |
|
|
|
|
|
|
1557
|
|
|
{ |
|
1558
|
|
|
$input = (string)$input; |
|
1559
|
|
|
if ($input === '') { |
|
1560
|
|
|
return false; |
|
1561
|
|
|
} |
|
1562
|
|
|
|
|
1563
|
|
|
if (preg_match('~^[01]+$~', $input)) { |
|
1564
|
|
|
return true; |
|
1565
|
|
|
} |
|
1566
|
|
|
|
|
1567
|
|
|
$ext = $this->get_file_type($input); |
|
1568
|
|
|
if ($ext['type'] === 'binary') { |
|
1569
|
|
|
return true; |
|
1570
|
|
|
} |
|
1571
|
|
|
|
|
1572
|
|
|
$testLength = \strlen($input); |
|
1573
|
|
|
$testNull = \substr_count($input, "\x0", 0, $testLength); |
|
1574
|
|
|
if (($testNull / $testLength) > 0.25) { |
|
1575
|
|
|
return true; |
|
1576
|
|
|
} |
|
1577
|
|
|
|
|
1578
|
|
|
if ($strict === true) { |
|
1579
|
|
|
if ($this->SUPPORT['finfo'] === false) { |
|
1580
|
|
|
throw new \RuntimeException('ext-fileinfo: is not installed'); |
|
1581
|
|
|
} |
|
1582
|
|
|
|
|
1583
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
1584
|
|
|
$finfo_encoding = (new \finfo(\FILEINFO_MIME_ENCODING))->buffer($input); |
|
1585
|
|
|
if ($finfo_encoding && $finfo_encoding === 'binary') { |
|
1586
|
|
|
return true; |
|
1587
|
|
|
} |
|
1588
|
|
|
} |
|
1589
|
|
|
|
|
1590
|
|
|
return false; |
|
1591
|
|
|
} |
|
1592
|
|
|
|
|
1593
|
|
|
private function get_file_type( |
|
|
|
|
|
|
1594
|
|
|
$str, |
|
1595
|
|
|
$fallback = [ |
|
1596
|
|
|
'ext' => null, |
|
1597
|
|
|
'mime' => 'application/octet-stream', |
|
1598
|
|
|
'type' => null, |
|
1599
|
|
|
] |
|
1600
|
|
|
) { |
|
1601
|
|
|
if ($str === '') { |
|
1602
|
|
|
return $fallback; |
|
1603
|
|
|
} |
|
1604
|
|
|
|
|
1605
|
|
|
$str_info = \substr($str, 0, 2); |
|
1606
|
|
|
if ($str_info === false || \strlen($str_info) !== 2) { |
|
1607
|
|
|
return $fallback; |
|
1608
|
|
|
} |
|
1609
|
|
|
|
|
1610
|
|
|
$str_info = \unpack('C2chars', $str_info); |
|
1611
|
|
|
if ($str_info === false) { |
|
1612
|
|
|
return $fallback; |
|
1613
|
|
|
} |
|
1614
|
|
|
$type_code = (int)($str_info['chars1'] . $str_info['chars2']); |
|
1615
|
|
|
|
|
1616
|
|
|
switch ($type_code) { |
|
1617
|
|
|
case 3780: |
|
1618
|
|
|
$ext = 'pdf'; |
|
1619
|
|
|
$mime = 'application/pdf'; |
|
1620
|
|
|
$type = 'binary'; |
|
1621
|
|
|
|
|
1622
|
|
|
break; |
|
1623
|
|
|
case 7790: |
|
1624
|
|
|
$ext = 'exe'; |
|
1625
|
|
|
$mime = 'application/octet-stream'; |
|
1626
|
|
|
$type = 'binary'; |
|
1627
|
|
|
|
|
1628
|
|
|
break; |
|
1629
|
|
|
case 7784: |
|
1630
|
|
|
$ext = 'midi'; |
|
1631
|
|
|
$mime = 'audio/x-midi'; |
|
1632
|
|
|
$type = 'binary'; |
|
1633
|
|
|
|
|
1634
|
|
|
break; |
|
1635
|
|
|
case 8075: |
|
1636
|
|
|
$ext = 'zip'; |
|
1637
|
|
|
$mime = 'application/zip'; |
|
1638
|
|
|
$type = 'binary'; |
|
1639
|
|
|
|
|
1640
|
|
|
break; |
|
1641
|
|
|
case 8297: |
|
1642
|
|
|
$ext = 'rar'; |
|
1643
|
|
|
$mime = 'application/rar'; |
|
1644
|
|
|
$type = 'binary'; |
|
1645
|
|
|
|
|
1646
|
|
|
break; |
|
1647
|
|
|
case 255216: |
|
1648
|
|
|
$ext = 'jpg'; |
|
1649
|
|
|
$mime = 'image/jpeg'; |
|
1650
|
|
|
$type = 'binary'; |
|
1651
|
|
|
|
|
1652
|
|
|
break; |
|
1653
|
|
|
case 7173: |
|
1654
|
|
|
$ext = 'gif'; |
|
1655
|
|
|
$mime = 'image/gif'; |
|
1656
|
|
|
$type = 'binary'; |
|
1657
|
|
|
|
|
1658
|
|
|
break; |
|
1659
|
|
|
case 6677: |
|
1660
|
|
|
$ext = 'bmp'; |
|
1661
|
|
|
$mime = 'image/bmp'; |
|
1662
|
|
|
$type = 'binary'; |
|
1663
|
|
|
|
|
1664
|
|
|
break; |
|
1665
|
|
|
case 13780: |
|
1666
|
|
|
$ext = 'png'; |
|
1667
|
|
|
$mime = 'image/png'; |
|
1668
|
|
|
$type = 'binary'; |
|
1669
|
|
|
|
|
1670
|
|
|
break; |
|
1671
|
|
|
default: |
|
1672
|
|
|
return $fallback; |
|
1673
|
|
|
} |
|
1674
|
|
|
|
|
1675
|
|
|
return [ |
|
1676
|
|
|
'ext' => $ext, |
|
1677
|
|
|
'mime' => $mime, |
|
1678
|
|
|
'type' => $type, |
|
1679
|
|
|
]; |
|
1680
|
|
|
} |
|
1681
|
|
|
|
|
1682
|
|
|
private function count_chars($str, $cleanUtf8 = false, $tryToUseMbFunction = true) |
|
|
|
|
|
|
1683
|
|
|
{ |
|
1684
|
|
|
return array_count_values($this->strSplit($str, 1, $cleanUtf8, $tryToUseMbFunction)); |
|
1685
|
|
|
} |
|
1686
|
|
|
|
|
1687
|
|
|
// private function to_iso8859($str) |
|
1688
|
|
|
// { |
|
1689
|
|
|
// if (is_array($str) === true) { |
|
1690
|
|
|
// |
|
1691
|
|
|
// foreach ($str as $key => $value) { |
|
1692
|
|
|
// $str[$k] = $this->to_iso8859($value); |
|
1693
|
|
|
// } |
|
1694
|
|
|
// |
|
1695
|
|
|
// return $str; |
|
1696
|
|
|
// } |
|
1697
|
|
|
// |
|
1698
|
|
|
// $str = (string)$str; |
|
1699
|
|
|
// if ($str === '') { |
|
1700
|
|
|
// return ''; |
|
1701
|
|
|
// } |
|
1702
|
|
|
// |
|
1703
|
|
|
// return $this->utf8_decode($str); |
|
1704
|
|
|
// } |
|
1705
|
|
|
|
|
1706
|
|
|
/** |
|
1707
|
|
|
* Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters. |
|
1708
|
|
|
* |
|
1709
|
|
|
* @see http://hsivonen.iki.fi/php-utf8/ |
|
1710
|
|
|
* |
|
1711
|
|
|
* @param string|string[] $str <p>The string to be checked.</p> |
|
1712
|
|
|
* @param bool $strict <p>Check also if the string is not UTF-16 or UTF-32.</p> |
|
|
|
|
|
|
1713
|
|
|
* |
|
1714
|
|
|
* @return bool |
|
1715
|
|
|
*/ |
|
1716
|
|
|
private function isUtf8($str) |
|
|
|
|
|
|
1717
|
|
|
{ |
|
1718
|
|
|
if (\is_array($str) === true) { |
|
1719
|
|
|
foreach ($str as $v) { |
|
1720
|
|
|
if ($this->isUtf8($v) === false) { |
|
1721
|
|
|
return false; |
|
1722
|
|
|
} |
|
1723
|
|
|
} |
|
1724
|
|
|
|
|
1725
|
|
|
return true; |
|
1726
|
|
|
} |
|
1727
|
|
|
|
|
1728
|
|
|
if ($str === '') { |
|
1729
|
|
|
return true; |
|
1730
|
|
|
} |
|
1731
|
|
|
|
|
1732
|
|
|
if ($this->system->pcre_utf8_support() !== true) { |
|
|
|
|
|
|
1733
|
|
|
|
|
1734
|
|
|
// If even just the first character can be matched, when the /u |
|
1735
|
|
|
// modifier is used, then it's valid UTF-8. If the UTF-8 is somehow |
|
1736
|
|
|
// invalid, nothing at all will match, even if the string contains |
|
1737
|
|
|
// some valid sequences |
|
1738
|
|
|
return preg_match('/^.{1}/us', $str, $ar) === 1; |
|
|
|
|
|
|
1739
|
|
|
} |
|
1740
|
|
|
|
|
1741
|
|
|
$mState = 0; // cached expected number of octets after the current octet |
|
1742
|
|
|
// until the beginning of the next UTF8 character sequence |
|
1743
|
|
|
$mUcs4 = 0; // cached Unicode character |
|
1744
|
|
|
$mBytes = 1; // cached expected number of octets in the current sequence |
|
1745
|
|
|
|
|
1746
|
|
|
if ($this->ORD === null) { |
|
1747
|
|
|
$this->ORD = $this->getData('ord'); |
|
1748
|
|
|
} |
|
1749
|
|
|
|
|
1750
|
|
|
$len = \strlen((string)$str); |
|
1751
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
|
1752
|
|
|
for ($i = 0; $i < $len; ++$i) { |
|
1753
|
|
|
$in = $this->ORD[$str[$i]]; |
|
|
|
|
|
|
1754
|
|
|
if ($mState === 0) { |
|
1755
|
|
|
// When mState is zero we expect either a US-ASCII character or a |
|
1756
|
|
|
// multi-octet sequence. |
|
1757
|
|
|
if ((0x80 & $in) === 0) { |
|
1758
|
|
|
// US-ASCII, pass straight through. |
|
1759
|
|
|
$mBytes = 1; |
|
1760
|
|
|
} elseif ((0xE0 & $in) === 0xC0) { |
|
1761
|
|
|
// First octet of 2 octet sequence. |
|
1762
|
|
|
$mUcs4 = $in; |
|
1763
|
|
|
$mUcs4 = ($mUcs4 & 0x1F) << 6; |
|
1764
|
|
|
$mState = 1; |
|
1765
|
|
|
$mBytes = 2; |
|
1766
|
|
|
} elseif ((0xF0 & $in) === 0xE0) { |
|
1767
|
|
|
// First octet of 3 octet sequence. |
|
1768
|
|
|
$mUcs4 = $in; |
|
1769
|
|
|
$mUcs4 = ($mUcs4 & 0x0F) << 12; |
|
1770
|
|
|
$mState = 2; |
|
1771
|
|
|
$mBytes = 3; |
|
1772
|
|
|
} elseif ((0xF8 & $in) === 0xF0) { |
|
1773
|
|
|
// First octet of 4 octet sequence. |
|
1774
|
|
|
$mUcs4 = $in; |
|
1775
|
|
|
$mUcs4 = ($mUcs4 & 0x07) << 18; |
|
1776
|
|
|
$mState = 3; |
|
1777
|
|
|
$mBytes = 4; |
|
1778
|
|
|
} elseif ((0xFC & $in) === 0xF8) { |
|
1779
|
|
|
/* First octet of 5 octet sequence. |
|
1780
|
|
|
* |
|
1781
|
|
|
* This is illegal because the encoded codepoint must be either |
|
1782
|
|
|
* (a) not the shortest form or |
|
1783
|
|
|
* (b) outside the Unicode range of 0-0x10FFFF. |
|
1784
|
|
|
* Rather than trying to resynchronize, we will carry on until the end |
|
1785
|
|
|
* of the sequence and let the later error handling code catch it. |
|
1786
|
|
|
*/ |
|
1787
|
|
|
$mUcs4 = $in; |
|
1788
|
|
|
$mUcs4 = ($mUcs4 & 0x03) << 24; |
|
1789
|
|
|
$mState = 4; |
|
1790
|
|
|
$mBytes = 5; |
|
1791
|
|
|
} elseif ((0xFE & $in) === 0xFC) { |
|
1792
|
|
|
// First octet of 6 octet sequence, see comments for 5 octet sequence. |
|
1793
|
|
|
$mUcs4 = $in; |
|
1794
|
|
|
$mUcs4 = ($mUcs4 & 1) << 30; |
|
1795
|
|
|
$mState = 5; |
|
1796
|
|
|
$mBytes = 6; |
|
1797
|
|
|
} else { |
|
|
|
|
|
|
1798
|
|
|
// Current octet is neither in the US-ASCII range nor a legal first |
|
1799
|
|
|
// octet of a multi-octet sequence. |
|
1800
|
|
|
return false; |
|
1801
|
|
|
} |
|
1802
|
|
|
} elseif ((0xC0 & $in) === 0x80) { |
|
|
|
|
|
|
1803
|
|
|
|
|
1804
|
|
|
// When mState is non-zero, we expect a continuation of the multi-octet |
|
1805
|
|
|
// sequence |
|
1806
|
|
|
|
|
1807
|
|
|
// Legal continuation. |
|
1808
|
|
|
$shift = ($mState - 1) * 6; |
|
1809
|
|
|
$tmp = $in; |
|
1810
|
|
|
$tmp = ($tmp & 0x0000003F) << $shift; |
|
1811
|
|
|
$mUcs4 |= $tmp; |
|
1812
|
|
|
// Prefix: End of the multi-octet sequence. mUcs4 now contains the final |
|
1813
|
|
|
// Unicode code point to be output. |
|
1814
|
|
|
if (--$mState === 0) { |
|
1815
|
|
|
// Check for illegal sequences and code points. |
|
1816
|
|
|
// |
|
1817
|
|
|
// From Unicode 3.1, non-shortest form is illegal |
|
1818
|
|
|
if ( |
|
1819
|
|
|
($mBytes === 2 && $mUcs4 < 0x0080) |
|
1820
|
|
|
|| |
|
1821
|
|
|
($mBytes === 3 && $mUcs4 < 0x0800) |
|
1822
|
|
|
|| |
|
1823
|
|
|
($mBytes === 4 && $mUcs4 < 0x10000) |
|
1824
|
|
|
|| |
|
1825
|
|
|
($mBytes > 4) |
|
1826
|
|
|
|| |
|
1827
|
|
|
// From Unicode 3.2, surrogate characters are illegal. |
|
1828
|
|
|
(($mUcs4 & 0xFFFFF800) === 0xD800) |
|
1829
|
|
|
|| |
|
1830
|
|
|
// Code points outside the Unicode range are illegal. |
|
1831
|
|
|
($mUcs4 > 0x10FFFF) |
|
1832
|
|
|
) { |
|
1833
|
|
|
return false; |
|
1834
|
|
|
} |
|
1835
|
|
|
// initialize UTF8 cache |
|
1836
|
|
|
$mState = 0; |
|
1837
|
|
|
$mUcs4 = 0; |
|
1838
|
|
|
$mBytes = 1; |
|
1839
|
|
|
} |
|
1840
|
|
|
} else { |
|
|
|
|
|
|
1841
|
|
|
// ((0xC0 & (*in) != 0x80) && (mState != 0)) |
|
1842
|
|
|
// Incomplete multi-octet sequence. |
|
1843
|
|
|
return false; |
|
1844
|
|
|
} |
|
1845
|
|
|
} |
|
1846
|
|
|
|
|
1847
|
|
|
return true; |
|
1848
|
|
|
} |
|
1849
|
|
|
|
|
1850
|
|
|
/** |
|
1851
|
|
|
* Decodes an UTF-8 string to ISO-8859-1. |
|
1852
|
|
|
* |
|
1853
|
|
|
* @param string $str <p>The input string.</p> |
|
1854
|
|
|
* @param bool $keepUtf8Chars |
|
1855
|
|
|
* |
|
1856
|
|
|
* @return string |
|
1857
|
|
|
*/ |
|
1858
|
|
|
private function utf8_decode($str, $keepUtf8Chars = false) |
|
|
|
|
|
|
1859
|
|
|
{ |
|
1860
|
|
|
if ($str === '') { |
|
1861
|
|
|
return ''; |
|
1862
|
|
|
} |
|
1863
|
|
|
|
|
1864
|
|
|
// save for later comparision |
|
1865
|
|
|
$str_backup = $str; |
|
1866
|
|
|
$len = \strlen($str); |
|
1867
|
|
|
|
|
1868
|
|
|
if ($this->ORD === null) { |
|
1869
|
|
|
$this->ORD = $this->getData('ord'); |
|
1870
|
|
|
} |
|
1871
|
|
|
|
|
1872
|
|
|
if ($this->CHR === null) { |
|
1873
|
|
|
$this->CHR = $this->getData('chr'); |
|
1874
|
|
|
} |
|
1875
|
|
|
|
|
1876
|
|
|
$noCharFound = '?'; |
|
1877
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
|
1878
|
|
|
for ($i = 0, $j = 0; $i < $len; ++$i, ++$j) { |
|
1879
|
|
|
switch ($str[$i] & "\xF0") { |
|
1880
|
|
|
case "\xC0": |
|
1881
|
|
|
case "\xD0": |
|
1882
|
|
|
$c = ($this->ORD[$str[$i] & "\x1F"] << 6) | $this->ORD[$str[++$i] & "\x3F"]; |
|
|
|
|
|
|
1883
|
|
|
$str[$j] = $c < 256 ? $this->CHR[$c] : $noCharFound; |
|
1884
|
|
|
|
|
1885
|
|
|
break; |
|
1886
|
|
|
|
|
1887
|
|
|
/** @noinspection PhpMissingBreakStatementInspection */ |
|
1888
|
|
|
case "\xF0": |
|
1889
|
|
|
++$i; |
|
1890
|
|
|
|
|
1891
|
|
|
// no break |
|
1892
|
|
|
|
|
1893
|
|
|
case "\xE0": |
|
1894
|
|
|
$str[$j] = $noCharFound; |
|
1895
|
|
|
$i += 2; |
|
1896
|
|
|
|
|
1897
|
|
|
break; |
|
1898
|
|
|
|
|
1899
|
|
|
default: |
|
1900
|
|
|
$str[$j] = $str[$i]; |
|
1901
|
|
|
} |
|
1902
|
|
|
} |
|
1903
|
|
|
|
|
1904
|
|
|
$return = substr($str, 0, $j); |
|
1905
|
|
|
if ($return === false) { |
|
1906
|
|
|
$return = ''; |
|
1907
|
|
|
} |
|
1908
|
|
|
|
|
1909
|
|
|
if ( |
|
1910
|
|
|
$keepUtf8Chars === true |
|
1911
|
|
|
&& |
|
1912
|
|
|
$this->stringLength($return) >= (int)$this->stringLength($str_backup) |
|
1913
|
|
|
) { |
|
1914
|
|
|
return $str_backup; |
|
1915
|
|
|
} |
|
1916
|
|
|
|
|
1917
|
|
|
return $return; |
|
1918
|
|
|
} |
|
1919
|
|
|
|
|
1920
|
|
|
/** |
|
1921
|
|
|
* @param $str |
|
1922
|
|
|
* @param string $encoding |
|
|
|
|
|
|
1923
|
|
|
* @param bool $cleanUtf8 |
|
|
|
|
|
|
1924
|
|
|
* @return bool|int |
|
1925
|
|
|
*/ |
|
1926
|
|
|
private function stringLength($str) |
|
|
|
|
|
|
1927
|
|
|
{ |
|
1928
|
|
|
if ($str === '') { |
|
1929
|
|
|
return 0; |
|
1930
|
|
|
} |
|
1931
|
|
|
|
|
1932
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
|
1933
|
|
|
return mb_strlen($str, 'UTF-8'); |
|
1934
|
|
|
} |
|
1935
|
|
|
|
|
1936
|
|
|
if ($this->SUPPORT['iconv'] === true) { |
|
1937
|
|
|
$returnTmp = \iconv_strlen($str, 'UTF-8'); |
|
1938
|
|
|
if ($returnTmp !== false) { |
|
1939
|
|
|
return $returnTmp; |
|
1940
|
|
|
} |
|
1941
|
|
|
} |
|
1942
|
|
|
|
|
1943
|
|
|
if ( |
|
1944
|
|
|
$this->SUPPORT['intl'] === true |
|
1945
|
|
|
) { |
|
1946
|
|
|
$returnTmp = \grapheme_strlen($str); |
|
1947
|
|
|
if ($returnTmp !== null) { |
|
1948
|
|
|
return $returnTmp; |
|
1949
|
|
|
} |
|
1950
|
|
|
} |
|
1951
|
|
|
|
|
1952
|
|
|
if ($this->isAscii($str)) { |
|
1953
|
|
|
return strlen($str); |
|
1954
|
|
|
} |
|
1955
|
|
|
|
|
1956
|
|
|
// |
|
1957
|
|
|
// fallback via vanilla php |
|
1958
|
|
|
// |
|
1959
|
|
|
|
|
1960
|
|
|
\preg_match_all('/./us', $str, $parts); |
|
1961
|
|
|
|
|
1962
|
|
|
$returnTmp = \count($parts[0]); |
|
1963
|
|
|
if ($returnTmp === 0) { |
|
1964
|
|
|
return false; |
|
1965
|
|
|
} |
|
1966
|
|
|
|
|
1967
|
|
|
return $returnTmp; |
|
1968
|
|
|
} |
|
1969
|
|
|
|
|
1970
|
|
|
|
|
1971
|
|
|
} |
|
1972
|
|
|
|
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString.