|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace devtoolboxuk\soteria\voku\Resources; |
|
4
|
|
|
|
|
5
|
|
|
class Utf8 extends Resources |
|
|
|
|
|
|
6
|
|
|
{ |
|
7
|
|
|
|
|
8
|
|
|
private $system; |
|
9
|
|
|
private $ENCODINGS; |
|
10
|
|
|
private $SUPPORT = []; |
|
11
|
|
|
private $BROKEN_UTF8_FIX; |
|
12
|
|
|
private $ORD; |
|
13
|
|
|
private $CHR; |
|
14
|
|
|
private $WIN1252_TO_UTF8; |
|
15
|
|
|
|
|
16
|
|
|
private $BOM = [ |
|
17
|
|
|
"\xef\xbb\xbf" => 3, // UTF-8 BOM |
|
18
|
|
|
'' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
|
19
|
|
|
"\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
|
20
|
|
|
' ΓΎΓΏ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
|
21
|
|
|
"\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
|
22
|
|
|
'ΓΏΓΎ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
|
23
|
|
|
"\xfe\xff" => 2, // UTF-16 (BE) BOM |
|
24
|
|
|
'ΓΎΓΏ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
|
25
|
|
|
"\xff\xfe" => 2, // UTF-16 (LE) BOM |
|
26
|
|
|
'ΓΏΓΎ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
|
27
|
|
|
]; |
|
28
|
|
|
|
|
29
|
|
|
private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
|
30
|
|
|
// LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
|
31
|
|
|
8234 => "\xE2\x80\xAA", |
|
32
|
|
|
// RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
|
33
|
|
|
8235 => "\xE2\x80\xAB", |
|
34
|
|
|
// POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
|
35
|
|
|
8236 => "\xE2\x80\xAC", |
|
36
|
|
|
// LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
|
37
|
|
|
8237 => "\xE2\x80\xAD", |
|
38
|
|
|
// RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
|
39
|
|
|
8238 => "\xE2\x80\xAE", |
|
40
|
|
|
// LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
|
41
|
|
|
8294 => "\xE2\x81\xA6", |
|
42
|
|
|
// RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
|
43
|
|
|
8295 => "\xE2\x81\xA7", |
|
44
|
|
|
// FIRST STRONG ISOLATE // (use -> dir = "auto") |
|
45
|
|
|
8296 => "\xE2\x81\xA8", |
|
46
|
|
|
// POP DIRECTIONAL ISOLATE |
|
47
|
|
|
8297 => "\xE2\x81\xA9", |
|
48
|
|
|
]; |
|
49
|
|
|
|
|
50
|
|
|
/** |
|
51
|
|
|
* @var array |
|
52
|
|
|
*/ |
|
53
|
|
|
private $WHITESPACE_TABLE = [ |
|
54
|
|
|
'SPACE' => "\x20", |
|
55
|
|
|
'NO-BREAK SPACE' => "\xc2\xa0", |
|
56
|
|
|
'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
|
57
|
|
|
'EN QUAD' => "\xe2\x80\x80", |
|
58
|
|
|
'EM QUAD' => "\xe2\x80\x81", |
|
59
|
|
|
'EN SPACE' => "\xe2\x80\x82", |
|
60
|
|
|
'EM SPACE' => "\xe2\x80\x83", |
|
61
|
|
|
'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
|
62
|
|
|
'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
|
63
|
|
|
'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
|
64
|
|
|
'FIGURE SPACE' => "\xe2\x80\x87", |
|
65
|
|
|
'PUNCTUATION SPACE' => "\xe2\x80\x88", |
|
66
|
|
|
'THIN SPACE' => "\xe2\x80\x89", |
|
67
|
|
|
'HAIR SPACE' => "\xe2\x80\x8a", |
|
68
|
|
|
'LINE SEPARATOR' => "\xe2\x80\xa8", |
|
69
|
|
|
'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
|
70
|
|
|
'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
|
71
|
|
|
'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
|
72
|
|
|
'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
|
73
|
|
|
'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
|
74
|
|
|
]; |
|
75
|
|
|
|
|
76
|
6 |
|
function __construct() |
|
|
|
|
|
|
77
|
|
|
{ |
|
78
|
6 |
|
$this->system = new System(); |
|
79
|
6 |
|
$this->checkForSupport(); |
|
80
|
6 |
|
} |
|
81
|
|
|
|
|
82
|
6 |
|
private function checkForSupport() |
|
|
|
|
|
|
83
|
|
|
{ |
|
84
|
6 |
|
if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
|
85
|
6 |
|
$this->SUPPORT['already_checked_via_portable_utf8'] = true; |
|
86
|
|
|
|
|
87
|
|
|
// http://php.net/manual/en/book.mbstring.php |
|
88
|
6 |
|
$this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
|
89
|
6 |
|
$this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
|
90
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
|
91
|
6 |
|
\mb_internal_encoding('UTF-8'); |
|
92
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
|
93
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
94
|
6 |
|
\mb_regex_encoding('UTF-8'); |
|
95
|
6 |
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
// http://php.net/manual/en/book.iconv.php |
|
99
|
6 |
|
$this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
|
100
|
|
|
|
|
101
|
|
|
// http://php.net/manual/en/book.intl.php |
|
102
|
6 |
|
$this->SUPPORT['intl'] = $this->system->intl_loaded(); |
|
103
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = []; |
|
104
|
|
|
|
|
105
|
|
|
if ( |
|
106
|
6 |
|
$this->SUPPORT['intl'] === true |
|
107
|
|
|
&& |
|
108
|
6 |
|
\function_exists('transliterator_list_ids') === true |
|
109
|
|
|
) { |
|
110
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
111
|
6 |
|
$this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
// http://php.net/manual/en/class.intlchar.php |
|
115
|
6 |
|
$this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
|
116
|
|
|
|
|
117
|
|
|
// http://php.net/manual/en/book.ctype.php |
|
118
|
6 |
|
$this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
|
119
|
|
|
|
|
120
|
|
|
// http://php.net/manual/en/class.finfo.php |
|
121
|
6 |
|
$this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
|
122
|
|
|
|
|
123
|
|
|
// http://php.net/manual/en/book.json.php |
|
124
|
6 |
|
$this->SUPPORT['json'] = $this->system->json_loaded(); |
|
125
|
|
|
|
|
126
|
|
|
// http://php.net/manual/en/book.pcre.php |
|
127
|
6 |
|
$this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
|
128
|
|
|
|
|
129
|
6 |
|
$this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
|
130
|
6 |
|
if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
|
131
|
|
|
\mb_internal_encoding('UTF-8'); |
|
132
|
|
|
$this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
133
|
|
|
} |
|
134
|
|
|
} |
|
135
|
6 |
|
} |
|
136
|
|
|
|
|
137
|
6 |
|
public function rawurldecode($str, $multi_decode = true) |
|
|
|
|
|
|
138
|
|
|
{ |
|
139
|
6 |
|
if ($str === '') { |
|
140
|
|
|
return ''; |
|
141
|
|
|
} |
|
142
|
|
|
|
|
143
|
6 |
|
if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
|
144
|
6 |
|
return $this->fixSimpleUtf8($str); |
|
145
|
|
|
} |
|
146
|
|
|
|
|
147
|
6 |
|
$pattern = '/%u([0-9a-fA-F]{3,4})/'; |
|
148
|
6 |
|
if (preg_match($pattern, $str)) { |
|
149
|
|
|
$str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
|
150
|
|
|
} |
|
151
|
|
|
|
|
152
|
6 |
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
|
153
|
|
|
|
|
154
|
6 |
|
if ($multi_decode === true) { |
|
155
|
|
|
do { |
|
156
|
6 |
|
$str_compare = $str; |
|
157
|
|
|
|
|
158
|
|
|
/** |
|
159
|
|
|
* @psalm-suppress PossiblyInvalidArgument |
|
160
|
|
|
*/ |
|
161
|
6 |
|
$str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
|
162
|
6 |
|
} while ($str_compare !== $str); |
|
163
|
|
|
} |
|
164
|
|
|
|
|
165
|
6 |
|
return $str; |
|
166
|
|
|
} |
|
167
|
|
|
|
|
168
|
6 |
|
private function fixSimpleUtf8($str) |
|
|
|
|
|
|
169
|
|
|
{ |
|
170
|
6 |
|
if ($str === '') { |
|
171
|
|
|
return ''; |
|
172
|
|
|
} |
|
173
|
|
|
|
|
174
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
|
|
|
|
|
175
|
6 |
|
static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
|
|
|
|
|
176
|
|
|
|
|
177
|
6 |
|
if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
|
178
|
1 |
|
if ($this->BROKEN_UTF8_FIX === null) { |
|
179
|
1 |
|
$this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
|
180
|
|
|
} |
|
181
|
|
|
|
|
182
|
1 |
|
$BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
|
183
|
1 |
|
$BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
|
184
|
|
|
} |
|
185
|
|
|
|
|
186
|
6 |
|
return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
|
187
|
|
|
} |
|
188
|
|
|
|
|
189
|
|
|
/** |
|
190
|
|
|
* @param $file |
|
191
|
|
|
* @return mixed |
|
192
|
|
|
*/ |
|
193
|
2 |
|
private function getData($file) |
|
194
|
|
|
{ |
|
195
|
2 |
|
return include __DIR__ . '/../Data/' . $file . '.php'; |
|
196
|
|
|
} |
|
197
|
|
|
|
|
198
|
|
|
/** |
|
199
|
|
|
* @param $str |
|
200
|
|
|
* @param null $flags |
|
201
|
|
|
* @param string $encoding |
|
202
|
|
|
* @return bool|false|string|string[]|null |
|
203
|
|
|
*/ |
|
204
|
6 |
|
private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
|
|
|
|
|
205
|
|
|
{ |
|
206
|
|
|
if ( |
|
207
|
6 |
|
!isset($str[3]) // examples: &; || &x; |
|
208
|
|
|
|| |
|
209
|
6 |
|
strpos($str, '&') === false // no "&" |
|
210
|
|
|
) { |
|
211
|
6 |
|
return $str; |
|
212
|
|
|
} |
|
213
|
|
|
|
|
214
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
215
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
6 |
|
if ($flags === null) { |
|
219
|
|
|
$flags = \ENT_QUOTES | \ENT_HTML5; |
|
220
|
|
|
} |
|
221
|
|
|
|
|
222
|
6 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
223
|
|
|
trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
|
224
|
|
|
} |
|
225
|
|
|
|
|
226
|
|
|
do { |
|
227
|
6 |
|
$str_compare = $str; |
|
228
|
|
|
|
|
229
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
|
230
|
6 |
|
if ($this->SUPPORT['mbstring'] === true) { |
|
231
|
6 |
|
if ($encoding === 'UTF-8') { |
|
232
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
|
233
|
|
|
} else { |
|
|
|
|
|
|
234
|
6 |
|
$str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
|
235
|
|
|
} |
|
236
|
|
|
} else { |
|
|
|
|
|
|
237
|
|
|
$str = (string)preg_replace_callback( |
|
238
|
|
|
"/&#\d{2,6};/", |
|
239
|
|
|
/** |
|
240
|
|
|
* @param string[] $matches |
|
241
|
|
|
* |
|
242
|
|
|
* @return string |
|
243
|
|
|
*/ |
|
244
|
|
|
static function ($matches) use ($encoding) { |
|
245
|
|
|
$returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
|
246
|
|
|
if ($returnTmp !== '"' && $returnTmp !== "'") { |
|
247
|
|
|
return $returnTmp; |
|
248
|
|
|
} |
|
249
|
|
|
|
|
250
|
|
|
return $matches[0]; |
|
251
|
|
|
}, |
|
252
|
|
|
$str |
|
253
|
|
|
); |
|
254
|
|
|
} |
|
255
|
|
|
|
|
256
|
6 |
|
if (strpos($str, '&') !== false) { |
|
257
|
6 |
|
if (strpos($str, '&#') !== false) { |
|
258
|
|
|
// decode also numeric & UTF16 two byte entities |
|
259
|
6 |
|
$str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
|
260
|
|
|
} |
|
261
|
|
|
|
|
262
|
6 |
|
$str = html_entity_decode($str, $flags, $encoding); |
|
263
|
|
|
} |
|
264
|
6 |
|
} while ($str_compare !== $str); |
|
265
|
|
|
|
|
266
|
6 |
|
return $str; |
|
267
|
|
|
} |
|
268
|
|
|
|
|
269
|
|
|
/** |
|
270
|
|
|
* @param $encoding |
|
271
|
|
|
* @param string $fallback |
|
272
|
|
|
* @return mixed|string |
|
273
|
|
|
*/ |
|
274
|
|
|
private function normalize_encoding($encoding, $fallback = '') |
|
|
|
|
|
|
275
|
|
|
{ |
|
276
|
|
|
static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
|
|
|
|
|
|
277
|
|
|
|
|
278
|
|
|
// init |
|
279
|
|
|
$encoding = (string)$encoding; |
|
280
|
|
|
|
|
281
|
|
|
if (!$encoding) { |
|
282
|
|
|
return $fallback; |
|
283
|
|
|
} |
|
284
|
|
|
|
|
285
|
|
|
if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
|
286
|
|
|
return 'UTF-8'; |
|
287
|
|
|
} |
|
288
|
|
|
|
|
289
|
|
|
if ($encoding === '8BIT' || $encoding === 'BINARY') { |
|
290
|
|
|
return 'CP850'; |
|
291
|
|
|
} |
|
292
|
|
|
|
|
293
|
|
|
if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
|
294
|
|
|
return 'HTML-ENTITIES'; |
|
295
|
|
|
} |
|
296
|
|
|
|
|
297
|
|
|
if ( |
|
298
|
|
|
$encoding === '1' // only a fallback, for non "strict_types" usage ... |
|
299
|
|
|
|| |
|
300
|
|
|
$encoding === '0' // only a fallback, for non "strict_types" usage ... |
|
301
|
|
|
) { |
|
302
|
|
|
return $fallback; |
|
303
|
|
|
} |
|
304
|
|
|
|
|
305
|
|
|
if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
|
306
|
|
|
return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
|
307
|
|
|
} |
|
308
|
|
|
|
|
309
|
|
|
if ($this->ENCODINGS === null) { |
|
310
|
|
|
$this->ENCODINGS = $this->getData('encodings'); |
|
311
|
|
|
} |
|
312
|
|
|
|
|
313
|
|
|
if (in_array($encoding, $this->ENCODINGS, true)) { |
|
314
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
|
315
|
|
|
|
|
316
|
|
|
return $encoding; |
|
317
|
|
|
} |
|
318
|
|
|
|
|
319
|
|
|
$encodingOrig = $encoding; |
|
320
|
|
|
$encoding = strtoupper($encoding); |
|
321
|
|
|
$encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
|
322
|
|
|
|
|
323
|
|
|
$equivalences = [ |
|
324
|
|
|
'ISO8859' => 'ISO-8859-1', |
|
325
|
|
|
'ISO88591' => 'ISO-8859-1', |
|
326
|
|
|
'ISO' => 'ISO-8859-1', |
|
327
|
|
|
'LATIN' => 'ISO-8859-1', |
|
328
|
|
|
'LATIN1' => 'ISO-8859-1', // Western European |
|
329
|
|
|
'ISO88592' => 'ISO-8859-2', |
|
330
|
|
|
'LATIN2' => 'ISO-8859-2', // Central European |
|
331
|
|
|
'ISO88593' => 'ISO-8859-3', |
|
332
|
|
|
'LATIN3' => 'ISO-8859-3', // Southern European |
|
333
|
|
|
'ISO88594' => 'ISO-8859-4', |
|
334
|
|
|
'LATIN4' => 'ISO-8859-4', // Northern European |
|
335
|
|
|
'ISO88595' => 'ISO-8859-5', |
|
336
|
|
|
'ISO88596' => 'ISO-8859-6', // Greek |
|
337
|
|
|
'ISO88597' => 'ISO-8859-7', |
|
338
|
|
|
'ISO88598' => 'ISO-8859-8', // Hebrew |
|
339
|
|
|
'ISO88599' => 'ISO-8859-9', |
|
340
|
|
|
'LATIN5' => 'ISO-8859-9', // Turkish |
|
341
|
|
|
'ISO885911' => 'ISO-8859-11', |
|
342
|
|
|
'TIS620' => 'ISO-8859-11', // Thai |
|
343
|
|
|
'ISO885910' => 'ISO-8859-10', |
|
344
|
|
|
'LATIN6' => 'ISO-8859-10', // Nordic |
|
345
|
|
|
'ISO885913' => 'ISO-8859-13', |
|
346
|
|
|
'LATIN7' => 'ISO-8859-13', // Baltic |
|
347
|
|
|
'ISO885914' => 'ISO-8859-14', |
|
348
|
|
|
'LATIN8' => 'ISO-8859-14', // Celtic |
|
349
|
|
|
'ISO885915' => 'ISO-8859-15', |
|
350
|
|
|
'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. β¬) |
|
351
|
|
|
'ISO885916' => 'ISO-8859-16', |
|
352
|
|
|
'LATIN10' => 'ISO-8859-16', // Southeast European |
|
353
|
|
|
'CP1250' => 'WINDOWS-1250', |
|
354
|
|
|
'WIN1250' => 'WINDOWS-1250', |
|
355
|
|
|
'WINDOWS1250' => 'WINDOWS-1250', |
|
356
|
|
|
'CP1251' => 'WINDOWS-1251', |
|
357
|
|
|
'WIN1251' => 'WINDOWS-1251', |
|
358
|
|
|
'WINDOWS1251' => 'WINDOWS-1251', |
|
359
|
|
|
'CP1252' => 'WINDOWS-1252', |
|
360
|
|
|
'WIN1252' => 'WINDOWS-1252', |
|
361
|
|
|
'WINDOWS1252' => 'WINDOWS-1252', |
|
362
|
|
|
'CP1253' => 'WINDOWS-1253', |
|
363
|
|
|
'WIN1253' => 'WINDOWS-1253', |
|
364
|
|
|
'WINDOWS1253' => 'WINDOWS-1253', |
|
365
|
|
|
'CP1254' => 'WINDOWS-1254', |
|
366
|
|
|
'WIN1254' => 'WINDOWS-1254', |
|
367
|
|
|
'WINDOWS1254' => 'WINDOWS-1254', |
|
368
|
|
|
'CP1255' => 'WINDOWS-1255', |
|
369
|
|
|
'WIN1255' => 'WINDOWS-1255', |
|
370
|
|
|
'WINDOWS1255' => 'WINDOWS-1255', |
|
371
|
|
|
'CP1256' => 'WINDOWS-1256', |
|
372
|
|
|
'WIN1256' => 'WINDOWS-1256', |
|
373
|
|
|
'WINDOWS1256' => 'WINDOWS-1256', |
|
374
|
|
|
'CP1257' => 'WINDOWS-1257', |
|
375
|
|
|
'WIN1257' => 'WINDOWS-1257', |
|
376
|
|
|
'WINDOWS1257' => 'WINDOWS-1257', |
|
377
|
|
|
'CP1258' => 'WINDOWS-1258', |
|
378
|
|
|
'WIN1258' => 'WINDOWS-1258', |
|
379
|
|
|
'WINDOWS1258' => 'WINDOWS-1258', |
|
380
|
|
|
'UTF16' => 'UTF-16', |
|
381
|
|
|
'UTF32' => 'UTF-32', |
|
382
|
|
|
'UTF8' => 'UTF-8', |
|
383
|
|
|
'UTF' => 'UTF-8', |
|
384
|
|
|
'UTF7' => 'UTF-7', |
|
385
|
|
|
'8BIT' => 'CP850', |
|
386
|
|
|
'BINARY' => 'CP850', |
|
387
|
|
|
]; |
|
388
|
|
|
|
|
389
|
|
|
if (!empty($equivalences[$encodingUpperHelper])) { |
|
390
|
|
|
$encoding = $equivalences[$encodingUpperHelper]; |
|
391
|
|
|
} |
|
392
|
|
|
|
|
393
|
|
|
$STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
|
394
|
|
|
|
|
395
|
|
|
return $encoding; |
|
396
|
|
|
} |
|
397
|
|
|
|
|
398
|
6 |
|
private function toUtf8($str) |
|
|
|
|
|
|
399
|
|
|
{ |
|
400
|
|
|
|
|
401
|
6 |
|
if (is_array($str) === true) { |
|
402
|
|
|
foreach ($str as $key => $value) { |
|
403
|
|
|
$str[$key] = $this->toUtf8($value); |
|
404
|
|
|
} |
|
405
|
|
|
return $str; |
|
406
|
|
|
} |
|
407
|
|
|
|
|
408
|
|
|
|
|
409
|
6 |
|
$str = (string)$str; |
|
410
|
6 |
|
if ($str === '') { |
|
411
|
|
|
return $str; |
|
412
|
|
|
} |
|
413
|
|
|
|
|
414
|
6 |
|
$max = \strlen($str); |
|
415
|
6 |
|
$buf = ''; |
|
416
|
|
|
|
|
417
|
6 |
|
for ($i = 0; $i < $max; ++$i) { |
|
418
|
6 |
|
$c1 = $str[$i]; |
|
|
|
|
|
|
419
|
|
|
|
|
420
|
6 |
|
if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
|
421
|
|
|
|
|
422
|
|
|
if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
|
423
|
|
|
|
|
424
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
|
|
|
|
|
425
|
|
|
|
|
426
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
427
|
|
|
$buf .= $c1 . $c2; |
|
428
|
|
|
++$i; |
|
429
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
430
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
431
|
|
|
} |
|
432
|
|
|
} elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
|
433
|
|
|
|
|
434
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
435
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
|
|
|
|
|
436
|
|
|
|
|
437
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
438
|
|
|
$buf .= $c1 . $c2 . $c3; |
|
439
|
|
|
$i += 2; |
|
440
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
441
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
442
|
|
|
} |
|
443
|
|
|
} elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
|
444
|
|
|
|
|
445
|
|
|
$c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
|
446
|
|
|
$c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
|
447
|
|
|
$c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
|
|
|
|
|
|
448
|
|
|
|
|
449
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
|
450
|
|
|
$buf .= $c1 . $c2 . $c3 . $c4; |
|
451
|
|
|
$i += 3; |
|
452
|
|
|
} else { // not valid UTF8 - convert it |
|
|
|
|
|
|
453
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
454
|
|
|
} |
|
455
|
|
|
} else { // doesn't look like UTF8, but should be converted |
|
|
|
|
|
|
456
|
|
|
|
|
457
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
458
|
|
|
} |
|
459
|
6 |
|
} elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
|
460
|
|
|
|
|
461
|
|
|
$buf .= $this->toUtf8ConvertHelper($c1); |
|
462
|
|
|
} else { // it doesn't need conversion |
|
|
|
|
|
|
463
|
|
|
|
|
464
|
6 |
|
$buf .= $c1; |
|
465
|
|
|
} |
|
466
|
|
|
} |
|
467
|
|
|
|
|
468
|
|
|
// decode unicode escape sequences + unicode surrogate pairs |
|
469
|
6 |
|
$buf = preg_replace_callback( |
|
470
|
6 |
|
'/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
|
471
|
|
|
/** |
|
472
|
|
|
* @param array $matches |
|
473
|
|
|
* |
|
474
|
|
|
* @return string |
|
475
|
|
|
*/ |
|
476
|
|
|
function (array $matches) { |
|
477
|
1 |
|
if (isset($matches[3])) { |
|
478
|
1 |
|
$cp = (int)hexdec($matches[3]); |
|
|
|
|
|
|
479
|
|
|
} else { |
|
|
|
|
|
|
480
|
|
|
// http://unicode.org/faq/utf_bom.html#utf16-4 |
|
481
|
|
|
$cp = ((int)hexdec($matches[1]) << 10) |
|
482
|
|
|
+ (int)hexdec($matches[2]) |
|
483
|
|
|
+ 0x10000 |
|
484
|
|
|
- (0xD800 << 10) |
|
485
|
|
|
- 0xDC00; |
|
486
|
|
|
} |
|
487
|
|
|
|
|
488
|
|
|
// https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
|
489
|
|
|
// |
|
490
|
|
|
// php_utf32_utf8(unsigned char *buf, unsigned k) |
|
491
|
|
|
|
|
492
|
1 |
|
if ($cp < 0x80) { |
|
493
|
1 |
|
return (string)$this->chr($cp); |
|
494
|
|
|
} |
|
495
|
|
|
|
|
496
|
|
|
if ($cp < 0xA0) { |
|
497
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
498
|
|
|
return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
|
499
|
|
|
} |
|
500
|
|
|
|
|
501
|
|
|
return $this->decimalToChr($cp); |
|
502
|
6 |
|
}, |
|
503
|
6 |
|
$buf |
|
504
|
|
|
); |
|
505
|
|
|
|
|
506
|
6 |
|
if ($buf === null) { |
|
507
|
|
|
return ''; |
|
508
|
|
|
} |
|
509
|
|
|
|
|
510
|
|
|
|
|
511
|
6 |
|
return $buf; |
|
512
|
|
|
} |
|
513
|
|
|
|
|
514
|
|
|
private function toUtf8ConvertHelper($input) |
|
|
|
|
|
|
515
|
|
|
{ |
|
516
|
|
|
// init |
|
517
|
|
|
$buf = ''; |
|
518
|
|
|
|
|
519
|
|
|
if ($this->ORD === null) { |
|
520
|
|
|
$this->ORD = $this->getData('ord'); |
|
521
|
|
|
} |
|
522
|
|
|
|
|
523
|
|
|
if ($this->CHR === null) { |
|
524
|
|
|
$this->CHR = $this->getData('chr'); |
|
525
|
|
|
} |
|
526
|
|
|
|
|
527
|
|
|
if ($this->WIN1252_TO_UTF8 === null) { |
|
528
|
|
|
$this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
|
529
|
|
|
} |
|
530
|
|
|
|
|
531
|
|
|
$ordC1 = $this->ORD[$input]; |
|
532
|
|
|
if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
|
533
|
|
|
$buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
|
534
|
|
|
} else { |
|
|
|
|
|
|
535
|
|
|
$cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
|
536
|
|
|
$cc2 = ((string)$input & "\x3F") | "\x80"; |
|
537
|
|
|
$buf .= $cc1 . $cc2; |
|
538
|
|
|
} |
|
539
|
|
|
|
|
540
|
|
|
return $buf; |
|
541
|
|
|
} |
|
542
|
|
|
|
|
543
|
1 |
|
private function chr($code_point, $encoding = 'UTF-8') |
|
|
|
|
|
|
544
|
|
|
{ |
|
545
|
|
|
// init |
|
546
|
1 |
|
static $CHAR_CACHE = []; |
|
547
|
|
|
|
|
548
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
549
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
550
|
|
|
} |
|
551
|
|
|
|
|
552
|
1 |
|
if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
553
|
|
|
trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
|
554
|
|
|
} |
|
555
|
|
|
|
|
556
|
1 |
|
$cacheKey = $code_point . $encoding; |
|
557
|
1 |
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
558
|
|
|
return $CHAR_CACHE[$cacheKey]; |
|
559
|
|
|
} |
|
560
|
|
|
|
|
561
|
1 |
|
if ($code_point <= 127) { // use "simple"-char only until "\x80" |
|
562
|
|
|
|
|
563
|
1 |
|
if ($this->CHR === null) { |
|
564
|
1 |
|
$this->CHR = (array)$this->getData('chr'); |
|
565
|
|
|
} |
|
566
|
|
|
|
|
567
|
|
|
/** |
|
568
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
569
|
|
|
*/ |
|
570
|
1 |
|
$chr = $this->CHR[$code_point]; |
|
571
|
|
|
|
|
572
|
1 |
|
if ($encoding !== 'UTF-8') { |
|
573
|
|
|
$chr = $this->encode($encoding, $chr); |
|
574
|
|
|
} |
|
575
|
|
|
|
|
576
|
1 |
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
577
|
|
|
} |
|
578
|
|
|
|
|
579
|
|
|
// |
|
580
|
|
|
// fallback via "IntlChar" |
|
581
|
|
|
// |
|
582
|
|
|
|
|
583
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
|
584
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
585
|
|
|
$chr = IntlChar::chr($code_point); |
|
586
|
|
|
|
|
587
|
|
|
if ($encoding !== 'UTF-8') { |
|
588
|
|
|
$chr = $this->encode($encoding, $chr); |
|
589
|
|
|
} |
|
590
|
|
|
|
|
591
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
592
|
|
|
} |
|
593
|
|
|
|
|
594
|
|
|
// |
|
595
|
|
|
// fallback via vanilla php |
|
596
|
|
|
// |
|
597
|
|
|
|
|
598
|
|
|
if ($this->CHR === null) { |
|
599
|
|
|
$this->CHR = (array)$this->getData('chr'); |
|
600
|
|
|
} |
|
601
|
|
|
|
|
602
|
|
|
$code_point = (int)$code_point; |
|
603
|
|
|
if ($code_point <= 0x7F) { |
|
604
|
|
|
/** |
|
605
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
606
|
|
|
*/ |
|
607
|
|
|
$chr = $this->CHR[$code_point]; |
|
608
|
|
|
} elseif ($code_point <= 0x7FF) { |
|
609
|
|
|
/** |
|
610
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
611
|
|
|
*/ |
|
612
|
|
|
$chr = $this->CHR[($code_point >> 6) + 0xC0] . |
|
613
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
614
|
|
|
} elseif ($code_point <= 0xFFFF) { |
|
615
|
|
|
/** |
|
616
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
617
|
|
|
*/ |
|
618
|
|
|
$chr = $this->CHR[($code_point >> 12) + 0xE0] . |
|
619
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
|
620
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
621
|
|
|
} else { |
|
|
|
|
|
|
622
|
|
|
/** |
|
623
|
|
|
* @psalm-suppress PossiblyNullArrayAccess |
|
624
|
|
|
*/ |
|
625
|
|
|
$chr = $this->CHR[($code_point >> 18) + 0xF0] . |
|
626
|
|
|
$this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
|
627
|
|
|
$this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
|
628
|
|
|
$this->CHR[($code_point & 0x3F) + 0x80]; |
|
629
|
|
|
} |
|
630
|
|
|
|
|
631
|
|
|
if ($encoding !== 'UTF-8') { |
|
632
|
|
|
$chr = $this->encode($encoding, $chr); |
|
633
|
|
|
} |
|
634
|
|
|
|
|
635
|
|
|
return $CHAR_CACHE[$cacheKey] = $chr; |
|
636
|
|
|
} |
|
637
|
|
|
|
|
638
|
|
|
private function encode($toEncoding, $str) |
|
|
|
|
|
|
639
|
|
|
{ |
|
640
|
|
|
if ($str === '' || $toEncoding === '') { |
|
641
|
|
|
return $str; |
|
642
|
|
|
} |
|
643
|
|
|
|
|
644
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
|
645
|
|
|
$toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
|
646
|
|
|
} |
|
647
|
|
|
|
|
648
|
|
|
// if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
|
649
|
|
|
// $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
|
650
|
|
|
// } |
|
651
|
|
|
|
|
652
|
|
|
// if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
|
653
|
|
|
// return $str; |
|
654
|
|
|
// } |
|
655
|
|
|
|
|
656
|
|
|
if ($toEncoding === 'JSON') { |
|
657
|
|
|
$return = $this->jsonEncode($str); |
|
658
|
|
|
if ($return === false) { |
|
659
|
|
|
throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
|
660
|
|
|
} |
|
661
|
|
|
|
|
662
|
|
|
return $return; |
|
663
|
|
|
} |
|
664
|
|
|
// if ($fromEncoding === 'JSON') { |
|
665
|
|
|
// $str = $this->json_decode($str); |
|
666
|
|
|
// $fromEncoding = ''; |
|
667
|
|
|
// } |
|
668
|
|
|
|
|
669
|
|
|
if ($toEncoding === 'BASE64') { |
|
670
|
|
|
return base64_encode($str); |
|
671
|
|
|
} |
|
672
|
|
|
// if ($fromEncoding === 'BASE64') { |
|
673
|
|
|
// $str = base64_decode($str, true); |
|
674
|
|
|
// $fromEncoding = ''; |
|
675
|
|
|
// } |
|
676
|
|
|
|
|
677
|
|
|
if ($toEncoding === 'HTML-ENTITIES') { |
|
678
|
|
|
return $this->htmlEncode($str, true, 'UTF-8'); |
|
679
|
|
|
} |
|
680
|
|
|
// if ($fromEncoding === 'HTML-ENTITIES') { |
|
681
|
|
|
// $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
|
682
|
|
|
// $fromEncoding = ''; |
|
683
|
|
|
// } |
|
684
|
|
|
|
|
685
|
|
|
$fromEncodingDetected = false; |
|
|
|
|
|
|
686
|
|
|
// if ($autodetectFromEncoding === true || !$fromEncoding) { |
|
687
|
|
|
// $fromEncodingDetected = $this->str_detect_encoding($str); |
|
688
|
|
|
// } |
|
689
|
|
|
|
|
690
|
|
|
// DEBUG |
|
691
|
|
|
//var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
|
692
|
|
|
|
|
693
|
|
|
// if ($fromEncodingDetected !== false) { |
|
694
|
|
|
// $fromEncoding = $fromEncodingDetected; |
|
695
|
|
|
// } elseif ($autodetectFromEncoding === true) { |
|
696
|
|
|
// // fallback for the "autodetect"-mode |
|
697
|
|
|
// return $this->toUtf8($str); |
|
698
|
|
|
// } |
|
699
|
|
|
|
|
700
|
|
|
// if (!$fromEncoding || $fromEncoding === $toEncoding) { |
|
701
|
|
|
// return $str; |
|
702
|
|
|
// } |
|
703
|
|
|
|
|
704
|
|
|
// if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
|
705
|
|
|
// return $this->toUtf8($str); |
|
706
|
|
|
// } |
|
707
|
|
|
|
|
708
|
|
|
// if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
|
709
|
|
|
// return $this->to_iso8859($str); |
|
710
|
|
|
// } |
|
711
|
|
|
|
|
712
|
|
|
if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
713
|
|
|
trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
|
714
|
|
|
} |
|
715
|
|
|
// |
|
716
|
|
|
// if ($this->SUPPORT['mbstring'] === true) { |
|
717
|
|
|
// // warning: do not use the symfony polyfill here |
|
718
|
|
|
// $strEncoded = mb_convert_encoding( |
|
719
|
|
|
// $str, |
|
720
|
|
|
// $toEncoding, |
|
721
|
|
|
// $fromEncoding |
|
722
|
|
|
// ); |
|
723
|
|
|
// |
|
724
|
|
|
// if ($strEncoded) { |
|
725
|
|
|
// return $strEncoded; |
|
726
|
|
|
// } |
|
727
|
|
|
// } |
|
728
|
|
|
// |
|
729
|
|
|
// $return = \iconv($fromEncoding, $toEncoding, $str); |
|
730
|
|
|
// if ($return !== false) { |
|
731
|
|
|
// return $return; |
|
732
|
|
|
// } |
|
733
|
|
|
|
|
734
|
|
|
return $str; |
|
735
|
|
|
} |
|
736
|
|
|
|
|
737
|
|
|
private function jsonEncode($value) |
|
738
|
|
|
{ |
|
739
|
|
|
$value = $this->filter($value); |
|
740
|
|
|
|
|
741
|
|
|
if ($this->SUPPORT['json'] === false) { |
|
742
|
|
|
throw new \RuntimeException('ext-json: is not installed'); |
|
743
|
|
|
} |
|
744
|
|
|
|
|
745
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
746
|
|
|
return json_encode($value, 0, 512); |
|
747
|
|
|
} |
|
748
|
|
|
|
|
749
|
|
|
private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = 'β') |
|
|
|
|
|
|
750
|
|
|
{ |
|
751
|
|
|
switch (\gettype($var)) { |
|
752
|
|
|
case 'array': |
|
753
|
|
|
foreach ($var as $key => $value) { |
|
754
|
|
|
$var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
755
|
|
|
} |
|
756
|
|
|
unset($v); |
|
|
|
|
|
|
757
|
|
|
|
|
758
|
|
|
break; |
|
759
|
|
|
case 'object': |
|
760
|
|
|
foreach ($var as $key => $value) { |
|
761
|
|
|
$str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
|
|
|
|
|
|
762
|
|
|
} |
|
763
|
|
|
unset($v); |
|
764
|
|
|
|
|
765
|
|
|
break; |
|
766
|
|
|
case 'string': |
|
|
|
|
|
|
767
|
|
|
|
|
768
|
|
|
if (strpos($var, "\r") !== false) { |
|
769
|
|
|
// Workaround https://bugs.php.net/65732 |
|
770
|
|
|
$var = $this->normalizeLineEnding($var); |
|
771
|
|
|
} |
|
772
|
|
|
|
|
773
|
|
|
if ($this->isAscii($var) === false) { |
|
774
|
|
|
if (\Normalizer::isNormalized($var, $normalization_form)) { |
|
775
|
|
|
$n = '-'; |
|
|
|
|
|
|
776
|
|
|
} else { |
|
|
|
|
|
|
777
|
|
|
$n = \Normalizer::normalize($var, $normalization_form); |
|
778
|
|
|
|
|
779
|
|
|
if (isset($n[0])) { |
|
780
|
|
|
$var = $n; |
|
781
|
|
|
} else { |
|
|
|
|
|
|
782
|
|
|
$var = $this->encode('UTF-8', $var, true); |
|
|
|
|
|
|
783
|
|
|
} |
|
784
|
|
|
} |
|
785
|
|
|
|
|
786
|
|
|
if ( |
|
787
|
|
|
$var[0] >= "\x80" |
|
788
|
|
|
&& |
|
789
|
|
|
isset($n[0], $leading_combining[0]) |
|
790
|
|
|
&& |
|
791
|
|
|
preg_match('/^\p{Mn}/u', $var) |
|
792
|
|
|
) { |
|
793
|
|
|
// Prevent leading combining chars |
|
794
|
|
|
// for NFC-safe concatenations. |
|
795
|
|
|
$var = $leading_combining . $var; |
|
796
|
|
|
} |
|
797
|
|
|
} |
|
798
|
|
|
|
|
799
|
|
|
break; |
|
800
|
|
|
} |
|
801
|
|
|
|
|
802
|
|
|
return $var; |
|
803
|
|
|
} |
|
804
|
|
|
|
|
805
|
|
|
private function normalizeLineEnding($str) |
|
806
|
|
|
{ |
|
807
|
|
|
return str_replace(["\r\n", "\r"], "\n", $str); |
|
808
|
|
|
} |
|
809
|
|
|
|
|
810
|
|
|
private function isAscii($str) |
|
811
|
|
|
{ |
|
812
|
|
|
if ($str === '') { |
|
813
|
|
|
return true; |
|
814
|
|
|
} |
|
815
|
|
|
|
|
816
|
|
|
return !preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); |
|
817
|
|
|
} |
|
818
|
|
|
|
|
819
|
|
|
private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
|
|
820
|
|
|
{ |
|
821
|
|
|
if ($str === '') { |
|
822
|
|
|
return ''; |
|
823
|
|
|
} |
|
824
|
|
|
|
|
825
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
826
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
827
|
|
|
} |
|
828
|
|
|
|
|
829
|
|
|
// INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
|
830
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
|
831
|
|
|
$startCode = 0x00; |
|
832
|
|
|
if ($keepAsciiChars === true) { |
|
833
|
|
|
$startCode = 0x80; |
|
834
|
|
|
} |
|
835
|
|
|
|
|
836
|
|
|
if ($encoding === 'UTF-8') { |
|
837
|
|
|
return mb_encode_numericentity( |
|
838
|
|
|
$str, |
|
839
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0] |
|
840
|
|
|
); |
|
841
|
|
|
} |
|
842
|
|
|
|
|
843
|
|
|
return mb_encode_numericentity( |
|
844
|
|
|
$str, |
|
845
|
|
|
[$startCode, 0xfffff, 0, 0xfffff, 0], |
|
846
|
|
|
$encoding |
|
847
|
|
|
); |
|
848
|
|
|
} |
|
849
|
|
|
|
|
850
|
|
|
return implode( |
|
851
|
|
|
'', |
|
852
|
|
|
\array_map( |
|
853
|
|
|
function (string $chr) use ($keepAsciiChars, $encoding) { |
|
854
|
|
|
return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
|
855
|
|
|
}, |
|
856
|
|
|
$this->strSplit($str) |
|
857
|
|
|
) |
|
858
|
|
|
); |
|
859
|
|
|
} |
|
860
|
|
|
|
|
861
|
|
|
private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
|
|
|
|
|
|
862
|
|
|
{ |
|
863
|
|
|
if ($char === '') { |
|
864
|
|
|
return ''; |
|
865
|
|
|
} |
|
866
|
|
|
|
|
867
|
|
|
if ($keepAsciiChars === true && $this->isAscii($char) === true) { |
|
868
|
|
|
return $char; |
|
869
|
|
|
} |
|
870
|
|
|
|
|
871
|
|
|
return '&#' . $this->ord($char, $encoding) . ';'; |
|
872
|
|
|
} |
|
873
|
|
|
|
|
874
|
|
|
private function ord($chr, $encoding = 'UTF-8') |
|
|
|
|
|
|
875
|
|
|
{ |
|
876
|
|
|
static $CHAR_CACHE = []; |
|
877
|
|
|
|
|
878
|
|
|
// init |
|
879
|
|
|
$chr = (string)$chr; |
|
880
|
|
|
|
|
881
|
|
|
if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
882
|
|
|
$encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
|
883
|
|
|
} |
|
884
|
|
|
|
|
885
|
|
|
$cacheKey = $chr . $encoding; |
|
886
|
|
|
if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
887
|
|
|
return $CHAR_CACHE[$cacheKey]; |
|
888
|
|
|
} |
|
889
|
|
|
|
|
890
|
|
|
// check again, if it's still not UTF-8 |
|
891
|
|
|
if ($encoding !== 'UTF-8') { |
|
892
|
|
|
$chr = $this->encode($encoding, $chr); |
|
893
|
|
|
} |
|
894
|
|
|
|
|
895
|
|
|
if ($this->ORD === null) { |
|
896
|
|
|
$this->ORD = $this->getData('ord'); |
|
897
|
|
|
} |
|
898
|
|
|
|
|
899
|
|
|
if (isset($this->ORD[$chr])) { |
|
900
|
|
|
return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
|
901
|
|
|
} |
|
902
|
|
|
|
|
903
|
|
|
// |
|
904
|
|
|
// fallback via "IntlChar" |
|
905
|
|
|
// |
|
906
|
|
|
|
|
907
|
|
|
if ($this->SUPPORT['intlChar'] === true) { |
|
908
|
|
|
/** @noinspection PhpComposerExtensionStubsInspection */ |
|
909
|
|
|
$code = \IntlChar::ord($chr); |
|
910
|
|
|
if ($code) { |
|
911
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
|
912
|
|
|
} |
|
913
|
|
|
} |
|
914
|
|
|
|
|
915
|
|
|
// |
|
916
|
|
|
// fallback via vanilla php |
|
917
|
|
|
// |
|
918
|
|
|
|
|
919
|
|
|
/** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
|
920
|
|
|
$chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
|
921
|
|
|
$code = $chr ? $chr[1] : 0; |
|
922
|
|
|
|
|
923
|
|
|
if ($code >= 0xF0 && isset($chr[4])) { |
|
924
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
925
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
|
926
|
|
|
} |
|
927
|
|
|
|
|
928
|
|
|
if ($code >= 0xE0 && isset($chr[3])) { |
|
929
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
930
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
|
931
|
|
|
} |
|
932
|
|
|
|
|
933
|
|
|
if ($code >= 0xC0 && isset($chr[2])) { |
|
934
|
|
|
/** @noinspection UnnecessaryCastingInspection */ |
|
935
|
|
|
return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
|
936
|
|
|
} |
|
937
|
|
|
|
|
938
|
|
|
return $CHAR_CACHE[$cacheKey] = $code; |
|
939
|
|
|
} |
|
940
|
|
|
|
|
941
|
|
|
private function strSplit($str, $length = 1) |
|
|
|
|
|
|
942
|
|
|
{ |
|
943
|
|
|
if ($length <= 0) { |
|
944
|
|
|
return []; |
|
945
|
|
|
} |
|
946
|
|
|
|
|
947
|
|
|
if (is_array($str) === true) { |
|
948
|
|
|
foreach ($str as $key => $value) { |
|
949
|
|
|
$str[$key] = $this->strSplit($value, $length); |
|
950
|
|
|
} |
|
951
|
|
|
|
|
952
|
|
|
return $str; |
|
953
|
|
|
} |
|
954
|
|
|
|
|
955
|
|
|
// init |
|
956
|
|
|
$str = (string)$str; |
|
957
|
|
|
|
|
958
|
|
|
if ($str === '') { |
|
959
|
|
|
return []; |
|
960
|
|
|
} |
|
961
|
|
|
|
|
962
|
|
|
|
|
963
|
|
|
if ($this->SUPPORT['mbstring'] === true) { |
|
964
|
|
|
$iMax = \mb_strlen($str); |
|
965
|
|
|
if ($iMax <= 127) { |
|
966
|
|
|
$ret = []; |
|
967
|
|
|
for ($i = 0; $i < $iMax; ++$i) { |
|
968
|
|
|
$ret[] = \mb_substr($str, $i, 1); |
|
969
|
|
|
} |
|
970
|
|
|
} else { |
|
|
|
|
|
|
971
|
|
|
$retArray = []; |
|
972
|
|
|
preg_match_all('/./us', $str, $retArray); |
|
973
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
|
974
|
|
|
} |
|
975
|
|
|
} elseif ($this->SUPPORT['pcre_utf8'] === true) { |
|
976
|
|
|
$retArray = []; |
|
977
|
|
|
preg_match_all('/./us', $str, $retArray); |
|
978
|
|
|
$ret = isset($retArray[0]) ? $retArray[0] : []; |
|
979
|
|
|
} else { |
|
|
|
|
|
|
980
|
|
|
|
|
981
|
|
|
// fallback |
|
982
|
|
|
|
|
983
|
|
|
$ret = []; |
|
984
|
|
|
$len = \strlen($str); |
|
985
|
|
|
|
|
986
|
|
|
/** @noinspection ForeachInvariantsInspection */ |
|
987
|
|
|
for ($i = 0; $i < $len; ++$i) { |
|
988
|
|
|
if (($str[$i] & "\x80") === "\x00") { |
|
989
|
|
|
$ret[] = $str[$i]; |
|
990
|
|
|
} elseif ( |
|
991
|
|
|
isset($str[$i + 1]) |
|
992
|
|
|
&& |
|
993
|
|
|
($str[$i] & "\xE0") === "\xC0" |
|
994
|
|
|
) { |
|
995
|
|
|
if (($str[$i + 1] & "\xC0") === "\x80") { |
|
996
|
|
|
$ret[] = $str[$i] . $str[$i + 1]; |
|
997
|
|
|
|
|
998
|
|
|
++$i; |
|
999
|
|
|
} |
|
1000
|
|
|
} elseif ( |
|
1001
|
|
|
isset($str[$i + 2]) |
|
1002
|
|
|
&& |
|
1003
|
|
|
($str[$i] & "\xF0") === "\xE0" |
|
1004
|
|
|
) { |
|
1005
|
|
|
if ( |
|
1006
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
|
1007
|
|
|
&& |
|
1008
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
|
1009
|
|
|
) { |
|
1010
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
|
1011
|
|
|
|
|
1012
|
|
|
$i += 2; |
|
1013
|
|
|
} |
|
1014
|
|
|
} elseif ( |
|
1015
|
|
|
isset($str[$i + 3]) |
|
1016
|
|
|
&& |
|
1017
|
|
|
($str[$i] & "\xF8") === "\xF0" |
|
1018
|
|
|
) { |
|
1019
|
|
|
if ( |
|
1020
|
|
|
($str[$i + 1] & "\xC0") === "\x80" |
|
1021
|
|
|
&& |
|
1022
|
|
|
($str[$i + 2] & "\xC0") === "\x80" |
|
1023
|
|
|
&& |
|
1024
|
|
|
($str[$i + 3] & "\xC0") === "\x80" |
|
1025
|
|
|
) { |
|
1026
|
|
|
$ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
|
1027
|
|
|
|
|
1028
|
|
|
$i += 3; |
|
1029
|
|
|
} |
|
1030
|
|
|
} |
|
1031
|
|
|
} |
|
1032
|
|
|
} |
|
1033
|
|
|
|
|
1034
|
|
|
if ($length > 1) { |
|
1035
|
|
|
$ret = \array_chunk($ret, $length); |
|
1036
|
|
|
|
|
1037
|
|
|
return array_map( |
|
1038
|
|
|
static function (&$item) { |
|
1039
|
|
|
return implode('', $item); |
|
1040
|
|
|
}, |
|
1041
|
|
|
$ret |
|
1042
|
|
|
); |
|
1043
|
|
|
} |
|
1044
|
|
|
|
|
1045
|
|
|
if (isset($ret[0]) && $ret[0] === '') { |
|
1046
|
|
|
return []; |
|
1047
|
|
|
} |
|
1048
|
|
|
|
|
1049
|
|
|
return $ret; |
|
1050
|
|
|
} |
|
1051
|
|
|
|
|
1052
|
|
|
private function decimalToChr($int) |
|
1053
|
|
|
{ |
|
1054
|
|
|
return $this->htmlEntityDecode('&#' . $int . ';', \ENT_QUOTES | \ENT_HTML5); |
|
1055
|
|
|
} |
|
1056
|
|
|
|
|
1057
|
|
|
private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
|
|
|
|
|
|
1058
|
|
|
{ |
|
1059
|
|
|
// http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
|
1060
|
|
|
// caused connection reset problem on larger strings |
|
1061
|
|
|
|
|
1062
|
|
|
$regx = '/ |
|
1063
|
|
|
( |
|
1064
|
|
|
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
|
1065
|
|
|
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
|
1066
|
|
|
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
|
1067
|
|
|
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
|
1068
|
|
|
){1,100} # ...one or more times |
|
1069
|
|
|
) |
|
1070
|
|
|
| ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
|
1071
|
|
|
| ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
|
1072
|
|
|
/x'; |
|
1073
|
|
|
$str = (string)preg_replace($regx, '$1', $str); |
|
1074
|
|
|
|
|
1075
|
|
|
if ($replace_diamond_question_mark === true) { |
|
1076
|
|
|
$str = $this->replace_diamond_question_mark($str, ''); |
|
1077
|
|
|
} |
|
1078
|
|
|
|
|
1079
|
|
|
if ($remove_invisible_characters === true) { |
|
1080
|
|
|
$str = $this->remove_invisible_characters($str); |
|
1081
|
|
|
} |
|
1082
|
|
|
|
|
1083
|
|
|
if ($normalize_whitespace === true) { |
|
1084
|
|
|
$str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
|
1085
|
|
|
} |
|
1086
|
|
|
|
|
1087
|
|
|
if ($normalize_msword === true) { |
|
1088
|
|
|
$str = $this->normalize_msword($str); |
|
1089
|
|
|
} |
|
1090
|
|
|
|
|
1091
|
|
|
if ($remove_bom === true) { |
|
1092
|
|
|
$str = $this->remove_bom($str); |
|
1093
|
|
|
} |
|
1094
|
|
|
|
|
1095
|
|
|
return $str; |
|
1096
|
|
|
} |
|
1097
|
|
|
|
|
1098
|
6 |
|
public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
|
|
|
|
|
1099
|
|
|
{ |
|
1100
|
6 |
|
if ($str === '') { |
|
1101
|
|
|
return ''; |
|
1102
|
|
|
} |
|
1103
|
|
|
|
|
1104
|
6 |
|
if ($processInvalidUtf8 === true) { |
|
1105
|
6 |
|
$replacementCharHelper = $replacementChar; |
|
|
|
|
|
|
1106
|
6 |
|
if ($replacementChar === '') { |
|
1107
|
6 |
|
$replacementCharHelper = 'none'; |
|
1108
|
|
|
} |
|
1109
|
|
|
|
|
1110
|
6 |
|
if ($this->SUPPORT['mbstring'] === false) { |
|
1111
|
|
|
// if there is no native support for "mbstring", |
|
1112
|
|
|
// then we need to clean the string before ... |
|
1113
|
|
|
$str = $this->clean($str); |
|
1114
|
|
|
} |
|
1115
|
|
|
|
|
1116
|
6 |
|
$save = \mb_substitute_character(); |
|
1117
|
6 |
|
\mb_substitute_character($replacementCharHelper); |
|
1118
|
|
|
// the polyfill maybe return false, so cast to string |
|
1119
|
6 |
|
$str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
|
1120
|
6 |
|
\mb_substitute_character($save); |
|
1121
|
|
|
} |
|
1122
|
|
|
|
|
1123
|
6 |
|
return str_replace( |
|
1124
|
|
|
[ |
|
1125
|
6 |
|
"\xEF\xBF\xBD", |
|
1126
|
|
|
'οΏ½', |
|
1127
|
|
|
], |
|
1128
|
|
|
[ |
|
1129
|
6 |
|
$replacementChar, |
|
1130
|
6 |
|
$replacementChar, |
|
1131
|
|
|
], |
|
1132
|
6 |
|
$str |
|
1133
|
|
|
); |
|
1134
|
|
|
} |
|
1135
|
|
|
|
|
1136
|
6 |
|
public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
|
|
|
|
|
1137
|
|
|
{ |
|
1138
|
|
|
// init |
|
1139
|
6 |
|
$non_displayables = []; |
|
1140
|
|
|
|
|
1141
|
|
|
// every control character except newline (dec 10), |
|
1142
|
|
|
// carriage return (dec 13) and horizontal tab (dec 09) |
|
1143
|
6 |
|
if ($url_encoded) { |
|
1144
|
6 |
|
$non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
|
1145
|
6 |
|
$non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
|
1146
|
|
|
} |
|
1147
|
|
|
|
|
1148
|
6 |
|
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
|
1149
|
|
|
|
|
1150
|
|
|
do { |
|
1151
|
6 |
|
$str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
|
1152
|
6 |
|
} while ($count !== 0); |
|
1153
|
|
|
|
|
1154
|
6 |
|
return $str; |
|
1155
|
|
|
} |
|
1156
|
|
|
|
|
1157
|
6 |
|
public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
|
|
|
|
|
1158
|
|
|
{ |
|
1159
|
6 |
|
if ($str === '') { |
|
1160
|
|
|
return ''; |
|
1161
|
|
|
} |
|
1162
|
|
|
|
|
1163
|
6 |
|
static $WHITESPACE_CACHE = []; |
|
1164
|
6 |
|
$cacheKey = (int)$keepNonBreakingSpace; |
|
1165
|
|
|
|
|
1166
|
6 |
|
if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
|
1167
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
|
1168
|
|
|
|
|
1169
|
1 |
|
if ($keepNonBreakingSpace === true) { |
|
1170
|
|
|
unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
|
1171
|
|
|
} |
|
1172
|
|
|
|
|
1173
|
1 |
|
$WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
|
1174
|
|
|
} |
|
1175
|
|
|
|
|
1176
|
6 |
|
if ($keepBidiUnicodeControls === false) { |
|
1177
|
6 |
|
static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
|
|
|
|
|
1178
|
|
|
|
|
1179
|
6 |
|
if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
|
1180
|
1 |
|
$BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
|
1181
|
|
|
} |
|
1182
|
|
|
|
|
1183
|
6 |
|
$str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
|
1184
|
|
|
} |
|
1185
|
|
|
|
|
1186
|
6 |
|
return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
|
1187
|
|
|
} |
|
1188
|
|
|
|
|
1189
|
|
|
private function normalize_msword($str) |
|
|
|
|
|
|
1190
|
|
|
{ |
|
1191
|
|
|
if ($str === '') { |
|
1192
|
|
|
return ''; |
|
1193
|
|
|
} |
|
1194
|
|
|
|
|
1195
|
|
|
$keys = [ |
|
1196
|
|
|
"\xc2\xab", // Β« (U+00AB) in UTF-8 |
|
1197
|
|
|
"\xc2\xbb", // Β» (U+00BB) in UTF-8 |
|
1198
|
|
|
"\xe2\x80\x98", // β (U+2018) in UTF-8 |
|
1199
|
|
|
"\xe2\x80\x99", // β (U+2019) in UTF-8 |
|
1200
|
|
|
"\xe2\x80\x9a", // β (U+201A) in UTF-8 |
|
1201
|
|
|
"\xe2\x80\x9b", // β (U+201B) in UTF-8 |
|
1202
|
|
|
"\xe2\x80\x9c", // β (U+201C) in UTF-8 |
|
1203
|
|
|
"\xe2\x80\x9d", // β (U+201D) in UTF-8 |
|
1204
|
|
|
"\xe2\x80\x9e", // β (U+201E) in UTF-8 |
|
1205
|
|
|
"\xe2\x80\x9f", // β (U+201F) in UTF-8 |
|
1206
|
|
|
"\xe2\x80\xb9", // βΉ (U+2039) in UTF-8 |
|
1207
|
|
|
"\xe2\x80\xba", // βΊ (U+203A) in UTF-8 |
|
1208
|
|
|
"\xe2\x80\x93", // β (U+2013) in UTF-8 |
|
1209
|
|
|
"\xe2\x80\x94", // β (U+2014) in UTF-8 |
|
1210
|
|
|
"\xe2\x80\xa6", // β¦ (U+2026) in UTF-8 |
|
1211
|
|
|
]; |
|
1212
|
|
|
|
|
1213
|
|
|
$values = [ |
|
1214
|
|
|
'"', // Β« (U+00AB) in UTF-8 |
|
1215
|
|
|
'"', // Β» (U+00BB) in UTF-8 |
|
1216
|
|
|
"'", // β (U+2018) in UTF-8 |
|
1217
|
|
|
"'", // β (U+2019) in UTF-8 |
|
1218
|
|
|
"'", // β (U+201A) in UTF-8 |
|
1219
|
|
|
"'", // β (U+201B) in UTF-8 |
|
1220
|
|
|
'"', // β (U+201C) in UTF-8 |
|
1221
|
|
|
'"', // β (U+201D) in UTF-8 |
|
1222
|
|
|
'"', // β (U+201E) in UTF-8 |
|
1223
|
|
|
'"', // β (U+201F) in UTF-8 |
|
1224
|
|
|
"'", // βΉ (U+2039) in UTF-8 |
|
1225
|
|
|
"'", // βΊ (U+203A) in UTF-8 |
|
1226
|
|
|
'-', // β (U+2013) in UTF-8 |
|
1227
|
|
|
'-', // β (U+2014) in UTF-8 |
|
1228
|
|
|
'...', // β¦ (U+2026) in UTF-8 |
|
1229
|
|
|
]; |
|
1230
|
|
|
|
|
1231
|
|
|
return str_replace($keys, $values, $str); |
|
1232
|
|
|
} |
|
1233
|
|
|
|
|
1234
|
6 |
|
public function remove_bom($str) |
|
|
|
|
|
|
1235
|
|
|
{ |
|
1236
|
6 |
|
if ($str === '') { |
|
1237
|
|
|
return ''; |
|
1238
|
|
|
} |
|
1239
|
|
|
|
|
1240
|
6 |
|
$strLength = \strlen($str); |
|
1241
|
6 |
|
foreach ($this->BOM as $bomString => $bomByteLength) { |
|
1242
|
6 |
|
if (strpos($str, $bomString, 0) === 0) { |
|
1243
|
|
|
$strTmp = \substr($str, $bomByteLength, $strLength); |
|
1244
|
|
|
if ($strTmp === false) { |
|
1245
|
|
|
return ''; |
|
1246
|
|
|
} |
|
1247
|
|
|
|
|
1248
|
|
|
$strLength -= (int)$bomByteLength; |
|
1249
|
|
|
$str = (string)$strTmp; |
|
1250
|
|
|
} |
|
1251
|
|
|
} |
|
1252
|
|
|
|
|
1253
|
6 |
|
return $str; |
|
1254
|
|
|
} |
|
1255
|
|
|
|
|
1256
|
|
|
} |
|
1257
|
|
|
|
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString.