Complex classes like Utf8 often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Utf8, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
5 | class Utf8 extends Resources |
||
6 | { |
||
7 | |||
8 | private $system; |
||
9 | private $ENCODINGS; |
||
10 | private $SUPPORT = []; |
||
11 | private $BROKEN_UTF8_FIX; |
||
12 | private $ORD; |
||
13 | private $CHR; |
||
14 | private $WIN1252_TO_UTF8; |
||
15 | private $BOM = [ |
||
16 | "\xef\xbb\xbf" => 3, // UTF-8 BOM |
||
17 | '' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
||
18 | "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
||
19 | ' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
||
20 | "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
||
21 | 'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
||
22 | "\xfe\xff" => 2, // UTF-16 (BE) BOM |
||
23 | 'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
||
24 | "\xff\xfe" => 2, // UTF-16 (LE) BOM |
||
25 | 'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
||
26 | ]; |
||
27 | |||
28 | private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
||
29 | // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
||
30 | 8234 => "\xE2\x80\xAA", |
||
31 | // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
||
32 | 8235 => "\xE2\x80\xAB", |
||
33 | // POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
||
34 | 8236 => "\xE2\x80\xAC", |
||
35 | // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
||
36 | 8237 => "\xE2\x80\xAD", |
||
37 | // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
||
38 | 8238 => "\xE2\x80\xAE", |
||
39 | // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
||
40 | 8294 => "\xE2\x81\xA6", |
||
41 | // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
||
42 | 8295 => "\xE2\x81\xA7", |
||
43 | // FIRST STRONG ISOLATE // (use -> dir = "auto") |
||
44 | 8296 => "\xE2\x81\xA8", |
||
45 | // POP DIRECTIONAL ISOLATE |
||
46 | 8297 => "\xE2\x81\xA9", |
||
47 | ]; |
||
48 | |||
49 | /** |
||
50 | * @var array |
||
51 | */ |
||
52 | private $WHITESPACE_TABLE = [ |
||
53 | 'SPACE' => "\x20", |
||
54 | 'NO-BREAK SPACE' => "\xc2\xa0", |
||
55 | 'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
||
56 | 'EN QUAD' => "\xe2\x80\x80", |
||
57 | 'EM QUAD' => "\xe2\x80\x81", |
||
58 | 'EN SPACE' => "\xe2\x80\x82", |
||
59 | 'EM SPACE' => "\xe2\x80\x83", |
||
60 | 'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
||
61 | 'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
||
62 | 'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
||
63 | 'FIGURE SPACE' => "\xe2\x80\x87", |
||
64 | 'PUNCTUATION SPACE' => "\xe2\x80\x88", |
||
65 | 'THIN SPACE' => "\xe2\x80\x89", |
||
66 | 'HAIR SPACE' => "\xe2\x80\x8a", |
||
67 | 'LINE SEPARATOR' => "\xe2\x80\xa8", |
||
68 | 'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
||
69 | 'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
||
70 | 'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
||
71 | 'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
||
72 | 'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
||
73 | ]; |
||
74 | |||
75 | 6 | function __construct() |
|
80 | |||
81 | 6 | private function checkForSupport() |
|
82 | { |
||
83 | 6 | if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
|
84 | 6 | $this->SUPPORT['already_checked_via_portable_utf8'] = true; |
|
85 | |||
86 | // http://php.net/manual/en/book.mbstring.php |
||
87 | 6 | $this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
|
88 | 6 | $this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
|
89 | 6 | if ($this->SUPPORT['mbstring'] === true) { |
|
90 | 6 | \mb_internal_encoding('UTF-8'); |
|
91 | /** @noinspection UnusedFunctionResultInspection */ |
||
92 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
93 | 6 | \mb_regex_encoding('UTF-8'); |
|
94 | 6 | $this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
95 | 6 | } |
|
96 | |||
97 | // http://php.net/manual/en/book.iconv.php |
||
98 | 6 | $this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
|
99 | |||
100 | // http://php.net/manual/en/book.intl.php |
||
101 | 6 | $this->SUPPORT['intl'] = $this->system->intl_loaded(); |
|
102 | 6 | $this->SUPPORT['intl__transliterator_list_ids'] = []; |
|
103 | |||
104 | if ( |
||
105 | 6 | $this->SUPPORT['intl'] === true |
|
106 | 6 | && |
|
107 | 6 | \function_exists('transliterator_list_ids') === true |
|
108 | 6 | ) { |
|
109 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
110 | 6 | $this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
|
111 | 6 | } |
|
112 | |||
113 | // http://php.net/manual/en/class.intlchar.php |
||
114 | 6 | $this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
|
115 | |||
116 | // http://php.net/manual/en/book.ctype.php |
||
117 | 6 | $this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
|
118 | |||
119 | // http://php.net/manual/en/class.finfo.php |
||
120 | 6 | $this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
|
121 | |||
122 | // http://php.net/manual/en/book.json.php |
||
123 | 6 | $this->SUPPORT['json'] = $this->system->json_loaded(); |
|
124 | |||
125 | // http://php.net/manual/en/book.pcre.php |
||
126 | 6 | $this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
|
127 | |||
128 | 6 | $this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
|
129 | 6 | if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
|
130 | \mb_internal_encoding('UTF-8'); |
||
131 | $this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
||
132 | } |
||
133 | 6 | } |
|
134 | 6 | } |
|
135 | |||
136 | 6 | public function rawurldecode($str, $multi_decode = true) |
|
137 | { |
||
138 | 6 | if ($str === '') { |
|
139 | return ''; |
||
140 | } |
||
141 | |||
142 | 6 | if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
|
143 | 6 | return $this->fixSimpleUtf8($str); |
|
144 | } |
||
145 | |||
146 | 6 | $pattern = '/%u([0-9a-fA-F]{3,4})/'; |
|
147 | 6 | if (preg_match($pattern, $str)) { |
|
148 | $str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
||
149 | } |
||
150 | |||
151 | 6 | $flags = \ENT_QUOTES | \ENT_HTML5; |
|
152 | |||
153 | 6 | if ($multi_decode === true) { |
|
154 | do { |
||
155 | 6 | $str_compare = $str; |
|
156 | |||
157 | /** |
||
158 | * @psalm-suppress PossiblyInvalidArgument |
||
159 | */ |
||
160 | 6 | $str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
|
161 | 6 | } while ($str_compare !== $str); |
|
162 | 6 | } |
|
163 | |||
164 | 6 | return $str; |
|
165 | } |
||
166 | |||
167 | 6 | private function fixSimpleUtf8($str) |
|
168 | { |
||
169 | 6 | if ($str === '') { |
|
170 | return ''; |
||
171 | } |
||
172 | |||
173 | 6 | static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
174 | 6 | static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
175 | |||
176 | 6 | if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
|
177 | 1 | if ($this->BROKEN_UTF8_FIX === null) { |
|
178 | 1 | $this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
|
179 | 1 | } |
|
180 | |||
181 | 1 | $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
|
182 | 1 | $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
|
183 | 1 | } |
|
184 | |||
185 | 6 | return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
|
186 | } |
||
187 | |||
188 | 2 | private function getData($file) |
|
193 | |||
194 | 6 | private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
195 | { |
||
196 | if ( |
||
197 | 6 | !isset($str[3]) // examples: &; || &x; |
|
198 | 6 | || |
|
199 | 6 | strpos($str, '&') === false // no "&" |
|
200 | 6 | ) { |
|
201 | 6 | return $str; |
|
202 | } |
||
203 | |||
204 | 6 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
205 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
206 | } |
||
207 | |||
208 | 6 | if ($flags === null) { |
|
209 | $flags = \ENT_QUOTES | \ENT_HTML5; |
||
210 | } |
||
211 | |||
212 | 6 | if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
213 | trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
||
214 | } |
||
215 | |||
216 | do { |
||
217 | 6 | $str_compare = $str; |
|
218 | |||
219 | // INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
||
220 | 6 | if ($this->SUPPORT['mbstring'] === true) { |
|
221 | 6 | if ($encoding === 'UTF-8') { |
|
222 | 6 | $str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
|
223 | 6 | } else { |
|
224 | $str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
||
225 | } |
||
226 | 6 | } else { |
|
227 | $str = (string)preg_replace_callback( |
||
228 | "/&#\d{2,6};/", |
||
229 | /** |
||
230 | * @param string[] $matches |
||
231 | * |
||
232 | * @return string |
||
233 | */ |
||
234 | static function ($matches) use ($encoding) { |
||
235 | $returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
||
236 | if ($returnTmp !== '"' && $returnTmp !== "'") { |
||
237 | return $returnTmp; |
||
238 | } |
||
239 | |||
240 | return $matches[0]; |
||
241 | }, |
||
242 | $str |
||
243 | ); |
||
244 | } |
||
245 | |||
246 | 6 | if (strpos($str, '&') !== false) { |
|
247 | 6 | if (strpos($str, '&#') !== false) { |
|
248 | // decode also numeric & UTF16 two byte entities |
||
249 | 6 | $str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
|
250 | 6 | } |
|
251 | |||
252 | 6 | $str = html_entity_decode($str, $flags, $encoding); |
|
253 | 6 | } |
|
254 | 6 | } while ($str_compare !== $str); |
|
255 | |||
256 | 6 | return $str; |
|
257 | } |
||
258 | |||
259 | private function normalize_encoding($encoding, $fallback = '') |
||
260 | { |
||
261 | static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
||
262 | |||
263 | // init |
||
264 | $encoding = (string)$encoding; |
||
265 | |||
266 | if (!$encoding) { |
||
267 | return $fallback; |
||
268 | } |
||
269 | |||
270 | if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
||
271 | return 'UTF-8'; |
||
272 | } |
||
273 | |||
274 | if ($encoding === '8BIT' || $encoding === 'BINARY') { |
||
275 | return 'CP850'; |
||
276 | } |
||
277 | |||
278 | if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
||
279 | return 'HTML-ENTITIES'; |
||
280 | } |
||
281 | |||
282 | if ( |
||
283 | $encoding === '1' // only a fallback, for non "strict_types" usage ... |
||
284 | || |
||
285 | $encoding === '0' // only a fallback, for non "strict_types" usage ... |
||
286 | ) { |
||
287 | return $fallback; |
||
288 | } |
||
289 | |||
290 | if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
||
291 | return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
||
292 | } |
||
293 | |||
294 | if ($this->ENCODINGS === null) { |
||
295 | $this->ENCODINGS = $this->getData('encodings'); |
||
296 | } |
||
297 | |||
298 | if (in_array($encoding, $this->ENCODINGS, true)) { |
||
299 | $STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
||
300 | |||
301 | return $encoding; |
||
302 | } |
||
303 | |||
304 | $encodingOrig = $encoding; |
||
305 | $encoding = strtoupper($encoding); |
||
306 | $encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
||
307 | |||
308 | $equivalences = [ |
||
309 | 'ISO8859' => 'ISO-8859-1', |
||
310 | 'ISO88591' => 'ISO-8859-1', |
||
311 | 'ISO' => 'ISO-8859-1', |
||
312 | 'LATIN' => 'ISO-8859-1', |
||
313 | 'LATIN1' => 'ISO-8859-1', // Western European |
||
314 | 'ISO88592' => 'ISO-8859-2', |
||
315 | 'LATIN2' => 'ISO-8859-2', // Central European |
||
316 | 'ISO88593' => 'ISO-8859-3', |
||
317 | 'LATIN3' => 'ISO-8859-3', // Southern European |
||
318 | 'ISO88594' => 'ISO-8859-4', |
||
319 | 'LATIN4' => 'ISO-8859-4', // Northern European |
||
320 | 'ISO88595' => 'ISO-8859-5', |
||
321 | 'ISO88596' => 'ISO-8859-6', // Greek |
||
322 | 'ISO88597' => 'ISO-8859-7', |
||
323 | 'ISO88598' => 'ISO-8859-8', // Hebrew |
||
324 | 'ISO88599' => 'ISO-8859-9', |
||
325 | 'LATIN5' => 'ISO-8859-9', // Turkish |
||
326 | 'ISO885911' => 'ISO-8859-11', |
||
327 | 'TIS620' => 'ISO-8859-11', // Thai |
||
328 | 'ISO885910' => 'ISO-8859-10', |
||
329 | 'LATIN6' => 'ISO-8859-10', // Nordic |
||
330 | 'ISO885913' => 'ISO-8859-13', |
||
331 | 'LATIN7' => 'ISO-8859-13', // Baltic |
||
332 | 'ISO885914' => 'ISO-8859-14', |
||
333 | 'LATIN8' => 'ISO-8859-14', // Celtic |
||
334 | 'ISO885915' => 'ISO-8859-15', |
||
335 | 'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
||
336 | 'ISO885916' => 'ISO-8859-16', |
||
337 | 'LATIN10' => 'ISO-8859-16', // Southeast European |
||
338 | 'CP1250' => 'WINDOWS-1250', |
||
339 | 'WIN1250' => 'WINDOWS-1250', |
||
340 | 'WINDOWS1250' => 'WINDOWS-1250', |
||
341 | 'CP1251' => 'WINDOWS-1251', |
||
342 | 'WIN1251' => 'WINDOWS-1251', |
||
343 | 'WINDOWS1251' => 'WINDOWS-1251', |
||
344 | 'CP1252' => 'WINDOWS-1252', |
||
345 | 'WIN1252' => 'WINDOWS-1252', |
||
346 | 'WINDOWS1252' => 'WINDOWS-1252', |
||
347 | 'CP1253' => 'WINDOWS-1253', |
||
348 | 'WIN1253' => 'WINDOWS-1253', |
||
349 | 'WINDOWS1253' => 'WINDOWS-1253', |
||
350 | 'CP1254' => 'WINDOWS-1254', |
||
351 | 'WIN1254' => 'WINDOWS-1254', |
||
352 | 'WINDOWS1254' => 'WINDOWS-1254', |
||
353 | 'CP1255' => 'WINDOWS-1255', |
||
354 | 'WIN1255' => 'WINDOWS-1255', |
||
355 | 'WINDOWS1255' => 'WINDOWS-1255', |
||
356 | 'CP1256' => 'WINDOWS-1256', |
||
357 | 'WIN1256' => 'WINDOWS-1256', |
||
358 | 'WINDOWS1256' => 'WINDOWS-1256', |
||
359 | 'CP1257' => 'WINDOWS-1257', |
||
360 | 'WIN1257' => 'WINDOWS-1257', |
||
361 | 'WINDOWS1257' => 'WINDOWS-1257', |
||
362 | 'CP1258' => 'WINDOWS-1258', |
||
363 | 'WIN1258' => 'WINDOWS-1258', |
||
364 | 'WINDOWS1258' => 'WINDOWS-1258', |
||
365 | 'UTF16' => 'UTF-16', |
||
366 | 'UTF32' => 'UTF-32', |
||
367 | 'UTF8' => 'UTF-8', |
||
368 | 'UTF' => 'UTF-8', |
||
369 | 'UTF7' => 'UTF-7', |
||
370 | '8BIT' => 'CP850', |
||
371 | 'BINARY' => 'CP850', |
||
372 | ]; |
||
373 | |||
374 | if (!empty($equivalences[$encodingUpperHelper])) { |
||
375 | $encoding = $equivalences[$encodingUpperHelper]; |
||
376 | } |
||
377 | |||
378 | $STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
||
379 | |||
380 | return $encoding; |
||
381 | } |
||
382 | |||
383 | 6 | private function toUtf8($str) |
|
384 | { |
||
385 | |||
386 | 6 | if (is_array($str) === true) { |
|
387 | foreach ($str as $key => $value) { |
||
388 | $str[$key] = $this->toUtf8($value); |
||
389 | } |
||
390 | return $str; |
||
391 | } |
||
392 | |||
393 | |||
394 | 6 | $str = (string)$str; |
|
395 | 6 | if ($str === '') { |
|
396 | return $str; |
||
397 | } |
||
398 | |||
399 | 6 | $max = \strlen($str); |
|
400 | 6 | $buf = ''; |
|
401 | |||
402 | 6 | for ($i = 0; $i < $max; ++$i) { |
|
403 | 6 | $c1 = $str[$i]; |
|
404 | |||
405 | 6 | if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
|
406 | |||
407 | if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
||
408 | |||
409 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
410 | |||
411 | if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
412 | $buf .= $c1 . $c2; |
||
413 | ++$i; |
||
414 | } else { // not valid UTF8 - convert it |
||
415 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
416 | } |
||
417 | } elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
||
418 | |||
419 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
420 | $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
||
421 | |||
422 | if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
423 | $buf .= $c1 . $c2 . $c3; |
||
424 | $i += 2; |
||
425 | } else { // not valid UTF8 - convert it |
||
426 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
427 | } |
||
428 | } elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
||
429 | |||
430 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
431 | $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
||
432 | $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
||
433 | |||
434 | if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
435 | $buf .= $c1 . $c2 . $c3 . $c4; |
||
436 | $i += 3; |
||
437 | } else { // not valid UTF8 - convert it |
||
438 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
439 | } |
||
440 | } else { // doesn't look like UTF8, but should be converted |
||
441 | |||
442 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
443 | } |
||
444 | 6 | } elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
|
445 | |||
446 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
447 | } else { // it doesn't need conversion |
||
448 | |||
449 | 6 | $buf .= $c1; |
|
450 | } |
||
451 | 6 | } |
|
452 | |||
453 | // decode unicode escape sequences + unicode surrogate pairs |
||
454 | 6 | $buf = preg_replace_callback( |
|
455 | 6 | '/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
|
456 | /** |
||
457 | * @param array $matches |
||
458 | * |
||
459 | * @return string |
||
460 | */ |
||
461 | function (array $matches) { |
||
462 | 1 | if (isset($matches[3])) { |
|
463 | 1 | $cp = (int)hexdec($matches[3]); |
|
464 | 1 | } else { |
|
465 | // http://unicode.org/faq/utf_bom.html#utf16-4 |
||
466 | $cp = ((int)hexdec($matches[1]) << 10) |
||
467 | + (int)hexdec($matches[2]) |
||
468 | + 0x10000 |
||
469 | - (0xD800 << 10) |
||
470 | - 0xDC00; |
||
471 | } |
||
472 | |||
473 | // https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
||
474 | // |
||
475 | // php_utf32_utf8(unsigned char *buf, unsigned k) |
||
476 | |||
477 | 1 | if ($cp < 0x80) { |
|
478 | 1 | return (string)$this->chr($cp); |
|
479 | } |
||
480 | |||
481 | if ($cp < 0xA0) { |
||
482 | /** @noinspection UnnecessaryCastingInspection */ |
||
483 | return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
||
484 | } |
||
485 | |||
486 | return $this->decimalToChr($cp); |
||
487 | 6 | }, |
|
488 | $buf |
||
489 | 6 | ); |
|
490 | |||
491 | 6 | if ($buf === null) { |
|
492 | return ''; |
||
493 | } |
||
494 | |||
495 | |||
496 | 6 | return $buf; |
|
497 | } |
||
498 | |||
499 | private function toUtf8ConvertHelper($input) |
||
500 | { |
||
501 | // init |
||
502 | $buf = ''; |
||
503 | |||
504 | if ($this->ORD === null) { |
||
505 | $this->ORD = $this->getData('ord'); |
||
506 | } |
||
507 | |||
508 | if ($this->CHR === null) { |
||
509 | $this->CHR = $this->getData('chr'); |
||
510 | } |
||
511 | |||
512 | if ($this->WIN1252_TO_UTF8 === null) { |
||
513 | $this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
||
514 | } |
||
515 | |||
516 | $ordC1 = $this->ORD[$input]; |
||
517 | if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
||
518 | $buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
||
519 | } else { |
||
520 | $cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
||
521 | $cc2 = ((string)$input & "\x3F") | "\x80"; |
||
522 | $buf .= $cc1 . $cc2; |
||
523 | } |
||
524 | |||
525 | return $buf; |
||
526 | } |
||
527 | |||
528 | 1 | private function chr($code_point, $encoding = 'UTF-8') |
|
529 | { |
||
530 | // init |
||
531 | 1 | static $CHAR_CACHE = []; |
|
532 | |||
533 | 1 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
534 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
535 | } |
||
536 | |||
537 | 1 | if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
538 | trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
||
539 | } |
||
540 | |||
541 | 1 | $cacheKey = $code_point . $encoding; |
|
542 | 1 | if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
543 | return $CHAR_CACHE[$cacheKey]; |
||
544 | } |
||
545 | |||
546 | 1 | if ($code_point <= 127) { // use "simple"-char only until "\x80" |
|
547 | |||
548 | 1 | if ($this->CHR === null) { |
|
549 | 1 | $this->CHR = (array)$this->getData('chr'); |
|
550 | 1 | } |
|
551 | |||
552 | /** |
||
553 | * @psalm-suppress PossiblyNullArrayAccess |
||
554 | */ |
||
555 | 1 | $chr = $this->CHR[$code_point]; |
|
556 | |||
557 | 1 | if ($encoding !== 'UTF-8') { |
|
558 | $chr = $this->encode($encoding, $chr); |
||
559 | } |
||
560 | |||
561 | 1 | return $CHAR_CACHE[$cacheKey] = $chr; |
|
562 | } |
||
563 | |||
564 | // |
||
565 | // fallback via "IntlChar" |
||
566 | // |
||
567 | |||
568 | if ($this->SUPPORT['intlChar'] === true) { |
||
569 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
570 | $chr = IntlChar::chr($code_point); |
||
571 | |||
572 | if ($encoding !== 'UTF-8') { |
||
573 | $chr = $this->encode($encoding, $chr); |
||
574 | } |
||
575 | |||
576 | return $CHAR_CACHE[$cacheKey] = $chr; |
||
577 | } |
||
578 | |||
579 | // |
||
580 | // fallback via vanilla php |
||
581 | // |
||
582 | |||
583 | if ($this->CHR === null) { |
||
584 | $this->CHR = (array)$this->getData('chr'); |
||
585 | } |
||
586 | |||
587 | $code_point = (int)$code_point; |
||
588 | if ($code_point <= 0x7F) { |
||
589 | /** |
||
590 | * @psalm-suppress PossiblyNullArrayAccess |
||
591 | */ |
||
592 | $chr = $this->CHR[$code_point]; |
||
593 | } elseif ($code_point <= 0x7FF) { |
||
594 | /** |
||
595 | * @psalm-suppress PossiblyNullArrayAccess |
||
596 | */ |
||
597 | $chr = $this->CHR[($code_point >> 6) + 0xC0] . |
||
598 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
599 | } elseif ($code_point <= 0xFFFF) { |
||
600 | /** |
||
601 | * @psalm-suppress PossiblyNullArrayAccess |
||
602 | */ |
||
603 | $chr = $this->CHR[($code_point >> 12) + 0xE0] . |
||
604 | $this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
||
605 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
606 | } else { |
||
607 | /** |
||
608 | * @psalm-suppress PossiblyNullArrayAccess |
||
609 | */ |
||
610 | $chr = $this->CHR[($code_point >> 18) + 0xF0] . |
||
611 | $this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
||
612 | $this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
||
613 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
614 | } |
||
615 | |||
616 | if ($encoding !== 'UTF-8') { |
||
617 | $chr = $this->encode($encoding, $chr); |
||
618 | } |
||
619 | |||
620 | return $CHAR_CACHE[$cacheKey] = $chr; |
||
621 | } |
||
622 | |||
623 | private function encode($toEncoding, $str) |
||
624 | { |
||
625 | if ($str === '' || $toEncoding === '') { |
||
626 | return $str; |
||
627 | } |
||
628 | |||
629 | if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
||
630 | $toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
||
631 | } |
||
632 | |||
633 | // if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
||
634 | // $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
||
635 | // } |
||
636 | |||
637 | // if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
||
638 | // return $str; |
||
639 | // } |
||
640 | |||
641 | if ($toEncoding === 'JSON') { |
||
642 | $return = $this->jsonEncode($str); |
||
643 | if ($return === false) { |
||
644 | throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
||
645 | } |
||
646 | |||
647 | return $return; |
||
648 | } |
||
649 | // if ($fromEncoding === 'JSON') { |
||
650 | // $str = $this->json_decode($str); |
||
651 | // $fromEncoding = ''; |
||
652 | // } |
||
653 | |||
654 | if ($toEncoding === 'BASE64') { |
||
655 | return base64_encode($str); |
||
656 | } |
||
657 | // if ($fromEncoding === 'BASE64') { |
||
658 | // $str = base64_decode($str, true); |
||
659 | // $fromEncoding = ''; |
||
660 | // } |
||
661 | |||
662 | if ($toEncoding === 'HTML-ENTITIES') { |
||
663 | return $this->htmlEncode($str, true, 'UTF-8'); |
||
664 | } |
||
665 | // if ($fromEncoding === 'HTML-ENTITIES') { |
||
666 | // $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
||
667 | // $fromEncoding = ''; |
||
668 | // } |
||
669 | |||
670 | $fromEncodingDetected = false; |
||
671 | // if ($autodetectFromEncoding === true || !$fromEncoding) { |
||
672 | // $fromEncodingDetected = $this->str_detect_encoding($str); |
||
673 | // } |
||
674 | |||
675 | // DEBUG |
||
676 | //var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
||
677 | |||
678 | // if ($fromEncodingDetected !== false) { |
||
679 | // $fromEncoding = $fromEncodingDetected; |
||
680 | // } elseif ($autodetectFromEncoding === true) { |
||
681 | // // fallback for the "autodetect"-mode |
||
682 | // return $this->toUtf8($str); |
||
683 | // } |
||
684 | |||
685 | // if (!$fromEncoding || $fromEncoding === $toEncoding) { |
||
686 | // return $str; |
||
687 | // } |
||
688 | |||
689 | // if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
||
690 | // return $this->toUtf8($str); |
||
691 | // } |
||
692 | |||
693 | // if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
||
694 | // return $this->to_iso8859($str); |
||
695 | // } |
||
696 | |||
697 | if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
||
698 | trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
||
699 | } |
||
700 | // |
||
701 | // if ($this->SUPPORT['mbstring'] === true) { |
||
702 | // // warning: do not use the symfony polyfill here |
||
703 | // $strEncoded = mb_convert_encoding( |
||
704 | // $str, |
||
705 | // $toEncoding, |
||
706 | // $fromEncoding |
||
707 | // ); |
||
708 | // |
||
709 | // if ($strEncoded) { |
||
710 | // return $strEncoded; |
||
711 | // } |
||
712 | // } |
||
713 | // |
||
714 | // $return = \iconv($fromEncoding, $toEncoding, $str); |
||
715 | // if ($return !== false) { |
||
716 | // return $return; |
||
717 | // } |
||
718 | |||
719 | return $str; |
||
720 | } |
||
721 | |||
722 | private function jsonEncode($value) |
||
733 | |||
734 | private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
||
735 | { |
||
736 | switch (\gettype($var)) { |
||
737 | case 'array': |
||
738 | foreach ($var as $key => $value) { |
||
739 | $var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
||
740 | } |
||
741 | unset($v); |
||
742 | |||
743 | break; |
||
744 | case 'object': |
||
745 | foreach ($var as $key => $value) { |
||
746 | $str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
||
747 | } |
||
748 | unset($v); |
||
749 | |||
750 | break; |
||
751 | case 'string': |
||
752 | |||
753 | if (strpos($var, "\r") !== false) { |
||
754 | // Workaround https://bugs.php.net/65732 |
||
755 | $var = $this->normalizeLineEnding($var); |
||
756 | } |
||
757 | |||
758 | if ($this->isAscii($var) === false) { |
||
759 | if (\Normalizer::isNormalized($var, $normalization_form)) { |
||
760 | $n = '-'; |
||
761 | } else { |
||
762 | $n = \Normalizer::normalize($var, $normalization_form); |
||
763 | |||
764 | if (isset($n[0])) { |
||
765 | $var = $n; |
||
766 | } else { |
||
767 | $var = $this->encode('UTF-8', $var, true); |
||
768 | } |
||
769 | } |
||
770 | |||
771 | if ( |
||
772 | $var[0] >= "\x80" |
||
773 | && |
||
774 | isset($n[0], $leading_combining[0]) |
||
775 | && |
||
776 | preg_match('/^\p{Mn}/u', $var) |
||
777 | ) { |
||
778 | // Prevent leading combining chars |
||
779 | // for NFC-safe concatenations. |
||
780 | $var = $leading_combining . $var; |
||
781 | } |
||
782 | } |
||
783 | |||
784 | break; |
||
785 | } |
||
786 | |||
787 | return $var; |
||
788 | } |
||
789 | |||
790 | private function normalizeLineEnding($str) |
||
794 | |||
795 | private function isAscii($str) |
||
803 | |||
804 | private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
||
805 | { |
||
806 | if ($str === '') { |
||
807 | return ''; |
||
808 | } |
||
809 | |||
810 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
||
811 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
812 | } |
||
813 | |||
814 | // INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
||
815 | if ($this->SUPPORT['mbstring'] === true) { |
||
816 | $startCode = 0x00; |
||
817 | if ($keepAsciiChars === true) { |
||
818 | $startCode = 0x80; |
||
819 | } |
||
820 | |||
821 | if ($encoding === 'UTF-8') { |
||
822 | return mb_encode_numericentity( |
||
823 | $str, |
||
824 | [$startCode, 0xfffff, 0, 0xfffff, 0] |
||
825 | ); |
||
826 | } |
||
827 | |||
828 | return mb_encode_numericentity( |
||
829 | $str, |
||
830 | [$startCode, 0xfffff, 0, 0xfffff, 0], |
||
831 | $encoding |
||
832 | ); |
||
833 | } |
||
834 | |||
835 | return implode( |
||
836 | '', |
||
837 | \array_map( |
||
838 | function (string $chr) use ($keepAsciiChars, $encoding) { |
||
839 | return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
||
840 | }, |
||
841 | $this->strSplit($str) |
||
842 | ) |
||
843 | ); |
||
844 | } |
||
845 | |||
846 | private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
||
858 | |||
859 | private function ord($chr, $encoding = 'UTF-8') |
||
860 | { |
||
861 | static $CHAR_CACHE = []; |
||
862 | |||
863 | // init |
||
864 | $chr = (string)$chr; |
||
865 | |||
866 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
||
867 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
868 | } |
||
869 | |||
870 | $cacheKey = $chr . $encoding; |
||
871 | if (isset($CHAR_CACHE[$cacheKey]) === true) { |
||
872 | return $CHAR_CACHE[$cacheKey]; |
||
873 | } |
||
874 | |||
875 | // check again, if it's still not UTF-8 |
||
876 | if ($encoding !== 'UTF-8') { |
||
877 | $chr = $this->encode($encoding, $chr); |
||
878 | } |
||
879 | |||
880 | if ($this->ORD === null) { |
||
881 | $this->ORD = $this->getData('ord'); |
||
882 | } |
||
883 | |||
884 | if (isset($this->ORD[$chr])) { |
||
885 | return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
||
886 | } |
||
887 | |||
888 | // |
||
889 | // fallback via "IntlChar" |
||
890 | // |
||
891 | |||
892 | if ($this->SUPPORT['intlChar'] === true) { |
||
893 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
894 | $code = \IntlChar::ord($chr); |
||
895 | if ($code) { |
||
896 | return $CHAR_CACHE[$cacheKey] = $code; |
||
897 | } |
||
898 | } |
||
899 | |||
900 | // |
||
901 | // fallback via vanilla php |
||
902 | // |
||
903 | |||
904 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
||
905 | $chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
||
906 | $code = $chr ? $chr[1] : 0; |
||
907 | |||
908 | if ($code >= 0xF0 && isset($chr[4])) { |
||
909 | /** @noinspection UnnecessaryCastingInspection */ |
||
910 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
||
911 | } |
||
912 | |||
913 | if ($code >= 0xE0 && isset($chr[3])) { |
||
914 | /** @noinspection UnnecessaryCastingInspection */ |
||
915 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
||
916 | } |
||
917 | |||
918 | if ($code >= 0xC0 && isset($chr[2])) { |
||
919 | /** @noinspection UnnecessaryCastingInspection */ |
||
920 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
||
921 | } |
||
922 | |||
923 | return $CHAR_CACHE[$cacheKey] = $code; |
||
924 | } |
||
925 | |||
926 | private function strSplit($str, $length = 1, $cleanUtf8 = false, $tryToUseMbFunction = true) |
||
927 | { |
||
928 | if ($length <= 0) { |
||
929 | return []; |
||
930 | } |
||
931 | |||
932 | if (is_array($str) === true) { |
||
933 | foreach ($str as $key => $value) { |
||
934 | $str[$key] = $this->strSplit($value, $length, $cleanUtf8, $tryToUseMbFunction); |
||
935 | } |
||
936 | |||
937 | return $str; |
||
938 | } |
||
939 | |||
940 | // init |
||
941 | $str = (string)$str; |
||
942 | |||
943 | if ($str === '') { |
||
944 | return []; |
||
945 | } |
||
946 | |||
947 | if ($cleanUtf8 === true) { |
||
948 | $str = $this->clean($str); |
||
949 | } |
||
950 | |||
951 | if ($tryToUseMbFunction === true && $this->SUPPORT['mbstring'] === true) { |
||
952 | $iMax = \mb_strlen($str); |
||
953 | if ($iMax <= 127) { |
||
954 | $ret = []; |
||
955 | for ($i = 0; $i < $iMax; ++$i) { |
||
956 | $ret[] = \mb_substr($str, $i, 1); |
||
957 | } |
||
958 | } else { |
||
959 | $retArray = []; |
||
960 | preg_match_all('/./us', $str, $retArray); |
||
961 | $ret = isset($retArray[0]) ? $retArray[0] : []; |
||
962 | } |
||
963 | } elseif ($this->SUPPORT['pcre_utf8'] === true) { |
||
964 | $retArray = []; |
||
965 | preg_match_all('/./us', $str, $retArray); |
||
966 | $ret = isset($retArray[0]) ? $retArray[0] : []; |
||
967 | } else { |
||
968 | |||
969 | // fallback |
||
970 | |||
971 | $ret = []; |
||
972 | $len = \strlen($str); |
||
973 | |||
974 | /** @noinspection ForeachInvariantsInspection */ |
||
975 | for ($i = 0; $i < $len; ++$i) { |
||
976 | if (($str[$i] & "\x80") === "\x00") { |
||
977 | $ret[] = $str[$i]; |
||
978 | } elseif ( |
||
979 | isset($str[$i + 1]) |
||
980 | && |
||
981 | ($str[$i] & "\xE0") === "\xC0" |
||
982 | ) { |
||
983 | if (($str[$i + 1] & "\xC0") === "\x80") { |
||
984 | $ret[] = $str[$i] . $str[$i + 1]; |
||
985 | |||
986 | ++$i; |
||
987 | } |
||
988 | } elseif ( |
||
989 | isset($str[$i + 2]) |
||
990 | && |
||
991 | ($str[$i] & "\xF0") === "\xE0" |
||
992 | ) { |
||
993 | if ( |
||
994 | ($str[$i + 1] & "\xC0") === "\x80" |
||
995 | && |
||
996 | ($str[$i + 2] & "\xC0") === "\x80" |
||
997 | ) { |
||
998 | $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
||
999 | |||
1000 | $i += 2; |
||
1001 | } |
||
1002 | } elseif ( |
||
1003 | isset($str[$i + 3]) |
||
1004 | && |
||
1005 | ($str[$i] & "\xF8") === "\xF0" |
||
1006 | ) { |
||
1007 | if ( |
||
1008 | ($str[$i + 1] & "\xC0") === "\x80" |
||
1009 | && |
||
1010 | ($str[$i + 2] & "\xC0") === "\x80" |
||
1011 | && |
||
1012 | ($str[$i + 3] & "\xC0") === "\x80" |
||
1013 | ) { |
||
1014 | $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
||
1015 | |||
1016 | $i += 3; |
||
1017 | } |
||
1018 | } |
||
1019 | } |
||
1020 | } |
||
1021 | |||
1022 | if ($length > 1) { |
||
1023 | $ret = \array_chunk($ret, $length); |
||
1024 | |||
1025 | return array_map( |
||
1026 | static function (&$item) { |
||
1027 | return implode('', $item); |
||
1028 | }, |
||
1029 | $ret |
||
1030 | ); |
||
1031 | } |
||
1032 | |||
1033 | if (isset($ret[0]) && $ret[0] === '') { |
||
1034 | return []; |
||
1035 | } |
||
1036 | |||
1037 | return $ret; |
||
1038 | } |
||
1039 | |||
1040 | private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
||
1041 | { |
||
1042 | // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
||
1043 | // caused connection reset problem on larger strings |
||
1044 | |||
1045 | $regx = '/ |
||
1046 | ( |
||
1047 | (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
||
1048 | | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
||
1049 | | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
||
1050 | | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
||
1051 | ){1,100} # ...one or more times |
||
1052 | ) |
||
1053 | | ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
||
1054 | | ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
||
1055 | /x'; |
||
1056 | $str = (string)preg_replace($regx, '$1', $str); |
||
1057 | |||
1058 | if ($replace_diamond_question_mark === true) { |
||
1059 | $str = $this->replace_diamond_question_mark($str, ''); |
||
1060 | } |
||
1061 | |||
1062 | if ($remove_invisible_characters === true) { |
||
1063 | $str = $this->remove_invisible_characters($str); |
||
1064 | } |
||
1065 | |||
1066 | if ($normalize_whitespace === true) { |
||
1067 | $str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
||
1068 | } |
||
1069 | |||
1070 | if ($normalize_msword === true) { |
||
1071 | $str = $this->normalize_msword($str); |
||
1072 | } |
||
1073 | |||
1074 | if ($remove_bom === true) { |
||
1075 | $str = $this->remove_bom($str); |
||
1076 | } |
||
1077 | |||
1078 | return $str; |
||
1079 | } |
||
1080 | |||
1081 | 6 | public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
1082 | { |
||
1083 | 6 | if ($str === '') { |
|
1084 | return ''; |
||
1085 | } |
||
1086 | |||
1087 | 6 | if ($processInvalidUtf8 === true) { |
|
1088 | 6 | $replacementCharHelper = $replacementChar; |
|
1089 | 6 | if ($replacementChar === '') { |
|
1090 | 6 | $replacementCharHelper = 'none'; |
|
1091 | 6 | } |
|
1092 | |||
1093 | 6 | if ($this->SUPPORT['mbstring'] === false) { |
|
1094 | // if there is no native support for "mbstring", |
||
1095 | // then we need to clean the string before ... |
||
1096 | $str = $this->clean($str); |
||
1097 | } |
||
1098 | |||
1099 | 6 | $save = \mb_substitute_character(); |
|
1100 | 6 | \mb_substitute_character($replacementCharHelper); |
|
1101 | // the polyfill maybe return false, so cast to string |
||
1102 | 6 | $str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
|
1103 | 6 | \mb_substitute_character($save); |
|
1104 | 6 | } |
|
1105 | |||
1106 | 6 | return str_replace( |
|
1107 | [ |
||
1108 | 6 | "\xEF\xBF\xBD", |
|
1109 | 6 | '�', |
|
1110 | 6 | ], |
|
1111 | [ |
||
1112 | 6 | $replacementChar, |
|
1113 | 6 | $replacementChar, |
|
1114 | 6 | ], |
|
1115 | $str |
||
1116 | 6 | ); |
|
1117 | } |
||
1118 | |||
1119 | 6 | public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
1120 | { |
||
1121 | // init |
||
1122 | 6 | $non_displayables = []; |
|
1123 | |||
1124 | // every control character except newline (dec 10), |
||
1125 | // carriage return (dec 13) and horizontal tab (dec 09) |
||
1126 | 6 | if ($url_encoded) { |
|
1127 | 6 | $non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
|
1128 | 6 | $non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
|
1129 | 6 | } |
|
1130 | |||
1131 | 6 | $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
|
1132 | |||
1133 | do { |
||
1134 | 6 | $str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
|
1135 | 6 | } while ($count !== 0); |
|
1136 | |||
1137 | 6 | return $str; |
|
1138 | } |
||
1139 | |||
1140 | 6 | public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
1141 | { |
||
1142 | 6 | if ($str === '') { |
|
1143 | return ''; |
||
1144 | } |
||
1145 | |||
1146 | 6 | static $WHITESPACE_CACHE = []; |
|
1147 | 6 | $cacheKey = (int)$keepNonBreakingSpace; |
|
1148 | |||
1149 | 6 | if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
|
1150 | 1 | $WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
|
1151 | |||
1152 | 1 | if ($keepNonBreakingSpace === true) { |
|
1153 | unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
||
1154 | } |
||
1155 | |||
1156 | 1 | $WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
|
1157 | 1 | } |
|
1158 | |||
1159 | 6 | if ($keepBidiUnicodeControls === false) { |
|
1160 | 6 | static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
1161 | |||
1162 | 6 | if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
|
1163 | 1 | $BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
|
1164 | 1 | } |
|
1165 | |||
1166 | 6 | $str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
|
1167 | 6 | } |
|
1168 | |||
1169 | 6 | return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
|
1170 | } |
||
1171 | |||
1172 | private function normalize_msword($str) |
||
1173 | { |
||
1174 | if ($str === '') { |
||
1175 | return ''; |
||
1176 | } |
||
1177 | |||
1178 | $keys = [ |
||
1179 | "\xc2\xab", // « (U+00AB) in UTF-8 |
||
1180 | "\xc2\xbb", // » (U+00BB) in UTF-8 |
||
1181 | "\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
||
1182 | "\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
||
1183 | "\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
||
1184 | "\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
||
1185 | "\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
||
1186 | "\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
||
1187 | "\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
||
1188 | "\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
||
1189 | "\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
||
1190 | "\xe2\x80\xba", // › (U+203A) in UTF-8 |
||
1191 | "\xe2\x80\x93", // – (U+2013) in UTF-8 |
||
1192 | "\xe2\x80\x94", // — (U+2014) in UTF-8 |
||
1193 | "\xe2\x80\xa6", // … (U+2026) in UTF-8 |
||
1194 | ]; |
||
1195 | |||
1196 | $values = [ |
||
1197 | '"', // « (U+00AB) in UTF-8 |
||
1198 | '"', // » (U+00BB) in UTF-8 |
||
1199 | "'", // ‘ (U+2018) in UTF-8 |
||
1200 | "'", // ’ (U+2019) in UTF-8 |
||
1201 | "'", // ‚ (U+201A) in UTF-8 |
||
1202 | "'", // ‛ (U+201B) in UTF-8 |
||
1203 | '"', // “ (U+201C) in UTF-8 |
||
1204 | '"', // ” (U+201D) in UTF-8 |
||
1205 | '"', // „ (U+201E) in UTF-8 |
||
1206 | '"', // ‟ (U+201F) in UTF-8 |
||
1207 | "'", // ‹ (U+2039) in UTF-8 |
||
1208 | "'", // › (U+203A) in UTF-8 |
||
1209 | '-', // – (U+2013) in UTF-8 |
||
1210 | '-', // — (U+2014) in UTF-8 |
||
1211 | '...', // … (U+2026) in UTF-8 |
||
1212 | ]; |
||
1213 | |||
1214 | return str_replace($keys, $values, $str); |
||
1215 | } |
||
1216 | |||
1217 | 6 | public function remove_bom($str) |
|
1218 | { |
||
1219 | 6 | if ($str === '') { |
|
1220 | return ''; |
||
1221 | } |
||
1222 | |||
1223 | 6 | $strLength = \strlen($str); |
|
1224 | 6 | foreach ($this->BOM as $bomString => $bomByteLength) { |
|
1225 | 6 | if (strpos($str, $bomString, 0) === 0) { |
|
1226 | $strTmp = \substr($str, $bomByteLength, $strLength); |
||
1227 | if ($strTmp === false) { |
||
1228 | return ''; |
||
1229 | } |
||
1230 | |||
1231 | $strLength -= (int)$bomByteLength; |
||
1232 | $str = (string)$strTmp; |
||
1233 | } |
||
1234 | 6 | } |
|
1235 | |||
1236 | 6 | return $str; |
|
1237 | } |
||
1238 | |||
1239 | // private function str_detect_encoding($str) |
||
1240 | // { |
||
1241 | // // init |
||
1242 | // $str = (string)$str; |
||
1243 | // |
||
1244 | // // |
||
1245 | // // 1.) check binary strings (010001001...) like UTF-16 / UTF-32 / PDF / Images / ... |
||
1246 | // // |
||
1247 | // |
||
1248 | // if ($this->is_binary($str, true) === true) { |
||
1249 | // $isUtf16 = $this->is_utf16($str, false); |
||
1250 | // if ($isUtf16 === 1) { |
||
1251 | // return 'UTF-16LE'; |
||
1252 | // } |
||
1253 | // if ($isUtf16 === 2) { |
||
1254 | // return 'UTF-16BE'; |
||
1255 | // } |
||
1256 | // |
||
1257 | // $isUtf32 = $this->is_utf32($str, false); |
||
1258 | // if ($isUtf32 === 1) { |
||
1259 | // return 'UTF-32LE'; |
||
1260 | // } |
||
1261 | // if ($isUtf32 === 2) { |
||
1262 | // return 'UTF-32BE'; |
||
1263 | // } |
||
1264 | // |
||
1265 | // // is binary but not "UTF-16" or "UTF-32" |
||
1266 | // return false; |
||
1267 | // } |
||
1268 | // |
||
1269 | // // |
||
1270 | // // 2.) simple check for ASCII chars |
||
1271 | // // |
||
1272 | // |
||
1273 | // if ($this->isAscii($str) === true) { |
||
1274 | // return 'ASCII'; |
||
1275 | // } |
||
1276 | // |
||
1277 | // // |
||
1278 | // // 3.) simple check for UTF-8 chars |
||
1279 | // // |
||
1280 | // |
||
1281 | // if ($this->isUtf8($str) === true) { |
||
1282 | // return 'UTF-8'; |
||
1283 | // } |
||
1284 | // |
||
1285 | // // |
||
1286 | // // 4.) check via "mb_detect_encoding()" |
||
1287 | // // |
||
1288 | // // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "mb_detect_encoding()" |
||
1289 | // |
||
1290 | // $detectOrder = [ |
||
1291 | // 'ISO-8859-1', |
||
1292 | // 'ISO-8859-2', |
||
1293 | // 'ISO-8859-3', |
||
1294 | // 'ISO-8859-4', |
||
1295 | // 'ISO-8859-5', |
||
1296 | // 'ISO-8859-6', |
||
1297 | // 'ISO-8859-7', |
||
1298 | // 'ISO-8859-8', |
||
1299 | // 'ISO-8859-9', |
||
1300 | // 'ISO-8859-10', |
||
1301 | // 'ISO-8859-13', |
||
1302 | // 'ISO-8859-14', |
||
1303 | // 'ISO-8859-15', |
||
1304 | // 'ISO-8859-16', |
||
1305 | // 'WINDOWS-1251', |
||
1306 | // 'WINDOWS-1252', |
||
1307 | // 'WINDOWS-1254', |
||
1308 | // 'CP932', |
||
1309 | // 'CP936', |
||
1310 | // 'CP950', |
||
1311 | // 'CP866', |
||
1312 | // 'CP850', |
||
1313 | // 'CP51932', |
||
1314 | // 'CP50220', |
||
1315 | // 'CP50221', |
||
1316 | // 'CP50222', |
||
1317 | // 'ISO-2022-JP', |
||
1318 | // 'ISO-2022-KR', |
||
1319 | // 'JIS', |
||
1320 | // 'JIS-ms', |
||
1321 | // 'EUC-CN', |
||
1322 | // 'EUC-JP', |
||
1323 | // ]; |
||
1324 | // |
||
1325 | // if ($this->SUPPORT['mbstring'] === true) { |
||
1326 | // // info: do not use the symfony polyfill here |
||
1327 | // $encoding = \mb_detect_encoding($str, $detectOrder, true); |
||
1328 | // if ($encoding) { |
||
1329 | // return $encoding; |
||
1330 | // } |
||
1331 | // } |
||
1332 | // |
||
1333 | // // |
||
1334 | // // 5.) check via "iconv()" |
||
1335 | // // |
||
1336 | // |
||
1337 | // if ($this->ENCODINGS === null) { |
||
1338 | // $this->ENCODINGS = $this->getData('encodings'); |
||
1339 | // } |
||
1340 | // |
||
1341 | // foreach ($this->ENCODINGS as $encodingTmp) { |
||
1342 | // // INFO: //IGNORE but still throw notice |
||
1343 | // /** @noinspection PhpUsageOfSilenceOperatorInspection */ |
||
1344 | // if ((string)@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str) === $str) { |
||
1345 | // return $encodingTmp; |
||
1346 | // } |
||
1347 | // } |
||
1348 | // |
||
1349 | // return false; |
||
1350 | // } |
||
1351 | |||
1352 | private function decimalToChr($int) |
||
1356 | // |
||
1357 | // private function is_utf16($str, $checkIfStringIsBinary = true) |
||
1358 | // { |
||
1359 | // |
||
1360 | // // init |
||
1361 | // $str = (string)$str; |
||
1362 | // $strChars = []; |
||
1363 | // |
||
1364 | // if ( |
||
1365 | // $checkIfStringIsBinary === true |
||
1366 | // && |
||
1367 | // $this->is_binary($str, true) === false |
||
1368 | // ) { |
||
1369 | // return false; |
||
1370 | // } |
||
1371 | // |
||
1372 | // if ($this->SUPPORT['mbstring'] === false) { |
||
1373 | // \trigger_error('UTF8::is_utf16() without mbstring may did not work correctly', \E_USER_WARNING); |
||
1374 | // } |
||
1375 | // |
||
1376 | // $str = $this->remove_bom($str); |
||
1377 | // |
||
1378 | // |
||
1379 | // $maybeUTF16LE = 0; |
||
1380 | // $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE'); |
||
1381 | // if ($test) { |
||
1382 | // $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8'); |
||
1383 | // $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE'); |
||
1384 | // if ($test3 === $test) { |
||
1385 | // if (\count($strChars) === 0) { |
||
1386 | // $strChars = $this->count_chars($str, true, false); |
||
1387 | // } |
||
1388 | // $countChars = $this->count_chars($test3); |
||
1389 | // foreach ($countChars as $test3char => $test3charEmpty) { |
||
1390 | // if (\in_array($test3char, $strChars, true) === true) { |
||
1391 | // ++$maybeUTF16LE; |
||
1392 | // } |
||
1393 | // unset($countChars[$test3char]); |
||
1394 | // } |
||
1395 | // } |
||
1396 | // } |
||
1397 | // |
||
1398 | // $maybeUTF16BE = 0; |
||
1399 | // $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE'); |
||
1400 | // if ($test) { |
||
1401 | // $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8'); |
||
1402 | // $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE'); |
||
1403 | // if ($test3 === $test) { |
||
1404 | // if (\count($strChars) === 0) { |
||
1405 | // $strChars = $this->count_chars($str, true, false); |
||
1406 | // } |
||
1407 | // $countChars = $this->count_chars($test3); |
||
1408 | // foreach ($countChars as $test3char => $test3charEmpty) { |
||
1409 | // if (\in_array($test3char, $strChars, true) === true) { |
||
1410 | // ++$maybeUTF16BE; |
||
1411 | // } |
||
1412 | // unset($countChars[$test3char]); |
||
1413 | // } |
||
1414 | // |
||
1415 | // } |
||
1416 | // } |
||
1417 | // |
||
1418 | // if ($maybeUTF16BE !== $maybeUTF16LE) { |
||
1419 | // if ($maybeUTF16LE > $maybeUTF16BE) { |
||
1420 | // return 1; |
||
1421 | // } |
||
1422 | // |
||
1423 | // return 2; |
||
1424 | // } |
||
1425 | // |
||
1426 | // return false; |
||
1427 | // } |
||
1428 | |||
1429 | /** |
||
1430 | * Check if the string is UTF-32. |
||
1431 | * |
||
1432 | * @param mixed $str <p>The input string.</p> |
||
1433 | * @param bool $checkIfStringIsBinary |
||
1434 | * |
||
1435 | * @return false|int |
||
1436 | * <strong>false</strong> if is't not UTF-32,<br> |
||
1437 | * <strong>1</strong> for UTF-32LE,<br> |
||
1438 | * <strong>2</strong> for UTF-32BE |
||
1439 | */ |
||
1440 | private function is_utf32($str, $checkIfStringIsBinary = true) |
||
1441 | { |
||
1442 | // init |
||
1443 | $str = (string)$str; |
||
1444 | $strChars = []; |
||
1445 | |||
1446 | if ($checkIfStringIsBinary === true && $this->is_binary($str, true) === false) { |
||
1447 | return false; |
||
1448 | } |
||
1449 | |||
1450 | if ($this->SUPPORT['mbstring'] === false) { |
||
1451 | \trigger_error('UTF8::is_utf32() without mbstring may did not work correctly', \E_USER_WARNING); |
||
1452 | } |
||
1453 | |||
1454 | $str = $this->remove_bom($str); |
||
1455 | |||
1456 | $maybeUTF32LE = 0; |
||
1457 | $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE'); |
||
1458 | if ($test) { |
||
1459 | $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8'); |
||
1460 | $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE'); |
||
1461 | if ($test3 === $test) { |
||
1462 | if (\count($strChars) === 0) { |
||
1463 | $strChars = $this->count_chars($str, true, false); |
||
1464 | } |
||
1465 | $countChars = $this->count_chars($test3); |
||
1466 | foreach ($countChars as $test3char => $test3charEmpty) { |
||
1467 | if (\in_array($test3char, $strChars, true) === true) { |
||
1468 | ++$maybeUTF32LE; |
||
1469 | } |
||
1470 | unset($countChars[$test3char]); |
||
1471 | } |
||
1472 | } |
||
1473 | } |
||
1474 | |||
1475 | $maybeUTF32BE = 0; |
||
1476 | $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE'); |
||
1477 | if ($test) { |
||
1478 | $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8'); |
||
1479 | $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE'); |
||
1480 | if ($test3 === $test) { |
||
1481 | if (\count($strChars) === 0) { |
||
1482 | $strChars = $this->count_chars($str, true, false); |
||
1483 | } |
||
1484 | $countChars = $this->count_chars($test3); |
||
1485 | foreach ($countChars as $test3char => $test3charEmpty) { |
||
1486 | if (\in_array($test3char, $strChars, true) === true) { |
||
1487 | ++$maybeUTF32BE; |
||
1488 | } |
||
1489 | unset($countChars[$test3char]); |
||
1490 | } |
||
1491 | } |
||
1492 | } |
||
1493 | |||
1494 | if ($maybeUTF32BE !== $maybeUTF32LE) { |
||
1495 | if ($maybeUTF32LE > $maybeUTF32BE) { |
||
1496 | return 1; |
||
1497 | } |
||
1498 | |||
1499 | return 2; |
||
1500 | } |
||
1501 | |||
1502 | return false; |
||
1503 | } |
||
1504 | |||
1505 | private function is_binary($input, $strict = false) |
||
1506 | { |
||
1507 | $input = (string)$input; |
||
1508 | if ($input === '') { |
||
1509 | return false; |
||
1510 | } |
||
1511 | |||
1512 | if (preg_match('~^[01]+$~', $input)) { |
||
1513 | return true; |
||
1514 | } |
||
1515 | |||
1516 | $ext = $this->get_file_type($input); |
||
1517 | if ($ext['type'] === 'binary') { |
||
1518 | return true; |
||
1519 | } |
||
1520 | |||
1521 | $testLength = \strlen($input); |
||
1522 | $testNull = \substr_count($input, "\x0", 0, $testLength); |
||
1523 | if (($testNull / $testLength) > 0.25) { |
||
1524 | return true; |
||
1525 | } |
||
1526 | |||
1527 | if ($strict === true) { |
||
1528 | if ($this->SUPPORT['finfo'] === false) { |
||
1529 | throw new \RuntimeException('ext-fileinfo: is not installed'); |
||
1530 | } |
||
1531 | |||
1532 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
1533 | $finfo_encoding = (new \finfo(\FILEINFO_MIME_ENCODING))->buffer($input); |
||
1534 | if ($finfo_encoding && $finfo_encoding === 'binary') { |
||
1535 | return true; |
||
1536 | } |
||
1537 | } |
||
1538 | |||
1539 | return false; |
||
1540 | } |
||
1541 | |||
1542 | private function get_file_type( |
||
1543 | $str, |
||
1544 | $fallback = [ |
||
1545 | 'ext' => null, |
||
1546 | 'mime' => 'application/octet-stream', |
||
1547 | 'type' => null, |
||
1548 | ] |
||
1549 | ) { |
||
1550 | if ($str === '') { |
||
1551 | return $fallback; |
||
1552 | } |
||
1553 | |||
1554 | $str_info = \substr($str, 0, 2); |
||
1555 | if ($str_info === false || \strlen($str_info) !== 2) { |
||
1556 | return $fallback; |
||
1557 | } |
||
1558 | |||
1559 | $str_info = \unpack('C2chars', $str_info); |
||
1560 | if ($str_info === false) { |
||
1561 | return $fallback; |
||
1562 | } |
||
1563 | $type_code = (int)($str_info['chars1'] . $str_info['chars2']); |
||
1564 | |||
1565 | switch ($type_code) { |
||
1566 | case 3780: |
||
1567 | $ext = 'pdf'; |
||
1568 | $mime = 'application/pdf'; |
||
1569 | $type = 'binary'; |
||
1570 | |||
1571 | break; |
||
1572 | case 7790: |
||
1573 | $ext = 'exe'; |
||
1574 | $mime = 'application/octet-stream'; |
||
1575 | $type = 'binary'; |
||
1576 | |||
1577 | break; |
||
1578 | case 7784: |
||
1579 | $ext = 'midi'; |
||
1580 | $mime = 'audio/x-midi'; |
||
1581 | $type = 'binary'; |
||
1582 | |||
1583 | break; |
||
1584 | case 8075: |
||
1585 | $ext = 'zip'; |
||
1586 | $mime = 'application/zip'; |
||
1587 | $type = 'binary'; |
||
1588 | |||
1589 | break; |
||
1590 | case 8297: |
||
1591 | $ext = 'rar'; |
||
1592 | $mime = 'application/rar'; |
||
1593 | $type = 'binary'; |
||
1594 | |||
1595 | break; |
||
1596 | case 255216: |
||
1597 | $ext = 'jpg'; |
||
1598 | $mime = 'image/jpeg'; |
||
1599 | $type = 'binary'; |
||
1600 | |||
1601 | break; |
||
1602 | case 7173: |
||
1603 | $ext = 'gif'; |
||
1604 | $mime = 'image/gif'; |
||
1605 | $type = 'binary'; |
||
1606 | |||
1607 | break; |
||
1608 | case 6677: |
||
1609 | $ext = 'bmp'; |
||
1610 | $mime = 'image/bmp'; |
||
1611 | $type = 'binary'; |
||
1612 | |||
1613 | break; |
||
1614 | case 13780: |
||
1615 | $ext = 'png'; |
||
1616 | $mime = 'image/png'; |
||
1617 | $type = 'binary'; |
||
1618 | |||
1619 | break; |
||
1620 | default: |
||
1621 | return $fallback; |
||
1622 | } |
||
1623 | |||
1624 | return [ |
||
1625 | 'ext' => $ext, |
||
1626 | 'mime' => $mime, |
||
1627 | 'type' => $type, |
||
1628 | ]; |
||
1629 | } |
||
1630 | |||
1631 | private function count_chars($str, $cleanUtf8 = false, $tryToUseMbFunction = true) |
||
1635 | |||
1636 | } |
||
1637 |
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString
.