Complex classes like Utf8 often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Utf8, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 5 | class Utf8 extends Resources |
||
| 6 | { |
||
| 7 | |||
| 8 | private $system; |
||
| 9 | private $ENCODINGS; |
||
| 10 | private $SUPPORT = []; |
||
| 11 | private $BROKEN_UTF8_FIX; |
||
| 12 | private $ORD; |
||
| 13 | private $CHR; |
||
| 14 | private $WIN1252_TO_UTF8; |
||
| 15 | private $BOM = [ |
||
| 16 | "\xef\xbb\xbf" => 3, // UTF-8 BOM |
||
| 17 | '' => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...) |
||
| 18 | "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM |
||
| 19 | ' þÿ' => 6, // UTF-32 (BE) BOM as "WINDOWS-1252" |
||
| 20 | "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM |
||
| 21 | 'ÿþ ' => 6, // UTF-32 (LE) BOM as "WINDOWS-1252" |
||
| 22 | "\xfe\xff" => 2, // UTF-16 (BE) BOM |
||
| 23 | 'þÿ' => 4, // UTF-16 (BE) BOM as "WINDOWS-1252" |
||
| 24 | "\xff\xfe" => 2, // UTF-16 (LE) BOM |
||
| 25 | 'ÿþ' => 4, // UTF-16 (LE) BOM as "WINDOWS-1252" |
||
| 26 | ]; |
||
| 27 | |||
| 28 | private $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
||
| 29 | // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
||
| 30 | 8234 => "\xE2\x80\xAA", |
||
| 31 | // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
||
| 32 | 8235 => "\xE2\x80\xAB", |
||
| 33 | // POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
||
| 34 | 8236 => "\xE2\x80\xAC", |
||
| 35 | // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
||
| 36 | 8237 => "\xE2\x80\xAD", |
||
| 37 | // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
||
| 38 | 8238 => "\xE2\x80\xAE", |
||
| 39 | // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
||
| 40 | 8294 => "\xE2\x81\xA6", |
||
| 41 | // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
||
| 42 | 8295 => "\xE2\x81\xA7", |
||
| 43 | // FIRST STRONG ISOLATE // (use -> dir = "auto") |
||
| 44 | 8296 => "\xE2\x81\xA8", |
||
| 45 | // POP DIRECTIONAL ISOLATE |
||
| 46 | 8297 => "\xE2\x81\xA9", |
||
| 47 | ]; |
||
| 48 | |||
| 49 | /** |
||
| 50 | * @var array |
||
| 51 | */ |
||
| 52 | private $WHITESPACE_TABLE = [ |
||
| 53 | 'SPACE' => "\x20", |
||
| 54 | 'NO-BREAK SPACE' => "\xc2\xa0", |
||
| 55 | 'OGHAM SPACE MARK' => "\xe1\x9a\x80", |
||
| 56 | 'EN QUAD' => "\xe2\x80\x80", |
||
| 57 | 'EM QUAD' => "\xe2\x80\x81", |
||
| 58 | 'EN SPACE' => "\xe2\x80\x82", |
||
| 59 | 'EM SPACE' => "\xe2\x80\x83", |
||
| 60 | 'THREE-PER-EM SPACE' => "\xe2\x80\x84", |
||
| 61 | 'FOUR-PER-EM SPACE' => "\xe2\x80\x85", |
||
| 62 | 'SIX-PER-EM SPACE' => "\xe2\x80\x86", |
||
| 63 | 'FIGURE SPACE' => "\xe2\x80\x87", |
||
| 64 | 'PUNCTUATION SPACE' => "\xe2\x80\x88", |
||
| 65 | 'THIN SPACE' => "\xe2\x80\x89", |
||
| 66 | 'HAIR SPACE' => "\xe2\x80\x8a", |
||
| 67 | 'LINE SEPARATOR' => "\xe2\x80\xa8", |
||
| 68 | 'PARAGRAPH SEPARATOR' => "\xe2\x80\xa9", |
||
| 69 | 'ZERO WIDTH SPACE' => "\xe2\x80\x8b", |
||
| 70 | 'NARROW NO-BREAK SPACE' => "\xe2\x80\xaf", |
||
| 71 | 'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f", |
||
| 72 | 'IDEOGRAPHIC SPACE' => "\xe3\x80\x80", |
||
| 73 | ]; |
||
| 74 | |||
| 75 | 6 | function __construct() |
|
| 80 | |||
| 81 | 6 | private function checkForSupport() |
|
| 82 | { |
||
| 83 | 6 | if (!isset($this->SUPPORT['already_checked_via_portable_utf8'])) { |
|
| 84 | 6 | $this->SUPPORT['already_checked_via_portable_utf8'] = true; |
|
| 85 | |||
| 86 | // http://php.net/manual/en/book.mbstring.php |
||
| 87 | 6 | $this->SUPPORT['mbstring'] = $this->system->mbstring_loaded(); |
|
| 88 | 6 | $this->SUPPORT['mbstring_func_overload'] = $this->system->mbstring_overloaded(); |
|
| 89 | 6 | if ($this->SUPPORT['mbstring'] === true) { |
|
| 90 | 6 | \mb_internal_encoding('UTF-8'); |
|
| 91 | /** @noinspection UnusedFunctionResultInspection */ |
||
| 92 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
| 93 | 6 | \mb_regex_encoding('UTF-8'); |
|
| 94 | 6 | $this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
|
| 95 | 6 | } |
|
| 96 | |||
| 97 | // http://php.net/manual/en/book.iconv.php |
||
| 98 | 6 | $this->SUPPORT['iconv'] = $this->system->iconv_loaded(); |
|
| 99 | |||
| 100 | // http://php.net/manual/en/book.intl.php |
||
| 101 | 6 | $this->SUPPORT['intl'] = $this->system->intl_loaded(); |
|
| 102 | 6 | $this->SUPPORT['intl__transliterator_list_ids'] = []; |
|
| 103 | |||
| 104 | if ( |
||
| 105 | 6 | $this->SUPPORT['intl'] === true |
|
| 106 | 6 | && |
|
| 107 | 6 | \function_exists('transliterator_list_ids') === true |
|
| 108 | 6 | ) { |
|
| 109 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
| 110 | 6 | $this->SUPPORT['intl__transliterator_list_ids'] = \transliterator_list_ids(); |
|
| 111 | 6 | } |
|
| 112 | |||
| 113 | // http://php.net/manual/en/class.intlchar.php |
||
| 114 | 6 | $this->SUPPORT['intlChar'] = $this->system->intlChar_loaded(); |
|
| 115 | |||
| 116 | // http://php.net/manual/en/book.ctype.php |
||
| 117 | 6 | $this->SUPPORT['ctype'] = $this->system->ctype_loaded(); |
|
| 118 | |||
| 119 | // http://php.net/manual/en/class.finfo.php |
||
| 120 | 6 | $this->SUPPORT['finfo'] = $this->system->finfo_loaded(); |
|
| 121 | |||
| 122 | // http://php.net/manual/en/book.json.php |
||
| 123 | 6 | $this->SUPPORT['json'] = $this->system->json_loaded(); |
|
| 124 | |||
| 125 | // http://php.net/manual/en/book.pcre.php |
||
| 126 | 6 | $this->SUPPORT['pcre_utf8'] = $this->system->pcre_utf8_support(); |
|
| 127 | |||
| 128 | 6 | $this->SUPPORT['symfony_polyfill_used'] = $this->system->symfony_polyfill_used(); |
|
| 129 | 6 | if ($this->SUPPORT['symfony_polyfill_used'] === true) { |
|
| 130 | \mb_internal_encoding('UTF-8'); |
||
| 131 | $this->SUPPORT['mbstring_internal_encoding'] = 'UTF-8'; |
||
| 132 | } |
||
| 133 | 6 | } |
|
| 134 | 6 | } |
|
| 135 | |||
| 136 | 6 | public function rawurldecode($str, $multi_decode = true) |
|
| 137 | { |
||
| 138 | 6 | if ($str === '') { |
|
| 139 | return ''; |
||
| 140 | } |
||
| 141 | |||
| 142 | 6 | if (strpos($str, '&') === false && strpos($str, '%') === false && strpos($str, '+') === false && strpos($str, '\u') === false) { |
|
| 143 | 6 | return $this->fixSimpleUtf8($str); |
|
| 144 | } |
||
| 145 | |||
| 146 | 6 | $pattern = '/%u([0-9a-fA-F]{3,4})/'; |
|
| 147 | 6 | if (preg_match($pattern, $str)) { |
|
| 148 | $str = (string)preg_replace($pattern, '&#x\\1;', rawurldecode($str)); |
||
| 149 | } |
||
| 150 | |||
| 151 | 6 | $flags = \ENT_QUOTES | \ENT_HTML5; |
|
| 152 | |||
| 153 | 6 | if ($multi_decode === true) { |
|
| 154 | do { |
||
| 155 | 6 | $str_compare = $str; |
|
| 156 | |||
| 157 | /** |
||
| 158 | * @psalm-suppress PossiblyInvalidArgument |
||
| 159 | */ |
||
| 160 | 6 | $str = $this->fixSimpleUtf8(rawurldecode($this->htmlEntityDecode($this->toUtf8($str), $flags))); |
|
| 161 | 6 | } while ($str_compare !== $str); |
|
| 162 | 6 | } |
|
| 163 | |||
| 164 | 6 | return $str; |
|
| 165 | } |
||
| 166 | |||
| 167 | 6 | private function fixSimpleUtf8($str) |
|
| 168 | { |
||
| 169 | 6 | if ($str === '') { |
|
| 170 | return ''; |
||
| 171 | } |
||
| 172 | |||
| 173 | 6 | static $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = null; |
|
| 174 | 6 | static $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = null; |
|
| 175 | |||
| 176 | 6 | if ($BROKEN_UTF8_TO_UTF8_KEYS_CACHE === null) { |
|
| 177 | 1 | if ($this->BROKEN_UTF8_FIX === null) { |
|
| 178 | 1 | $this->BROKEN_UTF8_FIX = $this->getData('utf8_fix'); |
|
| 179 | 1 | } |
|
| 180 | |||
| 181 | 1 | $BROKEN_UTF8_TO_UTF8_KEYS_CACHE = array_keys($this->BROKEN_UTF8_FIX); |
|
| 182 | 1 | $BROKEN_UTF8_TO_UTF8_VALUES_CACHE = array_values($this->BROKEN_UTF8_FIX); |
|
| 183 | 1 | } |
|
| 184 | |||
| 185 | 6 | return str_replace($BROKEN_UTF8_TO_UTF8_KEYS_CACHE, $BROKEN_UTF8_TO_UTF8_VALUES_CACHE, $str); |
|
| 186 | } |
||
| 187 | |||
| 188 | 2 | private function getData($file) |
|
| 193 | |||
| 194 | 6 | private function htmlEntityDecode($str, $flags = null, $encoding = 'UTF-8') |
|
| 195 | { |
||
| 196 | if ( |
||
| 197 | 6 | !isset($str[3]) // examples: &; || &x; |
|
| 198 | 6 | || |
|
| 199 | 6 | strpos($str, '&') === false // no "&" |
|
| 200 | 6 | ) { |
|
| 201 | 6 | return $str; |
|
| 202 | } |
||
| 203 | |||
| 204 | 6 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
| 205 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
| 206 | } |
||
| 207 | |||
| 208 | 6 | if ($flags === null) { |
|
| 209 | $flags = \ENT_QUOTES | \ENT_HTML5; |
||
| 210 | } |
||
| 211 | |||
| 212 | 6 | if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
| 213 | trigger_error('UTF8::htmlEntityDecode() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
||
| 214 | } |
||
| 215 | |||
| 216 | do { |
||
| 217 | 6 | $str_compare = $str; |
|
| 218 | |||
| 219 | // INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
||
| 220 | 6 | if ($this->SUPPORT['mbstring'] === true) { |
|
| 221 | 6 | if ($encoding === 'UTF-8') { |
|
| 222 | 6 | $str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0]); |
|
| 223 | 6 | } else { |
|
| 224 | $str = mb_decode_numericentity($str, [0x80, 0xfffff, 0, 0xfffff, 0], $encoding); |
||
| 225 | } |
||
| 226 | 6 | } else { |
|
| 227 | $str = (string)preg_replace_callback( |
||
| 228 | "/&#\d{2,6};/", |
||
| 229 | /** |
||
| 230 | * @param string[] $matches |
||
| 231 | * |
||
| 232 | * @return string |
||
| 233 | */ |
||
| 234 | static function ($matches) use ($encoding) { |
||
| 235 | $returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES'); |
||
| 236 | if ($returnTmp !== '"' && $returnTmp !== "'") { |
||
| 237 | return $returnTmp; |
||
| 238 | } |
||
| 239 | |||
| 240 | return $matches[0]; |
||
| 241 | }, |
||
| 242 | $str |
||
| 243 | ); |
||
| 244 | } |
||
| 245 | |||
| 246 | 6 | if (strpos($str, '&') !== false) { |
|
| 247 | 6 | if (strpos($str, '&#') !== false) { |
|
| 248 | // decode also numeric & UTF16 two byte entities |
||
| 249 | 6 | $str = (string)preg_replace('/(&#(?:x0*[0-9a-fA-F]{2,6}(?![0-9a-fA-F;])|(?:0*\d{2,6}(?![0-9;]))))/S', '$1;', $str); |
|
| 250 | 6 | } |
|
| 251 | |||
| 252 | 6 | $str = html_entity_decode($str, $flags, $encoding); |
|
| 253 | 6 | } |
|
| 254 | 6 | } while ($str_compare !== $str); |
|
| 255 | |||
| 256 | 6 | return $str; |
|
| 257 | } |
||
| 258 | |||
| 259 | private function normalize_encoding($encoding, $fallback = '') |
||
| 260 | { |
||
| 261 | static $STATIC_NORMALIZE_ENCODING_CACHE = []; |
||
| 262 | |||
| 263 | // init |
||
| 264 | $encoding = (string)$encoding; |
||
| 265 | |||
| 266 | if (!$encoding) { |
||
| 267 | return $fallback; |
||
| 268 | } |
||
| 269 | |||
| 270 | if ($encoding === 'UTF-8' || $encoding === 'UTF8') { |
||
| 271 | return 'UTF-8'; |
||
| 272 | } |
||
| 273 | |||
| 274 | if ($encoding === '8BIT' || $encoding === 'BINARY') { |
||
| 275 | return 'CP850'; |
||
| 276 | } |
||
| 277 | |||
| 278 | if ($encoding === 'HTML' || $encoding === 'HTML-ENTITIES') { |
||
| 279 | return 'HTML-ENTITIES'; |
||
| 280 | } |
||
| 281 | |||
| 282 | if ( |
||
| 283 | $encoding === '1' // only a fallback, for non "strict_types" usage ... |
||
| 284 | || |
||
| 285 | $encoding === '0' // only a fallback, for non "strict_types" usage ... |
||
| 286 | ) { |
||
| 287 | return $fallback; |
||
| 288 | } |
||
| 289 | |||
| 290 | if (isset($STATIC_NORMALIZE_ENCODING_CACHE[$encoding])) { |
||
| 291 | return $STATIC_NORMALIZE_ENCODING_CACHE[$encoding]; |
||
| 292 | } |
||
| 293 | |||
| 294 | if ($this->ENCODINGS === null) { |
||
| 295 | $this->ENCODINGS = $this->getData('encodings'); |
||
| 296 | } |
||
| 297 | |||
| 298 | if (in_array($encoding, $this->ENCODINGS, true)) { |
||
| 299 | $STATIC_NORMALIZE_ENCODING_CACHE[$encoding] = $encoding; |
||
| 300 | |||
| 301 | return $encoding; |
||
| 302 | } |
||
| 303 | |||
| 304 | $encodingOrig = $encoding; |
||
| 305 | $encoding = strtoupper($encoding); |
||
| 306 | $encodingUpperHelper = (string)preg_replace('/[^a-zA-Z0-9\s]/u', '', $encoding); |
||
| 307 | |||
| 308 | $equivalences = [ |
||
| 309 | 'ISO8859' => 'ISO-8859-1', |
||
| 310 | 'ISO88591' => 'ISO-8859-1', |
||
| 311 | 'ISO' => 'ISO-8859-1', |
||
| 312 | 'LATIN' => 'ISO-8859-1', |
||
| 313 | 'LATIN1' => 'ISO-8859-1', // Western European |
||
| 314 | 'ISO88592' => 'ISO-8859-2', |
||
| 315 | 'LATIN2' => 'ISO-8859-2', // Central European |
||
| 316 | 'ISO88593' => 'ISO-8859-3', |
||
| 317 | 'LATIN3' => 'ISO-8859-3', // Southern European |
||
| 318 | 'ISO88594' => 'ISO-8859-4', |
||
| 319 | 'LATIN4' => 'ISO-8859-4', // Northern European |
||
| 320 | 'ISO88595' => 'ISO-8859-5', |
||
| 321 | 'ISO88596' => 'ISO-8859-6', // Greek |
||
| 322 | 'ISO88597' => 'ISO-8859-7', |
||
| 323 | 'ISO88598' => 'ISO-8859-8', // Hebrew |
||
| 324 | 'ISO88599' => 'ISO-8859-9', |
||
| 325 | 'LATIN5' => 'ISO-8859-9', // Turkish |
||
| 326 | 'ISO885911' => 'ISO-8859-11', |
||
| 327 | 'TIS620' => 'ISO-8859-11', // Thai |
||
| 328 | 'ISO885910' => 'ISO-8859-10', |
||
| 329 | 'LATIN6' => 'ISO-8859-10', // Nordic |
||
| 330 | 'ISO885913' => 'ISO-8859-13', |
||
| 331 | 'LATIN7' => 'ISO-8859-13', // Baltic |
||
| 332 | 'ISO885914' => 'ISO-8859-14', |
||
| 333 | 'LATIN8' => 'ISO-8859-14', // Celtic |
||
| 334 | 'ISO885915' => 'ISO-8859-15', |
||
| 335 | 'LATIN9' => 'ISO-8859-15', // Western European (with some extra chars e.g. €) |
||
| 336 | 'ISO885916' => 'ISO-8859-16', |
||
| 337 | 'LATIN10' => 'ISO-8859-16', // Southeast European |
||
| 338 | 'CP1250' => 'WINDOWS-1250', |
||
| 339 | 'WIN1250' => 'WINDOWS-1250', |
||
| 340 | 'WINDOWS1250' => 'WINDOWS-1250', |
||
| 341 | 'CP1251' => 'WINDOWS-1251', |
||
| 342 | 'WIN1251' => 'WINDOWS-1251', |
||
| 343 | 'WINDOWS1251' => 'WINDOWS-1251', |
||
| 344 | 'CP1252' => 'WINDOWS-1252', |
||
| 345 | 'WIN1252' => 'WINDOWS-1252', |
||
| 346 | 'WINDOWS1252' => 'WINDOWS-1252', |
||
| 347 | 'CP1253' => 'WINDOWS-1253', |
||
| 348 | 'WIN1253' => 'WINDOWS-1253', |
||
| 349 | 'WINDOWS1253' => 'WINDOWS-1253', |
||
| 350 | 'CP1254' => 'WINDOWS-1254', |
||
| 351 | 'WIN1254' => 'WINDOWS-1254', |
||
| 352 | 'WINDOWS1254' => 'WINDOWS-1254', |
||
| 353 | 'CP1255' => 'WINDOWS-1255', |
||
| 354 | 'WIN1255' => 'WINDOWS-1255', |
||
| 355 | 'WINDOWS1255' => 'WINDOWS-1255', |
||
| 356 | 'CP1256' => 'WINDOWS-1256', |
||
| 357 | 'WIN1256' => 'WINDOWS-1256', |
||
| 358 | 'WINDOWS1256' => 'WINDOWS-1256', |
||
| 359 | 'CP1257' => 'WINDOWS-1257', |
||
| 360 | 'WIN1257' => 'WINDOWS-1257', |
||
| 361 | 'WINDOWS1257' => 'WINDOWS-1257', |
||
| 362 | 'CP1258' => 'WINDOWS-1258', |
||
| 363 | 'WIN1258' => 'WINDOWS-1258', |
||
| 364 | 'WINDOWS1258' => 'WINDOWS-1258', |
||
| 365 | 'UTF16' => 'UTF-16', |
||
| 366 | 'UTF32' => 'UTF-32', |
||
| 367 | 'UTF8' => 'UTF-8', |
||
| 368 | 'UTF' => 'UTF-8', |
||
| 369 | 'UTF7' => 'UTF-7', |
||
| 370 | '8BIT' => 'CP850', |
||
| 371 | 'BINARY' => 'CP850', |
||
| 372 | ]; |
||
| 373 | |||
| 374 | if (!empty($equivalences[$encodingUpperHelper])) { |
||
| 375 | $encoding = $equivalences[$encodingUpperHelper]; |
||
| 376 | } |
||
| 377 | |||
| 378 | $STATIC_NORMALIZE_ENCODING_CACHE[$encodingOrig] = $encoding; |
||
| 379 | |||
| 380 | return $encoding; |
||
| 381 | } |
||
| 382 | |||
| 383 | 6 | private function toUtf8($str) |
|
| 384 | { |
||
| 385 | |||
| 386 | 6 | if (is_array($str) === true) { |
|
| 387 | foreach ($str as $key => $value) { |
||
| 388 | $str[$key] = $this->toUtf8($value); |
||
| 389 | } |
||
| 390 | return $str; |
||
| 391 | } |
||
| 392 | |||
| 393 | |||
| 394 | 6 | $str = (string)$str; |
|
| 395 | 6 | if ($str === '') { |
|
| 396 | return $str; |
||
| 397 | } |
||
| 398 | |||
| 399 | 6 | $max = \strlen($str); |
|
| 400 | 6 | $buf = ''; |
|
| 401 | |||
| 402 | 6 | for ($i = 0; $i < $max; ++$i) { |
|
| 403 | 6 | $c1 = $str[$i]; |
|
| 404 | |||
| 405 | 6 | if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already |
|
| 406 | |||
| 407 | if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 |
||
| 408 | |||
| 409 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
| 410 | |||
| 411 | if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
| 412 | $buf .= $c1 . $c2; |
||
| 413 | ++$i; |
||
| 414 | } else { // not valid UTF8 - convert it |
||
| 415 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
| 416 | } |
||
| 417 | } elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 |
||
| 418 | |||
| 419 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
| 420 | $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
||
| 421 | |||
| 422 | if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
| 423 | $buf .= $c1 . $c2 . $c3; |
||
| 424 | $i += 2; |
||
| 425 | } else { // not valid UTF8 - convert it |
||
| 426 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
| 427 | } |
||
| 428 | } elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 |
||
| 429 | |||
| 430 | $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; |
||
| 431 | $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; |
||
| 432 | $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; |
||
| 433 | |||
| 434 | if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already |
||
| 435 | $buf .= $c1 . $c2 . $c3 . $c4; |
||
| 436 | $i += 3; |
||
| 437 | } else { // not valid UTF8 - convert it |
||
| 438 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
| 439 | } |
||
| 440 | } else { // doesn't look like UTF8, but should be converted |
||
| 441 | |||
| 442 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
| 443 | } |
||
| 444 | 6 | } elseif (($c1 & "\xC0") === "\x80") { // needs conversion |
|
| 445 | |||
| 446 | $buf .= $this->toUtf8ConvertHelper($c1); |
||
| 447 | } else { // it doesn't need conversion |
||
| 448 | |||
| 449 | 6 | $buf .= $c1; |
|
| 450 | } |
||
| 451 | 6 | } |
|
| 452 | |||
| 453 | // decode unicode escape sequences + unicode surrogate pairs |
||
| 454 | 6 | $buf = preg_replace_callback( |
|
| 455 | 6 | '/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', |
|
| 456 | /** |
||
| 457 | * @param array $matches |
||
| 458 | * |
||
| 459 | * @return string |
||
| 460 | */ |
||
| 461 | function (array $matches) { |
||
| 462 | 1 | if (isset($matches[3])) { |
|
| 463 | 1 | $cp = (int)hexdec($matches[3]); |
|
| 464 | 1 | } else { |
|
| 465 | // http://unicode.org/faq/utf_bom.html#utf16-4 |
||
| 466 | $cp = ((int)hexdec($matches[1]) << 10) |
||
| 467 | + (int)hexdec($matches[2]) |
||
| 468 | + 0x10000 |
||
| 469 | - (0xD800 << 10) |
||
| 470 | - 0xDC00; |
||
| 471 | } |
||
| 472 | |||
| 473 | // https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 |
||
| 474 | // |
||
| 475 | // php_utf32_utf8(unsigned char *buf, unsigned k) |
||
| 476 | |||
| 477 | 1 | if ($cp < 0x80) { |
|
| 478 | 1 | return (string)$this->chr($cp); |
|
| 479 | } |
||
| 480 | |||
| 481 | if ($cp < 0xA0) { |
||
| 482 | /** @noinspection UnnecessaryCastingInspection */ |
||
| 483 | return (string)$this->chr(0xC0 | $cp >> 6) . (string)$this->chr(0x80 | $cp & 0x3F); |
||
| 484 | } |
||
| 485 | |||
| 486 | return $this->decimalToChr($cp); |
||
| 487 | 6 | }, |
|
| 488 | $buf |
||
| 489 | 6 | ); |
|
| 490 | |||
| 491 | 6 | if ($buf === null) { |
|
| 492 | return ''; |
||
| 493 | } |
||
| 494 | |||
| 495 | |||
| 496 | 6 | return $buf; |
|
| 497 | } |
||
| 498 | |||
| 499 | private function toUtf8ConvertHelper($input) |
||
| 500 | { |
||
| 501 | // init |
||
| 502 | $buf = ''; |
||
| 503 | |||
| 504 | if ($this->ORD === null) { |
||
| 505 | $this->ORD = $this->getData('ord'); |
||
| 506 | } |
||
| 507 | |||
| 508 | if ($this->CHR === null) { |
||
| 509 | $this->CHR = $this->getData('chr'); |
||
| 510 | } |
||
| 511 | |||
| 512 | if ($this->WIN1252_TO_UTF8 === null) { |
||
| 513 | $this->WIN1252_TO_UTF8 = $this->getData('win1252_to_utf8'); |
||
| 514 | } |
||
| 515 | |||
| 516 | $ordC1 = $this->ORD[$input]; |
||
| 517 | if (isset($this->WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases |
||
| 518 | $buf .= $this->WIN1252_TO_UTF8[$ordC1]; |
||
| 519 | } else { |
||
| 520 | $cc1 = $this->CHR[$ordC1 / 64] | "\xC0"; |
||
| 521 | $cc2 = ((string)$input & "\x3F") | "\x80"; |
||
| 522 | $buf .= $cc1 . $cc2; |
||
| 523 | } |
||
| 524 | |||
| 525 | return $buf; |
||
| 526 | } |
||
| 527 | |||
| 528 | 1 | private function chr($code_point, $encoding = 'UTF-8') |
|
| 529 | { |
||
| 530 | // init |
||
| 531 | 1 | static $CHAR_CACHE = []; |
|
| 532 | |||
| 533 | 1 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
|
| 534 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
| 535 | } |
||
| 536 | |||
| 537 | 1 | if ($encoding !== 'UTF-8' && $encoding !== 'ISO-8859-1' && $encoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
|
| 538 | trigger_error('UTF8::chr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); |
||
| 539 | } |
||
| 540 | |||
| 541 | 1 | $cacheKey = $code_point . $encoding; |
|
| 542 | 1 | if (isset($CHAR_CACHE[$cacheKey]) === true) { |
|
| 543 | return $CHAR_CACHE[$cacheKey]; |
||
| 544 | } |
||
| 545 | |||
| 546 | 1 | if ($code_point <= 127) { // use "simple"-char only until "\x80" |
|
| 547 | |||
| 548 | 1 | if ($this->CHR === null) { |
|
| 549 | 1 | $this->CHR = (array)$this->getData('chr'); |
|
| 550 | 1 | } |
|
| 551 | |||
| 552 | /** |
||
| 553 | * @psalm-suppress PossiblyNullArrayAccess |
||
| 554 | */ |
||
| 555 | 1 | $chr = $this->CHR[$code_point]; |
|
| 556 | |||
| 557 | 1 | if ($encoding !== 'UTF-8') { |
|
| 558 | $chr = $this->encode($encoding, $chr); |
||
| 559 | } |
||
| 560 | |||
| 561 | 1 | return $CHAR_CACHE[$cacheKey] = $chr; |
|
| 562 | } |
||
| 563 | |||
| 564 | // |
||
| 565 | // fallback via "IntlChar" |
||
| 566 | // |
||
| 567 | |||
| 568 | if ($this->SUPPORT['intlChar'] === true) { |
||
| 569 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
| 570 | $chr = IntlChar::chr($code_point); |
||
| 571 | |||
| 572 | if ($encoding !== 'UTF-8') { |
||
| 573 | $chr = $this->encode($encoding, $chr); |
||
| 574 | } |
||
| 575 | |||
| 576 | return $CHAR_CACHE[$cacheKey] = $chr; |
||
| 577 | } |
||
| 578 | |||
| 579 | // |
||
| 580 | // fallback via vanilla php |
||
| 581 | // |
||
| 582 | |||
| 583 | if ($this->CHR === null) { |
||
| 584 | $this->CHR = (array)$this->getData('chr'); |
||
| 585 | } |
||
| 586 | |||
| 587 | $code_point = (int)$code_point; |
||
| 588 | if ($code_point <= 0x7F) { |
||
| 589 | /** |
||
| 590 | * @psalm-suppress PossiblyNullArrayAccess |
||
| 591 | */ |
||
| 592 | $chr = $this->CHR[$code_point]; |
||
| 593 | } elseif ($code_point <= 0x7FF) { |
||
| 594 | /** |
||
| 595 | * @psalm-suppress PossiblyNullArrayAccess |
||
| 596 | */ |
||
| 597 | $chr = $this->CHR[($code_point >> 6) + 0xC0] . |
||
| 598 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
| 599 | } elseif ($code_point <= 0xFFFF) { |
||
| 600 | /** |
||
| 601 | * @psalm-suppress PossiblyNullArrayAccess |
||
| 602 | */ |
||
| 603 | $chr = $this->CHR[($code_point >> 12) + 0xE0] . |
||
| 604 | $this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
||
| 605 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
| 606 | } else { |
||
| 607 | /** |
||
| 608 | * @psalm-suppress PossiblyNullArrayAccess |
||
| 609 | */ |
||
| 610 | $chr = $this->CHR[($code_point >> 18) + 0xF0] . |
||
| 611 | $this->CHR[(($code_point >> 12) & 0x3F) + 0x80] . |
||
| 612 | $this->CHR[(($code_point >> 6) & 0x3F) + 0x80] . |
||
| 613 | $this->CHR[($code_point & 0x3F) + 0x80]; |
||
| 614 | } |
||
| 615 | |||
| 616 | if ($encoding !== 'UTF-8') { |
||
| 617 | $chr = $this->encode($encoding, $chr); |
||
| 618 | } |
||
| 619 | |||
| 620 | return $CHAR_CACHE[$cacheKey] = $chr; |
||
| 621 | } |
||
| 622 | |||
| 623 | private function encode($toEncoding, $str) |
||
| 624 | { |
||
| 625 | if ($str === '' || $toEncoding === '') { |
||
| 626 | return $str; |
||
| 627 | } |
||
| 628 | |||
| 629 | if ($toEncoding !== 'UTF-8' && $toEncoding !== 'CP850') { |
||
| 630 | $toEncoding = $this->normalize_encoding($toEncoding, 'UTF-8'); |
||
| 631 | } |
||
| 632 | |||
| 633 | // if ($fromEncoding && $fromEncoding !== 'UTF-8' && $fromEncoding !== 'CP850') { |
||
| 634 | // $fromEncoding = $this->normalize_encoding($fromEncoding, null); |
||
| 635 | // } |
||
| 636 | |||
| 637 | // if ($toEncoding && $fromEncoding && $fromEncoding === $toEncoding) { |
||
| 638 | // return $str; |
||
| 639 | // } |
||
| 640 | |||
| 641 | if ($toEncoding === 'JSON') { |
||
| 642 | $return = $this->jsonEncode($str); |
||
| 643 | if ($return === false) { |
||
| 644 | throw new InvalidArgumentException('The input string [' . $str . '] can not be used for jsonEncode().'); |
||
| 645 | } |
||
| 646 | |||
| 647 | return $return; |
||
| 648 | } |
||
| 649 | // if ($fromEncoding === 'JSON') { |
||
| 650 | // $str = $this->json_decode($str); |
||
| 651 | // $fromEncoding = ''; |
||
| 652 | // } |
||
| 653 | |||
| 654 | if ($toEncoding === 'BASE64') { |
||
| 655 | return base64_encode($str); |
||
| 656 | } |
||
| 657 | // if ($fromEncoding === 'BASE64') { |
||
| 658 | // $str = base64_decode($str, true); |
||
| 659 | // $fromEncoding = ''; |
||
| 660 | // } |
||
| 661 | |||
| 662 | if ($toEncoding === 'HTML-ENTITIES') { |
||
| 663 | return $this->htmlEncode($str, true, 'UTF-8'); |
||
| 664 | } |
||
| 665 | // if ($fromEncoding === 'HTML-ENTITIES') { |
||
| 666 | // $str = $this->html_decode($str, \ENT_COMPAT, 'UTF-8'); |
||
| 667 | // $fromEncoding = ''; |
||
| 668 | // } |
||
| 669 | |||
| 670 | $fromEncodingDetected = false; |
||
| 671 | // if ($autodetectFromEncoding === true || !$fromEncoding) { |
||
| 672 | // $fromEncodingDetected = $this->str_detect_encoding($str); |
||
| 673 | // } |
||
| 674 | |||
| 675 | // DEBUG |
||
| 676 | //var_dump($toEncoding, $fromEncoding, $fromEncodingDetected, $str, "\n\n"); |
||
| 677 | |||
| 678 | // if ($fromEncodingDetected !== false) { |
||
| 679 | // $fromEncoding = $fromEncodingDetected; |
||
| 680 | // } elseif ($autodetectFromEncoding === true) { |
||
| 681 | // // fallback for the "autodetect"-mode |
||
| 682 | // return $this->toUtf8($str); |
||
| 683 | // } |
||
| 684 | |||
| 685 | // if (!$fromEncoding || $fromEncoding === $toEncoding) { |
||
| 686 | // return $str; |
||
| 687 | // } |
||
| 688 | |||
| 689 | // if ($toEncoding === 'UTF-8' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'ISO-8859-1')) { |
||
| 690 | // return $this->toUtf8($str); |
||
| 691 | // } |
||
| 692 | |||
| 693 | // if ($toEncoding === 'ISO-8859-1' && ($fromEncoding === 'WINDOWS-1252' || $fromEncoding === 'UTF-8')) { |
||
| 694 | // return $this->to_iso8859($str); |
||
| 695 | // } |
||
| 696 | |||
| 697 | if ($toEncoding !== 'UTF-8' && $toEncoding !== 'ISO-8859-1' && $toEncoding !== 'WINDOWS-1252' && $this->SUPPORT['mbstring'] === false) { |
||
| 698 | trigger_error('UTF8::encode() without mbstring cannot handle "' . $toEncoding . '" encoding', E_USER_WARNING); |
||
| 699 | } |
||
| 700 | // |
||
| 701 | // if ($this->SUPPORT['mbstring'] === true) { |
||
| 702 | // // warning: do not use the symfony polyfill here |
||
| 703 | // $strEncoded = mb_convert_encoding( |
||
| 704 | // $str, |
||
| 705 | // $toEncoding, |
||
| 706 | // $fromEncoding |
||
| 707 | // ); |
||
| 708 | // |
||
| 709 | // if ($strEncoded) { |
||
| 710 | // return $strEncoded; |
||
| 711 | // } |
||
| 712 | // } |
||
| 713 | // |
||
| 714 | // $return = \iconv($fromEncoding, $toEncoding, $str); |
||
| 715 | // if ($return !== false) { |
||
| 716 | // return $return; |
||
| 717 | // } |
||
| 718 | |||
| 719 | return $str; |
||
| 720 | } |
||
| 721 | |||
| 722 | private function jsonEncode($value) |
||
| 733 | |||
| 734 | private function filter($var, $normalization_form = \Normalizer::NFC, $leading_combining = '◌') |
||
| 735 | { |
||
| 736 | switch (\gettype($var)) { |
||
| 737 | case 'array': |
||
| 738 | foreach ($var as $key => $value) { |
||
| 739 | $var[$key] = $this->filter($value, $normalization_form, $leading_combining); |
||
| 740 | } |
||
| 741 | unset($v); |
||
| 742 | |||
| 743 | break; |
||
| 744 | case 'object': |
||
| 745 | foreach ($var as $key => $value) { |
||
| 746 | $str[$key] = $this->filter($value, $normalization_form, $leading_combining); |
||
| 747 | } |
||
| 748 | unset($v); |
||
| 749 | |||
| 750 | break; |
||
| 751 | case 'string': |
||
| 752 | |||
| 753 | if (strpos($var, "\r") !== false) { |
||
| 754 | // Workaround https://bugs.php.net/65732 |
||
| 755 | $var = $this->normalizeLineEnding($var); |
||
| 756 | } |
||
| 757 | |||
| 758 | if ($this->isAscii($var) === false) { |
||
| 759 | if (\Normalizer::isNormalized($var, $normalization_form)) { |
||
| 760 | $n = '-'; |
||
| 761 | } else { |
||
| 762 | $n = \Normalizer::normalize($var, $normalization_form); |
||
| 763 | |||
| 764 | if (isset($n[0])) { |
||
| 765 | $var = $n; |
||
| 766 | } else { |
||
| 767 | $var = $this->encode('UTF-8', $var, true); |
||
| 768 | } |
||
| 769 | } |
||
| 770 | |||
| 771 | if ( |
||
| 772 | $var[0] >= "\x80" |
||
| 773 | && |
||
| 774 | isset($n[0], $leading_combining[0]) |
||
| 775 | && |
||
| 776 | preg_match('/^\p{Mn}/u', $var) |
||
| 777 | ) { |
||
| 778 | // Prevent leading combining chars |
||
| 779 | // for NFC-safe concatenations. |
||
| 780 | $var = $leading_combining . $var; |
||
| 781 | } |
||
| 782 | } |
||
| 783 | |||
| 784 | break; |
||
| 785 | } |
||
| 786 | |||
| 787 | return $var; |
||
| 788 | } |
||
| 789 | |||
| 790 | private function normalizeLineEnding($str) |
||
| 794 | |||
| 795 | private function isAscii($str) |
||
| 803 | |||
| 804 | private function htmlEncode($str, $keepAsciiChars = false, $encoding = 'UTF-8') |
||
| 805 | { |
||
| 806 | if ($str === '') { |
||
| 807 | return ''; |
||
| 808 | } |
||
| 809 | |||
| 810 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
||
| 811 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
| 812 | } |
||
| 813 | |||
| 814 | // INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity |
||
| 815 | if ($this->SUPPORT['mbstring'] === true) { |
||
| 816 | $startCode = 0x00; |
||
| 817 | if ($keepAsciiChars === true) { |
||
| 818 | $startCode = 0x80; |
||
| 819 | } |
||
| 820 | |||
| 821 | if ($encoding === 'UTF-8') { |
||
| 822 | return mb_encode_numericentity( |
||
| 823 | $str, |
||
| 824 | [$startCode, 0xfffff, 0, 0xfffff, 0] |
||
| 825 | ); |
||
| 826 | } |
||
| 827 | |||
| 828 | return mb_encode_numericentity( |
||
| 829 | $str, |
||
| 830 | [$startCode, 0xfffff, 0, 0xfffff, 0], |
||
| 831 | $encoding |
||
| 832 | ); |
||
| 833 | } |
||
| 834 | |||
| 835 | return implode( |
||
| 836 | '', |
||
| 837 | \array_map( |
||
| 838 | function (string $chr) use ($keepAsciiChars, $encoding) { |
||
| 839 | return $this->singleChrHtmlEncode($chr, $keepAsciiChars, $encoding); |
||
| 840 | }, |
||
| 841 | $this->strSplit($str) |
||
| 842 | ) |
||
| 843 | ); |
||
| 844 | } |
||
| 845 | |||
| 846 | private function singleChrHtmlEncode($char, $keepAsciiChars = false, $encoding = 'UTF-8') |
||
| 858 | |||
| 859 | private function ord($chr, $encoding = 'UTF-8') |
||
| 860 | { |
||
| 861 | static $CHAR_CACHE = []; |
||
| 862 | |||
| 863 | // init |
||
| 864 | $chr = (string)$chr; |
||
| 865 | |||
| 866 | if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { |
||
| 867 | $encoding = $this->normalize_encoding($encoding, 'UTF-8'); |
||
| 868 | } |
||
| 869 | |||
| 870 | $cacheKey = $chr . $encoding; |
||
| 871 | if (isset($CHAR_CACHE[$cacheKey]) === true) { |
||
| 872 | return $CHAR_CACHE[$cacheKey]; |
||
| 873 | } |
||
| 874 | |||
| 875 | // check again, if it's still not UTF-8 |
||
| 876 | if ($encoding !== 'UTF-8') { |
||
| 877 | $chr = $this->encode($encoding, $chr); |
||
| 878 | } |
||
| 879 | |||
| 880 | if ($this->ORD === null) { |
||
| 881 | $this->ORD = $this->getData('ord'); |
||
| 882 | } |
||
| 883 | |||
| 884 | if (isset($this->ORD[$chr])) { |
||
| 885 | return $CHAR_CACHE[$cacheKey] = $this->ORD[$chr]; |
||
| 886 | } |
||
| 887 | |||
| 888 | // |
||
| 889 | // fallback via "IntlChar" |
||
| 890 | // |
||
| 891 | |||
| 892 | if ($this->SUPPORT['intlChar'] === true) { |
||
| 893 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
| 894 | $code = \IntlChar::ord($chr); |
||
| 895 | if ($code) { |
||
| 896 | return $CHAR_CACHE[$cacheKey] = $code; |
||
| 897 | } |
||
| 898 | } |
||
| 899 | |||
| 900 | // |
||
| 901 | // fallback via vanilla php |
||
| 902 | // |
||
| 903 | |||
| 904 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
||
| 905 | $chr = \unpack('C*', (string)\substr($chr, 0, 4)); |
||
| 906 | $code = $chr ? $chr[1] : 0; |
||
| 907 | |||
| 908 | if ($code >= 0xF0 && isset($chr[4])) { |
||
| 909 | /** @noinspection UnnecessaryCastingInspection */ |
||
| 910 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80); |
||
| 911 | } |
||
| 912 | |||
| 913 | if ($code >= 0xE0 && isset($chr[3])) { |
||
| 914 | /** @noinspection UnnecessaryCastingInspection */ |
||
| 915 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80); |
||
| 916 | } |
||
| 917 | |||
| 918 | if ($code >= 0xC0 && isset($chr[2])) { |
||
| 919 | /** @noinspection UnnecessaryCastingInspection */ |
||
| 920 | return $CHAR_CACHE[$cacheKey] = (int)((($code - 0xC0) << 6) + $chr[2] - 0x80); |
||
| 921 | } |
||
| 922 | |||
| 923 | return $CHAR_CACHE[$cacheKey] = $code; |
||
| 924 | } |
||
| 925 | |||
| 926 | private function strSplit($str, $length = 1, $cleanUtf8 = false, $tryToUseMbFunction = true) |
||
| 927 | { |
||
| 928 | if ($length <= 0) { |
||
| 929 | return []; |
||
| 930 | } |
||
| 931 | |||
| 932 | if (is_array($str) === true) { |
||
| 933 | foreach ($str as $key => $value) { |
||
| 934 | $str[$key] = $this->strSplit($value, $length, $cleanUtf8, $tryToUseMbFunction); |
||
| 935 | } |
||
| 936 | |||
| 937 | return $str; |
||
| 938 | } |
||
| 939 | |||
| 940 | // init |
||
| 941 | $str = (string)$str; |
||
| 942 | |||
| 943 | if ($str === '') { |
||
| 944 | return []; |
||
| 945 | } |
||
| 946 | |||
| 947 | if ($cleanUtf8 === true) { |
||
| 948 | $str = $this->clean($str); |
||
| 949 | } |
||
| 950 | |||
| 951 | if ($tryToUseMbFunction === true && $this->SUPPORT['mbstring'] === true) { |
||
| 952 | $iMax = \mb_strlen($str); |
||
| 953 | if ($iMax <= 127) { |
||
| 954 | $ret = []; |
||
| 955 | for ($i = 0; $i < $iMax; ++$i) { |
||
| 956 | $ret[] = \mb_substr($str, $i, 1); |
||
| 957 | } |
||
| 958 | } else { |
||
| 959 | $retArray = []; |
||
| 960 | preg_match_all('/./us', $str, $retArray); |
||
| 961 | $ret = isset($retArray[0]) ? $retArray[0] : []; |
||
| 962 | } |
||
| 963 | } elseif ($this->SUPPORT['pcre_utf8'] === true) { |
||
| 964 | $retArray = []; |
||
| 965 | preg_match_all('/./us', $str, $retArray); |
||
| 966 | $ret = isset($retArray[0]) ? $retArray[0] : []; |
||
| 967 | } else { |
||
| 968 | |||
| 969 | // fallback |
||
| 970 | |||
| 971 | $ret = []; |
||
| 972 | $len = \strlen($str); |
||
| 973 | |||
| 974 | /** @noinspection ForeachInvariantsInspection */ |
||
| 975 | for ($i = 0; $i < $len; ++$i) { |
||
| 976 | if (($str[$i] & "\x80") === "\x00") { |
||
| 977 | $ret[] = $str[$i]; |
||
| 978 | } elseif ( |
||
| 979 | isset($str[$i + 1]) |
||
| 980 | && |
||
| 981 | ($str[$i] & "\xE0") === "\xC0" |
||
| 982 | ) { |
||
| 983 | if (($str[$i + 1] & "\xC0") === "\x80") { |
||
| 984 | $ret[] = $str[$i] . $str[$i + 1]; |
||
| 985 | |||
| 986 | ++$i; |
||
| 987 | } |
||
| 988 | } elseif ( |
||
| 989 | isset($str[$i + 2]) |
||
| 990 | && |
||
| 991 | ($str[$i] & "\xF0") === "\xE0" |
||
| 992 | ) { |
||
| 993 | if ( |
||
| 994 | ($str[$i + 1] & "\xC0") === "\x80" |
||
| 995 | && |
||
| 996 | ($str[$i + 2] & "\xC0") === "\x80" |
||
| 997 | ) { |
||
| 998 | $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2]; |
||
| 999 | |||
| 1000 | $i += 2; |
||
| 1001 | } |
||
| 1002 | } elseif ( |
||
| 1003 | isset($str[$i + 3]) |
||
| 1004 | && |
||
| 1005 | ($str[$i] & "\xF8") === "\xF0" |
||
| 1006 | ) { |
||
| 1007 | if ( |
||
| 1008 | ($str[$i + 1] & "\xC0") === "\x80" |
||
| 1009 | && |
||
| 1010 | ($str[$i + 2] & "\xC0") === "\x80" |
||
| 1011 | && |
||
| 1012 | ($str[$i + 3] & "\xC0") === "\x80" |
||
| 1013 | ) { |
||
| 1014 | $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; |
||
| 1015 | |||
| 1016 | $i += 3; |
||
| 1017 | } |
||
| 1018 | } |
||
| 1019 | } |
||
| 1020 | } |
||
| 1021 | |||
| 1022 | if ($length > 1) { |
||
| 1023 | $ret = \array_chunk($ret, $length); |
||
| 1024 | |||
| 1025 | return array_map( |
||
| 1026 | static function (&$item) { |
||
| 1027 | return implode('', $item); |
||
| 1028 | }, |
||
| 1029 | $ret |
||
| 1030 | ); |
||
| 1031 | } |
||
| 1032 | |||
| 1033 | if (isset($ret[0]) && $ret[0] === '') { |
||
| 1034 | return []; |
||
| 1035 | } |
||
| 1036 | |||
| 1037 | return $ret; |
||
| 1038 | } |
||
| 1039 | |||
| 1040 | private function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false, $replace_diamond_question_mark = false, $remove_invisible_characters = true) |
||
| 1041 | { |
||
| 1042 | // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
||
| 1043 | // caused connection reset problem on larger strings |
||
| 1044 | |||
| 1045 | $regx = '/ |
||
| 1046 | ( |
||
| 1047 | (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
||
| 1048 | | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
||
| 1049 | | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
||
| 1050 | | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
||
| 1051 | ){1,100} # ...one or more times |
||
| 1052 | ) |
||
| 1053 | | ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
||
| 1054 | | ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
||
| 1055 | /x'; |
||
| 1056 | $str = (string)preg_replace($regx, '$1', $str); |
||
| 1057 | |||
| 1058 | if ($replace_diamond_question_mark === true) { |
||
| 1059 | $str = $this->replace_diamond_question_mark($str, ''); |
||
| 1060 | } |
||
| 1061 | |||
| 1062 | if ($remove_invisible_characters === true) { |
||
| 1063 | $str = $this->remove_invisible_characters($str); |
||
| 1064 | } |
||
| 1065 | |||
| 1066 | if ($normalize_whitespace === true) { |
||
| 1067 | $str = $this->normalize_whitespace($str, $keep_non_breaking_space); |
||
| 1068 | } |
||
| 1069 | |||
| 1070 | if ($normalize_msword === true) { |
||
| 1071 | $str = $this->normalize_msword($str); |
||
| 1072 | } |
||
| 1073 | |||
| 1074 | if ($remove_bom === true) { |
||
| 1075 | $str = $this->remove_bom($str); |
||
| 1076 | } |
||
| 1077 | |||
| 1078 | return $str; |
||
| 1079 | } |
||
| 1080 | |||
| 1081 | 6 | public function replace_diamond_question_mark($str, $replacementChar = '', $processInvalidUtf8 = true) |
|
| 1082 | { |
||
| 1083 | 6 | if ($str === '') { |
|
| 1084 | return ''; |
||
| 1085 | } |
||
| 1086 | |||
| 1087 | 6 | if ($processInvalidUtf8 === true) { |
|
| 1088 | 6 | $replacementCharHelper = $replacementChar; |
|
| 1089 | 6 | if ($replacementChar === '') { |
|
| 1090 | 6 | $replacementCharHelper = 'none'; |
|
| 1091 | 6 | } |
|
| 1092 | |||
| 1093 | 6 | if ($this->SUPPORT['mbstring'] === false) { |
|
| 1094 | // if there is no native support for "mbstring", |
||
| 1095 | // then we need to clean the string before ... |
||
| 1096 | $str = $this->clean($str); |
||
| 1097 | } |
||
| 1098 | |||
| 1099 | 6 | $save = \mb_substitute_character(); |
|
| 1100 | 6 | \mb_substitute_character($replacementCharHelper); |
|
| 1101 | // the polyfill maybe return false, so cast to string |
||
| 1102 | 6 | $str = (string)\mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
|
| 1103 | 6 | \mb_substitute_character($save); |
|
| 1104 | 6 | } |
|
| 1105 | |||
| 1106 | 6 | return str_replace( |
|
| 1107 | [ |
||
| 1108 | 6 | "\xEF\xBF\xBD", |
|
| 1109 | 6 | '�', |
|
| 1110 | 6 | ], |
|
| 1111 | [ |
||
| 1112 | 6 | $replacementChar, |
|
| 1113 | 6 | $replacementChar, |
|
| 1114 | 6 | ], |
|
| 1115 | $str |
||
| 1116 | 6 | ); |
|
| 1117 | } |
||
| 1118 | |||
| 1119 | 6 | public function remove_invisible_characters($str, $url_encoded = true, $replacement = '') |
|
| 1120 | { |
||
| 1121 | // init |
||
| 1122 | 6 | $non_displayables = []; |
|
| 1123 | |||
| 1124 | // every control character except newline (dec 10), |
||
| 1125 | // carriage return (dec 13) and horizontal tab (dec 09) |
||
| 1126 | 6 | if ($url_encoded) { |
|
| 1127 | 6 | $non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
|
| 1128 | 6 | $non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
|
| 1129 | 6 | } |
|
| 1130 | |||
| 1131 | 6 | $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
|
| 1132 | |||
| 1133 | do { |
||
| 1134 | 6 | $str = (string)preg_replace($non_displayables, $replacement, $str, -1, $count); |
|
| 1135 | 6 | } while ($count !== 0); |
|
| 1136 | |||
| 1137 | 6 | return $str; |
|
| 1138 | } |
||
| 1139 | |||
| 1140 | 6 | public function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false) |
|
| 1141 | { |
||
| 1142 | 6 | if ($str === '') { |
|
| 1143 | return ''; |
||
| 1144 | } |
||
| 1145 | |||
| 1146 | 6 | static $WHITESPACE_CACHE = []; |
|
| 1147 | 6 | $cacheKey = (int)$keepNonBreakingSpace; |
|
| 1148 | |||
| 1149 | 6 | if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
|
| 1150 | 1 | $WHITESPACE_CACHE[$cacheKey] = $this->WHITESPACE_TABLE; |
|
| 1151 | |||
| 1152 | 1 | if ($keepNonBreakingSpace === true) { |
|
| 1153 | unset($WHITESPACE_CACHE[$cacheKey]['NO-BREAK SPACE']); |
||
| 1154 | } |
||
| 1155 | |||
| 1156 | 1 | $WHITESPACE_CACHE[$cacheKey] = array_values($WHITESPACE_CACHE[$cacheKey]); |
|
| 1157 | 1 | } |
|
| 1158 | |||
| 1159 | 6 | if ($keepBidiUnicodeControls === false) { |
|
| 1160 | 6 | static $BIDI_UNICODE_CONTROLS_CACHE = null; |
|
| 1161 | |||
| 1162 | 6 | if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
|
| 1163 | 1 | $BIDI_UNICODE_CONTROLS_CACHE = array_values($this->BIDI_UNI_CODE_CONTROLS_TABLE); |
|
| 1164 | 1 | } |
|
| 1165 | |||
| 1166 | 6 | $str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
|
| 1167 | 6 | } |
|
| 1168 | |||
| 1169 | 6 | return str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
|
| 1170 | } |
||
| 1171 | |||
| 1172 | private function normalize_msword($str) |
||
| 1173 | { |
||
| 1174 | if ($str === '') { |
||
| 1175 | return ''; |
||
| 1176 | } |
||
| 1177 | |||
| 1178 | $keys = [ |
||
| 1179 | "\xc2\xab", // « (U+00AB) in UTF-8 |
||
| 1180 | "\xc2\xbb", // » (U+00BB) in UTF-8 |
||
| 1181 | "\xe2\x80\x98", // ‘ (U+2018) in UTF-8 |
||
| 1182 | "\xe2\x80\x99", // ’ (U+2019) in UTF-8 |
||
| 1183 | "\xe2\x80\x9a", // ‚ (U+201A) in UTF-8 |
||
| 1184 | "\xe2\x80\x9b", // ‛ (U+201B) in UTF-8 |
||
| 1185 | "\xe2\x80\x9c", // “ (U+201C) in UTF-8 |
||
| 1186 | "\xe2\x80\x9d", // ” (U+201D) in UTF-8 |
||
| 1187 | "\xe2\x80\x9e", // „ (U+201E) in UTF-8 |
||
| 1188 | "\xe2\x80\x9f", // ‟ (U+201F) in UTF-8 |
||
| 1189 | "\xe2\x80\xb9", // ‹ (U+2039) in UTF-8 |
||
| 1190 | "\xe2\x80\xba", // › (U+203A) in UTF-8 |
||
| 1191 | "\xe2\x80\x93", // – (U+2013) in UTF-8 |
||
| 1192 | "\xe2\x80\x94", // — (U+2014) in UTF-8 |
||
| 1193 | "\xe2\x80\xa6", // … (U+2026) in UTF-8 |
||
| 1194 | ]; |
||
| 1195 | |||
| 1196 | $values = [ |
||
| 1197 | '"', // « (U+00AB) in UTF-8 |
||
| 1198 | '"', // » (U+00BB) in UTF-8 |
||
| 1199 | "'", // ‘ (U+2018) in UTF-8 |
||
| 1200 | "'", // ’ (U+2019) in UTF-8 |
||
| 1201 | "'", // ‚ (U+201A) in UTF-8 |
||
| 1202 | "'", // ‛ (U+201B) in UTF-8 |
||
| 1203 | '"', // “ (U+201C) in UTF-8 |
||
| 1204 | '"', // ” (U+201D) in UTF-8 |
||
| 1205 | '"', // „ (U+201E) in UTF-8 |
||
| 1206 | '"', // ‟ (U+201F) in UTF-8 |
||
| 1207 | "'", // ‹ (U+2039) in UTF-8 |
||
| 1208 | "'", // › (U+203A) in UTF-8 |
||
| 1209 | '-', // – (U+2013) in UTF-8 |
||
| 1210 | '-', // — (U+2014) in UTF-8 |
||
| 1211 | '...', // … (U+2026) in UTF-8 |
||
| 1212 | ]; |
||
| 1213 | |||
| 1214 | return str_replace($keys, $values, $str); |
||
| 1215 | } |
||
| 1216 | |||
| 1217 | 6 | public function remove_bom($str) |
|
| 1218 | { |
||
| 1219 | 6 | if ($str === '') { |
|
| 1220 | return ''; |
||
| 1221 | } |
||
| 1222 | |||
| 1223 | 6 | $strLength = \strlen($str); |
|
| 1224 | 6 | foreach ($this->BOM as $bomString => $bomByteLength) { |
|
| 1225 | 6 | if (strpos($str, $bomString, 0) === 0) { |
|
| 1226 | $strTmp = \substr($str, $bomByteLength, $strLength); |
||
| 1227 | if ($strTmp === false) { |
||
| 1228 | return ''; |
||
| 1229 | } |
||
| 1230 | |||
| 1231 | $strLength -= (int)$bomByteLength; |
||
| 1232 | $str = (string)$strTmp; |
||
| 1233 | } |
||
| 1234 | 6 | } |
|
| 1235 | |||
| 1236 | 6 | return $str; |
|
| 1237 | } |
||
| 1238 | |||
| 1239 | // private function str_detect_encoding($str) |
||
| 1240 | // { |
||
| 1241 | // // init |
||
| 1242 | // $str = (string)$str; |
||
| 1243 | // |
||
| 1244 | // // |
||
| 1245 | // // 1.) check binary strings (010001001...) like UTF-16 / UTF-32 / PDF / Images / ... |
||
| 1246 | // // |
||
| 1247 | // |
||
| 1248 | // if ($this->is_binary($str, true) === true) { |
||
| 1249 | // $isUtf16 = $this->is_utf16($str, false); |
||
| 1250 | // if ($isUtf16 === 1) { |
||
| 1251 | // return 'UTF-16LE'; |
||
| 1252 | // } |
||
| 1253 | // if ($isUtf16 === 2) { |
||
| 1254 | // return 'UTF-16BE'; |
||
| 1255 | // } |
||
| 1256 | // |
||
| 1257 | // $isUtf32 = $this->is_utf32($str, false); |
||
| 1258 | // if ($isUtf32 === 1) { |
||
| 1259 | // return 'UTF-32LE'; |
||
| 1260 | // } |
||
| 1261 | // if ($isUtf32 === 2) { |
||
| 1262 | // return 'UTF-32BE'; |
||
| 1263 | // } |
||
| 1264 | // |
||
| 1265 | // // is binary but not "UTF-16" or "UTF-32" |
||
| 1266 | // return false; |
||
| 1267 | // } |
||
| 1268 | // |
||
| 1269 | // // |
||
| 1270 | // // 2.) simple check for ASCII chars |
||
| 1271 | // // |
||
| 1272 | // |
||
| 1273 | // if ($this->isAscii($str) === true) { |
||
| 1274 | // return 'ASCII'; |
||
| 1275 | // } |
||
| 1276 | // |
||
| 1277 | // // |
||
| 1278 | // // 3.) simple check for UTF-8 chars |
||
| 1279 | // // |
||
| 1280 | // |
||
| 1281 | // if ($this->isUtf8($str) === true) { |
||
| 1282 | // return 'UTF-8'; |
||
| 1283 | // } |
||
| 1284 | // |
||
| 1285 | // // |
||
| 1286 | // // 4.) check via "mb_detect_encoding()" |
||
| 1287 | // // |
||
| 1288 | // // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "mb_detect_encoding()" |
||
| 1289 | // |
||
| 1290 | // $detectOrder = [ |
||
| 1291 | // 'ISO-8859-1', |
||
| 1292 | // 'ISO-8859-2', |
||
| 1293 | // 'ISO-8859-3', |
||
| 1294 | // 'ISO-8859-4', |
||
| 1295 | // 'ISO-8859-5', |
||
| 1296 | // 'ISO-8859-6', |
||
| 1297 | // 'ISO-8859-7', |
||
| 1298 | // 'ISO-8859-8', |
||
| 1299 | // 'ISO-8859-9', |
||
| 1300 | // 'ISO-8859-10', |
||
| 1301 | // 'ISO-8859-13', |
||
| 1302 | // 'ISO-8859-14', |
||
| 1303 | // 'ISO-8859-15', |
||
| 1304 | // 'ISO-8859-16', |
||
| 1305 | // 'WINDOWS-1251', |
||
| 1306 | // 'WINDOWS-1252', |
||
| 1307 | // 'WINDOWS-1254', |
||
| 1308 | // 'CP932', |
||
| 1309 | // 'CP936', |
||
| 1310 | // 'CP950', |
||
| 1311 | // 'CP866', |
||
| 1312 | // 'CP850', |
||
| 1313 | // 'CP51932', |
||
| 1314 | // 'CP50220', |
||
| 1315 | // 'CP50221', |
||
| 1316 | // 'CP50222', |
||
| 1317 | // 'ISO-2022-JP', |
||
| 1318 | // 'ISO-2022-KR', |
||
| 1319 | // 'JIS', |
||
| 1320 | // 'JIS-ms', |
||
| 1321 | // 'EUC-CN', |
||
| 1322 | // 'EUC-JP', |
||
| 1323 | // ]; |
||
| 1324 | // |
||
| 1325 | // if ($this->SUPPORT['mbstring'] === true) { |
||
| 1326 | // // info: do not use the symfony polyfill here |
||
| 1327 | // $encoding = \mb_detect_encoding($str, $detectOrder, true); |
||
| 1328 | // if ($encoding) { |
||
| 1329 | // return $encoding; |
||
| 1330 | // } |
||
| 1331 | // } |
||
| 1332 | // |
||
| 1333 | // // |
||
| 1334 | // // 5.) check via "iconv()" |
||
| 1335 | // // |
||
| 1336 | // |
||
| 1337 | // if ($this->ENCODINGS === null) { |
||
| 1338 | // $this->ENCODINGS = $this->getData('encodings'); |
||
| 1339 | // } |
||
| 1340 | // |
||
| 1341 | // foreach ($this->ENCODINGS as $encodingTmp) { |
||
| 1342 | // // INFO: //IGNORE but still throw notice |
||
| 1343 | // /** @noinspection PhpUsageOfSilenceOperatorInspection */ |
||
| 1344 | // if ((string)@\iconv($encodingTmp, $encodingTmp . '//IGNORE', $str) === $str) { |
||
| 1345 | // return $encodingTmp; |
||
| 1346 | // } |
||
| 1347 | // } |
||
| 1348 | // |
||
| 1349 | // return false; |
||
| 1350 | // } |
||
| 1351 | |||
| 1352 | private function decimalToChr($int) |
||
| 1356 | // |
||
| 1357 | // private function is_utf16($str, $checkIfStringIsBinary = true) |
||
| 1358 | // { |
||
| 1359 | // |
||
| 1360 | // // init |
||
| 1361 | // $str = (string)$str; |
||
| 1362 | // $strChars = []; |
||
| 1363 | // |
||
| 1364 | // if ( |
||
| 1365 | // $checkIfStringIsBinary === true |
||
| 1366 | // && |
||
| 1367 | // $this->is_binary($str, true) === false |
||
| 1368 | // ) { |
||
| 1369 | // return false; |
||
| 1370 | // } |
||
| 1371 | // |
||
| 1372 | // if ($this->SUPPORT['mbstring'] === false) { |
||
| 1373 | // \trigger_error('UTF8::is_utf16() without mbstring may did not work correctly', \E_USER_WARNING); |
||
| 1374 | // } |
||
| 1375 | // |
||
| 1376 | // $str = $this->remove_bom($str); |
||
| 1377 | // |
||
| 1378 | // |
||
| 1379 | // $maybeUTF16LE = 0; |
||
| 1380 | // $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE'); |
||
| 1381 | // if ($test) { |
||
| 1382 | // $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8'); |
||
| 1383 | // $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE'); |
||
| 1384 | // if ($test3 === $test) { |
||
| 1385 | // if (\count($strChars) === 0) { |
||
| 1386 | // $strChars = $this->count_chars($str, true, false); |
||
| 1387 | // } |
||
| 1388 | // $countChars = $this->count_chars($test3); |
||
| 1389 | // foreach ($countChars as $test3char => $test3charEmpty) { |
||
| 1390 | // if (\in_array($test3char, $strChars, true) === true) { |
||
| 1391 | // ++$maybeUTF16LE; |
||
| 1392 | // } |
||
| 1393 | // unset($countChars[$test3char]); |
||
| 1394 | // } |
||
| 1395 | // } |
||
| 1396 | // } |
||
| 1397 | // |
||
| 1398 | // $maybeUTF16BE = 0; |
||
| 1399 | // $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE'); |
||
| 1400 | // if ($test) { |
||
| 1401 | // $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8'); |
||
| 1402 | // $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE'); |
||
| 1403 | // if ($test3 === $test) { |
||
| 1404 | // if (\count($strChars) === 0) { |
||
| 1405 | // $strChars = $this->count_chars($str, true, false); |
||
| 1406 | // } |
||
| 1407 | // $countChars = $this->count_chars($test3); |
||
| 1408 | // foreach ($countChars as $test3char => $test3charEmpty) { |
||
| 1409 | // if (\in_array($test3char, $strChars, true) === true) { |
||
| 1410 | // ++$maybeUTF16BE; |
||
| 1411 | // } |
||
| 1412 | // unset($countChars[$test3char]); |
||
| 1413 | // } |
||
| 1414 | // |
||
| 1415 | // } |
||
| 1416 | // } |
||
| 1417 | // |
||
| 1418 | // if ($maybeUTF16BE !== $maybeUTF16LE) { |
||
| 1419 | // if ($maybeUTF16LE > $maybeUTF16BE) { |
||
| 1420 | // return 1; |
||
| 1421 | // } |
||
| 1422 | // |
||
| 1423 | // return 2; |
||
| 1424 | // } |
||
| 1425 | // |
||
| 1426 | // return false; |
||
| 1427 | // } |
||
| 1428 | |||
| 1429 | /** |
||
| 1430 | * Check if the string is UTF-32. |
||
| 1431 | * |
||
| 1432 | * @param mixed $str <p>The input string.</p> |
||
| 1433 | * @param bool $checkIfStringIsBinary |
||
| 1434 | * |
||
| 1435 | * @return false|int |
||
| 1436 | * <strong>false</strong> if is't not UTF-32,<br> |
||
| 1437 | * <strong>1</strong> for UTF-32LE,<br> |
||
| 1438 | * <strong>2</strong> for UTF-32BE |
||
| 1439 | */ |
||
| 1440 | private function is_utf32($str, $checkIfStringIsBinary = true) |
||
| 1441 | { |
||
| 1442 | // init |
||
| 1443 | $str = (string)$str; |
||
| 1444 | $strChars = []; |
||
| 1445 | |||
| 1446 | if ($checkIfStringIsBinary === true && $this->is_binary($str, true) === false) { |
||
| 1447 | return false; |
||
| 1448 | } |
||
| 1449 | |||
| 1450 | if ($this->SUPPORT['mbstring'] === false) { |
||
| 1451 | \trigger_error('UTF8::is_utf32() without mbstring may did not work correctly', \E_USER_WARNING); |
||
| 1452 | } |
||
| 1453 | |||
| 1454 | $str = $this->remove_bom($str); |
||
| 1455 | |||
| 1456 | $maybeUTF32LE = 0; |
||
| 1457 | $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE'); |
||
| 1458 | if ($test) { |
||
| 1459 | $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8'); |
||
| 1460 | $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE'); |
||
| 1461 | if ($test3 === $test) { |
||
| 1462 | if (\count($strChars) === 0) { |
||
| 1463 | $strChars = $this->count_chars($str, true, false); |
||
| 1464 | } |
||
| 1465 | $countChars = $this->count_chars($test3); |
||
| 1466 | foreach ($countChars as $test3char => $test3charEmpty) { |
||
| 1467 | if (\in_array($test3char, $strChars, true) === true) { |
||
| 1468 | ++$maybeUTF32LE; |
||
| 1469 | } |
||
| 1470 | unset($countChars[$test3char]); |
||
| 1471 | } |
||
| 1472 | } |
||
| 1473 | } |
||
| 1474 | |||
| 1475 | $maybeUTF32BE = 0; |
||
| 1476 | $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE'); |
||
| 1477 | if ($test) { |
||
| 1478 | $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8'); |
||
| 1479 | $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE'); |
||
| 1480 | if ($test3 === $test) { |
||
| 1481 | if (\count($strChars) === 0) { |
||
| 1482 | $strChars = $this->count_chars($str, true, false); |
||
| 1483 | } |
||
| 1484 | $countChars = $this->count_chars($test3); |
||
| 1485 | foreach ($countChars as $test3char => $test3charEmpty) { |
||
| 1486 | if (\in_array($test3char, $strChars, true) === true) { |
||
| 1487 | ++$maybeUTF32BE; |
||
| 1488 | } |
||
| 1489 | unset($countChars[$test3char]); |
||
| 1490 | } |
||
| 1491 | } |
||
| 1492 | } |
||
| 1493 | |||
| 1494 | if ($maybeUTF32BE !== $maybeUTF32LE) { |
||
| 1495 | if ($maybeUTF32LE > $maybeUTF32BE) { |
||
| 1496 | return 1; |
||
| 1497 | } |
||
| 1498 | |||
| 1499 | return 2; |
||
| 1500 | } |
||
| 1501 | |||
| 1502 | return false; |
||
| 1503 | } |
||
| 1504 | |||
| 1505 | private function is_binary($input, $strict = false) |
||
| 1506 | { |
||
| 1507 | $input = (string)$input; |
||
| 1508 | if ($input === '') { |
||
| 1509 | return false; |
||
| 1510 | } |
||
| 1511 | |||
| 1512 | if (preg_match('~^[01]+$~', $input)) { |
||
| 1513 | return true; |
||
| 1514 | } |
||
| 1515 | |||
| 1516 | $ext = $this->get_file_type($input); |
||
| 1517 | if ($ext['type'] === 'binary') { |
||
| 1518 | return true; |
||
| 1519 | } |
||
| 1520 | |||
| 1521 | $testLength = \strlen($input); |
||
| 1522 | $testNull = \substr_count($input, "\x0", 0, $testLength); |
||
| 1523 | if (($testNull / $testLength) > 0.25) { |
||
| 1524 | return true; |
||
| 1525 | } |
||
| 1526 | |||
| 1527 | if ($strict === true) { |
||
| 1528 | if ($this->SUPPORT['finfo'] === false) { |
||
| 1529 | throw new \RuntimeException('ext-fileinfo: is not installed'); |
||
| 1530 | } |
||
| 1531 | |||
| 1532 | /** @noinspection PhpComposerExtensionStubsInspection */ |
||
| 1533 | $finfo_encoding = (new \finfo(\FILEINFO_MIME_ENCODING))->buffer($input); |
||
| 1534 | if ($finfo_encoding && $finfo_encoding === 'binary') { |
||
| 1535 | return true; |
||
| 1536 | } |
||
| 1537 | } |
||
| 1538 | |||
| 1539 | return false; |
||
| 1540 | } |
||
| 1541 | |||
| 1542 | private function get_file_type( |
||
| 1543 | $str, |
||
| 1544 | $fallback = [ |
||
| 1545 | 'ext' => null, |
||
| 1546 | 'mime' => 'application/octet-stream', |
||
| 1547 | 'type' => null, |
||
| 1548 | ] |
||
| 1549 | ) { |
||
| 1550 | if ($str === '') { |
||
| 1551 | return $fallback; |
||
| 1552 | } |
||
| 1553 | |||
| 1554 | $str_info = \substr($str, 0, 2); |
||
| 1555 | if ($str_info === false || \strlen($str_info) !== 2) { |
||
| 1556 | return $fallback; |
||
| 1557 | } |
||
| 1558 | |||
| 1559 | $str_info = \unpack('C2chars', $str_info); |
||
| 1560 | if ($str_info === false) { |
||
| 1561 | return $fallback; |
||
| 1562 | } |
||
| 1563 | $type_code = (int)($str_info['chars1'] . $str_info['chars2']); |
||
| 1564 | |||
| 1565 | switch ($type_code) { |
||
| 1566 | case 3780: |
||
| 1567 | $ext = 'pdf'; |
||
| 1568 | $mime = 'application/pdf'; |
||
| 1569 | $type = 'binary'; |
||
| 1570 | |||
| 1571 | break; |
||
| 1572 | case 7790: |
||
| 1573 | $ext = 'exe'; |
||
| 1574 | $mime = 'application/octet-stream'; |
||
| 1575 | $type = 'binary'; |
||
| 1576 | |||
| 1577 | break; |
||
| 1578 | case 7784: |
||
| 1579 | $ext = 'midi'; |
||
| 1580 | $mime = 'audio/x-midi'; |
||
| 1581 | $type = 'binary'; |
||
| 1582 | |||
| 1583 | break; |
||
| 1584 | case 8075: |
||
| 1585 | $ext = 'zip'; |
||
| 1586 | $mime = 'application/zip'; |
||
| 1587 | $type = 'binary'; |
||
| 1588 | |||
| 1589 | break; |
||
| 1590 | case 8297: |
||
| 1591 | $ext = 'rar'; |
||
| 1592 | $mime = 'application/rar'; |
||
| 1593 | $type = 'binary'; |
||
| 1594 | |||
| 1595 | break; |
||
| 1596 | case 255216: |
||
| 1597 | $ext = 'jpg'; |
||
| 1598 | $mime = 'image/jpeg'; |
||
| 1599 | $type = 'binary'; |
||
| 1600 | |||
| 1601 | break; |
||
| 1602 | case 7173: |
||
| 1603 | $ext = 'gif'; |
||
| 1604 | $mime = 'image/gif'; |
||
| 1605 | $type = 'binary'; |
||
| 1606 | |||
| 1607 | break; |
||
| 1608 | case 6677: |
||
| 1609 | $ext = 'bmp'; |
||
| 1610 | $mime = 'image/bmp'; |
||
| 1611 | $type = 'binary'; |
||
| 1612 | |||
| 1613 | break; |
||
| 1614 | case 13780: |
||
| 1615 | $ext = 'png'; |
||
| 1616 | $mime = 'image/png'; |
||
| 1617 | $type = 'binary'; |
||
| 1618 | |||
| 1619 | break; |
||
| 1620 | default: |
||
| 1621 | return $fallback; |
||
| 1622 | } |
||
| 1623 | |||
| 1624 | return [ |
||
| 1625 | 'ext' => $ext, |
||
| 1626 | 'mime' => $mime, |
||
| 1627 | 'type' => $type, |
||
| 1628 | ]; |
||
| 1629 | } |
||
| 1630 | |||
| 1631 | private function count_chars($str, $cleanUtf8 = false, $tryToUseMbFunction = true) |
||
| 1635 | |||
| 1636 | } |
||
| 1637 |
This check marks property names that have not been written in camelCase.
In camelCase names are written without any punctuation, the start of each new word being marked by a capital letter. Thus the name database connection string becomes
databaseConnectionString.