| Total Complexity | 140 |
| Total Lines | 1410 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like ASCII often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use ASCII, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 10 | final class ASCII |
||
| 11 | { |
||
| 12 | // |
||
| 13 | // INFO: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes |
||
| 14 | // |
||
| 15 | |||
| 16 | const UZBEK_LANGUAGE_CODE = 'uz'; |
||
| 17 | |||
| 18 | const TURKMEN_LANGUAGE_CODE = 'tk'; |
||
| 19 | |||
| 20 | const THAI_LANGUAGE_CODE = 'th'; |
||
| 21 | |||
| 22 | const PASHTO_LANGUAGE_CODE = 'ps'; |
||
| 23 | |||
| 24 | const ORIYA_LANGUAGE_CODE = 'or'; |
||
| 25 | |||
| 26 | const MONGOLIAN_LANGUAGE_CODE = 'mn'; |
||
| 27 | |||
| 28 | const KOREAN_LANGUAGE_CODE = 'ko'; |
||
| 29 | |||
| 30 | const KIRGHIZ_LANGUAGE_CODE = 'ky'; |
||
| 31 | |||
| 32 | const ARMENIAN_LANGUAGE_CODE = 'hy'; |
||
| 33 | |||
| 34 | const BENGALI_LANGUAGE_CODE = 'bn'; |
||
| 35 | |||
| 36 | const BELARUSIAN_LANGUAGE_CODE = 'be'; |
||
| 37 | |||
| 38 | const AMHARIC_LANGUAGE_CODE = 'am'; |
||
| 39 | |||
| 40 | const JAPANESE_LANGUAGE_CODE = 'ja'; |
||
| 41 | |||
| 42 | const CHINESE_LANGUAGE_CODE = 'zh'; |
||
| 43 | |||
| 44 | const DUTCH_LANGUAGE_CODE = 'nl'; |
||
| 45 | |||
| 46 | const ITALIAN_LANGUAGE_CODE = 'it'; |
||
| 47 | |||
| 48 | const MACEDONIAN_LANGUAGE_CODE = 'mk'; |
||
| 49 | |||
| 50 | const PORTUGUESE_LANGUAGE_CODE = 'pt'; |
||
| 51 | |||
| 52 | const GREEKLISH_LANGUAGE_CODE = 'el__greeklish'; |
||
| 53 | |||
| 54 | const GREEK_LANGUAGE_CODE = 'el'; |
||
| 55 | |||
| 56 | const HINDI_LANGUAGE_CODE = 'hi'; |
||
| 57 | |||
| 58 | const SWEDISH_LANGUAGE_CODE = 'sv'; |
||
| 59 | |||
| 60 | const TURKISH_LANGUAGE_CODE = 'tr'; |
||
| 61 | |||
| 62 | const BULGARIAN_LANGUAGE_CODE = 'bg'; |
||
| 63 | |||
| 64 | const HUNGARIAN_LANGUAGE_CODE = 'hu'; |
||
| 65 | |||
| 66 | const MYANMAR_LANGUAGE_CODE = 'my'; |
||
| 67 | |||
| 68 | const CROATIAN_LANGUAGE_CODE = 'hr'; |
||
| 69 | |||
| 70 | const FINNISH_LANGUAGE_CODE = 'fi'; |
||
| 71 | |||
| 72 | const GEORGIAN_LANGUAGE_CODE = 'ka'; |
||
| 73 | |||
| 74 | const RUSSIAN_LANGUAGE_CODE = 'ru'; |
||
| 75 | |||
| 76 | const RUSSIAN_PASSPORT_2013_LANGUAGE_CODE = 'ru__passport_2013'; |
||
| 77 | |||
| 78 | const RUSSIAN_GOST_2000_B_LANGUAGE_CODE = 'ru__gost_2000_b'; |
||
| 79 | |||
| 80 | const UKRAINIAN_LANGUAGE_CODE = 'uk'; |
||
| 81 | |||
| 82 | const KAZAKH_LANGUAGE_CODE = 'kk'; |
||
| 83 | |||
| 84 | const CZECH_LANGUAGE_CODE = 'cs'; |
||
| 85 | |||
| 86 | const DANISH_LANGUAGE_CODE = 'da'; |
||
| 87 | |||
| 88 | const POLISH_LANGUAGE_CODE = 'pl'; |
||
| 89 | |||
| 90 | const ROMANIAN_LANGUAGE_CODE = 'ro'; |
||
| 91 | |||
| 92 | const ESPERANTO_LANGUAGE_CODE = 'eo'; |
||
| 93 | |||
| 94 | const ESTONIAN_LANGUAGE_CODE = 'et'; |
||
| 95 | |||
| 96 | const LATVIAN_LANGUAGE_CODE = 'lv'; |
||
| 97 | |||
| 98 | const LITHUANIAN_LANGUAGE_CODE = 'lt'; |
||
| 99 | |||
| 100 | const NORWEGIAN_LANGUAGE_CODE = 'no'; |
||
| 101 | |||
| 102 | const VIETNAMESE_LANGUAGE_CODE = 'vi'; |
||
| 103 | |||
| 104 | const ARABIC_LANGUAGE_CODE = 'ar'; |
||
| 105 | |||
| 106 | const PERSIAN_LANGUAGE_CODE = 'fa'; |
||
| 107 | |||
| 108 | const SERBIAN_LANGUAGE_CODE = 'sr'; |
||
| 109 | |||
| 110 | const SERBIAN_CYRILLIC_LANGUAGE_CODE = 'sr__cyr'; |
||
| 111 | |||
| 112 | const SERBIAN_LATIN_LANGUAGE_CODE = 'sr__lat'; |
||
| 113 | |||
| 114 | const AZERBAIJANI_LANGUAGE_CODE = 'az'; |
||
| 115 | |||
| 116 | const SLOVAK_LANGUAGE_CODE = 'sk'; |
||
| 117 | |||
| 118 | const FRENCH_LANGUAGE_CODE = 'fr'; |
||
| 119 | |||
| 120 | const FRENCH_AUSTRIAN_LANGUAGE_CODE = 'fr_at'; |
||
| 121 | |||
| 122 | const FRENCH_SWITZERLAND_LANGUAGE_CODE = 'fr_ch'; |
||
| 123 | |||
| 124 | const GERMAN_LANGUAGE_CODE = 'de'; |
||
| 125 | |||
| 126 | const GERMAN_AUSTRIAN_LANGUAGE_CODE = 'de_at'; |
||
| 127 | |||
| 128 | const GERMAN_SWITZERLAND_LANGUAGE_CODE = 'de_ch'; |
||
| 129 | |||
| 130 | const ENGLISH_LANGUAGE_CODE = 'en'; |
||
| 131 | |||
| 132 | const EXTRA_LATIN_CHARS_LANGUAGE_CODE = 'latin'; |
||
| 133 | |||
| 134 | const EXTRA_WHITESPACE_CHARS_LANGUAGE_CODE = ' '; |
||
| 135 | |||
| 136 | const EXTRA_MSWORD_CHARS_LANGUAGE_CODE = 'msword'; |
||
| 137 | |||
| 138 | /** |
||
| 139 | * @var array<string, array<string, string>>|null |
||
| 140 | */ |
||
| 141 | private static $ASCII_MAPS; |
||
| 142 | |||
| 143 | /** |
||
| 144 | * @var array<string, array<string, string>>|null |
||
| 145 | */ |
||
| 146 | private static $ASCII_MAPS_AND_EXTRAS; |
||
| 147 | |||
| 148 | /** |
||
| 149 | * @var array<string, array<string, string>>|null |
||
| 150 | */ |
||
| 151 | private static $ASCII_EXTRAS; |
||
| 152 | |||
| 153 | /** |
||
| 154 | * @var array<string, int>|null |
||
| 155 | */ |
||
| 156 | private static $ORD; |
||
| 157 | |||
| 158 | /** |
||
| 159 | * @var array<string, int>|null |
||
| 160 | */ |
||
| 161 | private static $LANGUAGE_MAX_KEY; |
||
| 162 | |||
| 163 | /** |
||
| 164 | * url: https://en.wikipedia.org/wiki/Wikipedia:ASCII#ASCII_printable_characters |
||
| 165 | * |
||
| 166 | * @var string |
||
| 167 | */ |
||
| 168 | private static $REGEX_ASCII = "[^\x09\x10\x13\x0A\x0D\x20-\x7E]"; |
||
| 169 | |||
| 170 | /** |
||
| 171 | * bidirectional text chars |
||
| 172 | * |
||
| 173 | * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls |
||
| 174 | * |
||
| 175 | * @var array<int, string> |
||
| 176 | */ |
||
| 177 | private static $BIDI_UNI_CODE_CONTROLS_TABLE = [ |
||
| 178 | // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") |
||
| 179 | 8234 => "\xE2\x80\xAA", |
||
| 180 | // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") |
||
| 181 | 8235 => "\xE2\x80\xAB", |
||
| 182 | // POP DIRECTIONAL FORMATTING // (use -> </bdo>) |
||
| 183 | 8236 => "\xE2\x80\xAC", |
||
| 184 | // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) |
||
| 185 | 8237 => "\xE2\x80\xAD", |
||
| 186 | // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) |
||
| 187 | 8238 => "\xE2\x80\xAE", |
||
| 188 | // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") |
||
| 189 | 8294 => "\xE2\x81\xA6", |
||
| 190 | // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") |
||
| 191 | 8295 => "\xE2\x81\xA7", |
||
| 192 | // FIRST STRONG ISOLATE // (use -> dir = "auto") |
||
| 193 | 8296 => "\xE2\x81\xA8", |
||
| 194 | // POP DIRECTIONAL ISOLATE |
||
| 195 | 8297 => "\xE2\x81\xA9", |
||
| 196 | ]; |
||
| 197 | |||
| 198 | /** |
||
| 199 | * Get all languages from the constants "ASCII::.*LANGUAGE_CODE". |
||
| 200 | * |
||
| 201 | * @return string[] |
||
| 202 | * |
||
| 203 | * @psalm-return array<string, string> |
||
| 204 | */ |
||
| 205 | public static function getAllLanguages(): array |
||
| 206 | { |
||
| 207 | // init |
||
| 208 | static $LANGUAGES = []; |
||
| 209 | |||
| 210 | if ($LANGUAGES !== []) { |
||
| 211 | return $LANGUAGES; |
||
| 212 | } |
||
| 213 | |||
| 214 | foreach ((new \ReflectionClass(__CLASS__))->getConstants() as $constant => $lang) { |
||
| 215 | if (\strpos($constant, 'EXTRA') !== false) { |
||
| 216 | $LANGUAGES[\strtolower($constant)] = $lang; |
||
| 217 | } else { |
||
| 218 | $LANGUAGES[\strtolower(\str_replace('_LANGUAGE_CODE', '', $constant))] = $lang; |
||
| 219 | } |
||
| 220 | } |
||
| 221 | |||
| 222 | return $LANGUAGES; |
||
| 223 | } |
||
| 224 | |||
| 225 | /** |
||
| 226 | * Returns an replacement array for ASCII methods. |
||
| 227 | * |
||
| 228 | * EXAMPLE: <code> |
||
| 229 | * $array = ASCII::charsArray(); |
||
| 230 | * var_dump($array['ru']['б']); // 'b' |
||
| 231 | * </code> |
||
| 232 | * |
||
| 233 | * @psalm-suppress InvalidNullableReturnType - we use the prepare* methods here, so we don't get NULL here |
||
| 234 | * |
||
| 235 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> |
||
| 236 | * |
||
| 237 | * @psalm-pure |
||
| 238 | * |
||
| 239 | * @return array |
||
| 240 | * |
||
| 241 | * @psalm-return array<string, array<string , string>> |
||
| 242 | */ |
||
| 243 | public static function charsArray(bool $replace_extra_symbols = false): array |
||
| 244 | { |
||
| 245 | if ($replace_extra_symbols) { |
||
| 246 | self::prepareAsciiAndExtrasMaps(); |
||
| 247 | |||
| 248 | return self::$ASCII_MAPS_AND_EXTRAS ?? []; |
||
| 249 | } |
||
| 250 | |||
| 251 | self::prepareAsciiMaps(); |
||
| 252 | |||
| 253 | return self::$ASCII_MAPS ?? []; |
||
| 254 | } |
||
| 255 | |||
| 256 | /** |
||
| 257 | * Returns an replacement array for ASCII methods with a mix of multiple languages. |
||
| 258 | * |
||
| 259 | * EXAMPLE: <code> |
||
| 260 | * $array = ASCII::charsArrayWithMultiLanguageValues(); |
||
| 261 | * var_dump($array['b']); // ['β', 'б', 'ဗ', 'ბ', 'ب'] |
||
| 262 | * </code> |
||
| 263 | * |
||
| 264 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> |
||
| 265 | * |
||
| 266 | * @psalm-pure |
||
| 267 | * |
||
| 268 | * @return array |
||
| 269 | * <p>An array of replacements.</p> |
||
| 270 | * |
||
| 271 | * @psalm-return array<string, array<int, string>> |
||
| 272 | */ |
||
| 273 | public static function charsArrayWithMultiLanguageValues(bool $replace_extra_symbols = false): array |
||
| 274 | { |
||
| 275 | /** |
||
| 276 | * @var array<string, array> |
||
| 277 | */ |
||
| 278 | static $CHARS_ARRAY = []; |
||
| 279 | $cacheKey = '' . $replace_extra_symbols; |
||
| 280 | |||
| 281 | if (isset($CHARS_ARRAY[$cacheKey])) { |
||
| 282 | return $CHARS_ARRAY[$cacheKey]; |
||
| 283 | } |
||
| 284 | |||
| 285 | // init |
||
| 286 | $return = []; |
||
| 287 | $language_all_chars = self::charsArrayWithSingleLanguageValues( |
||
| 288 | $replace_extra_symbols, |
||
| 289 | false |
||
| 290 | ); |
||
| 291 | |||
| 292 | /** @noinspection PhpSillyAssignmentInspection - hack for phpstan */ |
||
| 293 | /** @var array<string, string> $language_all_chars */ |
||
| 294 | $language_all_chars = $language_all_chars; |
||
| 295 | |||
| 296 | /** @noinspection AlterInForeachInspection */ |
||
| 297 | foreach ($language_all_chars as $key => &$value) { |
||
| 298 | $return[$value][] = $key; |
||
| 299 | } |
||
| 300 | |||
| 301 | $CHARS_ARRAY[$cacheKey] = $return; |
||
| 302 | |||
| 303 | /** @noinspection PhpSillyAssignmentInspection - hack for phpstan */ |
||
| 304 | /** @var array<string, array<int, string>> $return */ |
||
| 305 | $return = $return; |
||
| 306 | |||
| 307 | return $return; |
||
| 308 | } |
||
| 309 | |||
| 310 | /** |
||
| 311 | * Returns an replacement array for ASCII methods with one language. |
||
| 312 | * |
||
| 313 | * For example, German will map 'ä' to 'ae', while other languages |
||
| 314 | * will simply return e.g. 'a'. |
||
| 315 | * |
||
| 316 | * EXAMPLE: <code> |
||
| 317 | * $array = ASCII::charsArrayWithOneLanguage('ru'); |
||
| 318 | * $tmpKey = \array_search('yo', $array['replace']); |
||
| 319 | * echo $array['orig'][$tmpKey]; // 'ё' |
||
| 320 | * </code> |
||
| 321 | * |
||
| 322 | * @psalm-suppress InvalidNullableReturnType - we use the prepare* methods here, so we don't get NULL here |
||
| 323 | * |
||
| 324 | * @param string $language [optional] <p>Language of the source string e.g.: en, de_at, or de-ch. |
||
| 325 | * (default is 'en') | ASCII::*_LANGUAGE_CODE</p> |
||
| 326 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> |
||
| 327 | * @param bool $asOrigReplaceArray [optional] <p>TRUE === return {orig: string[], replace: string[]} |
||
| 328 | * array</p> |
||
| 329 | * |
||
| 330 | * @psalm-pure |
||
| 331 | * |
||
| 332 | * @return array |
||
| 333 | * <p>An array of replacements.</p> |
||
| 334 | * |
||
| 335 | * @psalm-return array{orig: string[], replace: string[]}|array<string, string> |
||
| 336 | */ |
||
| 337 | public static function charsArrayWithOneLanguage( |
||
| 338 | string $language = self::ENGLISH_LANGUAGE_CODE, |
||
| 339 | bool $replace_extra_symbols = false, |
||
| 340 | bool $asOrigReplaceArray = true |
||
| 341 | ): array { |
||
| 342 | $language = self::get_language($language); |
||
| 343 | |||
| 344 | // init |
||
| 345 | /** |
||
| 346 | * @var array<string, array> |
||
| 347 | */ |
||
| 348 | static $CHARS_ARRAY = []; |
||
| 349 | $cacheKey = '' . $replace_extra_symbols . '-' . $asOrigReplaceArray; |
||
| 350 | |||
| 351 | // check static cache |
||
| 352 | if (isset($CHARS_ARRAY[$cacheKey][$language])) { |
||
| 353 | return $CHARS_ARRAY[$cacheKey][$language]; |
||
| 354 | } |
||
| 355 | |||
| 356 | if ($replace_extra_symbols) { |
||
| 357 | self::prepareAsciiAndExtrasMaps(); |
||
| 358 | |||
| 359 | /** @noinspection DuplicatedCode */ |
||
| 360 | if (isset(self::$ASCII_MAPS_AND_EXTRAS[$language])) { |
||
| 361 | $tmpArray = self::$ASCII_MAPS_AND_EXTRAS[$language]; |
||
| 362 | |||
| 363 | if ($asOrigReplaceArray) { |
||
| 364 | $CHARS_ARRAY[$cacheKey][$language] = [ |
||
| 365 | 'orig' => \array_keys($tmpArray), |
||
| 366 | 'replace' => \array_values($tmpArray), |
||
| 367 | ]; |
||
| 368 | } else { |
||
| 369 | $CHARS_ARRAY[$cacheKey][$language] = $tmpArray; |
||
| 370 | } |
||
| 371 | } else { |
||
| 372 | /** @noinspection NestedPositiveIfStatementsInspection */ |
||
| 373 | if ($asOrigReplaceArray) { |
||
| 374 | $CHARS_ARRAY[$cacheKey][$language] = [ |
||
| 375 | 'orig' => [], |
||
| 376 | 'replace' => [], |
||
| 377 | ]; |
||
| 378 | } else { |
||
| 379 | $CHARS_ARRAY[$cacheKey][$language] = []; |
||
| 380 | } |
||
| 381 | } |
||
| 382 | } else { |
||
| 383 | self::prepareAsciiMaps(); |
||
| 384 | |||
| 385 | /** @noinspection DuplicatedCode */ |
||
| 386 | if (isset(self::$ASCII_MAPS[$language])) { |
||
| 387 | $tmpArray = self::$ASCII_MAPS[$language]; |
||
| 388 | |||
| 389 | if ($asOrigReplaceArray) { |
||
| 390 | $CHARS_ARRAY[$cacheKey][$language] = [ |
||
| 391 | 'orig' => \array_keys($tmpArray), |
||
| 392 | 'replace' => \array_values($tmpArray), |
||
| 393 | ]; |
||
| 394 | } else { |
||
| 395 | $CHARS_ARRAY[$cacheKey][$language] = $tmpArray; |
||
| 396 | } |
||
| 397 | } else { |
||
| 398 | /** @noinspection NestedPositiveIfStatementsInspection */ |
||
| 399 | if ($asOrigReplaceArray) { |
||
| 400 | $CHARS_ARRAY[$cacheKey][$language] = [ |
||
| 401 | 'orig' => [], |
||
| 402 | 'replace' => [], |
||
| 403 | ]; |
||
| 404 | } else { |
||
| 405 | $CHARS_ARRAY[$cacheKey][$language] = []; |
||
| 406 | } |
||
| 407 | } |
||
| 408 | } |
||
| 409 | |||
| 410 | return $CHARS_ARRAY[$cacheKey][$language] ?? ['orig' => [], 'replace' => []]; |
||
| 411 | } |
||
| 412 | |||
| 413 | /** |
||
| 414 | * Returns an replacement array for ASCII methods with multiple languages. |
||
| 415 | * |
||
| 416 | * EXAMPLE: <code> |
||
| 417 | * $array = ASCII::charsArrayWithSingleLanguageValues(); |
||
| 418 | * $tmpKey = \array_search('hnaik', $array['replace']); |
||
| 419 | * echo $array['orig'][$tmpKey]; // '၌' |
||
| 420 | * </code> |
||
| 421 | * |
||
| 422 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> |
||
| 423 | * @param bool $asOrigReplaceArray [optional] <p>TRUE === return {orig: string[], replace: string[]} |
||
| 424 | * array</p> |
||
| 425 | * |
||
| 426 | * @psalm-pure |
||
| 427 | * |
||
| 428 | * @return array |
||
| 429 | * <p>An array of replacements.</p> |
||
| 430 | * |
||
| 431 | * @psalm-return array{orig: string[], replace: string[]}|array<string, string> |
||
| 432 | */ |
||
| 433 | public static function charsArrayWithSingleLanguageValues( |
||
| 434 | bool $replace_extra_symbols = false, |
||
| 435 | bool $asOrigReplaceArray = true |
||
| 436 | ): array { |
||
| 437 | // init |
||
| 438 | /** |
||
| 439 | * @var array<string,array> |
||
| 440 | */ |
||
| 441 | static $CHARS_ARRAY = []; |
||
| 442 | $cacheKey = '' . $replace_extra_symbols . '-' . $asOrigReplaceArray; |
||
| 443 | |||
| 444 | if (isset($CHARS_ARRAY[$cacheKey])) { |
||
| 445 | return $CHARS_ARRAY[$cacheKey]; |
||
| 446 | } |
||
| 447 | |||
| 448 | if ($replace_extra_symbols) { |
||
| 449 | self::prepareAsciiAndExtrasMaps(); |
||
| 450 | |||
| 451 | /** @noinspection AlterInForeachInspection */ |
||
| 452 | /** @psalm-suppress PossiblyNullIterator - we use the prepare* methods here, so we don't get NULL here */ |
||
| 453 | foreach (self::$ASCII_MAPS_AND_EXTRAS ?? [] as &$map) { |
||
| 454 | $CHARS_ARRAY[$cacheKey][] = $map; |
||
| 455 | } |
||
| 456 | } else { |
||
| 457 | self::prepareAsciiMaps(); |
||
| 458 | |||
| 459 | /** @noinspection AlterInForeachInspection */ |
||
| 460 | /** @psalm-suppress PossiblyNullIterator - we use the prepare* methods here, so we don't get NULL here */ |
||
| 461 | foreach (self::$ASCII_MAPS ?? [] as &$map) { |
||
| 462 | $CHARS_ARRAY[$cacheKey][] = $map; |
||
| 463 | } |
||
| 464 | } |
||
| 465 | |||
| 466 | $CHARS_ARRAY[$cacheKey] = \array_merge([], ...$CHARS_ARRAY[$cacheKey]); |
||
| 467 | |||
| 468 | if ($asOrigReplaceArray) { |
||
| 469 | $CHARS_ARRAY[$cacheKey] = [ |
||
| 470 | 'orig' => \array_keys($CHARS_ARRAY[$cacheKey]), |
||
| 471 | 'replace' => \array_values($CHARS_ARRAY[$cacheKey]), |
||
| 472 | ]; |
||
| 473 | } |
||
| 474 | |||
| 475 | return $CHARS_ARRAY[$cacheKey]; |
||
| 476 | } |
||
| 477 | |||
| 478 | /** |
||
| 479 | * Accepts a string and removes all non-UTF-8 characters from it + extras if needed. |
||
| 480 | * |
||
| 481 | * @param string $str <p>The string to be sanitized.</p> |
||
| 482 | * @param bool $normalize_whitespace [optional] <p>Set to true, if you need to normalize the |
||
| 483 | * whitespace.</p> |
||
| 484 | * @param bool $normalize_msword [optional] <p>Set to true, if you need to normalize MS Word chars |
||
| 485 | * e.g.: "…" |
||
| 486 | * => "..."</p> |
||
| 487 | * @param bool $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in |
||
| 488 | * combination with |
||
| 489 | * $normalize_whitespace</p> |
||
| 490 | * @param bool $remove_invisible_characters [optional] <p>Set to false, if you not want to remove invisible |
||
| 491 | * characters e.g.: "\0"</p> |
||
| 492 | * |
||
| 493 | * @psalm-pure |
||
| 494 | * |
||
| 495 | * @return string |
||
| 496 | * <p>A clean UTF-8 string.</p> |
||
| 497 | */ |
||
| 498 | public static function clean( |
||
| 499 | string $str, |
||
| 500 | bool $normalize_whitespace = true, |
||
| 501 | bool $keep_non_breaking_space = false, |
||
| 502 | bool $normalize_msword = true, |
||
| 503 | bool $remove_invisible_characters = true |
||
| 504 | ): string { |
||
| 505 | // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string |
||
| 506 | // caused connection reset problem on larger strings |
||
| 507 | |||
| 508 | $regex = '/ |
||
| 509 | ( |
||
| 510 | (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
||
| 511 | | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
||
| 512 | | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
||
| 513 | | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 |
||
| 514 | ){1,100} # ...one or more times |
||
| 515 | ) |
||
| 516 | | ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 |
||
| 517 | | ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 |
||
| 518 | /x'; |
||
| 519 | $str = (string) \preg_replace($regex, '$1', $str); |
||
| 520 | |||
| 521 | if ($normalize_whitespace) { |
||
| 522 | $str = self::normalize_whitespace($str, $keep_non_breaking_space); |
||
| 523 | } |
||
| 524 | |||
| 525 | if ($normalize_msword) { |
||
| 526 | $str = self::normalize_msword($str); |
||
| 527 | } |
||
| 528 | |||
| 529 | if ($remove_invisible_characters) { |
||
| 530 | $str = self::remove_invisible_characters($str); |
||
| 531 | } |
||
| 532 | |||
| 533 | return $str; |
||
| 534 | } |
||
| 535 | |||
| 536 | /** |
||
| 537 | * Checks if a string is 7 bit ASCII. |
||
| 538 | * |
||
| 539 | * EXAMPLE: <code> |
||
| 540 | * ASCII::is_ascii('白'); // false |
||
| 541 | * </code> |
||
| 542 | * |
||
| 543 | * @param string $str <p>The string to check.</p> |
||
| 544 | * |
||
| 545 | * @psalm-pure |
||
| 546 | * |
||
| 547 | * @return bool |
||
| 548 | * <p> |
||
| 549 | * <strong>true</strong> if it is ASCII<br> |
||
| 550 | * <strong>false</strong> otherwise |
||
| 551 | * </p> |
||
| 552 | */ |
||
| 553 | public static function is_ascii(string $str): bool |
||
| 554 | { |
||
| 555 | if ($str === '') { |
||
| 556 | return true; |
||
| 557 | } |
||
| 558 | |||
| 559 | return !\preg_match('/' . self::$REGEX_ASCII . '/', $str); |
||
| 560 | } |
||
| 561 | |||
| 562 | /** |
||
| 563 | * Returns a string with smart quotes, ellipsis characters, and dashes from |
||
| 564 | * Windows-1252 (commonly used in Word documents) replaced by their ASCII |
||
| 565 | * equivalents. |
||
| 566 | * |
||
| 567 | * EXAMPLE: <code> |
||
| 568 | * ASCII::normalize_msword('„Abcdef…”'); // '"Abcdef..."' |
||
| 569 | * </code> |
||
| 570 | * |
||
| 571 | * @param string $str <p>The string to be normalized.</p> |
||
| 572 | * |
||
| 573 | * @psalm-pure |
||
| 574 | * |
||
| 575 | * @return string |
||
| 576 | * <p>A string with normalized characters for commonly used chars in Word documents.</p> |
||
| 577 | */ |
||
| 578 | public static function normalize_msword(string $str): string |
||
| 579 | { |
||
| 580 | if ($str === '') { |
||
| 581 | return ''; |
||
| 582 | } |
||
| 583 | |||
| 584 | /** |
||
| 585 | * @var array{orig: string[], replace: string[]} |
||
| 586 | */ |
||
| 587 | static $MSWORD_CACHE = ['orig' => [], 'replace' => []]; |
||
| 588 | |||
| 589 | if (empty($MSWORD_CACHE['orig'])) { |
||
| 590 | self::prepareAsciiMaps(); |
||
| 591 | |||
| 592 | /** |
||
| 593 | * @psalm-suppress PossiblyNullArrayAccess - we use the prepare* methods here, so we don't get NULL here |
||
| 594 | * |
||
| 595 | * @var array<string, string> |
||
| 596 | */ |
||
| 597 | $map = self::$ASCII_MAPS[self::EXTRA_MSWORD_CHARS_LANGUAGE_CODE] ?? []; |
||
| 598 | |||
| 599 | $MSWORD_CACHE = [ |
||
| 600 | 'orig' => \array_keys($map), |
||
| 601 | 'replace' => \array_values($map), |
||
| 602 | ]; |
||
| 603 | } |
||
| 604 | |||
| 605 | return \str_replace($MSWORD_CACHE['orig'], $MSWORD_CACHE['replace'], $str); |
||
| 606 | } |
||
| 607 | |||
| 608 | /** |
||
| 609 | * Normalize the whitespace. |
||
| 610 | * |
||
| 611 | * EXAMPLE: <code> |
||
| 612 | * ASCII::normalize_whitespace("abc-\xc2\xa0-öäü-\xe2\x80\xaf-\xE2\x80\xAC", true); // "abc-\xc2\xa0-öäü- -" |
||
| 613 | * </code> |
||
| 614 | * |
||
| 615 | * @param string $str <p>The string to be normalized.</p> |
||
| 616 | * @param bool $keepNonBreakingSpace [optional] <p>Set to true, to keep non-breaking-spaces.</p> |
||
| 617 | * @param bool $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web) |
||
| 618 | * bidirectional text chars.</p> |
||
| 619 | * |
||
| 620 | * @psalm-pure |
||
| 621 | * |
||
| 622 | * @return string |
||
| 623 | * <p>A string with normalized whitespace.</p> |
||
| 624 | */ |
||
| 625 | public static function normalize_whitespace( |
||
| 626 | string $str, |
||
| 627 | bool $keepNonBreakingSpace = false, |
||
| 628 | bool $keepBidiUnicodeControls = false |
||
| 629 | ): string { |
||
| 630 | if ($str === '') { |
||
| 631 | return ''; |
||
| 632 | } |
||
| 633 | |||
| 634 | /** |
||
| 635 | * @var array<int,array<string,string>> |
||
| 636 | */ |
||
| 637 | static $WHITESPACE_CACHE = []; |
||
| 638 | $cacheKey = (int) $keepNonBreakingSpace; |
||
| 639 | |||
| 640 | if (!isset($WHITESPACE_CACHE[$cacheKey])) { |
||
| 641 | self::prepareAsciiMaps(); |
||
| 642 | |||
| 643 | $WHITESPACE_CACHE[$cacheKey] = self::$ASCII_MAPS[self::EXTRA_WHITESPACE_CHARS_LANGUAGE_CODE] ?? []; |
||
| 644 | |||
| 645 | if ($keepNonBreakingSpace) { |
||
| 646 | unset($WHITESPACE_CACHE[$cacheKey]["\xc2\xa0"]); |
||
| 647 | } |
||
| 648 | |||
| 649 | $WHITESPACE_CACHE[$cacheKey] = \array_keys($WHITESPACE_CACHE[$cacheKey]); |
||
| 650 | } |
||
| 651 | |||
| 652 | if (!$keepBidiUnicodeControls) { |
||
| 653 | /** |
||
| 654 | * @var array<int,string>|null |
||
| 655 | */ |
||
| 656 | static $BIDI_UNICODE_CONTROLS_CACHE = null; |
||
| 657 | |||
| 658 | if ($BIDI_UNICODE_CONTROLS_CACHE === null) { |
||
| 659 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 660 | $BIDI_UNICODE_CONTROLS_CACHE = self::$BIDI_UNI_CODE_CONTROLS_TABLE; |
||
| 661 | } |
||
| 662 | |||
| 663 | $str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); |
||
| 664 | } |
||
| 665 | |||
| 666 | return \str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); |
||
| 667 | } |
||
| 668 | |||
| 669 | /** |
||
| 670 | * Remove invisible characters from a string. |
||
| 671 | * |
||
| 672 | * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script. |
||
| 673 | * |
||
| 674 | * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php |
||
| 675 | * |
||
| 676 | * @param string $str |
||
| 677 | * @param bool $url_encoded |
||
| 678 | * @param string $replacement |
||
| 679 | * |
||
| 680 | * @psalm-pure |
||
| 681 | * |
||
| 682 | * @return string |
||
| 683 | */ |
||
| 684 | public static function remove_invisible_characters( |
||
| 685 | string $str, |
||
| 686 | bool $url_encoded = false, |
||
| 687 | string $replacement = '' |
||
| 688 | ): string { |
||
| 689 | // init |
||
| 690 | $non_displayables = []; |
||
| 691 | |||
| 692 | // every control character except: |
||
| 693 | // - newline (dec 10), |
||
| 694 | // - carriage return (dec 13), |
||
| 695 | // - horizontal tab (dec 09) |
||
| 696 | if ($url_encoded) { |
||
| 697 | $non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 |
||
| 698 | $non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 |
||
| 699 | } |
||
| 700 | |||
| 701 | $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 |
||
| 702 | |||
| 703 | do { |
||
| 704 | $str = (string) \preg_replace($non_displayables, $replacement, $str, -1, $count); |
||
| 705 | } while ($count !== 0); |
||
| 706 | |||
| 707 | return $str; |
||
| 708 | } |
||
| 709 | |||
| 710 | /** |
||
| 711 | * Returns an ASCII version of the string. A set of non-ASCII characters are |
||
| 712 | * replaced with their closest ASCII counterparts, and the rest are removed |
||
| 713 | * by default. The language or locale of the source string can be supplied |
||
| 714 | * for language-specific transliteration in any of the following formats: |
||
| 715 | * en, en_GB, or en-GB. For example, passing "de" results in "äöü" mapping |
||
| 716 | * to "aeoeue" rather than "aou" as in other languages. |
||
| 717 | * |
||
| 718 | * EXAMPLE: <code> |
||
| 719 | * ASCII::to_ascii('�Düsseldorf�', 'en'); // Dusseldorf |
||
| 720 | * </code> |
||
| 721 | * |
||
| 722 | * @param string $str <p>The input string.</p> |
||
| 723 | * @param string $language [optional] <p>Language of the source string. |
||
| 724 | * (default is 'en') | ASCII::*_LANGUAGE_CODE</p> |
||
| 725 | * @param bool $remove_unsupported_chars [optional] <p>Whether or not to remove the |
||
| 726 | * unsupported characters.</p> |
||
| 727 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " pound |
||
| 728 | * ".</p> |
||
| 729 | * @param bool $use_transliterate [optional] <p>Use ASCII::to_transliterate() for unknown chars.</p> |
||
| 730 | * @param bool|null $replace_single_chars_only [optional] <p>Single char replacement is better for the |
||
| 731 | * performance, but some languages need to replace more then one char |
||
| 732 | * at the same time. | NULL === auto-setting, depended on the |
||
| 733 | * language</p> |
||
| 734 | * |
||
| 735 | * @psalm-pure |
||
| 736 | * |
||
| 737 | * @return string |
||
| 738 | * <p>A string that contains only ASCII characters.</p> |
||
| 739 | */ |
||
| 740 | public static function to_ascii( |
||
| 741 | string $str, |
||
| 742 | string $language = self::ENGLISH_LANGUAGE_CODE, |
||
| 743 | bool $remove_unsupported_chars = true, |
||
| 744 | bool $replace_extra_symbols = false, |
||
| 745 | bool $use_transliterate = false, |
||
| 746 | bool $replace_single_chars_only = null |
||
| 747 | ): string { |
||
| 748 | if ($str === '') { |
||
| 749 | return ''; |
||
| 750 | } |
||
| 751 | |||
| 752 | $language = self::get_language($language); |
||
| 753 | |||
| 754 | static $EXTRA_SYMBOLS_CACHE = null; |
||
| 755 | |||
| 756 | /** |
||
| 757 | * @var array<string,array<string,string>> |
||
| 758 | */ |
||
| 759 | static $REPLACE_HELPER_CACHE = []; |
||
| 760 | $cacheKey = $language . '-' . $replace_extra_symbols; |
||
| 761 | |||
| 762 | if (!isset($REPLACE_HELPER_CACHE[$cacheKey])) { |
||
| 763 | $langAll = self::charsArrayWithSingleLanguageValues($replace_extra_symbols, false); |
||
| 764 | |||
| 765 | $langSpecific = self::charsArrayWithOneLanguage($language, $replace_extra_symbols, false); |
||
| 766 | |||
| 767 | if ($langSpecific === []) { |
||
| 768 | $REPLACE_HELPER_CACHE[$cacheKey] = $langAll; |
||
| 769 | } else { |
||
| 770 | $REPLACE_HELPER_CACHE[$cacheKey] = \array_merge([], $langAll, $langSpecific); |
||
| 771 | } |
||
| 772 | } |
||
| 773 | |||
| 774 | if ( |
||
| 775 | $replace_extra_symbols |
||
| 776 | && |
||
| 777 | $EXTRA_SYMBOLS_CACHE === null |
||
| 778 | ) { |
||
| 779 | $EXTRA_SYMBOLS_CACHE = []; |
||
| 780 | foreach (self::$ASCII_EXTRAS ?? [] as $extrasLanguageTmp => $extrasDataTmp) { |
||
| 781 | foreach ($extrasDataTmp as $extrasDataKeyTmp => $extrasDataValueTmp) { |
||
| 782 | $EXTRA_SYMBOLS_CACHE[$extrasDataKeyTmp] = $extrasDataKeyTmp; |
||
| 783 | } |
||
| 784 | } |
||
| 785 | $EXTRA_SYMBOLS_CACHE = \implode('', $EXTRA_SYMBOLS_CACHE); |
||
| 786 | } |
||
| 787 | |||
| 788 | $charDone = []; |
||
| 789 | if (\preg_match_all('/' . self::$REGEX_ASCII . ($replace_extra_symbols ? '|[' . $EXTRA_SYMBOLS_CACHE . ']' : '') . '/u', $str, $matches)) { |
||
| 790 | if (!$replace_single_chars_only) { |
||
| 791 | if (self::$LANGUAGE_MAX_KEY === null) { |
||
| 792 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 793 | self::$LANGUAGE_MAX_KEY = self::getData('ascii_language_max_key'); |
||
| 794 | } |
||
| 795 | |||
| 796 | $maxKeyLength = self::$LANGUAGE_MAX_KEY[$language] ?? 0; |
||
| 797 | |||
| 798 | if ($maxKeyLength >= 5) { |
||
| 799 | foreach ($matches[0] as $keyTmp => $char) { |
||
| 800 | if (isset($matches[0][$keyTmp + 4])) { |
||
| 801 | $fiveChars = $matches[0][$keyTmp + 0] . $matches[0][$keyTmp + 1] . $matches[0][$keyTmp + 2] . $matches[0][$keyTmp + 3] . $matches[0][$keyTmp + 4]; |
||
| 802 | } else { |
||
| 803 | $fiveChars = null; |
||
| 804 | } |
||
| 805 | if ( |
||
| 806 | $fiveChars |
||
| 807 | && |
||
| 808 | !isset($charDone[$fiveChars]) |
||
| 809 | && |
||
| 810 | isset($REPLACE_HELPER_CACHE[$cacheKey][$fiveChars]) |
||
| 811 | && |
||
| 812 | \strpos($str, $fiveChars) !== false |
||
| 813 | ) { |
||
| 814 | // DEBUG |
||
| 815 | //\var_dump($str, $fiveChars, $REPLACE_HELPER_CACHE[$cacheKey][$fiveChars]); |
||
| 816 | |||
| 817 | $charDone[$fiveChars] = true; |
||
| 818 | $str = \str_replace($fiveChars, $REPLACE_HELPER_CACHE[$cacheKey][$fiveChars], $str); |
||
| 819 | |||
| 820 | // DEBUG |
||
| 821 | //\var_dump($str, "\n"); |
||
| 822 | } |
||
| 823 | } |
||
| 824 | } |
||
| 825 | |||
| 826 | if ($maxKeyLength >= 4) { |
||
| 827 | foreach ($matches[0] as $keyTmp => $char) { |
||
| 828 | if (isset($matches[0][$keyTmp + 3])) { |
||
| 829 | $fourChars = $matches[0][$keyTmp + 0] . $matches[0][$keyTmp + 1] . $matches[0][$keyTmp + 2] . $matches[0][$keyTmp + 3]; |
||
| 830 | } else { |
||
| 831 | $fourChars = null; |
||
| 832 | } |
||
| 833 | if ( |
||
| 834 | $fourChars |
||
| 835 | && |
||
| 836 | !isset($charDone[$fourChars]) |
||
| 837 | && |
||
| 838 | isset($REPLACE_HELPER_CACHE[$cacheKey][$fourChars]) |
||
| 839 | && |
||
| 840 | \strpos($str, $fourChars) !== false |
||
| 841 | ) { |
||
| 842 | // DEBUG |
||
| 843 | //\var_dump($str, $fourChars, $REPLACE_HELPER_CACHE[$cacheKey][$fourChars]); |
||
| 844 | |||
| 845 | $charDone[$fourChars] = true; |
||
| 846 | $str = \str_replace($fourChars, $REPLACE_HELPER_CACHE[$cacheKey][$fourChars], $str); |
||
| 847 | |||
| 848 | // DEBUG |
||
| 849 | //\var_dump($str, "\n"); |
||
| 850 | } |
||
| 851 | } |
||
| 852 | } |
||
| 853 | |||
| 854 | foreach ($matches[0] as $keyTmp => $char) { |
||
| 855 | if (isset($matches[0][$keyTmp + 2])) { |
||
| 856 | $threeChars = $matches[0][$keyTmp + 0] . $matches[0][$keyTmp + 1] . $matches[0][$keyTmp + 2]; |
||
| 857 | } else { |
||
| 858 | $threeChars = null; |
||
| 859 | } |
||
| 860 | if ( |
||
| 861 | $threeChars |
||
| 862 | && |
||
| 863 | !isset($charDone[$threeChars]) |
||
| 864 | && |
||
| 865 | isset($REPLACE_HELPER_CACHE[$cacheKey][$threeChars]) |
||
| 866 | && |
||
| 867 | \strpos($str, $threeChars) !== false |
||
| 868 | ) { |
||
| 869 | // DEBUG |
||
| 870 | //\var_dump($str, $threeChars, $REPLACE_HELPER_CACHE[$cacheKey][$threeChars]); |
||
| 871 | |||
| 872 | $charDone[$threeChars] = true; |
||
| 873 | $str = \str_replace($threeChars, $REPLACE_HELPER_CACHE[$cacheKey][$threeChars], $str); |
||
| 874 | |||
| 875 | // DEBUG |
||
| 876 | //\var_dump($str, "\n"); |
||
| 877 | } |
||
| 878 | } |
||
| 879 | |||
| 880 | foreach ($matches[0] as $keyTmp => $char) { |
||
| 881 | if (isset($matches[0][$keyTmp + 1])) { |
||
| 882 | $twoChars = $matches[0][$keyTmp + 0] . $matches[0][$keyTmp + 1]; |
||
| 883 | } else { |
||
| 884 | $twoChars = null; |
||
| 885 | } |
||
| 886 | if ( |
||
| 887 | $twoChars |
||
| 888 | && |
||
| 889 | !isset($charDone[$twoChars]) |
||
| 890 | && |
||
| 891 | isset($REPLACE_HELPER_CACHE[$cacheKey][$twoChars]) |
||
| 892 | && |
||
| 893 | \strpos($str, $twoChars) !== false |
||
| 894 | ) { |
||
| 895 | // DEBUG |
||
| 896 | //\var_dump($str, $twoChars, $REPLACE_HELPER_CACHE[$cacheKey][$twoChars]); |
||
| 897 | |||
| 898 | $charDone[$twoChars] = true; |
||
| 899 | $str = \str_replace($twoChars, $REPLACE_HELPER_CACHE[$cacheKey][$twoChars], $str); |
||
| 900 | |||
| 901 | // DEBUG |
||
| 902 | //\var_dump($str, "\n"); |
||
| 903 | } |
||
| 904 | } |
||
| 905 | } |
||
| 906 | |||
| 907 | foreach ($matches[0] as $keyTmp => $char) { |
||
| 908 | if ( |
||
| 909 | !isset($charDone[$char]) |
||
| 910 | && |
||
| 911 | isset($REPLACE_HELPER_CACHE[$cacheKey][$char]) |
||
| 912 | && |
||
| 913 | \strpos($str, $char) !== false |
||
| 914 | ) { |
||
| 915 | // DEBUG |
||
| 916 | //\var_dump($str, $char, $REPLACE_HELPER_CACHE[$cacheKey][$char]); |
||
| 917 | |||
| 918 | $charDone[$char] = true; |
||
| 919 | $str = \str_replace($char, $REPLACE_HELPER_CACHE[$cacheKey][$char], $str); |
||
| 920 | |||
| 921 | // DEBUG |
||
| 922 | //\var_dump($str, "\n"); |
||
| 923 | } |
||
| 924 | } |
||
| 925 | } |
||
| 926 | |||
| 927 | /** @psalm-suppress PossiblyNullOperand - we use the prepare* methods here, so we don't get NULL here */ |
||
| 928 | if (!isset(self::$ASCII_MAPS[$language])) { |
||
| 929 | $use_transliterate = true; |
||
| 930 | } |
||
| 931 | |||
| 932 | if ($use_transliterate) { |
||
| 933 | /** @noinspection ArgumentEqualsDefaultValueInspection */ |
||
| 934 | $str = self::to_transliterate($str, null, false); |
||
| 935 | } |
||
| 936 | |||
| 937 | if ($remove_unsupported_chars) { |
||
| 938 | $str = (string) \str_replace(["\n\r", "\n", "\r", "\t"], ' ', $str); |
||
| 939 | $str = (string) \preg_replace('/' . self::$REGEX_ASCII . '/', '', $str); |
||
| 940 | } |
||
| 941 | |||
| 942 | return $str; |
||
| 943 | } |
||
| 944 | |||
| 945 | /** |
||
| 946 | * Convert given string to safe filename (and keep string case). |
||
| 947 | * |
||
| 948 | * EXAMPLE: <code> |
||
| 949 | * ASCII::to_filename('שדגשדג.png', true)); // 'shdgshdg.png' |
||
| 950 | * </code> |
||
| 951 | * |
||
| 952 | * @param string $str |
||
| 953 | * @param bool $use_transliterate <p>ASCII::to_transliterate() is used by default - unsafe characters are |
||
| 954 | * simply replaced with hyphen otherwise.</p> |
||
| 955 | * @param string $fallback_char |
||
| 956 | * |
||
| 957 | * @psalm-pure |
||
| 958 | * |
||
| 959 | * @return string |
||
| 960 | * <p>A string that contains only safe characters for a filename.</p> |
||
| 961 | */ |
||
| 962 | public static function to_filename( |
||
| 963 | string $str, |
||
| 964 | bool $use_transliterate = true, |
||
| 965 | string $fallback_char = '-' |
||
| 966 | ): string { |
||
| 967 | if ($use_transliterate) { |
||
| 968 | $str = self::to_transliterate($str, $fallback_char); |
||
| 969 | } |
||
| 970 | |||
| 971 | $fallback_char_escaped = \preg_quote($fallback_char, '/'); |
||
| 972 | |||
| 973 | $str = (string) \preg_replace( |
||
| 974 | [ |
||
| 975 | '/[^' . $fallback_char_escaped . '.\\-a-zA-Z0-9\\s]/', // 1) remove un-needed chars |
||
| 976 | '/[\\s]+/u', // 2) convert spaces to $fallback_char |
||
| 977 | '/[' . $fallback_char_escaped . ']+/u', // 3) remove double $fallback_char's |
||
| 978 | ], |
||
| 979 | [ |
||
| 980 | '', |
||
| 981 | $fallback_char, |
||
| 982 | $fallback_char, |
||
| 983 | ], |
||
| 984 | $str |
||
| 985 | ); |
||
| 986 | |||
| 987 | return \trim($str, $fallback_char); |
||
| 988 | } |
||
| 989 | |||
| 990 | /** |
||
| 991 | * Converts the string into an URL slug. This includes replacing non-ASCII |
||
| 992 | * characters with their closest ASCII equivalents, removing remaining |
||
| 993 | * non-ASCII and non-alphanumeric characters, and replacing whitespace with |
||
| 994 | * $separator. The separator defaults to a single dash, and the string |
||
| 995 | * is also converted to lowercase. The language of the source string can |
||
| 996 | * also be supplied for language-specific transliteration. |
||
| 997 | * |
||
| 998 | * @param string $str |
||
| 999 | * @param string $separator [optional] <p>The string used to replace whitespace.</p> |
||
| 1000 | * @param string $language [optional] <p>Language of the source string. |
||
| 1001 | * (default is 'en') | ASCII::*_LANGUAGE_CODE</p> |
||
| 1002 | * @param array<string, string> $replacements [optional] <p>A map of replaceable strings.</p> |
||
| 1003 | * @param bool $replace_extra_symbols [optional] <p>Add some more replacements e.g. "£" with " |
||
| 1004 | * pound ".</p> |
||
| 1005 | * @param bool $use_str_to_lower [optional] <p>Use "string to lower" for the input.</p> |
||
| 1006 | * @param bool $use_transliterate [optional] <p>Use ASCII::to_transliterate() for unknown |
||
| 1007 | * chars.</p> |
||
| 1008 | * @psalm-pure |
||
| 1009 | * |
||
| 1010 | * @return string |
||
| 1011 | * <p>A string that has been converted to an URL slug.</p> |
||
| 1012 | */ |
||
| 1013 | public static function to_slugify( |
||
| 1014 | string $str, |
||
| 1015 | string $separator = '-', |
||
| 1016 | string $language = self::ENGLISH_LANGUAGE_CODE, |
||
| 1017 | array $replacements = [], |
||
| 1018 | bool $replace_extra_symbols = false, |
||
| 1019 | bool $use_str_to_lower = true, |
||
| 1020 | bool $use_transliterate = false |
||
| 1021 | ): string { |
||
| 1022 | if ($str === '') { |
||
| 1023 | return ''; |
||
| 1024 | } |
||
| 1025 | |||
| 1026 | foreach ($replacements as $from => $to) { |
||
| 1027 | $str = \str_replace($from, $to, $str); |
||
| 1028 | } |
||
| 1029 | |||
| 1030 | $str = self::to_ascii( |
||
| 1031 | $str, |
||
| 1032 | $language, |
||
| 1033 | false, |
||
| 1034 | $replace_extra_symbols, |
||
| 1035 | $use_transliterate |
||
| 1036 | ); |
||
| 1037 | |||
| 1038 | $str = \str_replace('@', $separator, $str); |
||
| 1039 | |||
| 1040 | $str = (string) \preg_replace( |
||
| 1041 | '/[^a-zA-Z\\d\\s\\-_' . \preg_quote($separator, '/') . ']/', |
||
| 1042 | '', |
||
| 1043 | $str |
||
| 1044 | ); |
||
| 1045 | |||
| 1046 | if ($use_str_to_lower) { |
||
| 1047 | $str = \strtolower($str); |
||
| 1048 | } |
||
| 1049 | |||
| 1050 | $str = (string) \preg_replace('/^[\'\\s]+|[\'\\s]+$/', '', $str); |
||
| 1051 | $str = (string) \preg_replace('/\\B([A-Z])/', '-\1', $str); |
||
| 1052 | $str = (string) \preg_replace('/[\\-_\\s]+/', $separator, $str); |
||
| 1053 | |||
| 1054 | $l = \strlen($separator); |
||
| 1055 | if ($l && \strpos($str, $separator) === 0) { |
||
| 1056 | $str = (string) \substr($str, $l); |
||
| 1057 | } |
||
| 1058 | |||
| 1059 | if (\substr($str, -$l) === $separator) { |
||
| 1060 | $str = (string) \substr($str, 0, \strlen($str) - $l); |
||
| 1061 | } |
||
| 1062 | |||
| 1063 | return $str; |
||
| 1064 | } |
||
| 1065 | |||
| 1066 | /** |
||
| 1067 | * Returns an ASCII version of the string. A set of non-ASCII characters are |
||
| 1068 | * replaced with their closest ASCII counterparts, and the rest are removed |
||
| 1069 | * unless instructed otherwise. |
||
| 1070 | * |
||
| 1071 | * EXAMPLE: <code> |
||
| 1072 | * ASCII::to_transliterate('déjà σσς iıii'); // 'deja sss iiii' |
||
| 1073 | * </code> |
||
| 1074 | * |
||
| 1075 | * @param string $str <p>The input string.</p> |
||
| 1076 | * @param string|null $unknown [optional] <p>Character use if character unknown. (default is '?') |
||
| 1077 | * But you can also use NULL to keep the unknown chars.</p> |
||
| 1078 | * @param bool $strict [optional] <p>Use "transliterator_transliterate()" from PHP-Intl |
||
| 1079 | * |
||
| 1080 | * @psalm-pure |
||
| 1081 | * |
||
| 1082 | * @return string |
||
| 1083 | * <p>A String that contains only ASCII characters.</p> |
||
| 1084 | * |
||
| 1085 | * @noinspection ParameterDefaultValueIsNotNullInspection |
||
| 1086 | */ |
||
| 1087 | public static function to_transliterate( |
||
| 1088 | string $str, |
||
| 1089 | $unknown = '?', |
||
| 1090 | bool $strict = false |
||
| 1091 | ): string { |
||
| 1092 | /** |
||
| 1093 | * @var array<int,string>|null |
||
| 1094 | */ |
||
| 1095 | static $UTF8_TO_TRANSLIT = null; |
||
| 1096 | |||
| 1097 | /** |
||
| 1098 | * null|\Transliterator |
||
| 1099 | */ |
||
| 1100 | static $TRANSLITERATOR = null; |
||
| 1101 | |||
| 1102 | /** |
||
| 1103 | * @var bool|null |
||
| 1104 | */ |
||
| 1105 | static $SUPPORT_INTL = null; |
||
| 1106 | |||
| 1107 | if ($str === '') { |
||
| 1108 | return ''; |
||
| 1109 | } |
||
| 1110 | |||
| 1111 | if ($SUPPORT_INTL === null) { |
||
| 1112 | $SUPPORT_INTL = \extension_loaded('intl'); |
||
| 1113 | } |
||
| 1114 | |||
| 1115 | // check if we only have ASCII, first (better performance) |
||
| 1116 | $str_tmp = $str; |
||
| 1117 | if (self::is_ascii($str)) { |
||
| 1118 | return $str; |
||
| 1119 | } |
||
| 1120 | |||
| 1121 | $str = self::clean($str); |
||
| 1122 | |||
| 1123 | // check again, if we only have ASCII, now ... |
||
| 1124 | if ( |
||
| 1125 | $str_tmp !== $str |
||
| 1126 | && |
||
| 1127 | self::is_ascii($str) |
||
| 1128 | ) { |
||
| 1129 | return $str; |
||
| 1130 | } |
||
| 1131 | |||
| 1132 | if ( |
||
| 1133 | $strict |
||
| 1134 | && |
||
| 1135 | $SUPPORT_INTL === true |
||
| 1136 | ) { |
||
| 1137 | if (!isset($TRANSLITERATOR)) { |
||
| 1138 | // INFO: see "*-Latin" rules via "transliterator_list_ids()" |
||
| 1139 | /** |
||
| 1140 | * @var \Transliterator |
||
| 1141 | */ |
||
| 1142 | $TRANSLITERATOR = \transliterator_create('NFKC; [:Nonspacing Mark:] Remove; NFKC; Any-Latin; Latin-ASCII;'); |
||
| 1143 | } |
||
| 1144 | |||
| 1145 | // INFO: https://unicode.org/cldr/utility/character.jsp |
||
| 1146 | $str_tmp = \transliterator_transliterate($TRANSLITERATOR, $str); |
||
| 1147 | |||
| 1148 | if ($str_tmp !== false) { |
||
| 1149 | |||
| 1150 | // check again, if we only have ASCII, now ... |
||
| 1151 | if ( |
||
| 1152 | $str_tmp !== $str |
||
| 1153 | && |
||
| 1154 | self::is_ascii($str_tmp) |
||
| 1155 | ) { |
||
| 1156 | return $str_tmp; |
||
| 1157 | } |
||
| 1158 | |||
| 1159 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */ |
||
| 1160 | $str = $str_tmp; |
||
| 1161 | } |
||
| 1162 | } |
||
| 1163 | |||
| 1164 | if (self::$ORD === null) { |
||
| 1165 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 1166 | self::$ORD = self::getData('ascii_ord'); |
||
| 1167 | } |
||
| 1168 | |||
| 1169 | \preg_match_all('/.|[^\x00]$/us', $str, $array_tmp); |
||
| 1170 | $chars = $array_tmp[0]; |
||
| 1171 | $ord = null; |
||
| 1172 | $str_tmp = ''; |
||
| 1173 | foreach ($chars as &$c) { |
||
| 1174 | $ordC0 = self::$ORD[$c[0]]; |
||
| 1175 | |||
| 1176 | if ($ordC0 >= 0 && $ordC0 <= 127) { |
||
| 1177 | $str_tmp .= $c; |
||
| 1178 | |||
| 1179 | continue; |
||
| 1180 | } |
||
| 1181 | |||
| 1182 | $ordC1 = self::$ORD[$c[1]]; |
||
| 1183 | |||
| 1184 | // ASCII - next please |
||
| 1185 | if ($ordC0 >= 192 && $ordC0 <= 223) { |
||
| 1186 | $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128); |
||
| 1187 | } |
||
| 1188 | |||
| 1189 | if ($ordC0 >= 224) { |
||
| 1190 | $ordC2 = self::$ORD[$c[2]]; |
||
| 1191 | |||
| 1192 | if ($ordC0 <= 239) { |
||
| 1193 | $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128); |
||
| 1194 | } |
||
| 1195 | |||
| 1196 | if ($ordC0 >= 240) { |
||
| 1197 | $ordC3 = self::$ORD[$c[3]]; |
||
| 1198 | |||
| 1199 | if ($ordC0 <= 247) { |
||
| 1200 | $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128); |
||
| 1201 | } |
||
| 1202 | |||
| 1203 | // We only process valid UTF-8 chars (<= 4 byte), so we don't need this code here ... |
||
| 1204 | /* |
||
| 1205 | if ($ordC0 >= 248) { |
||
| 1206 | $ordC4 = self::$ORD[$c[4]]; |
||
| 1207 | |||
| 1208 | if ($ordC0 <= 251) { |
||
| 1209 | $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128); |
||
| 1210 | } |
||
| 1211 | |||
| 1212 | if ($ordC0 >= 252) { |
||
| 1213 | $ordC5 = self::$ORD[$c[5]]; |
||
| 1214 | |||
| 1215 | if ($ordC0 <= 253) { |
||
| 1216 | $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128); |
||
| 1217 | } |
||
| 1218 | } |
||
| 1219 | } |
||
| 1220 | */ |
||
| 1221 | } |
||
| 1222 | } |
||
| 1223 | |||
| 1224 | if ( |
||
| 1225 | $ordC0 === 254 |
||
| 1226 | || |
||
| 1227 | $ordC0 === 255 |
||
| 1228 | || |
||
| 1229 | $ord === null |
||
| 1230 | ) { |
||
| 1231 | $str_tmp .= $unknown ?? $c; |
||
| 1232 | |||
| 1233 | continue; |
||
| 1234 | } |
||
| 1235 | |||
| 1236 | $bank = $ord >> 8; |
||
| 1237 | if (!isset($UTF8_TO_TRANSLIT[$bank])) { |
||
| 1238 | $UTF8_TO_TRANSLIT[$bank] = self::getDataIfExists(\sprintf('x%03x', $bank)); |
||
| 1239 | } |
||
| 1240 | |||
| 1241 | $new_char = $ord & 255; |
||
| 1242 | |||
| 1243 | if (isset($UTF8_TO_TRANSLIT[$bank][$new_char])) { |
||
| 1244 | |||
| 1245 | // keep for debugging |
||
| 1246 | /* |
||
| 1247 | echo "file: " . sprintf('x%02x', $bank) . "\n"; |
||
| 1248 | echo "char: " . $c . "\n"; |
||
| 1249 | echo "ord: " . $ord . "\n"; |
||
| 1250 | echo "new_char: " . $new_char . "\n"; |
||
| 1251 | echo "new_char: " . mb_chr($new_char) . "\n"; |
||
| 1252 | echo "ascii: " . $UTF8_TO_TRANSLIT[$bank][$new_char] . "\n"; |
||
| 1253 | echo "bank:" . $bank . "\n\n"; |
||
| 1254 | */ |
||
| 1255 | |||
| 1256 | $new_char = $UTF8_TO_TRANSLIT[$bank][$new_char]; |
||
| 1257 | |||
| 1258 | /** @noinspection MissingOrEmptyGroupStatementInspection */ |
||
| 1259 | /** @noinspection PhpStatementHasEmptyBodyInspection */ |
||
| 1260 | if ($unknown === null && $new_char === '') { |
||
| 1261 | // nothing |
||
| 1262 | } elseif ( |
||
| 1263 | $new_char === '[?]' |
||
| 1264 | || |
||
| 1265 | $new_char === '[?] ' |
||
| 1266 | ) { |
||
| 1267 | $c = $unknown ?? $c; |
||
| 1268 | } else { |
||
| 1269 | $c = $new_char; |
||
| 1270 | } |
||
| 1271 | } else { |
||
| 1272 | |||
| 1273 | // keep for debugging missing chars |
||
| 1274 | /* |
||
| 1275 | echo "file: " . sprintf('x%02x', $bank) . "\n"; |
||
| 1276 | echo "char: " . $c . "\n"; |
||
| 1277 | echo "ord: " . $ord . "\n"; |
||
| 1278 | echo "new_char: " . $new_char . "\n"; |
||
| 1279 | echo "new_char: " . mb_chr($new_char) . "\n"; |
||
| 1280 | echo "bank:" . $bank . "\n\n"; |
||
| 1281 | */ |
||
| 1282 | |||
| 1283 | $c = $unknown ?? $c; |
||
| 1284 | } |
||
| 1285 | |||
| 1286 | $str_tmp .= $c; |
||
| 1287 | } |
||
| 1288 | |||
| 1289 | return $str_tmp; |
||
| 1290 | } |
||
| 1291 | |||
| 1292 | /** |
||
| 1293 | * Get the language from a string. |
||
| 1294 | * |
||
| 1295 | * e.g.: de_at -> de_at |
||
| 1296 | * de_DE -> de |
||
| 1297 | * DE_DE -> de |
||
| 1298 | * de-de -> de |
||
| 1299 | * |
||
| 1300 | * @noinspection ReturnTypeCanBeDeclaredInspection |
||
| 1301 | * |
||
| 1302 | * @param string $language |
||
| 1303 | * |
||
| 1304 | * @psalm-pure |
||
| 1305 | * |
||
| 1306 | * @return string |
||
| 1307 | */ |
||
| 1308 | private static function get_language(string $language) |
||
| 1309 | { |
||
| 1310 | if ($language === '') { |
||
| 1311 | return ''; |
||
| 1312 | } |
||
| 1313 | |||
| 1314 | if ( |
||
| 1315 | \strpos($language, '_') === false |
||
| 1316 | && |
||
| 1317 | \strpos($language, '-') === false |
||
| 1318 | ) { |
||
| 1319 | return \strtolower($language); |
||
| 1320 | } |
||
| 1321 | |||
| 1322 | $regex = '/(?<first>[a-z]+)[\-_]\g{first}/i'; |
||
| 1323 | |||
| 1324 | return \str_replace( |
||
| 1325 | '-', |
||
| 1326 | '_', |
||
| 1327 | \strtolower( |
||
| 1328 | (string) \preg_replace($regex, '$1', $language) |
||
| 1329 | ) |
||
| 1330 | ); |
||
| 1331 | } |
||
| 1332 | |||
| 1333 | /** |
||
| 1334 | * Get data from "/data/*.php". |
||
| 1335 | * |
||
| 1336 | * @noinspection ReturnTypeCanBeDeclaredInspection |
||
| 1337 | * |
||
| 1338 | * @param string $file |
||
| 1339 | * |
||
| 1340 | * @psalm-pure |
||
| 1341 | * |
||
| 1342 | * @return array<mixed> |
||
| 1343 | */ |
||
| 1344 | private static function getData(string $file) |
||
| 1345 | { |
||
| 1346 | /** @noinspection PhpIncludeInspection */ |
||
| 1347 | /** @noinspection UsingInclusionReturnValueInspection */ |
||
| 1348 | /** @psalm-suppress UnresolvableInclude */ |
||
| 1349 | return include __DIR__ . '/data/' . $file . '.php'; |
||
| 1350 | } |
||
| 1351 | |||
| 1352 | /** |
||
| 1353 | * Get data from "/data/*.php". |
||
| 1354 | * |
||
| 1355 | * @param string $file |
||
| 1356 | * |
||
| 1357 | * @psalm-pure |
||
| 1358 | * |
||
| 1359 | * @return array<mixed> |
||
| 1360 | */ |
||
| 1361 | private static function getDataIfExists(string $file): array |
||
| 1362 | { |
||
| 1363 | $file = __DIR__ . '/data/' . $file . '.php'; |
||
| 1364 | /** |
||
| 1365 | * @noinspection LowPerformingFilesystemOperationsInspection |
||
| 1366 | * -> we use this only once, so no extra caching is needed |
||
| 1367 | */ |
||
| 1368 | if (\file_exists($file)) { |
||
| 1369 | /** @noinspection PhpIncludeInspection */ |
||
| 1370 | /** @noinspection UsingInclusionReturnValueInspection */ |
||
| 1371 | return include $file; |
||
| 1372 | } |
||
| 1373 | |||
| 1374 | return []; |
||
| 1375 | } |
||
| 1376 | |||
| 1377 | /** |
||
| 1378 | * @psalm-pure |
||
| 1379 | * |
||
| 1380 | * @return void |
||
| 1381 | */ |
||
| 1382 | private static function prepareAsciiAndExtrasMaps() |
||
| 1383 | { |
||
| 1384 | if (self::$ASCII_MAPS_AND_EXTRAS === null) { |
||
| 1385 | self::prepareAsciiMaps(); |
||
| 1386 | self::prepareAsciiExtras(); |
||
| 1387 | |||
| 1388 | /** @psalm-suppress PossiblyNullArgument - we use the prepare* methods here, so we don't get NULL here */ |
||
| 1389 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 1390 | self::$ASCII_MAPS_AND_EXTRAS = \array_merge_recursive( |
||
| 1391 | self::$ASCII_MAPS ?? [], |
||
| 1392 | self::$ASCII_EXTRAS ?? [] |
||
| 1393 | ); |
||
| 1394 | } |
||
| 1395 | } |
||
| 1396 | |||
| 1397 | /** |
||
| 1398 | * @psalm-pure |
||
| 1399 | * |
||
| 1400 | * @return void |
||
| 1401 | */ |
||
| 1402 | private static function prepareAsciiMaps() |
||
| 1403 | { |
||
| 1404 | if (self::$ASCII_MAPS === null) { |
||
| 1405 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 1406 | self::$ASCII_MAPS = self::getData('ascii_by_languages'); |
||
| 1407 | } |
||
| 1408 | } |
||
| 1409 | |||
| 1410 | /** |
||
| 1411 | * @psalm-pure |
||
| 1412 | * |
||
| 1413 | * @return void |
||
| 1414 | */ |
||
| 1415 | private static function prepareAsciiExtras() |
||
| 1416 | { |
||
| 1417 | if (self::$ASCII_EXTRAS === null) { |
||
| 1418 | /** @noinspection PsalmLocalImmutableInspection */ |
||
| 1419 | self::$ASCII_EXTRAS = self::getData('ascii_extras_by_languages'); |
||
| 1420 | } |
||
| 1423 |