albertlast /
SMF2.1
| 1 | <?php |
||||
| 2 | |||||
| 3 | /** |
||||
| 4 | * Simple Machines Forum (SMF) |
||||
| 5 | * |
||||
| 6 | * @package SMF |
||||
| 7 | * @author Simple Machines https://www.simplemachines.org |
||||
| 8 | * @copyright 2021 Simple Machines and individual contributors |
||||
| 9 | * @license https://www.simplemachines.org/about/smf/license.php BSD |
||||
| 10 | * |
||||
| 11 | * @version 2.1 RC4 |
||||
| 12 | */ |
||||
| 13 | |||||
| 14 | if (!defined('SMF')) |
||||
| 15 | die('No direct access...'); |
||||
| 16 | |||||
| 17 | /** |
||||
| 18 | * Converts the given UTF-8 string into lowercase. |
||||
| 19 | * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the |
||||
| 20 | * output consistent across PHP versions and up to date with the latest version |
||||
| 21 | * of Unicode. |
||||
| 22 | * |
||||
| 23 | * @param string $string The string |
||||
| 24 | * @return string The lowercase version of $string |
||||
| 25 | */ |
||||
| 26 | function utf8_strtolower($string) |
||||
| 27 | { |
||||
| 28 | global $sourcedir; |
||||
| 29 | |||||
| 30 | $string = (string) $string; |
||||
| 31 | |||||
| 32 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 33 | |||||
| 34 | if ($chars === false) |
||||
| 35 | return false; |
||||
| 36 | |||||
| 37 | require_once($sourcedir . '/Unicode/CaseLower.php'); |
||||
| 38 | |||||
| 39 | $substitutions = utf8_strtolower_maps(); |
||||
| 40 | |||||
| 41 | foreach ($chars as &$char) |
||||
| 42 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
| 43 | |||||
| 44 | return implode('', $chars); |
||||
| 45 | } |
||||
| 46 | |||||
| 47 | /** |
||||
| 48 | * Convert the given UTF-8 string to uppercase. |
||||
| 49 | * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the |
||||
| 50 | * output consistent across PHP versions and up to date with the latest version |
||||
| 51 | * of Unicode. |
||||
| 52 | * |
||||
| 53 | * @param string $string The string |
||||
| 54 | * @return string The uppercase version of $string |
||||
| 55 | */ |
||||
| 56 | function utf8_strtoupper($string) |
||||
| 57 | { |
||||
| 58 | global $sourcedir; |
||||
| 59 | |||||
| 60 | $string = (string) $string; |
||||
| 61 | |||||
| 62 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 63 | |||||
| 64 | if ($chars === false) |
||||
| 65 | return false; |
||||
| 66 | |||||
| 67 | require_once($sourcedir . '/Unicode/CaseUpper.php'); |
||||
| 68 | |||||
| 69 | $substitutions = utf8_strtoupper_maps(); |
||||
| 70 | |||||
| 71 | foreach ($chars as &$char) |
||||
| 72 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
| 73 | |||||
| 74 | return implode('', $chars); |
||||
| 75 | } |
||||
| 76 | |||||
| 77 | /** |
||||
| 78 | * Casefolds the given UTF-8 string. |
||||
| 79 | * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that |
||||
| 80 | * we can keep the output consistent across PHP versions and up to date with |
||||
| 81 | * the latest version of Unicode. |
||||
| 82 | * |
||||
| 83 | * @param string $string The string |
||||
| 84 | * @return string The uppercase version of $string |
||||
| 85 | */ |
||||
| 86 | function utf8_casefold($string) |
||||
| 87 | { |
||||
| 88 | global $sourcedir; |
||||
| 89 | |||||
| 90 | $string = (string) $string; |
||||
| 91 | |||||
| 92 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 93 | |||||
| 94 | if ($chars === false) |
||||
| 95 | return false; |
||||
| 96 | |||||
| 97 | require_once($sourcedir . '/Unicode/CaseFold.php'); |
||||
| 98 | |||||
| 99 | $substitutions = utf8_casefold_maps(); |
||||
| 100 | |||||
| 101 | foreach ($chars as &$char) |
||||
| 102 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
| 103 | |||||
| 104 | return implode('', $chars); |
||||
| 105 | } |
||||
| 106 | |||||
| 107 | /** |
||||
| 108 | * Normalizes UTF-8 via Canonical Decomposition. |
||||
| 109 | * |
||||
| 110 | * @param string $string A UTF-8 string |
||||
| 111 | * @return string The decomposed version of $string |
||||
| 112 | */ |
||||
| 113 | function utf8_normalize_d($string) |
||||
| 114 | { |
||||
| 115 | $string = (string) $string; |
||||
| 116 | |||||
| 117 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D)) |
||||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||||
| 118 | return $string; |
||||
| 119 | |||||
| 120 | if (is_callable('normalizer_normalize')) |
||||
| 121 | return normalizer_normalize($string, Normalizer::FORM_D); |
||||
| 122 | |||||
| 123 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 124 | |||||
| 125 | if ($chars === false) |
||||
| 126 | return false; |
||||
| 127 | |||||
| 128 | return implode('', utf8_decompose($chars, false)); |
||||
| 129 | } |
||||
| 130 | |||||
| 131 | /** |
||||
| 132 | * Normalizes UTF-8 via Compatibility Decomposition. |
||||
| 133 | * |
||||
| 134 | * @param string $string A UTF-8 string. |
||||
| 135 | * @return string The decomposed version of $string. |
||||
| 136 | */ |
||||
| 137 | function utf8_normalize_kd($string) |
||||
| 138 | { |
||||
| 139 | $string = (string) $string; |
||||
| 140 | |||||
| 141 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD)) |
||||
|
0 ignored issues
–
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized().
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 142 | return $string; |
||||
| 143 | |||||
| 144 | if (is_callable('normalizer_normalize')) |
||||
| 145 | return normalizer_normalize($string, Normalizer::FORM_KD); |
||||
| 146 | |||||
| 147 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 148 | |||||
| 149 | if ($chars === false) |
||||
| 150 | return false; |
||||
| 151 | |||||
| 152 | return implode('', utf8_decompose($chars, true)); |
||||
| 153 | } |
||||
| 154 | |||||
| 155 | /** |
||||
| 156 | * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition. |
||||
| 157 | * |
||||
| 158 | * @param string $string A UTF-8 string |
||||
| 159 | * @return string The composed version of $string |
||||
| 160 | */ |
||||
| 161 | function utf8_normalize_c($string) |
||||
| 162 | { |
||||
| 163 | $string = (string) $string; |
||||
| 164 | |||||
| 165 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C)) |
||||
| 166 | return $string; |
||||
| 167 | |||||
| 168 | if (is_callable('normalizer_normalize')) |
||||
| 169 | return normalizer_normalize($string, Normalizer::FORM_C); |
||||
| 170 | |||||
| 171 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 172 | |||||
| 173 | if ($chars === false) |
||||
| 174 | return false; |
||||
| 175 | |||||
| 176 | return implode('', utf8_compose(utf8_decompose($chars, false))); |
||||
| 177 | } |
||||
| 178 | |||||
| 179 | /** |
||||
| 180 | * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition. |
||||
| 181 | * |
||||
| 182 | * @param string $string The string |
||||
| 183 | * @return string The composed version of $string |
||||
| 184 | */ |
||||
| 185 | function utf8_normalize_kc($string) |
||||
| 186 | { |
||||
| 187 | $string = (string) $string; |
||||
| 188 | |||||
| 189 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC)) |
||||
| 190 | return $string; |
||||
| 191 | |||||
| 192 | if (is_callable('normalizer_normalize')) |
||||
| 193 | return normalizer_normalize($string, Normalizer::FORM_KC); |
||||
| 194 | |||||
| 195 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 196 | |||||
| 197 | if ($chars === false) |
||||
| 198 | return false; |
||||
| 199 | |||||
| 200 | return implode('', utf8_compose(utf8_decompose($chars, true))); |
||||
| 201 | } |
||||
| 202 | |||||
| 203 | /** |
||||
| 204 | * Casefolds UTF-8 via Compatibility Composition Casefolding. |
||||
| 205 | * Used by idn_to_ascii polyfill in Subs-Compat.php |
||||
| 206 | * |
||||
| 207 | * @param string $string The string |
||||
| 208 | * @return string The casefolded version of $string |
||||
| 209 | */ |
||||
| 210 | function utf8_normalize_kc_casefold($string) |
||||
| 211 | { |
||||
| 212 | global $sourcedir; |
||||
| 213 | |||||
| 214 | $string = (string) $string; |
||||
| 215 | |||||
| 216 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 217 | |||||
| 218 | if ($chars === false) |
||||
| 219 | return false; |
||||
| 220 | |||||
| 221 | $chars = utf8_decompose($chars, true); |
||||
| 222 | |||||
| 223 | require_once($sourcedir . '/Unicode/CaseFold.php'); |
||||
| 224 | require_once($sourcedir . '/Unicode/DefaultIgnorables.php'); |
||||
| 225 | |||||
| 226 | $substitutions = utf8_casefold_maps(); |
||||
| 227 | $ignorables = array_flip(utf8_default_ignorables()); |
||||
| 228 | |||||
| 229 | foreach ($chars as &$char) |
||||
| 230 | { |
||||
| 231 | if (isset($substitutions[$char])) |
||||
| 232 | $char = $substitutions[$char]; |
||||
| 233 | |||||
| 234 | elseif (isset($ignorables[$char])) |
||||
| 235 | $char = ''; |
||||
| 236 | } |
||||
| 237 | |||||
| 238 | return implode('', utf8_compose($chars)); |
||||
| 239 | } |
||||
| 240 | |||||
| 241 | /** |
||||
| 242 | * Helper function for utf8_normalize_d and utf8_normalize_kd. |
||||
| 243 | * |
||||
| 244 | * @param array $chars Array of Unicode characters |
||||
| 245 | * @return array Array of decomposed Unicode characters. |
||||
| 246 | */ |
||||
| 247 | function utf8_decompose($chars, $compatibility = false) |
||||
| 248 | { |
||||
| 249 | global $sourcedir; |
||||
| 250 | |||||
| 251 | if (!empty($compatibility)) |
||||
| 252 | { |
||||
| 253 | require_once($sourcedir . '/Unicode/DecompositionCompatibility.php'); |
||||
| 254 | |||||
| 255 | $substitutions = utf8_normalize_kd_maps(); |
||||
| 256 | |||||
| 257 | foreach ($chars as &$char) |
||||
| 258 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
| 259 | } |
||||
| 260 | |||||
| 261 | require_once($sourcedir . '/Unicode/DecompositionCanonical.php'); |
||||
| 262 | require_once($sourcedir . '/Unicode/CombiningClasses.php'); |
||||
| 263 | |||||
| 264 | $substitutions = utf8_normalize_d_maps(); |
||||
| 265 | $combining_classes = utf8_combining_classes(); |
||||
| 266 | |||||
| 267 | // Replace characters with decomposed forms. |
||||
| 268 | for ($i=0; $i < count($chars); $i++) |
||||
| 269 | { |
||||
| 270 | // Hangul characters. |
||||
| 271 | if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3") |
||||
| 272 | { |
||||
| 273 | if (!function_exists('mb_ord')) |
||||
| 274 | require_once($sourcedir . '/Subs-Compat.php'); |
||||
| 275 | |||||
| 276 | $s = mb_ord($chars[$i]); |
||||
| 277 | $sindex = $s - 0xAC00; |
||||
| 278 | $l = 0x1100 + $sindex / (21 * 28); |
||||
| 279 | $v = 0x1161 + ($sindex % (21 * 28)) / 28; |
||||
| 280 | $t = $sindex % 28; |
||||
| 281 | |||||
| 282 | $chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : '')); |
||||
| 283 | } |
||||
| 284 | // Everything else. |
||||
| 285 | elseif (isset($substitutions[$chars[$i]])) |
||||
| 286 | $chars[$i] = $substitutions[$chars[$i]]; |
||||
| 287 | } |
||||
| 288 | |||||
| 289 | // Must re-split the string before sorting. |
||||
| 290 | $chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
| 291 | |||||
| 292 | // Sort characters into canonical order. |
||||
| 293 | for ($i = 1; $i < count($chars); $i++) |
||||
| 294 | { |
||||
| 295 | if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]])) |
||||
| 296 | continue; |
||||
| 297 | |||||
| 298 | if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]]) |
||||
| 299 | { |
||||
| 300 | $temp = $chars[$i]; |
||||
| 301 | $chars[$i] = $chars[$i - 1]; |
||||
| 302 | $chars[$i -1] = $temp; |
||||
| 303 | |||||
| 304 | // Backtrack and check again. |
||||
| 305 | if ($i > 1) |
||||
| 306 | $i -= 2; |
||||
| 307 | } |
||||
| 308 | } |
||||
| 309 | |||||
| 310 | return $chars; |
||||
| 311 | } |
||||
| 312 | |||||
| 313 | /** |
||||
| 314 | * Helper function for utf8_normalize_c and utf8_normalize_kc. |
||||
| 315 | * |
||||
| 316 | * @param array $chars Array of decomposed Unicode characters |
||||
| 317 | * @return array Array of composed Unicode characters. |
||||
| 318 | */ |
||||
| 319 | function utf8_compose($chars) |
||||
| 320 | { |
||||
| 321 | global $sourcedir; |
||||
| 322 | |||||
| 323 | require_once($sourcedir . '/Unicode/Composition.php'); |
||||
| 324 | require_once($sourcedir . '/Unicode/CombiningClasses.php'); |
||||
| 325 | |||||
| 326 | $substitutions = utf8_compose_maps(); |
||||
| 327 | $combining_classes = utf8_combining_classes(); |
||||
| 328 | |||||
| 329 | for ($c = 0; $c < count($chars); $c++) |
||||
| 330 | { |
||||
| 331 | // Singleton replacements. |
||||
| 332 | if (isset($substitutions[$chars[$c]])) |
||||
| 333 | $chars[$c] = $substitutions[$chars[$c]]; |
||||
| 334 | |||||
| 335 | // Hangul characters. |
||||
| 336 | // See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12. |
||||
| 337 | if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5") |
||||
| 338 | { |
||||
| 339 | if (!function_exists('mb_ord')) |
||||
| 340 | require_once($sourcedir . '/Subs-Compat.php'); |
||||
| 341 | |||||
| 342 | $l_part = $chars[$c]; |
||||
| 343 | $v_part = $chars[$c + 1]; |
||||
| 344 | $t_part = null; |
||||
| 345 | |||||
| 346 | $l_index = mb_ord($l_part) - 0x1100; |
||||
| 347 | $v_index = mb_ord($v_part) - 0x1161; |
||||
| 348 | |||||
| 349 | $lv_index = $l_index * 588 + $v_index * 28; |
||||
| 350 | $s = 0xAC00 + $lv_index; |
||||
| 351 | |||||
| 352 | if ($chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82") |
||||
| 353 | { |
||||
| 354 | $t_part = $chars[$c + 2]; |
||||
| 355 | $t_index = mb_ord($t_part) - 0x11A7; |
||||
| 356 | $s += $t_index; |
||||
| 357 | } |
||||
| 358 | |||||
| 359 | $chars[$c] = mb_chr($s); |
||||
| 360 | $chars[++$c] = null; |
||||
| 361 | |||||
| 362 | if (isset($t_part)) |
||||
| 363 | $chars[++$c] = null; |
||||
| 364 | |||||
| 365 | continue; |
||||
| 366 | } |
||||
| 367 | |||||
| 368 | if ($c > 0) |
||||
| 369 | { |
||||
| 370 | $ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0; |
||||
| 371 | |||||
| 372 | // Find the preceding starter character. |
||||
| 373 | $l = $c - 1; |
||||
| 374 | while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc))) |
||||
| 375 | $l--; |
||||
| 376 | |||||
| 377 | // Is there a composed form for this combination? |
||||
| 378 | if (isset($substitutions[$chars[$l] . $chars[$c]])) |
||||
| 379 | { |
||||
| 380 | // Replace the starter character with the composed character. |
||||
| 381 | $chars[$l] = $substitutions[$chars[$l] . $chars[$c]]; |
||||
| 382 | |||||
| 383 | // Unset the current combining character. |
||||
| 384 | $chars[$c] = null; |
||||
| 385 | } |
||||
| 386 | } |
||||
| 387 | } |
||||
| 388 | |||||
| 389 | return $chars; |
||||
| 390 | } |
||||
| 391 | |||||
| 392 | /** |
||||
| 393 | * Helper function for sanitize_chars() that deals with invisible characters. |
||||
| 394 | * |
||||
| 395 | * This function deals with control characters, private use characters, |
||||
| 396 | * non-characters, and characters that are invisible by definition in the |
||||
| 397 | * Unicode standard. It does not deal with characters that are supposed to be |
||||
| 398 | * visible according to the Unicode standard, and makes no attempt to compensate |
||||
| 399 | * for possibly incomplete Unicode support in text rendering engines on client |
||||
| 400 | * devices. |
||||
| 401 | * |
||||
| 402 | * @param string $string The string to sanitize. |
||||
| 403 | * @param int $level Controls how invisible formatting characters are handled. |
||||
| 404 | * 0: Allow valid formatting characters. Use for sanitizing text in posts. |
||||
| 405 | * 1: Allow necessary formatting characters. Use for sanitizing usernames. |
||||
| 406 | * 2: Disallow all formatting characters. Use for internal comparisions |
||||
| 407 | * only, such as in the word censor, search contexts, etc. |
||||
| 408 | * @param string $substitute Replacement string for the invalid characters. |
||||
| 409 | * @return string The sanitized string. |
||||
| 410 | */ |
||||
| 411 | function utf8_sanitize_invisibles($string, $level, $substitute) |
||||
| 412 | { |
||||
| 413 | global $sourcedir; |
||||
| 414 | |||||
| 415 | $string = (string) $string; |
||||
| 416 | $level = min(max((int) $level, 0), 2); |
||||
| 417 | $substitute = (string) $substitute; |
||||
| 418 | |||||
| 419 | require_once($sourcedir . '/Unicode/RegularExpressions.php'); |
||||
| 420 | $prop_classes = utf8_regex_properties(); |
||||
| 421 | |||||
| 422 | // We never want non-whitespace control characters |
||||
| 423 | $disallowed[] = '[^\P{Cc}\t\r\n]'; |
||||
| 424 | |||||
| 425 | // We never want private use characters or non-characters. |
||||
| 426 | // Use our own version of \p{Cn} in order to avoid possible inconsistencies |
||||
| 427 | // between our data and whichever version of PCRE happens to be installed |
||||
| 428 | // on this server. Unlike \p{Cc} and \p{Co}, which never change, the value |
||||
| 429 | // of \p{Cn} changes with every new version of Unicode. |
||||
| 430 | $disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']'; |
||||
| 431 | |||||
| 432 | // Several more things we never want: |
||||
| 433 | $disallowed[] = '[' . implode('', array( |
||||
| 434 | // Soft Hyphen. |
||||
| 435 | '\x{AD}', |
||||
| 436 | // Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA. |
||||
| 437 | // Unicode Standard ch. 16 says: "they are insufficient for [their] |
||||
| 438 | // purpose and should be considered errors in the encoding." |
||||
| 439 | '\x{17B4}-\x{17B5}', |
||||
| 440 | // Invisible math characters. |
||||
| 441 | '\x{2061}-\x{2064}', |
||||
| 442 | // Deprecated formatting characters. |
||||
| 443 | '\x{206A}-\x{206F}', |
||||
| 444 | // Zero Width No-Break Space, a.k.a. Byte Order Mark. |
||||
| 445 | '\x{FEFF}', |
||||
| 446 | // Annotation characters and Object Replacement Character. |
||||
| 447 | '\x{FFF9}-\x{FFFC}', |
||||
| 448 | )) . ']'; |
||||
| 449 | |||||
| 450 | switch ($level) |
||||
| 451 | { |
||||
| 452 | case 2: |
||||
| 453 | $disallowed[] = '[' . implode('', array( |
||||
| 454 | // Combining Grapheme Character. |
||||
| 455 | '\x{34F}', |
||||
| 456 | // Zero Width Non-Joiner. |
||||
| 457 | '\x{200C}', |
||||
| 458 | // Zero Width Joiner. |
||||
| 459 | '\x{200D}', |
||||
| 460 | // All variation selectors. |
||||
| 461 | $prop_classes['Variation_Selector'], |
||||
| 462 | // Tag characters. |
||||
| 463 | '\x{E0000}-\x{E007F}', |
||||
| 464 | )) . ']'; |
||||
| 465 | |||||
| 466 | // no break |
||||
| 467 | |||||
| 468 | case 1: |
||||
| 469 | $disallowed[] = '[' . implode('', array( |
||||
| 470 | // Zero Width Space. |
||||
| 471 | '\x{200B}', |
||||
| 472 | // Word Joiner. |
||||
| 473 | '\x{2060}', |
||||
| 474 | // "Bidi_Control" characters. |
||||
| 475 | // Disallowing means that all characters will behave according |
||||
| 476 | // to their default bidirectional text properties. |
||||
| 477 | $prop_classes['Bidi_Control'], |
||||
| 478 | // Hangul filler characters. |
||||
| 479 | // Used as placeholders in incomplete ideographs. |
||||
| 480 | '\x{115F}\x{1160}\x{3164}\x{FFA0}', |
||||
| 481 | // Shorthand formatting characters. |
||||
| 482 | '\x{1BCA0}-\x{1BCA3}', |
||||
| 483 | // Musical formatting characters. |
||||
| 484 | '\x{1D173}-\x{1D17A}', |
||||
| 485 | )) . ']'; |
||||
| 486 | |||||
| 487 | break; |
||||
| 488 | |||||
| 489 | default: |
||||
| 490 | // Zero Width Space only allowed in certain scripts. |
||||
| 491 | $disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}'; |
||||
| 492 | |||||
| 493 | // Word Joiner disallowed inside words. (Yes, \w is Unicode safe.) |
||||
| 494 | $disallowed[] = '(?<=\w)\x{2060}(?=\w)'; |
||||
| 495 | |||||
| 496 | // Hangul Choseong Filler and Hangul Jungseong Filler must followed |
||||
| 497 | // by more Hangul Jamo characters. |
||||
| 498 | $disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])'; |
||||
| 499 | |||||
| 500 | // Hangul Filler for Hangul compatibility chars. |
||||
| 501 | $disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])'; |
||||
| 502 | |||||
| 503 | // Halfwidth Hangul Filler for halfwidth Hangul compatibility chars. |
||||
| 504 | $disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])'; |
||||
| 505 | |||||
| 506 | // Shorthand formatting characters only with other shorthand chars. |
||||
| 507 | $disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])'; |
||||
| 508 | $disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]'; |
||||
| 509 | |||||
| 510 | // Musical formatting characters only with other musical chars. |
||||
| 511 | $disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])'; |
||||
| 512 | $disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]'; |
||||
| 513 | |||||
| 514 | break; |
||||
| 515 | } |
||||
| 516 | |||||
| 517 | if ($level < 2) |
||||
| 518 | { |
||||
| 519 | /* |
||||
| 520 | Combining Grapheme Character has two uses: to override standard |
||||
| 521 | search and collation behaviours, which we never want to allow, and |
||||
| 522 | to ensure correct behaviour of combining marks in a few exceptional |
||||
| 523 | cases, which is legitimate and should be allowed. This means we can |
||||
| 524 | simply test whether it is followed by a combining mark in order to |
||||
| 525 | determine whether to allow it. |
||||
| 526 | */ |
||||
| 527 | $disallowed[] = '\x{34F}(?!\p{M})'; |
||||
| 528 | |||||
| 529 | // Tag characters not allowed inside words. |
||||
| 530 | $disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)'; |
||||
| 531 | } |
||||
| 532 | |||||
| 533 | $string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string); |
||||
| 534 | |||||
| 535 | // Are we done yet? |
||||
| 536 | if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||||
| 537 | return $string; |
||||
| 538 | |||||
| 539 | // String must be in Normalization Form C for the following checks to work. |
||||
| 540 | $string = utf8_normalize_c($string); |
||||
| 541 | |||||
| 542 | $placeholders = array(); |
||||
| 543 | |||||
| 544 | // Use placeholders to preserve known emoji from further processing. |
||||
| 545 | // Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex |
||||
| 546 | $string = preg_replace_callback( |
||||
| 547 | '/' . |
||||
| 548 | // Flag emojis |
||||
| 549 | '[' . $prop_classes['Regional_Indicator'] . ']{2}' . |
||||
| 550 | // Or |
||||
| 551 | '|' . |
||||
| 552 | // Emoji characters |
||||
| 553 | '[' . $prop_classes['Emoji'] . ']' . |
||||
| 554 | // Possibly followed by modifiers of various sorts |
||||
| 555 | '(' . |
||||
| 556 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||||
| 557 | '|' . |
||||
| 558 | '\x{FE0F}\x{20E3}?' . |
||||
| 559 | '|' . |
||||
| 560 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||||
| 561 | ')?' . |
||||
| 562 | // Possibly concatenated with Zero Width Joiner and more emojis |
||||
| 563 | // (e.g. the "family" emoji sequences) |
||||
| 564 | '(' . |
||||
| 565 | '\x{200D}[' . $prop_classes['Emoji'] . ']' . |
||||
| 566 | '(' . |
||||
| 567 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||||
| 568 | '|' . |
||||
| 569 | '\x{FE0F}\x{20E3}?' . |
||||
| 570 | '|' . |
||||
| 571 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||||
| 572 | ')?' . |
||||
| 573 | ')*' . |
||||
| 574 | '/u', |
||||
| 575 | function ($matches) use (&$placeholders) |
||||
| 576 | { |
||||
| 577 | // Skip lone ASCII characters that are not actully part of an emoji sequence. |
||||
| 578 | // This can happen because the digits 0-9 and the '*' and '#' characters are |
||||
| 579 | // the base characters for the "Emoji_Keycap_Sequence" emojis. |
||||
| 580 | if (strlen($matches[0]) === 1) |
||||
| 581 | return $matches[0]; |
||||
| 582 | |||||
| 583 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||||
| 584 | return $placeholders[$matches[0]]; |
||||
| 585 | }, |
||||
| 586 | $string |
||||
| 587 | ); |
||||
| 588 | |||||
| 589 | // Get rid of any unsanctioned variation selectors. |
||||
| 590 | if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||||
| 591 | { |
||||
| 592 | /* |
||||
| 593 | Unicode gives pre-defined lists of sanctioned variation sequences |
||||
| 594 | and says any use of variation selectors outside those sequences is |
||||
| 595 | unsanctioned. |
||||
| 596 | */ |
||||
| 597 | |||||
| 598 | $patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u'); |
||||
| 599 | |||||
| 600 | foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars) |
||||
| 601 | $patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u'; |
||||
| 602 | |||||
| 603 | // Use placeholders for sanctioned variation selectors. |
||||
| 604 | $string = preg_replace_callback( |
||||
| 605 | $patterns, |
||||
| 606 | function ($matches) use (&$placeholders) |
||||
| 607 | { |
||||
| 608 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||||
| 609 | return $placeholders[$matches[0]]; |
||||
| 610 | }, |
||||
| 611 | $string |
||||
| 612 | ); |
||||
| 613 | |||||
| 614 | // Remove any unsanctioned variation selectors. |
||||
| 615 | $string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string); |
||||
| 616 | } |
||||
| 617 | |||||
| 618 | // Join controls are only allowed inside words in special circumstances. |
||||
| 619 | // See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters |
||||
| 620 | if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string)) |
||||
| 621 | { |
||||
| 622 | // Zero Width Non-Joiner (U+200C) |
||||
| 623 | $zwnj = "\xE2\x80\x8C"; |
||||
| 624 | // Zero Width Joiner (U+200D) |
||||
| 625 | $zwj = "\xE2\x80\x8D"; |
||||
| 626 | |||||
| 627 | $placeholders[$zwnj] = "\xEE\x80\x8C"; |
||||
| 628 | $placeholders[$zwj] = "\xEE\x80\x8D"; |
||||
| 629 | |||||
| 630 | // When not in strict mode, allow ZWJ at word boundaries. |
||||
| 631 | if ($level === 0) |
||||
| 632 | $string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string); |
||||
| 633 | |||||
| 634 | // Tests for Zero Width Joiner and Zero Width Non-Joiner. |
||||
| 635 | $joining_type_classes = utf8_regex_joining_type(); |
||||
| 636 | $indic_classes = utf8_regex_indic(); |
||||
| 637 | |||||
| 638 | foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes) |
||||
| 639 | { |
||||
| 640 | // Cursive scripts like Arabic use ZWNJ in certain contexts. |
||||
| 641 | // For these scripts, use test A1 for allowing ZWNJ. |
||||
| 642 | // https://unicode.org/reports/tr31/#A1 |
||||
| 643 | if (isset($joining_type_classes[$script])) |
||||
| 644 | { |
||||
| 645 | $lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : ''; |
||||
| 646 | $rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : ''; |
||||
| 647 | $t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : ''; |
||||
| 648 | |||||
| 649 | if (!empty($classes['Dual_Joining'])) |
||||
| 650 | { |
||||
| 651 | $lj .= $classes['Dual_Joining']; |
||||
| 652 | $rj .= $classes['Dual_Joining']; |
||||
| 653 | } |
||||
| 654 | |||||
| 655 | $pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']'; |
||||
| 656 | } |
||||
| 657 | // Indic scripts with viramas use ZWNJ and ZWJ in certain contexts. |
||||
| 658 | // For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ. |
||||
| 659 | // https://unicode.org/reports/tr31/#A2 |
||||
| 660 | // https://unicode.org/reports/tr31/#B |
||||
| 661 | else |
||||
| 662 | { |
||||
| 663 | // A letter that is part of this particular script. |
||||
| 664 | $letter = '[' . $classes['Letter'] . ']'; |
||||
| 665 | |||||
| 666 | // Zero or more non-spacing marks used in this script. |
||||
| 667 | $nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*'; |
||||
| 668 | |||||
| 669 | // Zero or more non-spacing combining marks used in this script. |
||||
| 670 | $nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*'; |
||||
| 671 | |||||
| 672 | // ZWNJ must be followed by another letter in the same script. |
||||
| 673 | $zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')'; |
||||
| 674 | |||||
| 675 | // ZWJ must NOT be followed by a vowel dependent character in this |
||||
| 676 | // script or by any character from a different script. |
||||
| 677 | $zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])'; |
||||
| 678 | |||||
| 679 | // Now build the pattern for this script. |
||||
| 680 | $pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern); |
||||
| 681 | } |
||||
| 682 | |||||
| 683 | // Do the thing. |
||||
| 684 | $string = preg_replace_callback( |
||||
| 685 | '/' . $pattern . '/u', |
||||
| 686 | function ($matches) use ($placeholders) |
||||
| 687 | { |
||||
| 688 | return strtr($matches[0], $placeholders); |
||||
| 689 | }, |
||||
| 690 | $string |
||||
| 691 | ); |
||||
| 692 | |||||
| 693 | // Did we catch 'em all? |
||||
| 694 | if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false) |
||||
| 695 | break; |
||||
| 696 | } |
||||
| 697 | |||||
| 698 | // Apart from the exceptions above, ZWNJ and ZWJ are not allowed. |
||||
| 699 | $string = str_replace(array($zwj, $zwnj), $substitute, $string); |
||||
| 700 | } |
||||
| 701 | |||||
| 702 | // Revert placeholders back to original characters. |
||||
| 703 | $string = strtr($string, array_flip($placeholders)); |
||||
| 704 | |||||
| 705 | |||||
| 706 | return $string; |
||||
| 707 | } |
||||
| 708 | |||||
| 709 | ?> |