1 | <?php |
||||
2 | |||||
3 | /** |
||||
4 | * Simple Machines Forum (SMF) |
||||
5 | * |
||||
6 | * @package SMF |
||||
7 | * @author Simple Machines https://www.simplemachines.org |
||||
8 | * @copyright 2022 Simple Machines and individual contributors |
||||
9 | * @license https://www.simplemachines.org/about/smf/license.php BSD |
||||
10 | * |
||||
11 | * @version 2.1.0 |
||||
12 | */ |
||||
13 | |||||
14 | if (!defined('SMF')) |
||||
15 | die('No direct access...'); |
||||
16 | |||||
17 | /** |
||||
18 | * Converts the given UTF-8 string into lowercase. |
||||
19 | * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the |
||||
20 | * output consistent across PHP versions and up to date with the latest version |
||||
21 | * of Unicode. |
||||
22 | * |
||||
23 | * @param string $string The string |
||||
24 | * @return string The lowercase version of $string |
||||
25 | */ |
||||
26 | function utf8_strtolower($string) |
||||
27 | { |
||||
28 | global $sourcedir; |
||||
29 | |||||
30 | $string = (string) $string; |
||||
31 | |||||
32 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
33 | |||||
34 | if ($chars === false) |
||||
35 | return false; |
||||
36 | |||||
37 | require_once($sourcedir . '/Unicode/CaseLower.php'); |
||||
38 | |||||
39 | $substitutions = utf8_strtolower_maps(); |
||||
40 | |||||
41 | foreach ($chars as &$char) |
||||
42 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
43 | |||||
44 | return implode('', $chars); |
||||
45 | } |
||||
46 | |||||
47 | /** |
||||
48 | * Convert the given UTF-8 string to uppercase. |
||||
49 | * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the |
||||
50 | * output consistent across PHP versions and up to date with the latest version |
||||
51 | * of Unicode. |
||||
52 | * |
||||
53 | * @param string $string The string |
||||
54 | * @return string The uppercase version of $string |
||||
55 | */ |
||||
56 | function utf8_strtoupper($string) |
||||
57 | { |
||||
58 | global $sourcedir; |
||||
59 | |||||
60 | $string = (string) $string; |
||||
61 | |||||
62 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
63 | |||||
64 | if ($chars === false) |
||||
65 | return false; |
||||
66 | |||||
67 | require_once($sourcedir . '/Unicode/CaseUpper.php'); |
||||
68 | |||||
69 | $substitutions = utf8_strtoupper_maps(); |
||||
70 | |||||
71 | foreach ($chars as &$char) |
||||
72 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
73 | |||||
74 | return implode('', $chars); |
||||
75 | } |
||||
76 | |||||
77 | /** |
||||
78 | * Casefolds the given UTF-8 string. |
||||
79 | * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that |
||||
80 | * we can keep the output consistent across PHP versions and up to date with |
||||
81 | * the latest version of Unicode. |
||||
82 | * |
||||
83 | * @param string $string The string |
||||
84 | * @return string The uppercase version of $string |
||||
85 | */ |
||||
86 | function utf8_casefold($string) |
||||
87 | { |
||||
88 | global $sourcedir; |
||||
89 | |||||
90 | $string = (string) $string; |
||||
91 | |||||
92 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
93 | |||||
94 | if ($chars === false) |
||||
95 | return false; |
||||
96 | |||||
97 | require_once($sourcedir . '/Unicode/CaseFold.php'); |
||||
98 | |||||
99 | $substitutions = utf8_casefold_maps(); |
||||
100 | |||||
101 | foreach ($chars as &$char) |
||||
102 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
103 | |||||
104 | return implode('', $chars); |
||||
105 | } |
||||
106 | |||||
107 | /** |
||||
108 | * Normalizes UTF-8 via Canonical Decomposition. |
||||
109 | * |
||||
110 | * @param string $string A UTF-8 string |
||||
111 | * @return string The decomposed version of $string |
||||
112 | */ |
||||
113 | function utf8_normalize_d($string) |
||||
114 | { |
||||
115 | $string = (string) $string; |
||||
116 | |||||
117 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D)) |
||||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||||
118 | return $string; |
||||
119 | |||||
120 | if (is_callable('normalizer_normalize')) |
||||
121 | return normalizer_normalize($string, Normalizer::FORM_D); |
||||
0 ignored issues
–
show
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
122 | |||||
123 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
124 | |||||
125 | if ($chars === false) |
||||
126 | return false; |
||||
127 | |||||
128 | return implode('', utf8_decompose($chars, false)); |
||||
129 | } |
||||
130 | |||||
131 | /** |
||||
132 | * Normalizes UTF-8 via Compatibility Decomposition. |
||||
133 | * |
||||
134 | * @param string $string A UTF-8 string. |
||||
135 | * @return string The decomposed version of $string. |
||||
136 | */ |
||||
137 | function utf8_normalize_kd($string) |
||||
138 | { |
||||
139 | $string = (string) $string; |
||||
140 | |||||
141 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD)) |
||||
0 ignored issues
–
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
142 | return $string; |
||||
143 | |||||
144 | if (is_callable('normalizer_normalize')) |
||||
145 | return normalizer_normalize($string, Normalizer::FORM_KD); |
||||
0 ignored issues
–
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
146 | |||||
147 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
148 | |||||
149 | if ($chars === false) |
||||
150 | return false; |
||||
151 | |||||
152 | return implode('', utf8_decompose($chars, true)); |
||||
153 | } |
||||
154 | |||||
155 | /** |
||||
156 | * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition. |
||||
157 | * |
||||
158 | * @param string $string A UTF-8 string |
||||
159 | * @return string The composed version of $string |
||||
160 | */ |
||||
161 | function utf8_normalize_c($string) |
||||
162 | { |
||||
163 | $string = (string) $string; |
||||
164 | |||||
165 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C)) |
||||
166 | return $string; |
||||
167 | |||||
168 | if (is_callable('normalizer_normalize')) |
||||
169 | return normalizer_normalize($string, Normalizer::FORM_C); |
||||
170 | |||||
171 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
172 | |||||
173 | if ($chars === false) |
||||
174 | return false; |
||||
175 | |||||
176 | return implode('', utf8_compose(utf8_decompose($chars, false))); |
||||
177 | } |
||||
178 | |||||
179 | /** |
||||
180 | * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition. |
||||
181 | * |
||||
182 | * @param string $string The string |
||||
183 | * @return string The composed version of $string |
||||
184 | */ |
||||
185 | function utf8_normalize_kc($string) |
||||
186 | { |
||||
187 | $string = (string) $string; |
||||
188 | |||||
189 | if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC)) |
||||
190 | return $string; |
||||
191 | |||||
192 | if (is_callable('normalizer_normalize')) |
||||
193 | return normalizer_normalize($string, Normalizer::FORM_KC); |
||||
194 | |||||
195 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
196 | |||||
197 | if ($chars === false) |
||||
198 | return false; |
||||
199 | |||||
200 | return implode('', utf8_compose(utf8_decompose($chars, true))); |
||||
201 | } |
||||
202 | |||||
203 | /** |
||||
204 | * Casefolds UTF-8 via Compatibility Composition Casefolding. |
||||
205 | * Used by idn_to_ascii polyfill in Subs-Compat.php |
||||
206 | * |
||||
207 | * @param string $string The string |
||||
208 | * @return string The casefolded version of $string |
||||
209 | */ |
||||
210 | function utf8_normalize_kc_casefold($string) |
||||
211 | { |
||||
212 | global $sourcedir; |
||||
213 | |||||
214 | $string = (string) $string; |
||||
215 | |||||
216 | $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
217 | |||||
218 | if ($chars === false) |
||||
219 | return false; |
||||
220 | |||||
221 | $chars = utf8_decompose($chars, true); |
||||
222 | |||||
223 | require_once($sourcedir . '/Unicode/CaseFold.php'); |
||||
224 | require_once($sourcedir . '/Unicode/DefaultIgnorables.php'); |
||||
225 | |||||
226 | $substitutions = utf8_casefold_maps(); |
||||
227 | $ignorables = array_flip(utf8_default_ignorables()); |
||||
228 | |||||
229 | foreach ($chars as &$char) |
||||
230 | { |
||||
231 | if (isset($substitutions[$char])) |
||||
232 | $char = $substitutions[$char]; |
||||
233 | |||||
234 | elseif (isset($ignorables[$char])) |
||||
235 | $char = ''; |
||||
236 | } |
||||
237 | |||||
238 | return implode('', utf8_compose($chars)); |
||||
239 | } |
||||
240 | |||||
241 | /** |
||||
242 | * Helper function for utf8_normalize_d and utf8_normalize_kd. |
||||
243 | * |
||||
244 | * @param array $chars Array of Unicode characters |
||||
245 | * @return array Array of decomposed Unicode characters. |
||||
246 | */ |
||||
247 | function utf8_decompose($chars, $compatibility = false) |
||||
248 | { |
||||
249 | global $sourcedir; |
||||
250 | |||||
251 | if (!empty($compatibility)) |
||||
252 | { |
||||
253 | require_once($sourcedir . '/Unicode/DecompositionCompatibility.php'); |
||||
254 | |||||
255 | $substitutions = utf8_normalize_kd_maps(); |
||||
256 | |||||
257 | foreach ($chars as &$char) |
||||
258 | $char = isset($substitutions[$char]) ? $substitutions[$char] : $char; |
||||
259 | } |
||||
260 | |||||
261 | require_once($sourcedir . '/Unicode/DecompositionCanonical.php'); |
||||
262 | require_once($sourcedir . '/Unicode/CombiningClasses.php'); |
||||
263 | |||||
264 | $substitutions = utf8_normalize_d_maps(); |
||||
265 | $combining_classes = utf8_combining_classes(); |
||||
266 | |||||
267 | // Replace characters with decomposed forms. |
||||
268 | for ($i=0; $i < count($chars); $i++) |
||||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||||
269 | { |
||||
270 | // Hangul characters. |
||||
271 | if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3") |
||||
272 | { |
||||
273 | if (!function_exists('mb_ord')) |
||||
274 | require_once($sourcedir . '/Subs-Compat.php'); |
||||
275 | |||||
276 | $s = mb_ord($chars[$i]); |
||||
277 | $sindex = $s - 0xAC00; |
||||
278 | $l = 0x1100 + $sindex / (21 * 28); |
||||
279 | $v = 0x1161 + ($sindex % (21 * 28)) / 28; |
||||
280 | $t = $sindex % 28; |
||||
281 | |||||
282 | $chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : '')); |
||||
283 | } |
||||
284 | // Everything else. |
||||
285 | elseif (isset($substitutions[$chars[$i]])) |
||||
286 | $chars[$i] = $substitutions[$chars[$i]]; |
||||
287 | } |
||||
288 | |||||
289 | // Must re-split the string before sorting. |
||||
290 | $chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
||||
291 | |||||
292 | // Sort characters into canonical order. |
||||
293 | for ($i = 1; $i < count($chars); $i++) |
||||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||||
294 | { |
||||
295 | if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]])) |
||||
296 | continue; |
||||
297 | |||||
298 | if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]]) |
||||
299 | { |
||||
300 | $temp = $chars[$i]; |
||||
301 | $chars[$i] = $chars[$i - 1]; |
||||
302 | $chars[$i -1] = $temp; |
||||
303 | |||||
304 | // Backtrack and check again. |
||||
305 | if ($i > 1) |
||||
306 | $i -= 2; |
||||
307 | } |
||||
308 | } |
||||
309 | |||||
310 | return $chars; |
||||
311 | } |
||||
312 | |||||
313 | /** |
||||
314 | * Helper function for utf8_normalize_c and utf8_normalize_kc. |
||||
315 | * |
||||
316 | * @param array $chars Array of decomposed Unicode characters |
||||
317 | * @return array Array of composed Unicode characters. |
||||
318 | */ |
||||
319 | function utf8_compose($chars) |
||||
320 | { |
||||
321 | global $sourcedir; |
||||
322 | |||||
323 | require_once($sourcedir . '/Unicode/Composition.php'); |
||||
324 | require_once($sourcedir . '/Unicode/CombiningClasses.php'); |
||||
325 | |||||
326 | $substitutions = utf8_compose_maps(); |
||||
327 | $combining_classes = utf8_combining_classes(); |
||||
328 | |||||
329 | for ($c = 0; $c < count($chars); $c++) |
||||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||||
330 | { |
||||
331 | // Singleton replacements. |
||||
332 | if (isset($substitutions[$chars[$c]])) |
||||
333 | $chars[$c] = $substitutions[$chars[$c]]; |
||||
334 | |||||
335 | // Hangul characters. |
||||
336 | // See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12. |
||||
337 | if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5") |
||||
338 | { |
||||
339 | if (!function_exists('mb_ord')) |
||||
340 | require_once($sourcedir . '/Subs-Compat.php'); |
||||
341 | |||||
342 | $l_part = $chars[$c]; |
||||
343 | $v_part = $chars[$c + 1]; |
||||
344 | $t_part = null; |
||||
345 | |||||
346 | $l_index = mb_ord($l_part) - 0x1100; |
||||
347 | $v_index = mb_ord($v_part) - 0x1161; |
||||
348 | |||||
349 | $lv_index = $l_index * 588 + $v_index * 28; |
||||
350 | $s = 0xAC00 + $lv_index; |
||||
351 | |||||
352 | if (isset($chars[$c + 2]) && $chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82") |
||||
353 | { |
||||
354 | $t_part = $chars[$c + 2]; |
||||
355 | $t_index = mb_ord($t_part) - 0x11A7; |
||||
356 | $s += $t_index; |
||||
357 | } |
||||
358 | |||||
359 | $chars[$c] = mb_chr($s); |
||||
360 | $chars[++$c] = null; |
||||
361 | |||||
362 | if (isset($t_part)) |
||||
363 | $chars[++$c] = null; |
||||
364 | |||||
365 | continue; |
||||
366 | } |
||||
367 | |||||
368 | if ($c > 0) |
||||
369 | { |
||||
370 | $ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0; |
||||
371 | |||||
372 | // Find the preceding starter character. |
||||
373 | $l = $c - 1; |
||||
374 | while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc))) |
||||
375 | $l--; |
||||
376 | |||||
377 | // Is there a composed form for this combination? |
||||
378 | if (isset($substitutions[$chars[$l] . $chars[$c]])) |
||||
379 | { |
||||
380 | // Replace the starter character with the composed character. |
||||
381 | $chars[$l] = $substitutions[$chars[$l] . $chars[$c]]; |
||||
382 | |||||
383 | // Unset the current combining character. |
||||
384 | $chars[$c] = null; |
||||
385 | } |
||||
386 | } |
||||
387 | } |
||||
388 | |||||
389 | return $chars; |
||||
390 | } |
||||
391 | |||||
392 | /** |
||||
393 | * Helper function for sanitize_chars() that deals with invisible characters. |
||||
394 | * |
||||
395 | * This function deals with control characters, private use characters, |
||||
396 | * non-characters, and characters that are invisible by definition in the |
||||
397 | * Unicode standard. It does not deal with characters that are supposed to be |
||||
398 | * visible according to the Unicode standard, and makes no attempt to compensate |
||||
399 | * for possibly incomplete Unicode support in text rendering engines on client |
||||
400 | * devices. |
||||
401 | * |
||||
402 | * @param string $string The string to sanitize. |
||||
403 | * @param int $level Controls how invisible formatting characters are handled. |
||||
404 | * 0: Allow valid formatting characters. Use for sanitizing text in posts. |
||||
405 | * 1: Allow necessary formatting characters. Use for sanitizing usernames. |
||||
406 | * 2: Disallow all formatting characters. Use for internal comparisions |
||||
407 | * only, such as in the word censor, search contexts, etc. |
||||
408 | * @param string $substitute Replacement string for the invalid characters. |
||||
409 | * @return string The sanitized string. |
||||
410 | */ |
||||
411 | function utf8_sanitize_invisibles($string, $level, $substitute) |
||||
412 | { |
||||
413 | global $sourcedir; |
||||
414 | |||||
415 | $string = (string) $string; |
||||
416 | $level = min(max((int) $level, 0), 2); |
||||
417 | $substitute = (string) $substitute; |
||||
418 | |||||
419 | require_once($sourcedir . '/Unicode/RegularExpressions.php'); |
||||
420 | $prop_classes = utf8_regex_properties(); |
||||
421 | |||||
422 | // We never want non-whitespace control characters |
||||
423 | $disallowed[] = '[^\P{Cc}\t\r\n]'; |
||||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||||
424 | |||||
425 | // We never want private use characters or non-characters. |
||||
426 | // Use our own version of \p{Cn} in order to avoid possible inconsistencies |
||||
427 | // between our data and whichever version of PCRE happens to be installed |
||||
428 | // on this server. Unlike \p{Cc} and \p{Co}, which never change, the value |
||||
429 | // of \p{Cn} changes with every new version of Unicode. |
||||
430 | $disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']'; |
||||
431 | |||||
432 | // Several more things we never want: |
||||
433 | $disallowed[] = '[' . implode('', array( |
||||
434 | // Soft Hyphen. |
||||
435 | '\x{AD}', |
||||
436 | // Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA. |
||||
437 | // Unicode Standard ch. 16 says: "they are insufficient for [their] |
||||
438 | // purpose and should be considered errors in the encoding." |
||||
439 | '\x{17B4}-\x{17B5}', |
||||
440 | // Invisible math characters. |
||||
441 | '\x{2061}-\x{2064}', |
||||
442 | // Deprecated formatting characters. |
||||
443 | '\x{206A}-\x{206F}', |
||||
444 | // Zero Width No-Break Space, a.k.a. Byte Order Mark. |
||||
445 | '\x{FEFF}', |
||||
446 | // Annotation characters and Object Replacement Character. |
||||
447 | '\x{FFF9}-\x{FFFC}', |
||||
448 | )) . ']'; |
||||
449 | |||||
450 | switch ($level) |
||||
451 | { |
||||
452 | case 2: |
||||
453 | $disallowed[] = '[' . implode('', array( |
||||
454 | // Combining Grapheme Character. |
||||
455 | '\x{34F}', |
||||
456 | // Zero Width Non-Joiner. |
||||
457 | '\x{200C}', |
||||
458 | // Zero Width Joiner. |
||||
459 | '\x{200D}', |
||||
460 | // All variation selectors. |
||||
461 | $prop_classes['Variation_Selector'], |
||||
462 | // Tag characters. |
||||
463 | '\x{E0000}-\x{E007F}', |
||||
464 | )) . ']'; |
||||
465 | |||||
466 | // no break |
||||
467 | |||||
468 | case 1: |
||||
469 | $disallowed[] = '[' . implode('', array( |
||||
470 | // Zero Width Space. |
||||
471 | '\x{200B}', |
||||
472 | // Word Joiner. |
||||
473 | '\x{2060}', |
||||
474 | // "Bidi_Control" characters. |
||||
475 | // Disallowing means that all characters will behave according |
||||
476 | // to their default bidirectional text properties. |
||||
477 | $prop_classes['Bidi_Control'], |
||||
478 | // Hangul filler characters. |
||||
479 | // Used as placeholders in incomplete ideographs. |
||||
480 | '\x{115F}\x{1160}\x{3164}\x{FFA0}', |
||||
481 | // Shorthand formatting characters. |
||||
482 | '\x{1BCA0}-\x{1BCA3}', |
||||
483 | // Musical formatting characters. |
||||
484 | '\x{1D173}-\x{1D17A}', |
||||
485 | )) . ']'; |
||||
486 | |||||
487 | break; |
||||
488 | |||||
489 | default: |
||||
490 | // Zero Width Space only allowed in certain scripts. |
||||
491 | $disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}'; |
||||
492 | |||||
493 | // Word Joiner disallowed inside words. (Yes, \w is Unicode safe.) |
||||
494 | $disallowed[] = '(?<=\w)\x{2060}(?=\w)'; |
||||
495 | |||||
496 | // Hangul Choseong Filler and Hangul Jungseong Filler must followed |
||||
497 | // by more Hangul Jamo characters. |
||||
498 | $disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])'; |
||||
499 | |||||
500 | // Hangul Filler for Hangul compatibility chars. |
||||
501 | $disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])'; |
||||
502 | |||||
503 | // Halfwidth Hangul Filler for halfwidth Hangul compatibility chars. |
||||
504 | $disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])'; |
||||
505 | |||||
506 | // Shorthand formatting characters only with other shorthand chars. |
||||
507 | $disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])'; |
||||
508 | $disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]'; |
||||
509 | |||||
510 | // Musical formatting characters only with other musical chars. |
||||
511 | $disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])'; |
||||
512 | $disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]'; |
||||
513 | |||||
514 | break; |
||||
515 | } |
||||
516 | |||||
517 | if ($level < 2) |
||||
518 | { |
||||
519 | /* |
||||
520 | Combining Grapheme Character has two uses: to override standard |
||||
521 | search and collation behaviours, which we never want to allow, and |
||||
522 | to ensure correct behaviour of combining marks in a few exceptional |
||||
523 | cases, which is legitimate and should be allowed. This means we can |
||||
524 | simply test whether it is followed by a combining mark in order to |
||||
525 | determine whether to allow it. |
||||
526 | */ |
||||
527 | $disallowed[] = '\x{34F}(?!\p{M})'; |
||||
528 | |||||
529 | // Tag characters not allowed inside words. |
||||
530 | $disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)'; |
||||
531 | } |
||||
532 | |||||
533 | $string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string); |
||||
534 | |||||
535 | // Are we done yet? |
||||
536 | if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||||
537 | return $string; |
||||
538 | |||||
539 | // String must be in Normalization Form C for the following checks to work. |
||||
540 | $string = utf8_normalize_c($string); |
||||
541 | |||||
542 | $placeholders = array(); |
||||
543 | |||||
544 | // Use placeholders to preserve known emoji from further processing. |
||||
545 | // Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex |
||||
546 | $string = preg_replace_callback( |
||||
547 | '/' . |
||||
548 | // Flag emojis |
||||
549 | '[' . $prop_classes['Regional_Indicator'] . ']{2}' . |
||||
550 | // Or |
||||
551 | '|' . |
||||
552 | // Emoji characters |
||||
553 | '[' . $prop_classes['Emoji'] . ']' . |
||||
554 | // Possibly followed by modifiers of various sorts |
||||
555 | '(' . |
||||
556 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||||
557 | '|' . |
||||
558 | '\x{FE0F}\x{20E3}?' . |
||||
559 | '|' . |
||||
560 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||||
561 | ')?' . |
||||
562 | // Possibly concatenated with Zero Width Joiner and more emojis |
||||
563 | // (e.g. the "family" emoji sequences) |
||||
564 | '(' . |
||||
565 | '\x{200D}[' . $prop_classes['Emoji'] . ']' . |
||||
566 | '(' . |
||||
567 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||||
568 | '|' . |
||||
569 | '\x{FE0F}\x{20E3}?' . |
||||
570 | '|' . |
||||
571 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||||
572 | ')?' . |
||||
573 | ')*' . |
||||
574 | '/u', |
||||
575 | function ($matches) use (&$placeholders) |
||||
576 | { |
||||
577 | // Skip lone ASCII characters that are not actully part of an emoji sequence. |
||||
578 | // This can happen because the digits 0-9 and the '*' and '#' characters are |
||||
579 | // the base characters for the "Emoji_Keycap_Sequence" emojis. |
||||
580 | if (strlen($matches[0]) === 1) |
||||
581 | return $matches[0]; |
||||
582 | |||||
583 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||||
584 | return $placeholders[$matches[0]]; |
||||
585 | }, |
||||
586 | $string |
||||
587 | ); |
||||
588 | |||||
589 | // Get rid of any unsanctioned variation selectors. |
||||
590 | if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||||
591 | { |
||||
592 | /* |
||||
593 | Unicode gives pre-defined lists of sanctioned variation sequences |
||||
594 | and says any use of variation selectors outside those sequences is |
||||
595 | unsanctioned. |
||||
596 | */ |
||||
597 | |||||
598 | $patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u'); |
||||
599 | |||||
600 | foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars) |
||||
601 | $patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u'; |
||||
602 | |||||
603 | // Use placeholders for sanctioned variation selectors. |
||||
604 | $string = preg_replace_callback( |
||||
605 | $patterns, |
||||
606 | function ($matches) use (&$placeholders) |
||||
607 | { |
||||
608 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||||
609 | return $placeholders[$matches[0]]; |
||||
610 | }, |
||||
611 | $string |
||||
612 | ); |
||||
613 | |||||
614 | // Remove any unsanctioned variation selectors. |
||||
615 | $string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string); |
||||
616 | } |
||||
617 | |||||
618 | // Join controls are only allowed inside words in special circumstances. |
||||
619 | // See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters |
||||
620 | if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string)) |
||||
621 | { |
||||
622 | // Zero Width Non-Joiner (U+200C) |
||||
623 | $zwnj = "\xE2\x80\x8C"; |
||||
624 | // Zero Width Joiner (U+200D) |
||||
625 | $zwj = "\xE2\x80\x8D"; |
||||
626 | |||||
627 | $placeholders[$zwnj] = "\xEE\x80\x8C"; |
||||
628 | $placeholders[$zwj] = "\xEE\x80\x8D"; |
||||
629 | |||||
630 | // When not in strict mode, allow ZWJ at word boundaries. |
||||
631 | if ($level === 0) |
||||
632 | $string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string); |
||||
633 | |||||
634 | // Tests for Zero Width Joiner and Zero Width Non-Joiner. |
||||
635 | $joining_type_classes = utf8_regex_joining_type(); |
||||
636 | $indic_classes = utf8_regex_indic(); |
||||
637 | |||||
638 | foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes) |
||||
639 | { |
||||
640 | // Cursive scripts like Arabic use ZWNJ in certain contexts. |
||||
641 | // For these scripts, use test A1 for allowing ZWNJ. |
||||
642 | // https://unicode.org/reports/tr31/#A1 |
||||
643 | if (isset($joining_type_classes[$script])) |
||||
644 | { |
||||
645 | $lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : ''; |
||||
646 | $rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : ''; |
||||
647 | $t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : ''; |
||||
648 | |||||
649 | if (!empty($classes['Dual_Joining'])) |
||||
650 | { |
||||
651 | $lj .= $classes['Dual_Joining']; |
||||
652 | $rj .= $classes['Dual_Joining']; |
||||
653 | } |
||||
654 | |||||
655 | $pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']'; |
||||
656 | } |
||||
657 | // Indic scripts with viramas use ZWNJ and ZWJ in certain contexts. |
||||
658 | // For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ. |
||||
659 | // https://unicode.org/reports/tr31/#A2 |
||||
660 | // https://unicode.org/reports/tr31/#B |
||||
661 | else |
||||
662 | { |
||||
663 | // A letter that is part of this particular script. |
||||
664 | $letter = '[' . $classes['Letter'] . ']'; |
||||
665 | |||||
666 | // Zero or more non-spacing marks used in this script. |
||||
667 | $nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*'; |
||||
668 | |||||
669 | // Zero or more non-spacing combining marks used in this script. |
||||
670 | $nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*'; |
||||
671 | |||||
672 | // ZWNJ must be followed by another letter in the same script. |
||||
673 | $zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')'; |
||||
674 | |||||
675 | // ZWJ must NOT be followed by a vowel dependent character in this |
||||
676 | // script or by any character from a different script. |
||||
677 | $zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])'; |
||||
678 | |||||
679 | // Now build the pattern for this script. |
||||
680 | $pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern); |
||||
681 | } |
||||
682 | |||||
683 | // Do the thing. |
||||
684 | $string = preg_replace_callback( |
||||
685 | '/' . $pattern . '/u', |
||||
686 | function ($matches) use ($placeholders) |
||||
687 | { |
||||
688 | return strtr($matches[0], $placeholders); |
||||
689 | }, |
||||
690 | $string |
||||
691 | ); |
||||
692 | |||||
693 | // Did we catch 'em all? |
||||
694 | if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false) |
||||
695 | break; |
||||
696 | } |
||||
697 | |||||
698 | // Apart from the exceptions above, ZWNJ and ZWJ are not allowed. |
||||
699 | $string = str_replace(array($zwj, $zwnj), $substitute, $string); |
||||
700 | } |
||||
701 | |||||
702 | // Revert placeholders back to original characters. |
||||
703 | $string = strtr($string, array_flip($placeholders)); |
||||
704 | |||||
705 | |||||
706 | return $string; |
||||
707 | } |
||||
708 | |||||
709 | ?> |