Passed
Pull Request — release-2.1 (#7124)
by Jon
04:15
created

utf8_normalize_kc_casefold()   A

Complexity

Conditions 5
Paths 5

Size

Total Lines 27
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 15
c 0
b 0
f 0
nc 5
nop 1
dl 0
loc 27
rs 9.4555
1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2021 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1 RC4
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
/**
18
 * Converts the given UTF-8 string into lowercase.
19
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
20
 * output consistent across PHP versions and up to date with the latest version
21
 * of Unicode.
22
 *
23
 * @param string $string The string
24
 * @return string The lowercase version of $string
25
 */
26
function utf8_strtolower($string)
27
{
28
	global $sourcedir;
29
30
	require_once($sourcedir . '/Unicode/CaseLower.php');
31
32
	$substitutions = utf8_strtolower_maps();
33
34
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
35
36
	foreach ($chars as &$char)
37
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
38
39
	if ($chars === false)
40
		return false;
41
42
	return implode('', $chars);
43
}
44
45
/**
46
 * Convert the given UTF-8 string to uppercase.
47
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
48
 * output consistent across PHP versions and up to date with the latest version
49
 * of Unicode.
50
 *
51
 * @param string $string The string
52
 * @return string The uppercase version of $string
53
 */
54
function utf8_strtoupper($string)
55
{
56
	global $sourcedir;
57
58
	require_once($sourcedir . '/Unicode/CaseUpper.php');
59
60
	$substitutions = utf8_strtoupper_maps();
61
62
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
63
64
	if ($chars === false)
65
		return false;
66
67
	foreach ($chars as &$char)
68
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
69
70
	return implode('', $chars);
71
}
72
73
/**
74
 * Casefolds the given UTF-8 string.
75
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
76
 * we can keep the output consistent across PHP versions and up to date with
77
 * the latest version of Unicode.
78
 *
79
 * @param string $string The string
80
 * @return string The uppercase version of $string
81
 */
82
function utf8_casefold($string)
83
{
84
	global $sourcedir;
85
86
	require_once($sourcedir . '/Unicode/CaseFold.php');
87
88
	$substitutions = utf8_casefold_maps();
89
90
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
91
92
	if ($chars === false)
93
		return false;
94
95
	foreach ($chars as &$char)
96
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
97
98
	return implode('', $chars);
99
}
100
101
/**
102
 * Normalizes UTF-8 via Canonical Decomposition.
103
 *
104
 * @param string $string A UTF-8 string
105
 * @return string The decomposed version of $string
106
 */
107
function utf8_normalize_d($string)
108
{
109
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

109
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D))
Loading history...
110
		return $string;
111
112
	if (is_callable('normalizer_normalize'))
113
		return normalizer_normalize($string, Normalizer::FORM_D);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

113
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D);
Loading history...
114
115
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
116
117
	if ($chars === false)
118
		return false;
119
120
	return implode('', utf8_decompose($chars, false));
121
}
122
123
/**
124
 * Normalizes UTF-8 via Compatibility Decomposition.
125
 *
126
 * @param string $string A UTF-8 string.
127
 * @return string The decomposed version of $string.
128
 */
129
function utf8_normalize_kd($string)
130
{
131
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

131
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD))
Loading history...
132
		return $string;
133
134
	if (is_callable('normalizer_normalize'))
135
		return normalizer_normalize($string, Normalizer::FORM_KD);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

135
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD);
Loading history...
136
137
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
138
139
	if ($chars === false)
140
		return false;
141
142
	return implode('', utf8_decompose($chars, true));
143
}
144
145
/**
146
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
147
 *
148
 * @param string $string A UTF-8 string
149
 * @return string The composed version of $string
150
 */
151
function utf8_normalize_c($string)
152
{
153
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
154
		return $string;
155
156
	if (is_callable('normalizer_normalize'))
157
		return normalizer_normalize($string, Normalizer::FORM_C);
158
159
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
160
161
	if ($chars === false)
162
		return false;
163
164
	return implode('', utf8_compose(utf8_decompose($chars, false)));
165
}
166
167
/**
168
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
169
 *
170
 * @param string $string The string
171
 * @return string The composed version of $string
172
 */
173
function utf8_normalize_kc($string)
174
{
175
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
176
		return $string;
177
178
	if (is_callable('normalizer_normalize'))
179
		return normalizer_normalize($string, Normalizer::FORM_KC);
180
181
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
182
183
	if ($chars === false)
184
		return false;
185
186
	return implode('', utf8_compose(utf8_decompose($chars, true)));
187
}
188
189
/**
190
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
191
 * Used by idn_to_ascii polyfill in Subs-Compat.php
192
 *
193
 * @param string $string The string
194
 * @return string The casefolded version of $string
195
 */
196
function utf8_normalize_kc_casefold($string)
197
{
198
	global $sourcedir;
199
200
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
201
202
	if ($chars === false)
203
		return false;
204
205
	$chars = utf8_decompose($chars, true);
206
207
	require_once($sourcedir . '/Unicode/CaseFold.php');
208
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
209
210
	$substitutions = utf8_casefold_maps();
211
	$ignorables = array_flip(utf8_default_ignorables());
212
213
	foreach ($chars as &$char)
214
	{
215
		if (isset($substitutions[$char]))
216
			$char = $substitutions[$char];
217
218
		elseif (isset($ignorables[$char]))
219
			$char = '';
220
	}
221
222
	return implode('', utf8_compose($chars));
223
}
224
225
/**
226
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
227
 *
228
 * @param array $chars Array of Unicode characters
229
 * @return array Array of decomposed Unicode characters.
230
 */
231
function utf8_decompose($chars, $compatibility = false)
232
{
233
	global $sourcedir;
234
235
	if (!empty($compatibility))
236
	{
237
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
238
239
		$substitutions = utf8_normalize_kd_maps();
240
241
		foreach ($chars as &$char)
242
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
243
	}
244
245
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
246
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
247
248
	$substitutions = utf8_normalize_d_maps();
249
	$combining_classes = utf8_combining_classes();
250
251
	// Replace characters with decomposed forms.
252
	for ($i=0; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
253
	{
254
		// Hangul characters.
255
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
256
		{
257
			if (!function_exists('mb_ord'))
258
				require_once($sourcedir . '/Subs-Compat.php');
259
260
			$s = mb_ord($chars[$i]);
261
			$sindex = $s - 0xAC00;
262
			$l = 0x1100 + $sindex / (21 * 28);
263
			$v = 0x1161 + ($sindex % (21 * 28)) / 28;
264
			$t = $sindex % 28;
265
266
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
267
		}
268
		// Everything else.
269
		elseif (isset($substitutions[$chars[$i]]))
270
			$chars[$i] = $substitutions[$chars[$i]];
271
	}
272
273
	// Must re-split the string before sorting.
274
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
275
276
	// Sort characters into canonical order.
277
	for ($i = 1; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
278
	{
279
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
280
			continue;
281
282
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
283
		{
284
			$temp = $chars[$i];
285
			$chars[$i] = $chars[$i - 1];
286
			$chars[$i -1] = $temp;
287
288
			// Backtrack and check again.
289
			if ($i > 1)
290
				$i -= 2;
291
		}
292
	}
293
294
	return $chars;
295
}
296
297
/**
298
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
299
 *
300
 * @param array $chars Array of decomposed Unicode characters
301
 * @return array Array of composed Unicode characters.
302
 */
303
function utf8_compose($chars)
304
{
305
	global $sourcedir;
306
307
	require_once($sourcedir . '/Unicode/Composition.php');
308
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
309
310
	$substitutions = utf8_compose_maps();
311
	$combining_classes = utf8_combining_classes();
312
313
	for ($c = 0; $c < count($chars); $c++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
314
	{
315
		// Singleton replacements.
316
		if (isset($substitutions[$chars[$c]]))
317
			$chars[$c] = $substitutions[$chars[$c]];
318
319
		// Hangul characters.
320
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
321
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
322
		{
323
			if (!function_exists('mb_ord'))
324
				require_once($sourcedir . '/Subs-Compat.php');
325
326
			$l_part = $chars[$c];
327
			$v_part = $chars[$c + 1];
328
			$t_part = null;
329
330
			$l_index = mb_ord($l_part) - 0x1100;
331
			$v_index = mb_ord($v_part) - 0x1161;
332
333
			$lv_index = $l_index * 588 + $v_index * 28;
334
			$s = 0xAC00 + $lv_index;
335
336
			if ($chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
337
			{
338
				$t_part = $chars[$c + 2];
339
				$t_index = mb_ord($t_part) - 0x11A7;
340
				$s += $t_index;
341
			}
342
343
			$chars[$c] = mb_chr($s);
344
			$chars[++$c] = null;
345
346
			if (isset($t_part))
347
				$chars[++$c] = null;
348
349
			continue;
350
		}
351
352
		if ($c > 0)
353
		{
354
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
355
356
			// Find the preceding starter character.
357
			$l = $c - 1;
358
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
359
				$l--;
360
361
			// Is there a composed form for this combination?
362
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
363
			{
364
				// Replace the starter character with the composed character.
365
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
366
367
				// Unset the current combining character.
368
				$chars[$c] = null;
369
			}
370
		}
371
	}
372
373
	return $chars;
374
}
375
376
/**
377
 * Helper function for sanitize_chars() that deals with invisible characters.
378
 *
379
 * This function deals with control characters, private use characters,
380
 * non-characters, and characters that are invisible by definition in the
381
 * Unicode standard. It does not deal with characters that are supposed to be
382
 * visible according to the Unicode standard, and makes no attempt to compensate
383
 * for possibly incomplete Unicode support in text rendering engines on client
384
 * devices.
385
 *
386
 * @param string $string The string to sanitize.
387
 * @param int $level Controls how invisible formatting characters are handled.
388
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
389
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
390
 *      2: Disallow all formatting characters. Use for internal comparisions
391
 *         only, such as in the word censor, search contexts, etc.
392
 * @param string $substitute Replacement string for the invalid characters.
393
 * @return string The sanitized string.
394
 */
395
function utf8_sanitize_invisibles($string, $level, $substitute)
396
{
397
	$string = (string) $string;
398
	$level = min(max((int) $level, 0), 2);
399
	$substitute = (string) $substitute;
400
401
	// We never want non-whitespace control characters
402
	$disallowed[] = '[^\P{Cc}\t\r\n]';
0 ignored issues
show
Comprehensibility Best Practice introduced by
$disallowed was never initialized. Although not strictly required by PHP, it is generally a good practice to add $disallowed = array(); before regardless.
Loading history...
403
404
	// We never want private use characters or non-characters.
405
	$disallowed[] = '[\p{Co}\p{Cn}]';
406
407
	// Several more things we never want:
408
	$disallowed[] = '[' . implode('', array(
409
		// Soft Hyphen.
410
		'\x{AD}',
411
		// Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA.
412
		// Unicode Standard ch. 16 says: "they are insufficient for [their]
413
		// purpose and should be considered errors in the encoding."
414
		'\x{17B4}-\x{17B5}',
415
		// Invisible math characters.
416
		'\x{2061}-\x{2064}',
417
		// Deprecated formatting characters.
418
		'\x{206A}-\x{206F}',
419
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
420
		'\x{FEFF}',
421
		// Annotation characters and Object Replacement Character.
422
		'\x{FFF9}-\x{FFFC}',
423
	)) . ']';
424
425
	switch ($level)
426
	{
427
		case 2:
428
			$disallowed[] = '[' . implode('', array(
429
				// Combining Grapheme Character.
430
				'\x{34F}',
431
				// Zero Width Non-Joiner.
432
				'\x{200C}',
433
				// Zero Width Joiner.
434
				'\x{200D}',
435
				// All variation selectors.
436
				'\x{180B}-\x{180D}\x{180F}\x{FE00}-\x{FE0F}\x{E0100}-\x{E01EF}',
437
				// Tag characters.
438
				'\x{E0000}-\x{E007F}',
439
			)) . ']';
440
441
			// no break
442
443
		case 1:
444
			$disallowed[] = '[' . implode('', array(
445
				// Zero Width Space.
446
				'\x{200B}',
447
				// Word Joiner.
448
				'\x{2060}',
449
				// "Bidi_Control" characters.
450
				// Disallowing means that all characters will behave according
451
				// to their default bidirectional text properties.
452
				'\x{61C}\x{200E}\x{200F}\x{202A}-\x{202E}\x{2066}-\x{2069}',
453
				// Hangul filler characters.
454
				// Used as placeholders in incomplete ideographs.
455
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
456
				// Shorthand formatting characters.
457
				'\x{1BCA0}-\x{1BCA3}',
458
				// Musical formatting characters.
459
				'\x{1D173}-\x{1D17A}',
460
			)) . ']';
461
462
			break;
463
464
		default:
465
			// Zero Width Space only allowed in certain scripts.
466
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
467
468
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
469
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
470
471
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
472
			// by more Hangul Jamo characters.
473
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
474
475
			// Hangul Filler for Hangul compatibility chars.
476
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
477
478
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
479
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
480
481
			// Shorthand formatting characters only with other shorthand chars.
482
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
483
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
484
485
			// Musical formatting characters only with other musical chars.
486
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
487
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
488
489
			break;
490
	}
491
492
	if ($level < 2)
493
	{
494
		/*
495
			Combining Grapheme Character has two uses: to override standard
496
			search and collation behaviours, which we never want to allow, and
497
			to ensure correct behaviour of combining marks in a few exceptional
498
			cases, which is legitimate and should be allowed. This means we can
499
			simply test whether it is followed by a combining mark in order to
500
			determine whether to allow it.
501
		*/
502
		$disallowed[] = '\x{34F}(?!\p{M})';
503
504
		// Tag characters not allowed inside words.
505
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
506
507
		// Mongolian Free Variation Selectors.
508
		$disallowed[] = '(?<!\p{Mongolian})[\x{180B}-\x{180D}\x{180F}]';
509
	}
510
511
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
512
513
	/*
514
		Past this point, we need to use mb_ereg* functions because they support
515
		character class intersection and more Unicode properties than the preg*
516
		functions do.
517
	*/
518
	if (!function_exists('mb_ereg_replace_callback') || !preg_match('/[\x{200C}\x{200D}\x{202A}-\x{202E}\x{2066}-\x{2069}\x{FE00}-\x{FE0F}\x{E0100}-\x{E01EF}]/u', $string))
519
		return $string;
520
521
	mb_regex_encoding('UTF-8');
522
523
	// String must be in Normalization Form C for the following checks to work.
524
	$string = utf8_normalize_c($string);
525
526
	$placeholders = array();
527
528
	/*
529
		Use placeholders to preserve known emoji from further processing.
530
531
		This only matches emoji known to the installed version of mbstring,
532
		so we still need to account for new possible emoji further down, but
533
		this will cover the majority.
534
535
		Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
536
	*/
537
	$string  = mb_ereg_replace_callback(
538
		'\p{Regional_Indicator}\p{Regional_Indicator}' .
539
		'|' .
540
		'\p{Emoji}' .
541
		'(' .
542
			'\p{Emoji_Modifier}' .
543
			'|' .
544
			'\x{FE0F}\x{20E3}?' .
545
			'|' .
546
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
547
		')?' .
548
		'(' .
549
			'\x{200D}\p{Emoji}' .
550
			'(' .
551
				'\p{Emoji_Modifier}' .
552
				'|' .
553
				'\x{FE0F}\x{20E3}?' .
554
				'|' .
555
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
556
			')?' .
557
		')*',
558
		function ($matches) use (&$placeholders)
559
		{
560
			// Skip lone ASCII characters that are not actully part of an emoji sequence.
561
			// This can happen because the digits 0-9 and the '*' and '#' characters are
562
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
563
			if (strlen($matches[0]) === 1)
564
				return $matches[0];
565
566
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
567
			return $placeholders[$matches[0]];
568
		},
569
		$string
570
	);
571
572
	// Get rid of any unsanctioned variation selectors.
573
	if (mb_ereg('[\x{FE00}-\x{FE0F}\x{E0100}-\x{E01EF}]', $string))
574
	{
575
		/*
576
			Unicode gives pre-defined lists of sanctioned variation sequences
577
			and says any use of variation selectors outside those sequences is
578
			unsanctioned. However, those lists will continue to grow over time.
579
			Therefore, the regex patterns below are more permissive than
580
			Unicode itself, making reasonable guesses about the types of
581
			characters that are likely to be used as base characters for new
582
			variation sequences in the future.
583
		*/
584
585
		// Base characters that take variation selectors 1 - 16
586
		$variation_base_chars_low = implode('', array(
587
			// Symbols.
588
			'\p{S}',
589
			// CJK Symbols and Punctuation.
590
			'\x{3000}-\x{303F}',
591
			// CJK Unified Ideographs.
592
			'\x{3400}-\x{4DBF}',
593
			'\x{4E00}-\x{9FFF}',
594
			'\x{20000}-\x{2A6DF}',
595
			// Halfwidth and Fullwidth Forms.
596
			'\x{FF01}-\x{FFEE}',
597
			// Multiple characters in these scripts can have variations.
598
			'\p{Myanmar}',
599
			'\p{Phags_Pa}',
600
			'\p{Manichaean}',
601
		));
602
		$string = mb_ereg_replace('[^' . $variation_base_chars_low . ']\K[\x{FE00}-\x{FE0F}]', $substitute, $string);
603
604
		// For variation selectors 17 - 256, things are simpler.
605
		$string = mb_ereg_replace('\P{Ideographic}\K[\x{E0100}-\x{E01EF}]', $substitute, $string);
606
	}
607
608
	// Join controls are only allowed inside words in special circumstances.
609
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
610
	if (mb_ereg('[\x{200C}\x{200D}]', $string))
611
	{
612
		// Zero Width Non-Joiner (U+200C)
613
		$zwnj = "\xE2\x80\x8C";
614
		// Zero Width Joiner (U+200D)
615
		$zwj = "\xE2\x80\x8D";
616
617
		$placeholders[$zwnj] = "\xEE\x80\x8C";
618
		$placeholders[$zwj] = "\xEE\x80\x8C";
619
620
		// When not in strict mode, allow ZWJ at word boundaries.
621
		if ($level === 0)
622
			$string = mb_ereg_replace('\b\x{200D}|\x{200D}\b', $placeholders[$zwj], $string);
623
624
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
625
		$script_tests = array(
626
			// For these scripts, use test A1 for allowing ZWNJ
627
			// https://unicode.org/reports/tr31/#A1
628
			// Character class lists compiled from:
629
			// https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt
630
			'Arabic' => array(
631
				'dual_joining' => '\x{0620}\x{0626}\x{0628}\x{062A}-\x{062E}\x{0633}-\x{063F}\x{0641}-\x{0647}\x{0649}-\x{064A}\x{066E}-\x{066F}\x{0678}-\x{0687}\x{069A}-\x{06BF}\x{06C1}-\x{06C2}\x{06CC}\x{06CE}\x{06D0}-\x{06D1}\x{06FA}-\x{06FC}\x{06FF}\x{074E}-\x{0758}\x{075C}-\x{076A}\x{076D}-\x{0770}\x{0772}\x{0775}-\x{0777}\x{077A}-\x{077F}\x{08A0}-\x{08A9}\x{08AF}-\x{08B0}\x{08B3}-\x{08B8}\x{08BA}-\x{08C8}',
632
				'right_joining' => '\x{0622}-\x{0625}\x{0627}\x{0629}\x{062F}-\x{0632}\x{0648}\x{0671}-\x{0673}\x{0675}-\x{0677}\x{0688}-\x{0699}\x{06C0}\x{06C3}-\x{06CB}\x{06CD}\x{06CF}\x{06D2}-\x{06D3}\x{06D5}\x{06EE}-\x{06EF}\x{0759}-\x{075B}\x{076B}-\x{076C}\x{0771}\x{0773}-\x{0774}\x{0778}-\x{0779}\x{08AA}-\x{08AC}\x{08AE}\x{08B1}-\x{08B2}\x{08B9}',
633
				'transparent_joining' => '\x{0610}-\x{061A}\x{061C}\x{064B}-\x{065F}\x{06D6}-\x{06DC}\x{06DF}-\x{06E4}\x{06E7}-\x{06E8}\x{06EA}-\x{06ED}\x{08CA}-\x{08E1}\x{08E3}-\x{0902}',
634
			),
635
			'Syriac' => array(
636
				'dual_joining' => '\x{0712}-\x{0714}\x{071A}-\x{071D}\x{071F}-\x{0727}\x{0729}\x{072B}\x{072D}-\x{072E}\x{0860}\x{0862}-\x{0865}\x{0868}',
637
				'right_joining' => '\x{0710}\x{0715}-\x{0719}\x{071E}\x{0728}\x{072A}\x{072C}\x{072F}\x{074D}\x{0867}\x{0869}-\x{086A}',
638
				'transparent_joining' => '\x{070F}\x{0711}\x{0730}-\x{074A}',
639
			),
640
			'Mongolian' => array(
641
				'dual_joining' => '\x{1807}\x{1820}-\x{1878}\x{1887}-\x{18A8}\x{18AA}',
642
				'transparent_joining' => '\x{180B}-\x{180D}\x{1885}-\x{1886}\x{18A9}',
643
			),
644
			'Nko' => array(
645
				'dual_joining' => '\x{07CA}-\x{07EA}',
646
				'transparent_joining' => '\x{07EB}-\x{07F3}\x{07FD}',
647
			),
648
			'Mandaic' => array(
649
				'dual_joining' => '\x{0841}-\x{0845}\x{0848}\x{084A}-\x{0853}\x{0855}',
650
				'right_joining' => '\x{0840}\x{0846}-\x{0847}\x{0849}\x{0854}\x{0856}-\x{0858}',
651
				'transparent_joining' => '\x{0859}-\x{085B}',
652
			),
653
			'Manichaean' => array(
654
				'dual_joining' => '\x{10AC0}-\x{10AC4}\x{10AD3}-\x{10AD6}\x{10AD8}-\x{10ADC}\x{10ADE}-\x{10AE0}\x{10AEB}-\x{10AEE}',
655
				'right_joining' => '\x{10AC5}\x{10AC7}\x{10AC9}-\x{10ACA}\x{10ACE}-\x{10AD2}\x{10ADD}\x{10AE1}\x{10AE4}\x{10AEF}',
656
				'left_joining' => '\x{10ACD}\x{10AD7}',
657
				'transparent_joining' => '\x{10AE5}-\x{10AE6}',
658
			),
659
			'Psalter_Pahlavi' => array(
660
				'dual_joining' => '\x{10B80}\x{10B82}\x{10B86}-\x{10B88}\x{10B8A}-\x{10B8B}\x{10B8D}\x{10B90}\x{10BAD}-\x{10BAE}',
661
				'right_joining' => '\x{10B81}\x{10B83}-\x{10B85}\x{10B89}\x{10B8C}\x{10B8E}-\x{10B8F}\x{10B91}\x{10BA9}-\x{10BAC}',
662
			),
663
			'Hanifi_Rohingya' => array(
664
				'dual_joining' => '\x{10D01}-\x{10D21}\x{10D23}',
665
				'right_joining' => '\x{10D22}',
666
				'left_joining' => '\x{10D00}',
667
				'transparent_joining' => '\x{10D24}-\x{10D27}',
668
			),
669
			'Sogdian' => array(
670
				'dual_joining' => '\x{10F30}-\x{10F32}\x{10F34}-\x{10F44}\x{10F51}-\x{10F53}',
671
				'right_joining' => '\x{10F33}\x{10F54}',
672
				'transparent_joining' => '\x{10F46}-\x{10F50}',
673
			),
674
			'Chorasmian' => array(
675
				'dual_joining' => '\x{0FB0}\x{0FB2)-\x{10FB3}\x{0FB8}\x{0FBB)-\x{10FBC}\x{0FBE)-\x{10FBF}\x{0FC1}\x{0FC4}\x{0FCA}',
676
				'right_joining' => '\x{0FB4)-\x{10FB6}\x{0FB9)-\x{10FBA}\x{0FBD}\x{0FC2)-\x{10FC3}\x{0FC9}',
677
				'left_joining' => '\x{0FCB}',
678
			),
679
			'Adlam' => array(
680
				'dual_joining' => '\x{1E900}-\x{1E943}',
681
				'transparent_joining' => '\x{1E944}-\x{1E94B}',
682
			),
683
684
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ
685
			// https://unicode.org/reports/tr31/#A2
686
			// https://unicode.org/reports/tr31/#B
687
			// Character class lists compiled from:
688
			// https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
689
			// https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt
690
			'Devanagari' => array(
691
				'viramas' => '\x{094D}',
692
				'vowel_dependents' => '\x{093A}-\x{093B}\x{093E}-\x{094C}\x{094E}-\x{094F}\x{0955}-\x{0957}\x{0962}-\x{0963}\x{A8FF}',
693
			),
694
			'Bengali' => array(
695
				'viramas' => '\x{09CD}',
696
				'vowel_dependents' => '\x{09BE}-\x{09C4}\x{09C7}-\x{09C8}\x{09CB}-\x{09CC}\x{09D7}\x{09E2}-\x{09E3}',
697
			),
698
			'Gurmukhi' => array(
699
				'viramas' => '\x{0A4D}',
700
				'vowel_dependents' => '\x{0A3E}-\x{0A42}\x{0A47}-\x{0A48}\x{0A4B}-\x{0A4C}',
701
			),
702
			'Gujarati' => array(
703
				'viramas' => '\x{0ACD}',
704
				'vowel_dependents' => '\x{0ABE}-\x{0AC5}\x{0AC7}-\x{0AC9}\x{0ACB}-\x{0ACC}\x{0AE2}-\x{0AE3}',
705
			),
706
			'Oriya' => array(
707
				'viramas' => '\x{0B4D}',
708
				'vowel_dependents' => '\x{0B3E}-\x{0B44}\x{0B47}-\x{0B48}\x{0B4B}-\x{0B4C}\x{0B55}-\x{0B57}\x{0B62}-\x{0B63}',
709
			),
710
			'Tamil' => array(
711
				'viramas' => '\x{0BCD}',
712
				'vowel_dependents' => '\x{0BBE}-\x{0BC2}\x{0BC6}-\x{0BC8}\x{0BCA}-\x{0BCC}\x{0BD7}',
713
			),
714
			'Telugu' => array(
715
				'viramas' => '\x{0C4D}',
716
				'vowel_dependents' => '\x{0C3E}-\x{0C44}\x{0C46}-\x{0C48}\x{0C4A}-\x{0C4C}\x{0C55}-\x{0C56}\x{0C62}-\x{0C63}',
717
			),
718
			'Kannada' => array(
719
				'viramas' => '\x{0CCD}',
720
				'vowel_dependents' => '\x{0CBE}-\x{0CC4}\x{0CC6}-\x{0CC8}\x{0CCA}-\x{0CCC}\x{0CD5}-\x{0CD6}\x{0CE2}-\x{0CE3}',
721
			),
722
			'Malayalam' => array(
723
				'viramas' => '\x{0D4D}',
724
				'vowel_dependents' => '\x{0D3E}-\x{0D44}\x{0D46}-\x{0D48}\x{0D4A}-\x{0D4C}\x{0D57}\x{0D62}-\x{0D63}',
725
			),
726
			'Sinhala' => array(
727
				'viramas' => '\x{0DCA}',
728
				'vowel_dependents' => '\x{0DCF}-\x{0DD4}\x{0DD6}\x{0DD8}-\x{0DDF}\x{0DF2}-\x{0DF3}',
729
			),
730
			'Thai' => array(
731
				'viramas' => '\x{0E3A}',
732
				'vowel_dependents' => '\x{0E30}\x{0E40}\x{0E47}',
733
			),
734
			'Lao' => array(
735
				'viramas' => '\x{0EBA}',
736
				'vowel_dependents' => '\x{0EB0}-\x{0EB9}\x{0EBB}\x{0EC0}-\x{0EC4}',
737
			),
738
			'Tibetan' => array(
739
				'viramas' => '\x{0F84}',
740
				'vowel_dependents' => '\x{0F71}-\x{0F7D}\x{0F80}-\x{0F81}',
741
			),
742
			'Myanmar' => array(
743
				'viramas' => '\x{1039}-\x{103A}',
744
				'vowel_dependents' => '\x{102B}-\x{1035}\x{1056}-\x{1059}\x{1062}\x{1067}-\x{1068}\x{1071}-\x{1074}\x{1083}-\x{1086}\x{109C}-\x{109D}\x{A9E5}',
745
			),
746
			'Tagalog' => array(
747
				'viramas' => '\x{1714}-\x{1715}',
748
				'vowel_dependents' => '\x{1712}-\x{1713}',
749
			),
750
			'Hanunoo' => array(
751
				'viramas' => '\x{1734}',
752
				'vowel_dependents' => '\x{1732}-\x{1733}',
753
			),
754
			'Khmer' => array(
755
				'viramas' => '\x{17D2}',
756
				'vowel_dependents' => '\x{17B6}-\x{17C5}\x{17C8}',
757
			),
758
			'Tai_Tham' => array(
759
				'viramas' => '\x{1A60}',
760
				'vowel_dependents' => '\x{1A61}-\x{1A73}',
761
			),
762
			'Balinese' => array(
763
				'viramas' => '\x{1B44}',
764
				'vowel_dependents' => '\x{1B35}-\x{1B43}',
765
			),
766
			'Sundanese' => array(
767
				'viramas' => '\x{1BAA}-\x{1BAB}',
768
				'vowel_dependents' => '\x{1BA4}-\x{1BA9}',
769
			),
770
			'Batak' => array(
771
				'viramas' => '\x{1BF2}-\x{1BF3}',
772
				'vowel_dependents' => '\x{1BE7}-\x{1BEF}',
773
			),
774
			'Tifinagh' => array(
775
				'viramas' => '\x{2D7F}',
776
				'vowel_dependents' => '',
777
			),
778
			'Syloti_Nagri' => array(
779
				'viramas' => '\x{A806}-\x{A82C}',
780
				'vowel_dependents' => '\x{A802}\x{A823}-\x{A827}',
781
			),
782
			'Saurashtra' => array(
783
				'viramas' => '\x{A8C4}',
784
				'vowel_dependents' => '\x{A8B5}-\x{A8C3}',
785
			),
786
			'Rejang' => array(
787
				'viramas' => '\x{A953}',
788
				'vowel_dependents' => '\x{A947}-\x{A94E}',
789
			),
790
			'Javanese' => array(
791
				'viramas' => '\x{A9C0}',
792
				'vowel_dependents' => '\x{A9B4}-\x{A9BC}',
793
			),
794
			'Meetei_Mayek' => array(
795
				'viramas' => '\x{AAF6}-\x{ABED}',
796
				'vowel_dependents' => '\x{AAEB}-\x{AAEF}\x{ABE3}-\x{ABEA}',
797
			),
798
			'Kharoshthi' => array(
799
				'viramas' => '\x{10A3F}',
800
				'vowel_dependents' => '\x{10A01}-\x{10A03}\x{10A05}-\x{10A06}\x{10A0C}-\x{10A0D}',
801
			),
802
			'Brahmi' => array(
803
				'viramas' => '\x{11046}\x{11070}\x{1107F}',
804
				'vowel_dependents' => '\x{11038}-\x{11045}',
805
			),
806
			'Kaithi' => array(
807
				'viramas' => '\x{110B9}',
808
				'vowel_dependents' => '\x{110B0}-\x{110B8}',
809
			),
810
			'Chakma' => array(
811
				'viramas' => '\x{11133}-\x{11134}',
812
				'vowel_dependents' => '\x{11127}-\x{11132}\x{11145}-\x{11146}',
813
			),
814
			'Sharada' => array(
815
				'viramas' => '\x{111C0}',
816
				'vowel_dependents' => '\x{111B3}-\x{111BF}\x{111CB}-\x{111CC}',
817
			),
818
			'Khojki' => array(
819
				'viramas' => '\x{11235}',
820
				'vowel_dependents' => '\x{1122C}-\x{11233}',
821
			),
822
			'Khudawadi' => array(
823
				'viramas' => '\x{112EA}',
824
				'vowel_dependents' => '\x{112E0}-\x{112E8}',
825
			),
826
			'Grantha' => array(
827
				'viramas' => '\x{1134D}',
828
				'vowel_dependents' => '\x{1133E}-\x{11344}\x{11347}-\x{11348}\x{1134B}-\x{1134C}\x{11357}\x{11362}-\x{11363}',
829
			),
830
			'Newa' => array(
831
				'viramas' => '\x{11442}',
832
				'vowel_dependents' => '\x{11435}-\x{11441}',
833
			),
834
			'Tirhuta' => array(
835
				'viramas' => '\x{114C2}',
836
				'vowel_dependents' => '\x{114B0}-\x{114BE}',
837
			),
838
			'Siddham' => array(
839
				'viramas' => '\x{115BF}',
840
				'vowel_dependents' => '\x{115AF}-\x{115B5}\x{115B8}-\x{115BB}\x{115DC}-\x{115DD}',
841
			),
842
			'Modi' => array(
843
				'viramas' => '\x{1163F}',
844
				'vowel_dependents' => '\x{11630}-\x{1163C}\x{11640}',
845
			),
846
			'Takri' => array(
847
				'viramas' => '\x{116B6}',
848
				'vowel_dependents' => '\x{116AD}-\x{116B5}',
849
			),
850
			'Ahom' => array(
851
				'viramas' => '\x{1172B}',
852
				'vowel_dependents' => '\x{11720}-\x{1172A}',
853
			),
854
			'Dogra' => array(
855
				'viramas' => '\x{11839}',
856
				'vowel_dependents' => '\x{1182C}-\x{11836}',
857
			),
858
			'Nandinagari' => array(
859
				'viramas' => '\x{119E0}',
860
				'vowel_dependents' => '\x{119D1}-\x{119D7}\x{119DA}-\x{119DD}\x{119E4}',
861
			),
862
			'Zanabazar_Square' => array(
863
				'viramas' => '\x{11A34}\x{11A47}',
864
				'vowel_dependents' => '\x{11A01}-\x{11A0A}',
865
			),
866
			'Soyombo' => array(
867
				'viramas' => '\x{11A99}',
868
				'vowel_dependents' => '\x{11A51}-\x{11A5B}',
869
			),
870
			'Bhaiksuki' => array(
871
				'viramas' => '\x{11C3F}',
872
				'vowel_dependents' => '\x{11C2F}-\x{11C36}\x{11C38}-\x{11C3B}',
873
			),
874
			'Masaram_Gondi' => array(
875
				'viramas' => '\x{11D44}-\x{11D45}',
876
				'vowel_dependents' => '\x{11D31}-\x{11D36}\x{11D3A}\x{11D3C}-\x{11D3D}\x{11D3F}\x{11D43}',
877
			),
878
			'Gunjala_Gondi' => array(
879
				'viramas' => '\x{11D97}',
880
				'vowel_dependents' => '\x{11D8A}-\x{11D8E}\x{11D90}-\x{11D91}\x{11D93}-\x{11D94}',
881
			),
882
		);
883
884
		$all_combining_marks = '[' . implode('', array_keys(utf8_combining_classes())) . ']';
885
886
		foreach ($script_tests as $script => $chars)
887
		{
888
			// https://unicode.org/reports/tr31/#A1
889
			if (empty($chars['viramas']))
890
			{
891
				$lj = !empty($chars['left_joining']) ? $chars['left_joining'] : '';
892
				$rj = !empty($chars['right_joining']) ? $chars['right_joining'] : '';
893
				$t = !empty($chars['transparent_joining']) ? '[' . $chars['transparent_joining'] . ']*' : '';
894
895
				if (!empty($chars['dual_joining']))
896
				{
897
					$lj .= $chars['dual_joining'];
898
					$rj .= $chars['dual_joining'];
899
				}
900
901
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
902
			}
903
			// https://unicode.org/reports/tr31/#A2
904
			// https://unicode.org/reports/tr31/#B
905
			else
906
			{
907
				// Characters used in this script.
908
				$used_in_script = '[\p{' . $script . '}\p{Common}\p{Inherited}]';
909
910
				// A letter that is part of this particular script.
911
				$letter = '[\p{L}&&\p{' . $script . '}]';
912
913
				// Zero or more non-spacing marks used in this script.
914
				$nonspacing_marks = '[\p{Mn}&&' . $used_in_script . ']*';
915
916
				// Zero or more non-spacing combining marks used in this script.
917
				$nonspacing_combining_marks = '[\p{Mn}&&' . $used_in_script . '&&' . $all_combining_marks . ']*';
918
919
				// ZWNJ must be followed by another letter in the same script.
920
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
921
922
				// ZWJ must NOT be followed by a vowel dependent character in this
923
				// script or by any character from a different script.
924
				$zwj_pattern = '\x{200D}(?!' . (!empty($chars['vowel_dependents']) ? '[' . $chars['vowel_dependents'] . ']|' : '') . '\P{' . $script . '}})';
925
926
				// Now build the pattern for this script.
927
				$pattern = $letter . $nonspacing_marks . '[' . $chars['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
928
			}
929
930
			// Do the thing.
931
			$temp = @mb_ereg_replace_callback(
932
				$pattern,
933
				function ($matches) use ($placeholders)
934
				{
935
					return strtr($matches[0], $placeholders);
936
				},
937
				$string
938
			);
939
940
			// False means the installed version of mbstring lacks support for this script.
941
			if ($temp !== false)
942
				$string = $temp;
943
944
			// Did we catch 'em all?
945
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
946
				break;
947
		}
948
949
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
950
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
951
	}
952
953
	// Revert placeholders back to original characters.
954
	$string = strtr($string, array_flip($placeholders));
955
956
957
	return $string;
958
}
959
960
?>