Issues (1014)

Sources/Subs-Charset.php (8 issues)

1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2022 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1.0
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
/**
18
 * Converts the given UTF-8 string into lowercase.
19
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
20
 * output consistent across PHP versions and up to date with the latest version
21
 * of Unicode.
22
 *
23
 * @param string $string The string
24
 * @return string The lowercase version of $string
25
 */
26
function utf8_strtolower($string)
27
{
28
	global $sourcedir;
29
30
	$string = (string) $string;
31
32
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
33
34
	if ($chars === false)
35
		return false;
36
37
	require_once($sourcedir . '/Unicode/CaseLower.php');
38
39
	$substitutions = utf8_strtolower_maps();
40
41
	foreach ($chars as &$char)
42
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
43
44
	return implode('', $chars);
45
}
46
47
/**
48
 * Convert the given UTF-8 string to uppercase.
49
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
50
 * output consistent across PHP versions and up to date with the latest version
51
 * of Unicode.
52
 *
53
 * @param string $string The string
54
 * @return string The uppercase version of $string
55
 */
56
function utf8_strtoupper($string)
57
{
58
	global $sourcedir;
59
60
	$string = (string) $string;
61
62
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
63
64
	if ($chars === false)
65
		return false;
66
67
	require_once($sourcedir . '/Unicode/CaseUpper.php');
68
69
	$substitutions = utf8_strtoupper_maps();
70
71
	foreach ($chars as &$char)
72
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
73
74
	return implode('', $chars);
75
}
76
77
/**
78
 * Casefolds the given UTF-8 string.
79
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
80
 * we can keep the output consistent across PHP versions and up to date with
81
 * the latest version of Unicode.
82
 *
83
 * @param string $string The string
84
 * @return string The uppercase version of $string
85
 */
86
function utf8_casefold($string)
87
{
88
	global $sourcedir;
89
90
	$string = (string) $string;
91
92
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
93
94
	if ($chars === false)
95
		return false;
96
97
	require_once($sourcedir . '/Unicode/CaseFold.php');
98
99
	$substitutions = utf8_casefold_maps();
100
101
	foreach ($chars as &$char)
102
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
103
104
	return implode('', $chars);
105
}
106
107
/**
108
 * Normalizes UTF-8 via Canonical Decomposition.
109
 *
110
 * @param string $string A UTF-8 string
111
 * @return string The decomposed version of $string
112
 */
113
function utf8_normalize_d($string)
114
{
115
	$string = (string) $string;
116
117
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
0 ignored issues
show
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

117
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D))
Loading history...
118
		return $string;
119
120
	if (is_callable('normalizer_normalize'))
121
		return normalizer_normalize($string, Normalizer::FORM_D);
0 ignored issues
show
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

121
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D);
Loading history...
122
123
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
124
125
	if ($chars === false)
126
		return false;
127
128
	return implode('', utf8_decompose($chars, false));
129
}
130
131
/**
132
 * Normalizes UTF-8 via Compatibility Decomposition.
133
 *
134
 * @param string $string A UTF-8 string.
135
 * @return string The decomposed version of $string.
136
 */
137
function utf8_normalize_kd($string)
138
{
139
	$string = (string) $string;
140
141
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
0 ignored issues
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

141
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD))
Loading history...
142
		return $string;
143
144
	if (is_callable('normalizer_normalize'))
145
		return normalizer_normalize($string, Normalizer::FORM_KD);
0 ignored issues
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

145
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD);
Loading history...
146
147
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
148
149
	if ($chars === false)
150
		return false;
151
152
	return implode('', utf8_decompose($chars, true));
153
}
154
155
/**
156
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
157
 *
158
 * @param string $string A UTF-8 string
159
 * @return string The composed version of $string
160
 */
161
function utf8_normalize_c($string)
162
{
163
	$string = (string) $string;
164
165
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
166
		return $string;
167
168
	if (is_callable('normalizer_normalize'))
169
		return normalizer_normalize($string, Normalizer::FORM_C);
170
171
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
172
173
	if ($chars === false)
174
		return false;
175
176
	return implode('', utf8_compose(utf8_decompose($chars, false)));
177
}
178
179
/**
180
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
181
 *
182
 * @param string $string The string
183
 * @return string The composed version of $string
184
 */
185
function utf8_normalize_kc($string)
186
{
187
	$string = (string) $string;
188
189
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
190
		return $string;
191
192
	if (is_callable('normalizer_normalize'))
193
		return normalizer_normalize($string, Normalizer::FORM_KC);
194
195
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
196
197
	if ($chars === false)
198
		return false;
199
200
	return implode('', utf8_compose(utf8_decompose($chars, true)));
201
}
202
203
/**
204
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
205
 * Used by idn_to_ascii polyfill in Subs-Compat.php
206
 *
207
 * @param string $string The string
208
 * @return string The casefolded version of $string
209
 */
210
function utf8_normalize_kc_casefold($string)
211
{
212
	global $sourcedir;
213
214
	$string = (string) $string;
215
216
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
217
218
	if ($chars === false)
219
		return false;
220
221
	$chars = utf8_decompose($chars, true);
222
223
	require_once($sourcedir . '/Unicode/CaseFold.php');
224
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
225
226
	$substitutions = utf8_casefold_maps();
227
	$ignorables = array_flip(utf8_default_ignorables());
228
229
	foreach ($chars as &$char)
230
	{
231
		if (isset($substitutions[$char]))
232
			$char = $substitutions[$char];
233
234
		elseif (isset($ignorables[$char]))
235
			$char = '';
236
	}
237
238
	return implode('', utf8_compose($chars));
239
}
240
241
/**
242
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
243
 *
244
 * @param array $chars Array of Unicode characters
245
 * @return array Array of decomposed Unicode characters.
246
 */
247
function utf8_decompose($chars, $compatibility = false)
248
{
249
	global $sourcedir;
250
251
	if (!empty($compatibility))
252
	{
253
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
254
255
		$substitutions = utf8_normalize_kd_maps();
256
257
		foreach ($chars as &$char)
258
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
259
	}
260
261
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
262
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
263
264
	$substitutions = utf8_normalize_d_maps();
265
	$combining_classes = utf8_combining_classes();
266
267
	// Replace characters with decomposed forms.
268
	for ($i=0; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
269
	{
270
		// Hangul characters.
271
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
272
		{
273
			if (!function_exists('mb_ord'))
274
				require_once($sourcedir . '/Subs-Compat.php');
275
276
			$s = mb_ord($chars[$i]);
277
			$sindex = $s - 0xAC00;
278
			$l = 0x1100 + $sindex / (21 * 28);
279
			$v = 0x1161 + ($sindex % (21 * 28)) / 28;
280
			$t = $sindex % 28;
281
282
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
283
		}
284
		// Everything else.
285
		elseif (isset($substitutions[$chars[$i]]))
286
			$chars[$i] = $substitutions[$chars[$i]];
287
	}
288
289
	// Must re-split the string before sorting.
290
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
291
292
	// Sort characters into canonical order.
293
	for ($i = 1; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
294
	{
295
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
296
			continue;
297
298
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
299
		{
300
			$temp = $chars[$i];
301
			$chars[$i] = $chars[$i - 1];
302
			$chars[$i -1] = $temp;
303
304
			// Backtrack and check again.
305
			if ($i > 1)
306
				$i -= 2;
307
		}
308
	}
309
310
	return $chars;
311
}
312
313
/**
314
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
315
 *
316
 * @param array $chars Array of decomposed Unicode characters
317
 * @return array Array of composed Unicode characters.
318
 */
319
function utf8_compose($chars)
320
{
321
	global $sourcedir;
322
323
	require_once($sourcedir . '/Unicode/Composition.php');
324
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
325
326
	$substitutions = utf8_compose_maps();
327
	$combining_classes = utf8_combining_classes();
328
329
	for ($c = 0; $c < count($chars); $c++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
330
	{
331
		// Singleton replacements.
332
		if (isset($substitutions[$chars[$c]]))
333
			$chars[$c] = $substitutions[$chars[$c]];
334
335
		// Hangul characters.
336
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
337
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
338
		{
339
			if (!function_exists('mb_ord'))
340
				require_once($sourcedir . '/Subs-Compat.php');
341
342
			$l_part = $chars[$c];
343
			$v_part = $chars[$c + 1];
344
			$t_part = null;
345
346
			$l_index = mb_ord($l_part) - 0x1100;
347
			$v_index = mb_ord($v_part) - 0x1161;
348
349
			$lv_index = $l_index * 588 + $v_index * 28;
350
			$s = 0xAC00 + $lv_index;
351
352
			if (isset($chars[$c + 2]) && $chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
353
			{
354
				$t_part = $chars[$c + 2];
355
				$t_index = mb_ord($t_part) - 0x11A7;
356
				$s += $t_index;
357
			}
358
359
			$chars[$c] = mb_chr($s);
360
			$chars[++$c] = null;
361
362
			if (isset($t_part))
363
				$chars[++$c] = null;
364
365
			continue;
366
		}
367
368
		if ($c > 0)
369
		{
370
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
371
372
			// Find the preceding starter character.
373
			$l = $c - 1;
374
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
375
				$l--;
376
377
			// Is there a composed form for this combination?
378
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
379
			{
380
				// Replace the starter character with the composed character.
381
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
382
383
				// Unset the current combining character.
384
				$chars[$c] = null;
385
			}
386
		}
387
	}
388
389
	return $chars;
390
}
391
392
/**
393
 * Helper function for sanitize_chars() that deals with invisible characters.
394
 *
395
 * This function deals with control characters, private use characters,
396
 * non-characters, and characters that are invisible by definition in the
397
 * Unicode standard. It does not deal with characters that are supposed to be
398
 * visible according to the Unicode standard, and makes no attempt to compensate
399
 * for possibly incomplete Unicode support in text rendering engines on client
400
 * devices.
401
 *
402
 * @param string $string The string to sanitize.
403
 * @param int $level Controls how invisible formatting characters are handled.
404
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
405
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
406
 *      2: Disallow all formatting characters. Use for internal comparisions
407
 *         only, such as in the word censor, search contexts, etc.
408
 * @param string $substitute Replacement string for the invalid characters.
409
 * @return string The sanitized string.
410
 */
411
function utf8_sanitize_invisibles($string, $level, $substitute)
412
{
413
	global $sourcedir;
414
415
	$string = (string) $string;
416
	$level = min(max((int) $level, 0), 2);
417
	$substitute = (string) $substitute;
418
419
	require_once($sourcedir . '/Unicode/RegularExpressions.php');
420
	$prop_classes = utf8_regex_properties();
421
422
	// We never want non-whitespace control characters
423
	$disallowed[] = '[^\P{Cc}\t\r\n]';
0 ignored issues
show
Comprehensibility Best Practice introduced by
$disallowed was never initialized. Although not strictly required by PHP, it is generally a good practice to add $disallowed = array(); before regardless.
Loading history...
424
425
	// We never want private use characters or non-characters.
426
	// Use our own version of \p{Cn} in order to avoid possible inconsistencies
427
	// between our data and whichever version of PCRE happens to be installed
428
	// on this server. Unlike \p{Cc} and \p{Co}, which never change, the value
429
	// of \p{Cn} changes with every new version of Unicode.
430
	$disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']';
431
432
	// Several more things we never want:
433
	$disallowed[] = '[' . implode('', array(
434
		// Soft Hyphen.
435
		'\x{AD}',
436
		// Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA.
437
		// Unicode Standard ch. 16 says: "they are insufficient for [their]
438
		// purpose and should be considered errors in the encoding."
439
		'\x{17B4}-\x{17B5}',
440
		// Invisible math characters.
441
		'\x{2061}-\x{2064}',
442
		// Deprecated formatting characters.
443
		'\x{206A}-\x{206F}',
444
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
445
		'\x{FEFF}',
446
		// Annotation characters and Object Replacement Character.
447
		'\x{FFF9}-\x{FFFC}',
448
	)) . ']';
449
450
	switch ($level)
451
	{
452
		case 2:
453
			$disallowed[] = '[' . implode('', array(
454
				// Combining Grapheme Character.
455
				'\x{34F}',
456
				// Zero Width Non-Joiner.
457
				'\x{200C}',
458
				// Zero Width Joiner.
459
				'\x{200D}',
460
				// All variation selectors.
461
				$prop_classes['Variation_Selector'],
462
				// Tag characters.
463
				'\x{E0000}-\x{E007F}',
464
			)) . ']';
465
466
			// no break
467
468
		case 1:
469
			$disallowed[] = '[' . implode('', array(
470
				// Zero Width Space.
471
				'\x{200B}',
472
				// Word Joiner.
473
				'\x{2060}',
474
				// "Bidi_Control" characters.
475
				// Disallowing means that all characters will behave according
476
				// to their default bidirectional text properties.
477
				$prop_classes['Bidi_Control'],
478
				// Hangul filler characters.
479
				// Used as placeholders in incomplete ideographs.
480
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
481
				// Shorthand formatting characters.
482
				'\x{1BCA0}-\x{1BCA3}',
483
				// Musical formatting characters.
484
				'\x{1D173}-\x{1D17A}',
485
			)) . ']';
486
487
			break;
488
489
		default:
490
			// Zero Width Space only allowed in certain scripts.
491
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
492
493
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
494
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
495
496
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
497
			// by more Hangul Jamo characters.
498
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
499
500
			// Hangul Filler for Hangul compatibility chars.
501
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
502
503
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
504
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
505
506
			// Shorthand formatting characters only with other shorthand chars.
507
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
508
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
509
510
			// Musical formatting characters only with other musical chars.
511
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
512
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
513
514
			break;
515
	}
516
517
	if ($level < 2)
518
	{
519
		/*
520
			Combining Grapheme Character has two uses: to override standard
521
			search and collation behaviours, which we never want to allow, and
522
			to ensure correct behaviour of combining marks in a few exceptional
523
			cases, which is legitimate and should be allowed. This means we can
524
			simply test whether it is followed by a combining mark in order to
525
			determine whether to allow it.
526
		*/
527
		$disallowed[] = '\x{34F}(?!\p{M})';
528
529
		// Tag characters not allowed inside words.
530
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
531
	}
532
533
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
534
535
	// Are we done yet?
536
	if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string))
537
		return $string;
538
539
	// String must be in Normalization Form C for the following checks to work.
540
	$string = utf8_normalize_c($string);
541
542
	$placeholders = array();
543
544
	// Use placeholders to preserve known emoji from further processing.
545
	// Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
546
	$string  = preg_replace_callback(
547
		'/' .
548
		// Flag emojis
549
		'[' . $prop_classes['Regional_Indicator'] . ']{2}' .
550
		// Or
551
		'|' .
552
		// Emoji characters
553
		'[' . $prop_classes['Emoji'] . ']' .
554
		// Possibly followed by modifiers of various sorts
555
		'(' .
556
			'[' . $prop_classes['Emoji_Modifier'] . ']' .
557
			'|' .
558
			'\x{FE0F}\x{20E3}?' .
559
			'|' .
560
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
561
		')?' .
562
		// Possibly concatenated with Zero Width Joiner and more emojis
563
		// (e.g. the "family" emoji sequences)
564
		'(' .
565
			'\x{200D}[' . $prop_classes['Emoji'] . ']' .
566
			'(' .
567
				'[' . $prop_classes['Emoji_Modifier'] . ']' .
568
				'|' .
569
				'\x{FE0F}\x{20E3}?' .
570
				'|' .
571
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
572
			')?' .
573
		')*' .
574
		'/u',
575
		function ($matches) use (&$placeholders)
576
		{
577
			// Skip lone ASCII characters that are not actully part of an emoji sequence.
578
			// This can happen because the digits 0-9 and the '*' and '#' characters are
579
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
580
			if (strlen($matches[0]) === 1)
581
				return $matches[0];
582
583
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
584
			return $placeholders[$matches[0]];
585
		},
586
		$string
587
	);
588
589
	// Get rid of any unsanctioned variation selectors.
590
	if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string))
591
	{
592
		/*
593
			Unicode gives pre-defined lists of sanctioned variation sequences
594
			and says any use of variation selectors outside those sequences is
595
			unsanctioned.
596
		*/
597
598
		$patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u');
599
600
		foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars)
601
			$patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u';
602
603
		// Use placeholders for sanctioned variation selectors.
604
		$string = preg_replace_callback(
605
			$patterns,
606
			function ($matches) use (&$placeholders)
607
			{
608
				$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
609
				return $placeholders[$matches[0]];
610
			},
611
			$string
612
		);
613
614
		// Remove any unsanctioned variation selectors.
615
		$string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string);
616
	}
617
618
	// Join controls are only allowed inside words in special circumstances.
619
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
620
	if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string))
621
	{
622
		// Zero Width Non-Joiner (U+200C)
623
		$zwnj = "\xE2\x80\x8C";
624
		// Zero Width Joiner (U+200D)
625
		$zwj = "\xE2\x80\x8D";
626
627
		$placeholders[$zwnj] = "\xEE\x80\x8C";
628
		$placeholders[$zwj] = "\xEE\x80\x8D";
629
630
		// When not in strict mode, allow ZWJ at word boundaries.
631
		if ($level === 0)
632
			$string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string);
633
634
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
635
		$joining_type_classes = utf8_regex_joining_type();
636
		$indic_classes = utf8_regex_indic();
637
638
		foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes)
639
		{
640
			// Cursive scripts like Arabic use ZWNJ in certain contexts.
641
			// For these scripts, use test A1 for allowing ZWNJ.
642
			// https://unicode.org/reports/tr31/#A1
643
			if (isset($joining_type_classes[$script]))
644
			{
645
				$lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : '';
646
				$rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : '';
647
				$t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : '';
648
649
				if (!empty($classes['Dual_Joining']))
650
				{
651
					$lj .= $classes['Dual_Joining'];
652
					$rj .= $classes['Dual_Joining'];
653
				}
654
655
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
656
			}
657
			// Indic scripts with viramas use ZWNJ and ZWJ in certain contexts.
658
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ.
659
			// https://unicode.org/reports/tr31/#A2
660
			// https://unicode.org/reports/tr31/#B
661
			else
662
			{
663
				// A letter that is part of this particular script.
664
				$letter = '[' . $classes['Letter'] . ']';
665
666
				// Zero or more non-spacing marks used in this script.
667
				$nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*';
668
669
				// Zero or more non-spacing combining marks used in this script.
670
				$nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*';
671
672
				// ZWNJ must be followed by another letter in the same script.
673
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
674
675
				// ZWJ must NOT be followed by a vowel dependent character in this
676
				// script or by any character from a different script.
677
				$zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])';
678
679
				// Now build the pattern for this script.
680
				$pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
681
			}
682
683
			// Do the thing.
684
			$string = preg_replace_callback(
685
				'/' . $pattern . '/u',
686
				function ($matches) use ($placeholders)
687
				{
688
					return strtr($matches[0], $placeholders);
689
				},
690
				$string
691
			);
692
693
			// Did we catch 'em all?
694
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
695
				break;
696
		}
697
698
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
699
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
700
	}
701
702
	// Revert placeholders back to original characters.
703
	$string = strtr($string, array_flip($placeholders));
704
705
706
	return $string;
707
}
708
709
?>