Passed
Push — release-2.1 ( ab1855...12d126 )
by Mathias
07:51 queued 13s
created

Sources/Subs-Charset.php (2 issues)

Labels
Severity
1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2021 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1 RC4
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
/**
18
 * Converts the given UTF-8 string into lowercase.
19
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
20
 * output consistent across PHP versions and up to date with the latest version
21
 * of Unicode.
22
 *
23
 * @param string $string The string
24
 * @return string The lowercase version of $string
25
 */
26
function utf8_strtolower($string)
27
{
28
	global $sourcedir;
29
30
	$string = (string) $string;
31
32
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
33
34
	if ($chars === false)
35
		return false;
36
37
	require_once($sourcedir . '/Unicode/CaseLower.php');
38
39
	$substitutions = utf8_strtolower_maps();
40
41
	foreach ($chars as &$char)
42
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
43
44
	return implode('', $chars);
45
}
46
47
/**
48
 * Convert the given UTF-8 string to uppercase.
49
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
50
 * output consistent across PHP versions and up to date with the latest version
51
 * of Unicode.
52
 *
53
 * @param string $string The string
54
 * @return string The uppercase version of $string
55
 */
56
function utf8_strtoupper($string)
57
{
58
	global $sourcedir;
59
60
	$string = (string) $string;
61
62
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
63
64
	if ($chars === false)
65
		return false;
66
67
	require_once($sourcedir . '/Unicode/CaseUpper.php');
68
69
	$substitutions = utf8_strtoupper_maps();
70
71
	foreach ($chars as &$char)
72
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
73
74
	return implode('', $chars);
75
}
76
77
/**
78
 * Casefolds the given UTF-8 string.
79
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
80
 * we can keep the output consistent across PHP versions and up to date with
81
 * the latest version of Unicode.
82
 *
83
 * @param string $string The string
84
 * @return string The uppercase version of $string
85
 */
86
function utf8_casefold($string)
87
{
88
	global $sourcedir;
89
90
	$string = (string) $string;
91
92
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
93
94
	if ($chars === false)
95
		return false;
96
97
	require_once($sourcedir . '/Unicode/CaseFold.php');
98
99
	$substitutions = utf8_casefold_maps();
100
101
	foreach ($chars as &$char)
102
		$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
103
104
	return implode('', $chars);
105
}
106
107
/**
108
 * Normalizes UTF-8 via Canonical Decomposition.
109
 *
110
 * @param string $string A UTF-8 string
111
 * @return string The decomposed version of $string
112
 */
113
function utf8_normalize_d($string)
114
{
115
	$string = (string) $string;
116
117
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
0 ignored issues
show
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

117
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D))
Loading history...
118
		return $string;
119
120
	if (is_callable('normalizer_normalize'))
121
		return normalizer_normalize($string, Normalizer::FORM_D);
122
123
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
124
125
	if ($chars === false)
126
		return false;
127
128
	return implode('', utf8_decompose($chars, false));
129
}
130
131
/**
132
 * Normalizes UTF-8 via Compatibility Decomposition.
133
 *
134
 * @param string $string A UTF-8 string.
135
 * @return string The decomposed version of $string.
136
 */
137
function utf8_normalize_kd($string)
138
{
139
	$string = (string) $string;
140
141
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
0 ignored issues
show
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

141
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD))
Loading history...
142
		return $string;
143
144
	if (is_callable('normalizer_normalize'))
145
		return normalizer_normalize($string, Normalizer::FORM_KD);
146
147
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
148
149
	if ($chars === false)
150
		return false;
151
152
	return implode('', utf8_decompose($chars, true));
153
}
154
155
/**
156
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
157
 *
158
 * @param string $string A UTF-8 string
159
 * @return string The composed version of $string
160
 */
161
function utf8_normalize_c($string)
162
{
163
	$string = (string) $string;
164
165
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
166
		return $string;
167
168
	if (is_callable('normalizer_normalize'))
169
		return normalizer_normalize($string, Normalizer::FORM_C);
170
171
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
172
173
	if ($chars === false)
174
		return false;
175
176
	return implode('', utf8_compose(utf8_decompose($chars, false)));
177
}
178
179
/**
180
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
181
 *
182
 * @param string $string The string
183
 * @return string The composed version of $string
184
 */
185
function utf8_normalize_kc($string)
186
{
187
	$string = (string) $string;
188
189
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
190
		return $string;
191
192
	if (is_callable('normalizer_normalize'))
193
		return normalizer_normalize($string, Normalizer::FORM_KC);
194
195
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
196
197
	if ($chars === false)
198
		return false;
199
200
	return implode('', utf8_compose(utf8_decompose($chars, true)));
201
}
202
203
/**
204
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
205
 * Used by idn_to_ascii polyfill in Subs-Compat.php
206
 *
207
 * @param string $string The string
208
 * @return string The casefolded version of $string
209
 */
210
function utf8_normalize_kc_casefold($string)
211
{
212
	global $sourcedir;
213
214
	$string = (string) $string;
215
216
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
217
218
	if ($chars === false)
219
		return false;
220
221
	$chars = utf8_decompose($chars, true);
222
223
	require_once($sourcedir . '/Unicode/CaseFold.php');
224
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
225
226
	$substitutions = utf8_casefold_maps();
227
	$ignorables = array_flip(utf8_default_ignorables());
228
229
	foreach ($chars as &$char)
230
	{
231
		if (isset($substitutions[$char]))
232
			$char = $substitutions[$char];
233
234
		elseif (isset($ignorables[$char]))
235
			$char = '';
236
	}
237
238
	return implode('', utf8_compose($chars));
239
}
240
241
/**
242
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
243
 *
244
 * @param array $chars Array of Unicode characters
245
 * @return array Array of decomposed Unicode characters.
246
 */
247
function utf8_decompose($chars, $compatibility = false)
248
{
249
	global $sourcedir;
250
251
	if (!empty($compatibility))
252
	{
253
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
254
255
		$substitutions = utf8_normalize_kd_maps();
256
257
		foreach ($chars as &$char)
258
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
259
	}
260
261
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
262
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
263
264
	$substitutions = utf8_normalize_d_maps();
265
	$combining_classes = utf8_combining_classes();
266
267
	// Replace characters with decomposed forms.
268
	for ($i=0; $i < count($chars); $i++)
269
	{
270
		// Hangul characters.
271
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
272
		{
273
			if (!function_exists('mb_ord'))
274
				require_once($sourcedir . '/Subs-Compat.php');
275
276
			$s = mb_ord($chars[$i]);
277
			$sindex = $s - 0xAC00;
278
			$l = 0x1100 + $sindex / (21 * 28);
279
			$v = 0x1161 + ($sindex % (21 * 28)) / 28;
280
			$t = $sindex % 28;
281
282
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
283
		}
284
		// Everything else.
285
		elseif (isset($substitutions[$chars[$i]]))
286
			$chars[$i] = $substitutions[$chars[$i]];
287
	}
288
289
	// Must re-split the string before sorting.
290
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
291
292
	// Sort characters into canonical order.
293
	for ($i = 1; $i < count($chars); $i++)
294
	{
295
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
296
			continue;
297
298
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
299
		{
300
			$temp = $chars[$i];
301
			$chars[$i] = $chars[$i - 1];
302
			$chars[$i -1] = $temp;
303
304
			// Backtrack and check again.
305
			if ($i > 1)
306
				$i -= 2;
307
		}
308
	}
309
310
	return $chars;
311
}
312
313
/**
314
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
315
 *
316
 * @param array $chars Array of decomposed Unicode characters
317
 * @return array Array of composed Unicode characters.
318
 */
319
function utf8_compose($chars)
320
{
321
	global $sourcedir;
322
323
	require_once($sourcedir . '/Unicode/Composition.php');
324
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
325
326
	$substitutions = utf8_compose_maps();
327
	$combining_classes = utf8_combining_classes();
328
329
	for ($c = 0; $c < count($chars); $c++)
330
	{
331
		// Singleton replacements.
332
		if (isset($substitutions[$chars[$c]]))
333
			$chars[$c] = $substitutions[$chars[$c]];
334
335
		// Hangul characters.
336
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
337
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
338
		{
339
			if (!function_exists('mb_ord'))
340
				require_once($sourcedir . '/Subs-Compat.php');
341
342
			$l_part = $chars[$c];
343
			$v_part = $chars[$c + 1];
344
			$t_part = null;
345
346
			$l_index = mb_ord($l_part) - 0x1100;
347
			$v_index = mb_ord($v_part) - 0x1161;
348
349
			$lv_index = $l_index * 588 + $v_index * 28;
350
			$s = 0xAC00 + $lv_index;
351
352
			if ($chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
353
			{
354
				$t_part = $chars[$c + 2];
355
				$t_index = mb_ord($t_part) - 0x11A7;
356
				$s += $t_index;
357
			}
358
359
			$chars[$c] = mb_chr($s);
360
			$chars[++$c] = null;
361
362
			if (isset($t_part))
363
				$chars[++$c] = null;
364
365
			continue;
366
		}
367
368
		if ($c > 0)
369
		{
370
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
371
372
			// Find the preceding starter character.
373
			$l = $c - 1;
374
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
375
				$l--;
376
377
			// Is there a composed form for this combination?
378
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
379
			{
380
				// Replace the starter character with the composed character.
381
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
382
383
				// Unset the current combining character.
384
				$chars[$c] = null;
385
			}
386
		}
387
	}
388
389
	return $chars;
390
}
391
392
/**
393
 * Helper function for sanitize_chars() that deals with invisible characters.
394
 *
395
 * This function deals with control characters, private use characters,
396
 * non-characters, and characters that are invisible by definition in the
397
 * Unicode standard. It does not deal with characters that are supposed to be
398
 * visible according to the Unicode standard, and makes no attempt to compensate
399
 * for possibly incomplete Unicode support in text rendering engines on client
400
 * devices.
401
 *
402
 * @param string $string The string to sanitize.
403
 * @param int $level Controls how invisible formatting characters are handled.
404
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
405
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
406
 *      2: Disallow all formatting characters. Use for internal comparisions
407
 *         only, such as in the word censor, search contexts, etc.
408
 * @param string $substitute Replacement string for the invalid characters.
409
 * @return string The sanitized string.
410
 */
411
function utf8_sanitize_invisibles($string, $level, $substitute)
412
{
413
	global $sourcedir;
414
415
	$string = (string) $string;
416
	$level = min(max((int) $level, 0), 2);
417
	$substitute = (string) $substitute;
418
419
	require_once($sourcedir . '/Unicode/RegularExpressions.php');
420
	$prop_classes = utf8_regex_properties();
421
422
	// We never want non-whitespace control characters
423
	$disallowed[] = '[^\P{Cc}\t\r\n]';
424
425
	// We never want private use characters or non-characters.
426
	// Use our own version of \p{Cn} in order to avoid possible inconsistencies
427
	// between our data and whichever version of PCRE happens to be installed
428
	// on this server. Unlike \p{Cc} and \p{Co}, which never change, the value
429
	// of \p{Cn} changes with every new version of Unicode.
430
	$disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']';
431
432
	// Several more things we never want:
433
	$disallowed[] = '[' . implode('', array(
434
		// Soft Hyphen.
435
		'\x{AD}',
436
		// Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA.
437
		// Unicode Standard ch. 16 says: "they are insufficient for [their]
438
		// purpose and should be considered errors in the encoding."
439
		'\x{17B4}-\x{17B5}',
440
		// Invisible math characters.
441
		'\x{2061}-\x{2064}',
442
		// Deprecated formatting characters.
443
		'\x{206A}-\x{206F}',
444
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
445
		'\x{FEFF}',
446
		// Annotation characters and Object Replacement Character.
447
		'\x{FFF9}-\x{FFFC}',
448
	)) . ']';
449
450
	switch ($level)
451
	{
452
		case 2:
453
			$disallowed[] = '[' . implode('', array(
454
				// Combining Grapheme Character.
455
				'\x{34F}',
456
				// Zero Width Non-Joiner.
457
				'\x{200C}',
458
				// Zero Width Joiner.
459
				'\x{200D}',
460
				// All variation selectors.
461
				$prop_classes['Variation_Selector'],
462
				// Tag characters.
463
				'\x{E0000}-\x{E007F}',
464
			)) . ']';
465
466
			// no break
467
468
		case 1:
469
			$disallowed[] = '[' . implode('', array(
470
				// Zero Width Space.
471
				'\x{200B}',
472
				// Word Joiner.
473
				'\x{2060}',
474
				// "Bidi_Control" characters.
475
				// Disallowing means that all characters will behave according
476
				// to their default bidirectional text properties.
477
				$prop_classes['Bidi_Control'],
478
				// Hangul filler characters.
479
				// Used as placeholders in incomplete ideographs.
480
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
481
				// Shorthand formatting characters.
482
				'\x{1BCA0}-\x{1BCA3}',
483
				// Musical formatting characters.
484
				'\x{1D173}-\x{1D17A}',
485
			)) . ']';
486
487
			break;
488
489
		default:
490
			// Zero Width Space only allowed in certain scripts.
491
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
492
493
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
494
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
495
496
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
497
			// by more Hangul Jamo characters.
498
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
499
500
			// Hangul Filler for Hangul compatibility chars.
501
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
502
503
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
504
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
505
506
			// Shorthand formatting characters only with other shorthand chars.
507
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
508
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
509
510
			// Musical formatting characters only with other musical chars.
511
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
512
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
513
514
			break;
515
	}
516
517
	if ($level < 2)
518
	{
519
		/*
520
			Combining Grapheme Character has two uses: to override standard
521
			search and collation behaviours, which we never want to allow, and
522
			to ensure correct behaviour of combining marks in a few exceptional
523
			cases, which is legitimate and should be allowed. This means we can
524
			simply test whether it is followed by a combining mark in order to
525
			determine whether to allow it.
526
		*/
527
		$disallowed[] = '\x{34F}(?!\p{M})';
528
529
		// Tag characters not allowed inside words.
530
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
531
	}
532
533
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
534
535
	// Are we done yet?
536
	if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string))
537
		return $string;
538
539
	// String must be in Normalization Form C for the following checks to work.
540
	$string = utf8_normalize_c($string);
541
542
	$placeholders = array();
543
544
	// Use placeholders to preserve known emoji from further processing.
545
	// Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
546
	$string  = preg_replace_callback(
547
		'/' .
548
		// Flag emojis
549
		'[' . $prop_classes['Regional_Indicator'] . ']{2}' .
550
		// Or
551
		'|' .
552
		// Emoji characters
553
		'[' . $prop_classes['Emoji'] . ']' .
554
		// Possibly followed by modifiers of various sorts
555
		'(' .
556
			'[' . $prop_classes['Emoji_Modifier'] . ']' .
557
			'|' .
558
			'\x{FE0F}\x{20E3}?' .
559
			'|' .
560
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
561
		')?' .
562
		// Possibly concatenated with Zero Width Joiner and more emojis
563
		// (e.g. the "family" emoji sequences)
564
		'(' .
565
			'\x{200D}[' . $prop_classes['Emoji'] . ']' .
566
			'(' .
567
				'[' . $prop_classes['Emoji_Modifier'] . ']' .
568
				'|' .
569
				'\x{FE0F}\x{20E3}?' .
570
				'|' .
571
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
572
			')?' .
573
		')*' .
574
		'/u',
575
		function ($matches) use (&$placeholders)
576
		{
577
			// Skip lone ASCII characters that are not actully part of an emoji sequence.
578
			// This can happen because the digits 0-9 and the '*' and '#' characters are
579
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
580
			if (strlen($matches[0]) === 1)
581
				return $matches[0];
582
583
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
584
			return $placeholders[$matches[0]];
585
		},
586
		$string
587
	);
588
589
	// Get rid of any unsanctioned variation selectors.
590
	if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string))
591
	{
592
		/*
593
			Unicode gives pre-defined lists of sanctioned variation sequences
594
			and says any use of variation selectors outside those sequences is
595
			unsanctioned.
596
		*/
597
598
		$patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u');
599
600
		foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars)
601
			$patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u';
602
603
		// Use placeholders for sanctioned variation selectors.
604
		$string = preg_replace_callback(
605
			$patterns,
606
			function ($matches) use (&$placeholders)
607
			{
608
				$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
609
				return $placeholders[$matches[0]];
610
			},
611
			$string
612
		);
613
614
		// Remove any unsanctioned variation selectors.
615
		$string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string);
616
	}
617
618
	// Join controls are only allowed inside words in special circumstances.
619
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
620
	if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string))
621
	{
622
		// Zero Width Non-Joiner (U+200C)
623
		$zwnj = "\xE2\x80\x8C";
624
		// Zero Width Joiner (U+200D)
625
		$zwj = "\xE2\x80\x8D";
626
627
		$placeholders[$zwnj] = "\xEE\x80\x8C";
628
		$placeholders[$zwj] = "\xEE\x80\x8D";
629
630
		// When not in strict mode, allow ZWJ at word boundaries.
631
		if ($level === 0)
632
			$string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string);
633
634
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
635
		$joining_type_classes = utf8_regex_joining_type();
636
		$indic_classes = utf8_regex_indic();
637
638
		foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes)
639
		{
640
			// Cursive scripts like Arabic use ZWNJ in certain contexts.
641
			// For these scripts, use test A1 for allowing ZWNJ.
642
			// https://unicode.org/reports/tr31/#A1
643
			if (isset($joining_type_classes[$script]))
644
			{
645
				$lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : '';
646
				$rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : '';
647
				$t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : '';
648
649
				if (!empty($classes['Dual_Joining']))
650
				{
651
					$lj .= $classes['Dual_Joining'];
652
					$rj .= $classes['Dual_Joining'];
653
				}
654
655
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
656
			}
657
			// Indic scripts with viramas use ZWNJ and ZWJ in certain contexts.
658
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ.
659
			// https://unicode.org/reports/tr31/#A2
660
			// https://unicode.org/reports/tr31/#B
661
			else
662
			{
663
				// A letter that is part of this particular script.
664
				$letter = '[' . $classes['Letter'] . ']';
665
666
				// Zero or more non-spacing marks used in this script.
667
				$nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*';
668
669
				// Zero or more non-spacing combining marks used in this script.
670
				$nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*';
671
672
				// ZWNJ must be followed by another letter in the same script.
673
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
674
675
				// ZWJ must NOT be followed by a vowel dependent character in this
676
				// script or by any character from a different script.
677
				$zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])';
678
679
				// Now build the pattern for this script.
680
				$pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
681
			}
682
683
			// Do the thing.
684
			$string = preg_replace_callback(
685
				'/' . $pattern . '/u',
686
				function ($matches) use ($placeholders)
687
				{
688
					return strtr($matches[0], $placeholders);
689
				},
690
				$string
691
			);
692
693
			// Did we catch 'em all?
694
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
695
				break;
696
		}
697
698
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
699
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
700
	}
701
702
	// Revert placeholders back to original characters.
703
	$string = strtr($string, array_flip($placeholders));
704
705
706
	return $string;
707
}
708
709
?>