Issues (1065)

Sources/Subs-Charset.php (3 issues)

1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2025 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1.5
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
// If this file is missing, we're using an old version of Unicode.
18
if (!@include_once($sourcedir . '/Unicode/Metadata.php'))
19
	define('SMF_UNICODE_VERSION', '14.0.0.0');
20
21
/**
22
 * Converts the given UTF-8 string into lowercase.
23
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
24
 * output consistent across PHP versions and up to date with the latest version
25
 * of Unicode.
26
 *
27
 * @param string $string The string
28
 * @return string The lowercase version of $string
29
 */
30
function utf8_strtolower($string)
31
{
32
	return utf8_convert_case($string, 'lower');
33
}
34
35
/**
36
 * Convert the given UTF-8 string to uppercase.
37
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
38
 * output consistent across PHP versions and up to date with the latest version
39
 * of Unicode.
40
 *
41
 * @param string $string The string
42
 * @return string The uppercase version of $string
43
 */
44
function utf8_strtoupper($string)
45
{
46
	return utf8_convert_case($string, 'upper');
47
}
48
49
/**
50
 * Casefolds the given UTF-8 string.
51
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
52
 * we can keep the output consistent across PHP versions and up to date with
53
 * the latest version of Unicode.
54
 *
55
 * @param string $string The string
56
 * @return string The uppercase version of $string
57
 */
58
function utf8_casefold($string)
59
{
60
	return utf8_convert_case($string, 'fold');
61
}
62
63
/**
64
 * Converts the case of the given UTF-8 string.
65
 *
66
 * @param string $string The string.
67
 * @param string $case One of 'upper', 'lower', 'fold', 'title', 'ucfirst', or 'ucwords'.
68
 * @param bool $simple If true, use simple maps instead of full maps. Default: false.
69
 * @return string A version of $string converted to the specified case.
70
 */
71
function utf8_convert_case($string, $case, $simple = false)
72
{
73
	global $sourcedir, $txt;
74
75
	$simple = !empty($simple);
76
77
	$lang = empty($txt['lang_locale']) ? '' : substr($txt['lang_locale'], 0, 2);
78
79
	// The main case conversion logic
80
	if (in_array($case, array('upper', 'lower', 'fold')))
81
	{
82
		$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
83
84
		if ($chars === false)
85
			return false;
86
87
		switch ($case)
88
		{
89
			case 'upper':
90
				require_once($sourcedir . '/Unicode/CaseUpper.php');
91
92
				$substitutions = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
93
94
				// Turkish & Azeri conditional casing, part 1.
95
				if (in_array($lang, array('tr', 'az')))
96
					$substitutions['i'] = 'İ';
97
98
				break;
99
100
			case 'lower':
101
				require_once($sourcedir . '/Unicode/CaseLower.php');
102
103
				$substitutions = $simple ? utf8_strtolower_simple_maps() : utf8_strtolower_maps();
104
105
				// Turkish & Azeri conditional casing, part 1.
106
				if (in_array($lang, array('tr', 'az')))
107
				{
108
					$substitutions['İ'] = 'i';
109
					$substitutions['I' . "\xCC\x87"] = 'i';
110
					$substitutions['I'] = 'ı';
111
				}
112
113
				break;
114
115
			case 'fold':
116
				require_once($sourcedir . '/Unicode/CaseFold.php');
117
118
				$substitutions = $simple ? utf8_casefold_simple_maps() : utf8_casefold_maps();
119
120
				break;
121
		}
122
123
		foreach ($chars as &$char)
124
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
125
126
		$string = implode('', $chars);
127
	}
128
	elseif (in_array($case, array('title', 'ucfirst', 'ucwords')))
129
	{
130
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
131
		require_once($sourcedir . '/Unicode/CaseUpper.php');
132
		require_once($sourcedir . '/Unicode/CaseTitle.php');
133
134
		$prop_classes = utf8_regex_properties();
135
136
		$upper = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
137
138
		// Turkish & Azeri conditional casing, part 1.
139
		if (in_array($lang, array('tr', 'az')))
140
			$upper['i'] = 'İ';
141
142
		$title = array_merge($upper, $simple ? utf8_titlecase_simple_maps() : utf8_titlecase_maps());
143
144
		switch ($case)
145
		{
146
			case 'title':
147
				$string = utf8_convert_case($string, 'lower', $simple);
148
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})/u';
149
				break;
150
151
			case 'ucwords':
152
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
153
				break;
154
155
			case 'ucfirst':
156
				$regex = '/^[^\w' . $prop_classes['Case_Ignorable'] . ']*\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
157
				break;
158
		}
159
160
		$string = preg_replace_callback(
161
			$regex,
162
			function($matches) use ($upper, $title)
163
			{
164
				// If second letter is uppercase, use uppercase for first letter.
165
				// Otherwise, use titlecase for first letter.
166
				$case = !empty($matches['upper']) ? 'upper' : 'title';
167
168
				$matches[1] = isset($$case[$matches[1]]) ? $$case[$matches[1]] : $matches[1];
169
170
				return $matches[1];
171
			},
172
			$string
173
		);
174
	}
175
176
	// If casefolding, we're done.
177
	if ($case === 'fold')
178
		return $string;
179
180
	// Handle conditional casing situations...
181
	$substitutions = array();
182
	$replacements = array();
183
184
	// Greek conditional casing, part 1: Fix lowercase sigma.
185
	// Note that this rule doesn't depend on $txt['lang_locale'].
186
	if ($case !== 'upper' && strpos($string, 'ς') !== false || strpos($string, 'σ') !== false)
187
	{
188
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
189
190
		$prop_classes = utf8_regex_properties();
191
192
		// First, convert all lowercase sigmas to regular form.
193
		$substitutions['ς'] = 'σ';
194
195
		// Then convert any at the end of words to final form.
196
		$replacements['/\Bσ([' . $prop_classes['Case_Ignorable'] . ']*)(?!\p{L})/u'] = 'ς$1';
197
	}
198
	// Greek conditional casing, part 2: No accents on uppercase strings.
199
	if ($lang === 'el' && $case === 'upper')
200
	{
201
		// Composed forms.
202
		$substitutions += array(
203
			'Ά' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ὰ' => 'Α', 'Ᾰ' => 'Α',
204
			'Ᾱ' => 'Α', 'Α' => 'Α', 'Α' => 'Α', 'Ἂ' => 'Α', 'Ἃ' => 'Α',
205
			'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α', 'Ὰ' => 'Α',
206
			'Ά' => 'Α', 'Α' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ἂ' => 'Α',
207
			'Ἃ' => 'Α', 'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α',
208
			'Έ' => 'Ε', 'Ἐ' => 'Ε', 'Ἑ' => 'Ε', 'Ὲ' => 'Ε', 'Ἒ' => 'Ε',
209
			'Ἓ' => 'Ε', 'Ἔ' => 'Ε', 'Ἕ' => 'Ε', 'Ή' => 'Η', 'Ἠ' => 'Η',
210
			'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Η' => 'Η', 'Η' => 'Η', 'Ἢ' => 'Η',
211
			'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η', 'Ἧ' => 'Η',
212
			'Ἠ' => 'Η', 'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Ή' => 'Η', 'Η' => 'Η',
213
			'Ἢ' => 'Η', 'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η',
214
			'Ἧ' => 'Η', 'Ί' => 'Ι', 'Ἰ' => 'Ι', 'Ἱ' => 'Ι', 'Ὶ' => 'Ι',
215
			'Ῐ' => 'Ι', 'Ῑ' => 'Ι', 'Ι' => 'Ι', 'Ϊ' => 'Ι', 'Ι' => 'Ι',
216
			'Ἲ' => 'Ι', 'Ἳ' => 'Ι', 'Ἴ' => 'Ι', 'Ἵ' => 'Ι', 'Ἶ' => 'Ι',
217
			'Ἷ' => 'Ι', 'Ι' => 'Ι', 'Ι' => 'Ι', 'Ό' => 'Ο', 'Ὀ' => 'Ο',
218
			'Ὁ' => 'Ο', 'Ὸ' => 'Ο', 'Ὂ' => 'Ο', 'Ὃ' => 'Ο', 'Ὄ' => 'Ο',
219
			'Ὅ' => 'Ο', 'Ῥ' => 'Ρ', 'Ύ' => 'Υ', 'Υ' => 'Υ', 'Ὑ' => 'Υ',
220
			'Ὺ' => 'Υ', 'Ῠ' => 'Υ', 'Ῡ' => 'Υ', 'Υ' => 'Υ', 'Ϋ' => 'Υ',
221
			'Υ' => 'Υ', 'Υ' => 'Υ', 'Ὓ' => 'Υ', 'Υ' => 'Υ', 'Ὕ' => 'Υ',
222
			'Υ' => 'Υ', 'Ὗ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ',
223
			'Ώ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ὼ' => 'Ω', 'Ω' => 'Ω',
224
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
225
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ώ' => 'Ω',
226
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
227
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω',
228
		);
229
230
		// Individual Greek diacritics.
231
		$substitutions += array(
232
			"\xCC\x80" => '', "\xCC\x81" => '', "\xCC\x84" => '',
233
			"\xCC\x86" => '', "\xCC\x88" => '', "\xCC\x93" => '',
234
			"\xCC\x94" => '', "\xCD\x82" => '', "\xCD\x83" => '',
235
			"\xCD\x84" => '', "\xCD\x85" => '', "\xCD\xBA" => '',
236
			"\xCE\x84" => '', "\xCE\x85" => '',
237
			"\xE1\xBE\xBD" => '', "\xE1\xBE\xBF" => '', "\xE1\xBF\x80" => '',
238
			"\xE1\xBF\x81" => '', "\xE1\xBF\x8D" => '', "\xE1\xBF\x8E" => '',
239
			"\xE1\xBF\x8F" => '', "\xE1\xBF\x9D" => '', "\xE1\xBF\x9E" => '',
240
			"\xE1\xBF\x9F" => '', "\xE1\xBF\xAD" => '', "\xE1\xBF\xAE" => '',
241
			"\xE1\xBF\xAF" => '', "\xE1\xBF\xBD" => '', "\xE1\xBF\xBE" => '',
242
		);
243
	}
244
245
	// Turkish & Azeri conditional casing, part 2.
246
	if ($case !== 'upper' && in_array($lang, array('tr', 'az')))
247
	{
248
		// Remove unnecessary "COMBINING DOT ABOVE" after i
249
		$substitutions['i' . "\xCC\x87"] = 'i';
250
	}
251
252
	// Lithuanian conditional casing.
253
	if ($lang === 'lt')
254
	{
255
		// Force a dot above lowercase i and j with accents by inserting
256
		// the "COMBINING DOT ABOVE" character.
257
		// Note: some fonts handle this incorrectly and show two dots,
258
		// but that's a bug in those fonts and cannot be fixed here.
259
		if ($case !== 'upper')
260
			$replacements['/(i\x{328}?|\x{12F}|j)([\x{300}\x{301}\x{303}])/u'] = '$1' . "\xCC\x87" . '$2';
261
262
		// Remove "COMBINING DOT ABOVE" after uppercase I and J.
263
		if ($case !== 'lower')
264
			$replacements['/(I\x{328}?|\x{12E}|J)\x{307}/u'] = '$1';
265
	}
266
267
	// Dutch has a special titlecase rule.
268
	if ($lang === 'nl' && $case === 'title')
269
	{
270
		$replacements['/\bIj/u'] = 'IJ';
271
	}
272
273
	// Now perform whatever conditional casing fixes we need.
274
	if (!empty($substitutions))
275
		$string = strtr($string, $substitutions);
276
277
	if (!empty($replacements))
278
		$string = preg_replace(array_keys($replacements), $replacements, $string);
279
280
	return $string;
281
}
282
283
/**
284
 * Normalizes UTF-8 via Canonical Decomposition.
285
 *
286
 * @param string $string A UTF-8 string
287
 * @return string The decomposed version of $string
288
 */
289
function utf8_normalize_d($string)
290
{
291
	$string = (string) $string;
292
293
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
294
	{
295
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
296
			return $string;
297
298
		if (is_callable('normalizer_normalize'))
299
			return normalizer_normalize($string, Normalizer::FORM_D);
300
	}
301
302
	if (utf8_is_normalized($string, 'd'))
303
		return $string;
304
305
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
306
307
	if ($chars === false)
308
		return false;
309
310
	return implode('', utf8_decompose($chars, false));
311
}
312
313
/**
314
 * Normalizes UTF-8 via Compatibility Decomposition.
315
 *
316
 * @param string $string A UTF-8 string.
317
 * @return string The decomposed version of $string.
318
 */
319
function utf8_normalize_kd($string)
320
{
321
	$string = (string) $string;
322
323
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
324
	{
325
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
326
			return $string;
327
328
		if (is_callable('normalizer_normalize'))
329
			return normalizer_normalize($string, Normalizer::FORM_KD);
330
	}
331
332
	if (utf8_is_normalized($string, 'kd'))
333
		return $string;
334
335
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
336
337
	if ($chars === false)
338
		return false;
339
340
	return implode('', utf8_decompose($chars, true));
341
}
342
343
/**
344
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
345
 *
346
 * @param string $string A UTF-8 string
347
 * @return string The composed version of $string
348
 */
349
function utf8_normalize_c($string)
350
{
351
	$string = (string) $string;
352
353
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
354
	{
355
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
356
			return $string;
357
358
		if (is_callable('normalizer_normalize'))
359
			return normalizer_normalize($string, Normalizer::FORM_C);
360
	}
361
362
	if (utf8_is_normalized($string, 'c'))
363
		return $string;
364
365
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
366
367
	if ($chars === false)
368
		return false;
369
370
	return implode('', utf8_compose(utf8_decompose($chars, false)));
371
}
372
373
/**
374
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
375
 *
376
 * @param string $string The string
377
 * @return string The composed version of $string
378
 */
379
function utf8_normalize_kc($string)
380
{
381
	$string = (string) $string;
382
383
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
384
	{
385
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
386
			return $string;
387
388
		if (is_callable('normalizer_normalize'))
389
			return normalizer_normalize($string, Normalizer::FORM_KC);
390
	}
391
392
	if (utf8_is_normalized($string, 'kc'))
393
		return $string;
394
395
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
396
397
	if ($chars === false)
398
		return false;
399
400
	return implode('', utf8_compose(utf8_decompose($chars, true)));
401
}
402
403
/**
404
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
405
 * Used by idn_to_ascii polyfill in Subs-Compat.php
406
 *
407
 * @param string $string The string
408
 * @return string The casefolded version of $string
409
 */
410
function utf8_normalize_kc_casefold($string)
411
{
412
	global $sourcedir;
413
414
	$string = (string) $string;
415
416
	if (utf8_is_normalized($string, 'kc_casefold'))
417
		return $string;
418
419
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
420
421
	if ($chars === false)
422
		return false;
423
424
	$chars = utf8_decompose($chars, true);
425
426
	require_once($sourcedir . '/Unicode/CaseFold.php');
427
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
428
429
	$substitutions = utf8_casefold_maps();
430
	$ignorables = array_flip(utf8_default_ignorables());
431
432
	foreach ($chars as &$char)
433
	{
434
		if (isset($substitutions[$char]))
435
			$char = $substitutions[$char];
436
437
		elseif (isset($ignorables[$char]))
438
			$char = '';
439
	}
440
441
	return implode('', utf8_compose($chars));
442
}
443
444
/**
445
 * Checks whether a string is already normalized to a given form.
446
 *
447
 * @param string|array $string A string of UTF-8 characters.
448
 * @param string $form One of 'd', 'c', 'kd', 'kc', or 'kc_casefold'
449
 * @return bool Whether the string is already normalized to the given form.
450
 */
451
function utf8_is_normalized($string, $form)
452
{
453
	global $sourcedir;
454
455
	// Check whether string contains characters that are disallowed in this form.
456
	switch ($form)
457
	{
458
		case 'd':
459
			$prop = 'NFD_QC';
460
			break;
461
462
		case 'kd':
463
			$prop = 'NFKD_QC';
464
			break;
465
466
		case 'c':
467
			$prop = 'NFC_QC';
468
			break;
469
470
		case 'kc':
471
			$prop = 'NFKC_QC';
472
			break;
473
474
		case 'kc_casefold':
475
			$prop = 'Changes_When_NFKC_Casefolded';
476
			break;
477
478
		default:
479
			return false;
480
			break;
481
	}
482
483
	require_once($sourcedir . '/Unicode/QuickCheck.php');
484
	$qc = utf8_regex_quick_check();
485
486
	if (preg_match('/[' . $qc[$prop] . ']/u', $string))
487
		return false;
488
489
	// Check whether all combining marks are in canonical order.
490
	// Note: Because PCRE's Unicode data might be outdated compared to ours,
491
	// this regex checks for marks and anything PCRE thinks is not a character.
492
	// That means the more thorough checks will occasionally be performed on
493
	// strings that don't need them, but building and running a perfect regex
494
	// would be more expensive in the vast majority of cases, so meh.
495
	if (preg_match_all('/([\p{M}\p{Cn}])/u', $string, $matches, PREG_OFFSET_CAPTURE))
496
	{
497
		require_once($sourcedir . '/Unicode/CombiningClasses.php');
498
499
		$combining_classes = utf8_combining_classes();
500
501
		$last_pos = 0;
502
		$last_len = 0;
503
		$last_ccc = 0;
504
		foreach ($matches[1] as $match)
505
		{
506
			$char = $match[0];
507
			$pos = $match[1];
508
			$ccc = isset($combining_classes[$char]) ? $combining_classes[$char] : 0;
509
510
			// Not in canonical order, so return false.
511
			if ($pos === $last_pos + $last_len && $ccc > 0 && $last_ccc > $ccc)
512
				return false;
513
514
			$last_pos = $pos;
515
			$last_len = strlen($char);
516
			$last_ccc = $ccc;
517
		}
518
	}
519
520
	// If we get here, the string is normalized correctly.
521
	return true;
522
}
523
524
/**
525
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
526
 *
527
 * @param array $chars Array of Unicode characters
528
 * @param bool $compatibility If true, perform compatibility decomposition. Default false.
529
 * @return array Array of decomposed Unicode characters.
530
 */
531
function utf8_decompose($chars, $compatibility = false)
532
{
533
	global $sourcedir;
534
535
	if (!empty($compatibility))
536
	{
537
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
538
539
		$substitutions = utf8_normalize_kd_maps();
540
541
		foreach ($chars as &$char)
542
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
543
	}
544
545
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
546
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
547
548
	$substitutions = utf8_normalize_d_maps();
549
	$combining_classes = utf8_combining_classes();
550
551
	// Replace characters with decomposed forms.
552
	for ($i=0; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
553
	{
554
		// Hangul characters.
555
		// See "Hangul Syllable Decomposition" in the Unicode standard, ch. 3.12.
556
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
557
		{
558
			if (!function_exists('mb_ord'))
559
				require_once($sourcedir . '/Subs-Compat.php');
560
561
			$s = mb_ord($chars[$i]);
562
			$sindex = $s - 0xAC00;
563
			$l = (int) (0x1100 + $sindex / (21 * 28));
564
			$v = (int) (0x1161 + ($sindex % (21 * 28)) / 28);
565
			$t = $sindex % 28;
566
567
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
568
		}
569
		// Everything else.
570
		elseif (isset($substitutions[$chars[$i]]))
571
			$chars[$i] = $substitutions[$chars[$i]];
572
	}
573
574
	// Must re-split the string before sorting.
575
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
576
577
	// Sort characters into canonical order.
578
	for ($i = 1; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
579
	{
580
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
581
			continue;
582
583
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
584
		{
585
			$temp = $chars[$i];
586
			$chars[$i] = $chars[$i - 1];
587
			$chars[$i -1] = $temp;
588
589
			// Backtrack and check again.
590
			if ($i > 1)
591
				$i -= 2;
592
		}
593
	}
594
595
	return $chars;
596
}
597
598
/**
599
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
600
 *
601
 * @param array $chars Array of decomposed Unicode characters
602
 * @return array Array of composed Unicode characters.
603
 */
604
function utf8_compose($chars)
605
{
606
	global $sourcedir;
607
608
	require_once($sourcedir . '/Unicode/Composition.php');
609
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
610
611
	$substitutions = utf8_compose_maps();
612
	$combining_classes = utf8_combining_classes();
613
614
	for ($c = 0; $c < count($chars); $c++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
615
	{
616
		// Singleton replacements.
617
		if (isset($substitutions[$chars[$c]]))
618
			$chars[$c] = $substitutions[$chars[$c]];
619
620
		// Hangul characters.
621
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
622
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && isset($chars[$c + 1]) && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
623
		{
624
			if (!function_exists('mb_ord'))
625
				require_once($sourcedir . '/Subs-Compat.php');
626
627
			$l_part = $chars[$c];
628
			$v_part = $chars[$c + 1];
629
			$t_part = null;
630
631
			$l_index = mb_ord($l_part) - 0x1100;
632
			$v_index = mb_ord($v_part) - 0x1161;
633
634
			$lv_index = $l_index * 588 + $v_index * 28;
635
			$s = 0xAC00 + $lv_index;
636
637
			if (isset($chars[$c + 2]) && $chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
638
			{
639
				$t_part = $chars[$c + 2];
640
				$t_index = mb_ord($t_part) - 0x11A7;
641
				$s += $t_index;
642
			}
643
644
			$chars[$c] = mb_chr($s);
645
			$chars[++$c] = null;
646
647
			if (isset($t_part))
648
				$chars[++$c] = null;
649
650
			continue;
651
		}
652
653
		if ($c > 0)
654
		{
655
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
656
657
			// Find the preceding starter character.
658
			$l = $c - 1;
659
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
660
				$l--;
661
662
			// Is there a composed form for this combination?
663
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
664
			{
665
				// Replace the starter character with the composed character.
666
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
667
668
				// Unset the current combining character.
669
				$chars[$c] = null;
670
			}
671
		}
672
	}
673
674
	return $chars;
675
}
676
677
/**
678
 * Helper function for sanitize_chars() that deals with invisible characters.
679
 *
680
 * This function deals with control characters, private use characters,
681
 * non-characters, and characters that are invisible by definition in the
682
 * Unicode standard. It does not deal with characters that are supposed to be
683
 * visible according to the Unicode standard, and makes no attempt to compensate
684
 * for possibly incomplete Unicode support in text rendering engines on client
685
 * devices.
686
 *
687
 * @param string $string The string to sanitize.
688
 * @param int $level Controls how invisible formatting characters are handled.
689
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
690
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
691
 *      2: Disallow all formatting characters. Use for internal comparisions
692
 *         only, such as in the word censor, search contexts, etc.
693
 * @param string $substitute Replacement string for the invalid characters.
694
 * @return string The sanitized string.
695
 */
696
function utf8_sanitize_invisibles($string, $level, $substitute)
697
{
698
	global $sourcedir;
699
700
	$string = (string) $string;
701
	$level = min(max((int) $level, 0), 2);
702
	$substitute = (string) $substitute;
703
704
	require_once($sourcedir . '/Unicode/RegularExpressions.php');
705
	$prop_classes = utf8_regex_properties();
706
707
	// We never want non-whitespace control characters
708
	$disallowed[] = '[^\P{Cc}\t\r\n]';
709
710
	// We never want private use characters or non-characters.
711
	// Use our own version of \p{Cn} in order to avoid possible inconsistencies
712
	// between our data and whichever version of PCRE happens to be installed
713
	// on this server. Unlike \p{Cc} and \p{Co}, which never change, the value
714
	// of \p{Cn} changes with every new version of Unicode.
715
	$disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']';
716
717
	// Several more things we never want:
718
	$disallowed[] = '[' . implode('', array(
719
		// Soft Hyphen.
720
		'\x{AD}',
721
		// Invisible math characters.
722
		'\x{2061}-\x{2064}',
723
		// Deprecated formatting characters.
724
		'\x{206A}-\x{206F}',
725
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
726
		'\x{FEFF}',
727
		// Annotation characters and Object Replacement Character.
728
		'\x{FFF9}-\x{FFFC}',
729
	)) . ']';
730
731
	switch ($level)
732
	{
733
		case 2:
734
			$disallowed[] = '[' . implode('', array(
735
				// Combining Grapheme Character.
736
				'\x{34F}',
737
				// Zero Width Non-Joiner.
738
				'\x{200C}',
739
				// Zero Width Joiner.
740
				'\x{200D}',
741
				// All variation selectors.
742
				$prop_classes['Variation_Selector'],
743
				// Tag characters.
744
				'\x{E0000}-\x{E007F}',
745
			)) . ']';
746
747
			// no break
748
749
		case 1:
750
			$disallowed[] = '[' . implode('', array(
751
				// Zero Width Space.
752
				'\x{200B}',
753
				// Word Joiner.
754
				'\x{2060}',
755
				// "Bidi_Control" characters.
756
				// Disallowing means that all characters will behave according
757
				// to their default bidirectional text properties.
758
				$prop_classes['Bidi_Control'],
759
				// Hangul filler characters.
760
				// Used as placeholders in incomplete ideographs.
761
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
762
				// Shorthand formatting characters.
763
				'\x{1BCA0}-\x{1BCA3}',
764
				// Musical formatting characters.
765
				'\x{1D173}-\x{1D17A}',
766
			)) . ']';
767
768
			break;
769
770
		default:
771
			// Zero Width Space only allowed in certain scripts.
772
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
773
774
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
775
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
776
777
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
778
			// by more Hangul Jamo characters.
779
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
780
781
			// Hangul Filler for Hangul compatibility chars.
782
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
783
784
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
785
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
786
787
			// Shorthand formatting characters only with other shorthand chars.
788
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
789
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
790
791
			// Musical formatting characters only with other musical chars.
792
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
793
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
794
795
			break;
796
	}
797
798
	if ($level < 2)
799
	{
800
		/*
801
			Combining Grapheme Character has two uses: to override standard
802
			search and collation behaviours, which we never want to allow, and
803
			to ensure correct behaviour of combining marks in a few exceptional
804
			cases, which is legitimate and should be allowed. This means we can
805
			simply test whether it is followed by a combining mark in order to
806
			determine whether to allow it.
807
		*/
808
		$disallowed[] = '\x{34F}(?!\p{M})';
809
810
		// Tag characters not allowed inside words.
811
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
812
	}
813
814
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
815
816
	// Are we done yet?
817
	if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string))
818
		return $string;
819
820
	// String must be in Normalization Form C for the following checks to work.
821
	$string = utf8_normalize_c($string);
822
823
	$placeholders = array();
824
825
	// Use placeholders to preserve known emoji from further processing.
826
	// Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
827
	$string  = preg_replace_callback(
828
		'/' .
829
		// Flag emojis
830
		'[' . $prop_classes['Regional_Indicator'] . ']{2}' .
831
		// Or
832
		'|' .
833
		// Emoji characters
834
		'[' . $prop_classes['Emoji'] . ']' .
835
		// Possibly followed by modifiers of various sorts
836
		'(' .
837
			'[' . $prop_classes['Emoji_Modifier'] . ']' .
838
			'|' .
839
			'\x{FE0F}\x{20E3}?' .
840
			'|' .
841
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
842
		')?' .
843
		// Possibly concatenated with Zero Width Joiner and more emojis
844
		// (e.g. the "family" emoji sequences)
845
		'(' .
846
			'\x{200D}[' . $prop_classes['Emoji'] . ']' .
847
			'(' .
848
				'[' . $prop_classes['Emoji_Modifier'] . ']' .
849
				'|' .
850
				'\x{FE0F}\x{20E3}?' .
851
				'|' .
852
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
853
			')?' .
854
		')*' .
855
		'/u',
856
		function ($matches) use (&$placeholders)
857
		{
858
			// Skip lone ASCII characters that are not actully part of an emoji sequence.
859
			// This can happen because the digits 0-9 and the '*' and '#' characters are
860
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
861
			if (strlen($matches[0]) === 1)
862
				return $matches[0];
863
864
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
865
			return $placeholders[$matches[0]];
866
		},
867
		$string
868
	);
869
870
	// Get rid of any unsanctioned variation selectors.
871
	if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string))
872
	{
873
		/*
874
			Unicode gives pre-defined lists of sanctioned variation sequences
875
			and says any use of variation selectors outside those sequences is
876
			unsanctioned.
877
		*/
878
879
		$patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u');
880
881
		foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars)
882
			$patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u';
883
884
		// Use placeholders for sanctioned variation selectors.
885
		$string = preg_replace_callback(
886
			$patterns,
887
			function ($matches) use (&$placeholders)
888
			{
889
				$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
890
				return $placeholders[$matches[0]];
891
			},
892
			$string
893
		);
894
895
		// Remove any unsanctioned variation selectors.
896
		$string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string);
897
	}
898
899
	// Join controls are only allowed inside words in special circumstances.
900
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
901
	if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string))
902
	{
903
		// Zero Width Non-Joiner (U+200C)
904
		$zwnj = "\xE2\x80\x8C";
905
		// Zero Width Joiner (U+200D)
906
		$zwj = "\xE2\x80\x8D";
907
908
		$placeholders[$zwnj] = "\xEE\x80\x8C";
909
		$placeholders[$zwj] = "\xEE\x80\x8D";
910
911
		// When not in strict mode, allow ZWJ at word boundaries.
912
		if ($level === 0)
913
			$string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string);
914
915
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
916
		$joining_type_classes = utf8_regex_joining_type();
917
		$indic_classes = utf8_regex_indic();
918
919
		foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes)
920
		{
921
			// Cursive scripts like Arabic use ZWNJ in certain contexts.
922
			// For these scripts, use test A1 for allowing ZWNJ.
923
			// https://unicode.org/reports/tr31/#A1
924
			if (isset($joining_type_classes[$script]))
925
			{
926
				$lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : '';
927
				$rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : '';
928
				$t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : '';
929
930
				if (!empty($classes['Dual_Joining']))
931
				{
932
					$lj .= $classes['Dual_Joining'];
933
					$rj .= $classes['Dual_Joining'];
934
				}
935
936
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
937
			}
938
			// Indic scripts with viramas use ZWNJ and ZWJ in certain contexts.
939
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ.
940
			// https://unicode.org/reports/tr31/#A2
941
			// https://unicode.org/reports/tr31/#B
942
			else
943
			{
944
				// A letter that is part of this particular script.
945
				$letter = '[' . $classes['Letter'] . ']';
946
947
				// Zero or more non-spacing marks used in this script.
948
				$nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*';
949
950
				// Zero or more non-spacing combining marks used in this script.
951
				$nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*';
952
953
				// ZWNJ must be followed by another letter in the same script.
954
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
955
956
				// ZWJ must NOT be followed by a vowel dependent character in this
957
				// script or by any character from a different script.
958
				$zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])';
959
960
				// Now build the pattern for this script.
961
				$pattern = $letter . $nonspacing_marks . '[' . $classes['Virama'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
962
			}
963
964
			// Do the thing.
965
			$string = preg_replace_callback(
966
				'/' . $pattern . '/u',
967
				function ($matches) use ($placeholders)
968
				{
969
					return strtr($matches[0], $placeholders);
970
				},
971
				$string
972
			);
973
974
			// Did we catch 'em all?
975
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
976
				break;
977
		}
978
979
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
980
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
981
	}
982
983
	// Revert placeholders back to original characters.
984
	$string = strtr($string, array_flip($placeholders));
985
986
987
	return $string;
988
}
989
990
?>