Passed
Pull Request — release-2.1 (#7392)
by Jon
04:47
created

utf8_convert_case()   F

Complexity

Conditions 38
Paths 17978

Size

Total Lines 210
Code Lines 117

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 38
eloc 117
nc 17978
nop 3
dl 0
loc 210
rs 0
c 1
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2022 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1.0
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
/**
18
 * Converts the given UTF-8 string into lowercase.
19
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
20
 * output consistent across PHP versions and up to date with the latest version
21
 * of Unicode.
22
 *
23
 * @param string $string The string
24
 * @return string The lowercase version of $string
25
 */
26
function utf8_strtolower($string)
27
{
28
	return utf8_convert_case($string, 'lower');
29
}
30
31
/**
32
 * Convert the given UTF-8 string to uppercase.
33
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
34
 * output consistent across PHP versions and up to date with the latest version
35
 * of Unicode.
36
 *
37
 * @param string $string The string
38
 * @return string The uppercase version of $string
39
 */
40
function utf8_strtoupper($string)
41
{
42
	return utf8_convert_case($string, 'upper');
43
}
44
45
/**
46
 * Casefolds the given UTF-8 string.
47
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
48
 * we can keep the output consistent across PHP versions and up to date with
49
 * the latest version of Unicode.
50
 *
51
 * @param string $string The string
52
 * @return string The uppercase version of $string
53
 */
54
function utf8_casefold($string)
55
{
56
	return utf8_convert_case($string, 'fold');
57
}
58
59
/**
60
 * Converts the case of the given UTF-8 string.
61
 *
62
 * @param string $string The string.
63
 * @param string $case One of 'upper', 'lower', 'fold', 'title', 'ucfirst', or 'ucwords'.
64
 * @param bool $simple If true, use simple maps instead of full maps. Default: false.
65
 * @return string A version of $string converted to the specified case.
66
 */
67
function utf8_convert_case($string, $case, $simple = false)
68
{
69
	global $sourcedir, $txt;
70
71
	$simple = !empty($simple);
72
73
	$lang = empty($txt['lang_locale']) ? '' : substr($txt['lang_locale'], 0, 2);
74
75
	// The main case conversion logic
76
	if (in_array($case, array('upper', 'lower', 'fold')))
77
	{
78
		$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
79
80
		if ($chars === false)
81
			return false;
82
83
		switch ($case)
84
		{
85
			case 'upper':
86
				require_once($sourcedir . '/Unicode/CaseUpper.php');
87
88
				$substitutions = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
89
90
				// Turkish & Azeri conditional casing, part 1.
91
				if (in_array($lang, array('tr', 'az')))
92
					$substitutions['i'] = 'İ';
93
94
				break;
95
96
			case 'lower':
97
				require_once($sourcedir . '/Unicode/CaseLower.php');
98
99
				$substitutions = $simple ? utf8_strtolower_simple_maps() : utf8_strtolower_maps();
100
101
				// Turkish & Azeri conditional casing, part 1.
102
				if (in_array($lang, array('tr', 'az')))
103
				{
104
					$substitutions['İ'] = 'i';
105
					$substitutions['I' . "\xCC\x87"] = 'i';
106
					$substitutions['I'] = 'ı';
107
				}
108
109
				break;
110
111
			case 'fold':
112
				require_once($sourcedir . '/Unicode/CaseFold.php');
113
114
				$substitutions = $simple ? utf8_casefold_simple_maps() : utf8_casefold_maps();
115
116
				break;
117
		}
118
119
		foreach ($chars as &$char)
120
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
121
122
		$string = implode('', $chars);
123
	}
124
	elseif (in_array($case, array('title', 'ucfirst', 'ucwords')))
125
	{
126
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
127
		require_once($sourcedir . '/Unicode/CaseUpper.php');
128
		require_once($sourcedir . '/Unicode/CaseTitle.php');
129
130
		$prop_classes = utf8_regex_properties();
131
132
		$upper = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
133
134
		// Turkish & Azeri conditional casing, part 1.
135
		if (in_array($lang, array('tr', 'az')))
136
			$upper['i'] = 'İ';
137
138
		$title = array_merge($upper, $simple ? utf8_titlecase_simple_maps() : utf8_titlecase_maps());
139
140
		switch ($case)
141
		{
142
			case 'title':
143
				$string = utf8_convert_case($string, 'lower', $simple);
144
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})/u';
145
				break;
146
147
			case 'ucwords':
148
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
149
				break;
150
151
			case 'ucfirst':
152
				$regex = '/^[^\w' . $prop_classes['Case_Ignorable'] . ']*\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
153
				break;
154
		}
155
156
		$string = preg_replace_callback(
157
			$regex,
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $regex does not seem to be defined for all execution paths leading up to this point.
Loading history...
158
			function($matches) use ($upper, $title)
159
			{
160
				// If second letter is uppercase, use uppercase for first letter.
161
				// Otherwise, use titlecase for first letter.
162
				$case = !empty($matches['upper']) ? 'upper' : 'title';
163
164
				$matches[1] = isset($$case[$matches[1]]) ? $$case[$matches[1]] : $matches[1];
165
166
				return $matches[1];
167
			},
168
			$string
169
		);
170
	}
171
172
	// If casefolding, we're done.
173
	if ($case === 'fold')
174
		return $string;
175
176
	// Handle conditional casing situations...
177
	$substitutions = array();
178
	$replacements = array();
179
180
	// Greek conditional casing, part 1: Fix lowercase sigma.
181
	// Note that this rule doesn't depend on $txt['lang_locale'].
182
	if ($case !== 'upper' && strpos($string, 'ς') !== false || strpos($string, 'σ') !== false)
0 ignored issues
show
introduced by
Consider adding parentheses for clarity. Current Interpretation: ($case !== 'upper' && st...string, 'σ') !== false, Probably Intended Meaning: $case !== 'upper' && (st...tring, 'σ') !== false)
Loading history...
183
	{
184
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
185
186
		$prop_classes = utf8_regex_properties();
187
188
		// First, convert all lowercase sigmas to regular form.
189
		$substitutions['ς'] = 'σ';
190
191
		// Then convert any at the end of words to final form.
192
		$replacements['/\Bσ([' . $prop_classes['Case_Ignorable'] . ']*)(?!\p{L})/u'] = 'ς$1';
193
	}
194
	// Greek conditional casing, part 2: No accents on uppercase strings.
195
	if ($lang === 'el' && $case === 'upper')
196
	{
197
		// Composed forms.
198
		$substitutions += array(
199
			'Ά' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ὰ' => 'Α', 'Ᾰ' => 'Α',
200
			'Ᾱ' => 'Α', 'Α' => 'Α', 'Α' => 'Α', 'Ἂ' => 'Α', 'Ἃ' => 'Α',
201
			'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α', 'Ὰ' => 'Α',
202
			'Ά' => 'Α', 'Α' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ἂ' => 'Α',
203
			'Ἃ' => 'Α', 'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α',
204
			'Έ' => 'Ε', 'Ἐ' => 'Ε', 'Ἑ' => 'Ε', 'Ὲ' => 'Ε', 'Ἒ' => 'Ε',
205
			'Ἓ' => 'Ε', 'Ἔ' => 'Ε', 'Ἕ' => 'Ε', 'Ή' => 'Η', 'Ἠ' => 'Η',
206
			'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Η' => 'Η', 'Η' => 'Η', 'Ἢ' => 'Η',
207
			'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η', 'Ἧ' => 'Η',
208
			'Ἠ' => 'Η', 'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Ή' => 'Η', 'Η' => 'Η',
209
			'Ἢ' => 'Η', 'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η',
210
			'Ἧ' => 'Η', 'Ί' => 'Ι', 'Ἰ' => 'Ι', 'Ἱ' => 'Ι', 'Ὶ' => 'Ι',
211
			'Ῐ' => 'Ι', 'Ῑ' => 'Ι', 'Ι' => 'Ι', 'Ϊ' => 'Ι', 'Ι' => 'Ι',
212
			'Ἲ' => 'Ι', 'Ἳ' => 'Ι', 'Ἴ' => 'Ι', 'Ἵ' => 'Ι', 'Ἶ' => 'Ι',
213
			'Ἷ' => 'Ι', 'Ι' => 'Ι', 'Ι' => 'Ι', 'Ό' => 'Ο', 'Ὀ' => 'Ο',
214
			'Ὁ' => 'Ο', 'Ὸ' => 'Ο', 'Ὂ' => 'Ο', 'Ὃ' => 'Ο', 'Ὄ' => 'Ο',
215
			'Ὅ' => 'Ο', 'Ῥ' => 'Ρ', 'Ύ' => 'Υ', 'Υ' => 'Υ', 'Ὑ' => 'Υ',
216
			'Ὺ' => 'Υ', 'Ῠ' => 'Υ', 'Ῡ' => 'Υ', 'Υ' => 'Υ', 'Ϋ' => 'Υ',
217
			'Υ' => 'Υ', 'Υ' => 'Υ', 'Ὓ' => 'Υ', 'Υ' => 'Υ', 'Ὕ' => 'Υ',
218
			'Υ' => 'Υ', 'Ὗ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ',
219
			'Ώ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ὼ' => 'Ω', 'Ω' => 'Ω',
220
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
221
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ώ' => 'Ω',
222
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
223
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω',
224
		);
225
226
		// Individual Greek diacritics.
227
		$substitutions += array(
228
			"\xCC\x80" => '', "\xCC\x81" => '', "\xCC\x84" => '',
229
			"\xCC\x86" => '', "\xCC\x88" => '', "\xCC\x93" => '',
230
			"\xCC\x94" => '', "\xCD\x82" => '', "\xCD\x83" => '',
231
			"\xCD\x84" => '', "\xCD\x85" => '', "\xCD\xBA" => '',
232
			"\xCE\x84" => '', "\xCE\x85" => '',
233
			"\xE1\xBE\xBD" => '', "\xE1\xBE\xBF" => '', "\xE1\xBF\x80" => '',
234
			"\xE1\xBF\x81" => '', "\xE1\xBF\x8D" => '', "\xE1\xBF\x8E" => '',
235
			"\xE1\xBF\x8F" => '', "\xE1\xBF\x9D" => '', "\xE1\xBF\x9E" => '',
236
			"\xE1\xBF\x9F" => '', "\xE1\xBF\xAD" => '', "\xE1\xBF\xAE" => '',
237
			"\xE1\xBF\xAF" => '', "\xE1\xBF\xBD" => '', "\xE1\xBF\xBE" => '',
238
		);
239
	}
240
241
	// Turkish & Azeri conditional casing, part 2.
242
	if ($case !== 'upper' && in_array($lang, array('tr', 'az')))
243
	{
244
		// Remove unnecessary "COMBINING DOT ABOVE" after i
245
		$substitutions['i' . "\xCC\x87"] = 'i';
246
	}
247
248
	// Lithuanian conditional casing.
249
	if ($lang === 'lt')
250
	{
251
		// Force a dot above lowercase i and j with accents by inserting
252
		// the "COMBINING DOT ABOVE" character.
253
		// Note: some fonts handle this incorrectly and show two dots,
254
		// but that's a bug in those fonts and cannot be fixed here.
255
		if ($case !== 'upper')
256
			$replacements['/(i\x{328}?|\x{12F}|j)([\x{300}\x{301}\x{303}])/u'] = '$1' . "\xCC\x87" . '$2';
257
258
		// Remove "COMBINING DOT ABOVE" after uppercase I and J.
259
		if ($case !== 'lower')
260
			$replacements['/(I\x{328}?|\x{12E}|J)\x{307}/u'] = '$1';
261
	}
262
263
	// Dutch has a special titlecase rule.
264
	if ($lang === 'nl' && $case === 'title')
265
	{
266
		$replacements['/\bIj/u'] = 'IJ';
267
	}
268
269
	// Now perform whatever conditional casing fixes we need.
270
	if (!empty($substitutions))
271
		$string = strtr($string, $substitutions);
272
273
	if (!empty($replacements))
274
		$string = preg_replace(array_keys($replacements), $replacements, $string);
275
276
	return $string;
277
}
278
279
/**
280
 * Normalizes UTF-8 via Canonical Decomposition.
281
 *
282
 * @param string $string A UTF-8 string
283
 * @return string The decomposed version of $string
284
 */
285
function utf8_normalize_d($string)
286
{
287
	$string = (string) $string;
288
289
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

289
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D))
Loading history...
290
		return $string;
291
292
	if (is_callable('normalizer_normalize'))
293
		return normalizer_normalize($string, Normalizer::FORM_D);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

293
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D);
Loading history...
294
295
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
296
297
	if ($chars === false)
298
		return false;
299
300
	return implode('', utf8_decompose($chars, false));
301
}
302
303
/**
304
 * Normalizes UTF-8 via Compatibility Decomposition.
305
 *
306
 * @param string $string A UTF-8 string.
307
 * @return string The decomposed version of $string.
308
 */
309
function utf8_normalize_kd($string)
310
{
311
	$string = (string) $string;
312
313
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

313
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD))
Loading history...
314
		return $string;
315
316
	if (is_callable('normalizer_normalize'))
317
		return normalizer_normalize($string, Normalizer::FORM_KD);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

317
		return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD);
Loading history...
318
319
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
320
321
	if ($chars === false)
322
		return false;
323
324
	return implode('', utf8_decompose($chars, true));
325
}
326
327
/**
328
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
329
 *
330
 * @param string $string A UTF-8 string
331
 * @return string The composed version of $string
332
 */
333
function utf8_normalize_c($string)
334
{
335
	$string = (string) $string;
336
337
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
338
		return $string;
339
340
	if (is_callable('normalizer_normalize'))
341
		return normalizer_normalize($string, Normalizer::FORM_C);
342
343
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
344
345
	if ($chars === false)
346
		return false;
347
348
	return implode('', utf8_compose(utf8_decompose($chars, false)));
349
}
350
351
/**
352
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
353
 *
354
 * @param string $string The string
355
 * @return string The composed version of $string
356
 */
357
function utf8_normalize_kc($string)
358
{
359
	$string = (string) $string;
360
361
	if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
362
		return $string;
363
364
	if (is_callable('normalizer_normalize'))
365
		return normalizer_normalize($string, Normalizer::FORM_KC);
366
367
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
368
369
	if ($chars === false)
370
		return false;
371
372
	return implode('', utf8_compose(utf8_decompose($chars, true)));
373
}
374
375
/**
376
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
377
 * Used by idn_to_ascii polyfill in Subs-Compat.php
378
 *
379
 * @param string $string The string
380
 * @return string The casefolded version of $string
381
 */
382
function utf8_normalize_kc_casefold($string)
383
{
384
	global $sourcedir;
385
386
	$string = (string) $string;
387
388
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
389
390
	if ($chars === false)
391
		return false;
392
393
	$chars = utf8_decompose($chars, true);
394
395
	require_once($sourcedir . '/Unicode/CaseFold.php');
396
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
397
398
	$substitutions = utf8_casefold_maps();
399
	$ignorables = array_flip(utf8_default_ignorables());
400
401
	foreach ($chars as &$char)
402
	{
403
		if (isset($substitutions[$char]))
404
			$char = $substitutions[$char];
405
406
		elseif (isset($ignorables[$char]))
407
			$char = '';
408
	}
409
410
	return implode('', utf8_compose($chars));
411
}
412
413
/**
414
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
415
 *
416
 * @param array $chars Array of Unicode characters
417
 * @return array Array of decomposed Unicode characters.
418
 */
419
function utf8_decompose($chars, $compatibility = false)
420
{
421
	global $sourcedir;
422
423
	if (!empty($compatibility))
424
	{
425
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
426
427
		$substitutions = utf8_normalize_kd_maps();
428
429
		foreach ($chars as &$char)
430
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
431
	}
432
433
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
434
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
435
436
	$substitutions = utf8_normalize_d_maps();
437
	$combining_classes = utf8_combining_classes();
438
439
	// Replace characters with decomposed forms.
440
	for ($i=0; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
441
	{
442
		// Hangul characters.
443
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
444
		{
445
			if (!function_exists('mb_ord'))
446
				require_once($sourcedir . '/Subs-Compat.php');
447
448
			$s = mb_ord($chars[$i]);
449
			$sindex = $s - 0xAC00;
450
			$l = 0x1100 + $sindex / (21 * 28);
451
			$v = 0x1161 + ($sindex % (21 * 28)) / 28;
452
			$t = $sindex % 28;
453
454
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
455
		}
456
		// Everything else.
457
		elseif (isset($substitutions[$chars[$i]]))
458
			$chars[$i] = $substitutions[$chars[$i]];
459
	}
460
461
	// Must re-split the string before sorting.
462
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
463
464
	// Sort characters into canonical order.
465
	for ($i = 1; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
466
	{
467
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
468
			continue;
469
470
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
471
		{
472
			$temp = $chars[$i];
473
			$chars[$i] = $chars[$i - 1];
474
			$chars[$i -1] = $temp;
475
476
			// Backtrack and check again.
477
			if ($i > 1)
478
				$i -= 2;
479
		}
480
	}
481
482
	return $chars;
483
}
484
485
/**
486
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
487
 *
488
 * @param array $chars Array of decomposed Unicode characters
489
 * @return array Array of composed Unicode characters.
490
 */
491
function utf8_compose($chars)
492
{
493
	global $sourcedir;
494
495
	require_once($sourcedir . '/Unicode/Composition.php');
496
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
497
498
	$substitutions = utf8_compose_maps();
499
	$combining_classes = utf8_combining_classes();
500
501
	for ($c = 0; $c < count($chars); $c++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
502
	{
503
		// Singleton replacements.
504
		if (isset($substitutions[$chars[$c]]))
505
			$chars[$c] = $substitutions[$chars[$c]];
506
507
		// Hangul characters.
508
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
509
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && isset($chars[$c + 1]) && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
510
		{
511
			if (!function_exists('mb_ord'))
512
				require_once($sourcedir . '/Subs-Compat.php');
513
514
			$l_part = $chars[$c];
515
			$v_part = $chars[$c + 1];
516
			$t_part = null;
517
518
			$l_index = mb_ord($l_part) - 0x1100;
519
			$v_index = mb_ord($v_part) - 0x1161;
520
521
			$lv_index = $l_index * 588 + $v_index * 28;
522
			$s = 0xAC00 + $lv_index;
523
524
			if (isset($chars[$c + 2]) && $chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
525
			{
526
				$t_part = $chars[$c + 2];
527
				$t_index = mb_ord($t_part) - 0x11A7;
528
				$s += $t_index;
529
			}
530
531
			$chars[$c] = mb_chr($s);
532
			$chars[++$c] = null;
533
534
			if (isset($t_part))
535
				$chars[++$c] = null;
536
537
			continue;
538
		}
539
540
		if ($c > 0)
541
		{
542
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
543
544
			// Find the preceding starter character.
545
			$l = $c - 1;
546
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
547
				$l--;
548
549
			// Is there a composed form for this combination?
550
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
551
			{
552
				// Replace the starter character with the composed character.
553
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
554
555
				// Unset the current combining character.
556
				$chars[$c] = null;
557
			}
558
		}
559
	}
560
561
	return $chars;
562
}
563
564
/**
565
 * Helper function for sanitize_chars() that deals with invisible characters.
566
 *
567
 * This function deals with control characters, private use characters,
568
 * non-characters, and characters that are invisible by definition in the
569
 * Unicode standard. It does not deal with characters that are supposed to be
570
 * visible according to the Unicode standard, and makes no attempt to compensate
571
 * for possibly incomplete Unicode support in text rendering engines on client
572
 * devices.
573
 *
574
 * @param string $string The string to sanitize.
575
 * @param int $level Controls how invisible formatting characters are handled.
576
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
577
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
578
 *      2: Disallow all formatting characters. Use for internal comparisions
579
 *         only, such as in the word censor, search contexts, etc.
580
 * @param string $substitute Replacement string for the invalid characters.
581
 * @return string The sanitized string.
582
 */
583
function utf8_sanitize_invisibles($string, $level, $substitute)
584
{
585
	global $sourcedir;
586
587
	$string = (string) $string;
588
	$level = min(max((int) $level, 0), 2);
589
	$substitute = (string) $substitute;
590
591
	require_once($sourcedir . '/Unicode/RegularExpressions.php');
592
	$prop_classes = utf8_regex_properties();
593
594
	// We never want non-whitespace control characters
595
	$disallowed[] = '[^\P{Cc}\t\r\n]';
0 ignored issues
show
Comprehensibility Best Practice introduced by
$disallowed was never initialized. Although not strictly required by PHP, it is generally a good practice to add $disallowed = array(); before regardless.
Loading history...
596
597
	// We never want private use characters or non-characters.
598
	// Use our own version of \p{Cn} in order to avoid possible inconsistencies
599
	// between our data and whichever version of PCRE happens to be installed
600
	// on this server. Unlike \p{Cc} and \p{Co}, which never change, the value
601
	// of \p{Cn} changes with every new version of Unicode.
602
	$disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']';
603
604
	// Several more things we never want:
605
	$disallowed[] = '[' . implode('', array(
606
		// Soft Hyphen.
607
		'\x{AD}',
608
		// Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA.
609
		// Unicode Standard ch. 16 says: "they are insufficient for [their]
610
		// purpose and should be considered errors in the encoding."
611
		'\x{17B4}-\x{17B5}',
612
		// Invisible math characters.
613
		'\x{2061}-\x{2064}',
614
		// Deprecated formatting characters.
615
		'\x{206A}-\x{206F}',
616
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
617
		'\x{FEFF}',
618
		// Annotation characters and Object Replacement Character.
619
		'\x{FFF9}-\x{FFFC}',
620
	)) . ']';
621
622
	switch ($level)
623
	{
624
		case 2:
625
			$disallowed[] = '[' . implode('', array(
626
				// Combining Grapheme Character.
627
				'\x{34F}',
628
				// Zero Width Non-Joiner.
629
				'\x{200C}',
630
				// Zero Width Joiner.
631
				'\x{200D}',
632
				// All variation selectors.
633
				$prop_classes['Variation_Selector'],
634
				// Tag characters.
635
				'\x{E0000}-\x{E007F}',
636
			)) . ']';
637
638
			// no break
639
640
		case 1:
641
			$disallowed[] = '[' . implode('', array(
642
				// Zero Width Space.
643
				'\x{200B}',
644
				// Word Joiner.
645
				'\x{2060}',
646
				// "Bidi_Control" characters.
647
				// Disallowing means that all characters will behave according
648
				// to their default bidirectional text properties.
649
				$prop_classes['Bidi_Control'],
650
				// Hangul filler characters.
651
				// Used as placeholders in incomplete ideographs.
652
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
653
				// Shorthand formatting characters.
654
				'\x{1BCA0}-\x{1BCA3}',
655
				// Musical formatting characters.
656
				'\x{1D173}-\x{1D17A}',
657
			)) . ']';
658
659
			break;
660
661
		default:
662
			// Zero Width Space only allowed in certain scripts.
663
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
664
665
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
666
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
667
668
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
669
			// by more Hangul Jamo characters.
670
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
671
672
			// Hangul Filler for Hangul compatibility chars.
673
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
674
675
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
676
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
677
678
			// Shorthand formatting characters only with other shorthand chars.
679
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
680
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
681
682
			// Musical formatting characters only with other musical chars.
683
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
684
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
685
686
			break;
687
	}
688
689
	if ($level < 2)
690
	{
691
		/*
692
			Combining Grapheme Character has two uses: to override standard
693
			search and collation behaviours, which we never want to allow, and
694
			to ensure correct behaviour of combining marks in a few exceptional
695
			cases, which is legitimate and should be allowed. This means we can
696
			simply test whether it is followed by a combining mark in order to
697
			determine whether to allow it.
698
		*/
699
		$disallowed[] = '\x{34F}(?!\p{M})';
700
701
		// Tag characters not allowed inside words.
702
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
703
	}
704
705
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
706
707
	// Are we done yet?
708
	if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string))
709
		return $string;
710
711
	// String must be in Normalization Form C for the following checks to work.
712
	$string = utf8_normalize_c($string);
713
714
	$placeholders = array();
715
716
	// Use placeholders to preserve known emoji from further processing.
717
	// Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
718
	$string  = preg_replace_callback(
719
		'/' .
720
		// Flag emojis
721
		'[' . $prop_classes['Regional_Indicator'] . ']{2}' .
722
		// Or
723
		'|' .
724
		// Emoji characters
725
		'[' . $prop_classes['Emoji'] . ']' .
726
		// Possibly followed by modifiers of various sorts
727
		'(' .
728
			'[' . $prop_classes['Emoji_Modifier'] . ']' .
729
			'|' .
730
			'\x{FE0F}\x{20E3}?' .
731
			'|' .
732
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
733
		')?' .
734
		// Possibly concatenated with Zero Width Joiner and more emojis
735
		// (e.g. the "family" emoji sequences)
736
		'(' .
737
			'\x{200D}[' . $prop_classes['Emoji'] . ']' .
738
			'(' .
739
				'[' . $prop_classes['Emoji_Modifier'] . ']' .
740
				'|' .
741
				'\x{FE0F}\x{20E3}?' .
742
				'|' .
743
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
744
			')?' .
745
		')*' .
746
		'/u',
747
		function ($matches) use (&$placeholders)
748
		{
749
			// Skip lone ASCII characters that are not actully part of an emoji sequence.
750
			// This can happen because the digits 0-9 and the '*' and '#' characters are
751
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
752
			if (strlen($matches[0]) === 1)
753
				return $matches[0];
754
755
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
756
			return $placeholders[$matches[0]];
757
		},
758
		$string
759
	);
760
761
	// Get rid of any unsanctioned variation selectors.
762
	if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string))
763
	{
764
		/*
765
			Unicode gives pre-defined lists of sanctioned variation sequences
766
			and says any use of variation selectors outside those sequences is
767
			unsanctioned.
768
		*/
769
770
		$patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u');
771
772
		foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars)
773
			$patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u';
774
775
		// Use placeholders for sanctioned variation selectors.
776
		$string = preg_replace_callback(
777
			$patterns,
778
			function ($matches) use (&$placeholders)
779
			{
780
				$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
781
				return $placeholders[$matches[0]];
782
			},
783
			$string
784
		);
785
786
		// Remove any unsanctioned variation selectors.
787
		$string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string);
788
	}
789
790
	// Join controls are only allowed inside words in special circumstances.
791
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
792
	if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string))
793
	{
794
		// Zero Width Non-Joiner (U+200C)
795
		$zwnj = "\xE2\x80\x8C";
796
		// Zero Width Joiner (U+200D)
797
		$zwj = "\xE2\x80\x8D";
798
799
		$placeholders[$zwnj] = "\xEE\x80\x8C";
800
		$placeholders[$zwj] = "\xEE\x80\x8D";
801
802
		// When not in strict mode, allow ZWJ at word boundaries.
803
		if ($level === 0)
804
			$string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string);
805
806
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
807
		$joining_type_classes = utf8_regex_joining_type();
808
		$indic_classes = utf8_regex_indic();
809
810
		foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes)
811
		{
812
			// Cursive scripts like Arabic use ZWNJ in certain contexts.
813
			// For these scripts, use test A1 for allowing ZWNJ.
814
			// https://unicode.org/reports/tr31/#A1
815
			if (isset($joining_type_classes[$script]))
816
			{
817
				$lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : '';
818
				$rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : '';
819
				$t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : '';
820
821
				if (!empty($classes['Dual_Joining']))
822
				{
823
					$lj .= $classes['Dual_Joining'];
824
					$rj .= $classes['Dual_Joining'];
825
				}
826
827
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
828
			}
829
			// Indic scripts with viramas use ZWNJ and ZWJ in certain contexts.
830
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ.
831
			// https://unicode.org/reports/tr31/#A2
832
			// https://unicode.org/reports/tr31/#B
833
			else
834
			{
835
				// A letter that is part of this particular script.
836
				$letter = '[' . $classes['Letter'] . ']';
837
838
				// Zero or more non-spacing marks used in this script.
839
				$nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*';
840
841
				// Zero or more non-spacing combining marks used in this script.
842
				$nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*';
843
844
				// ZWNJ must be followed by another letter in the same script.
845
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
846
847
				// ZWJ must NOT be followed by a vowel dependent character in this
848
				// script or by any character from a different script.
849
				$zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])';
850
851
				// Now build the pattern for this script.
852
				$pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
853
			}
854
855
			// Do the thing.
856
			$string = preg_replace_callback(
857
				'/' . $pattern . '/u',
858
				function ($matches) use ($placeholders)
859
				{
860
					return strtr($matches[0], $placeholders);
861
				},
862
				$string
863
			);
864
865
			// Did we catch 'em all?
866
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
867
				break;
868
		}
869
870
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
871
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
872
	}
873
874
	// Revert placeholders back to original characters.
875
	$string = strtr($string, array_flip($placeholders));
876
877
878
	return $string;
879
}
880
881
?>