Passed
Push — release-2.1 ( 7401ba...5d05c6 )
by Mathias
08:16 queued 14s
created

utf8_is_normalized()   C

Complexity

Conditions 13
Paths 1

Size

Total Lines 71
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 13
eloc 40
c 0
b 0
f 0
nc 1
nop 2
dl 0
loc 71
rs 6.6166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2023 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1.4
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
require_once($sourcedir . '/Unicode/Metadata.php');
18
19
/**
20
 * Converts the given UTF-8 string into lowercase.
21
 * Equivalent to mb_strtolower($string, 'UTF-8'), except that we can keep the
22
 * output consistent across PHP versions and up to date with the latest version
23
 * of Unicode.
24
 *
25
 * @param string $string The string
26
 * @return string The lowercase version of $string
27
 */
28
function utf8_strtolower($string)
29
{
30
	return utf8_convert_case($string, 'lower');
31
}
32
33
/**
34
 * Convert the given UTF-8 string to uppercase.
35
 * Equivalent to mb_strtoupper($string, 'UTF-8'), except that we can keep the
36
 * output consistent across PHP versions and up to date with the latest version
37
 * of Unicode.
38
 *
39
 * @param string $string The string
40
 * @return string The uppercase version of $string
41
 */
42
function utf8_strtoupper($string)
43
{
44
	return utf8_convert_case($string, 'upper');
45
}
46
47
/**
48
 * Casefolds the given UTF-8 string.
49
 * Equivalent to mb_convert_case($string, MB_CASE_FOLD, 'UTF-8'), except that
50
 * we can keep the output consistent across PHP versions and up to date with
51
 * the latest version of Unicode.
52
 *
53
 * @param string $string The string
54
 * @return string The uppercase version of $string
55
 */
56
function utf8_casefold($string)
57
{
58
	return utf8_convert_case($string, 'fold');
59
}
60
61
/**
62
 * Converts the case of the given UTF-8 string.
63
 *
64
 * @param string $string The string.
65
 * @param string $case One of 'upper', 'lower', 'fold', 'title', 'ucfirst', or 'ucwords'.
66
 * @param bool $simple If true, use simple maps instead of full maps. Default: false.
67
 * @return string A version of $string converted to the specified case.
68
 */
69
function utf8_convert_case($string, $case, $simple = false)
70
{
71
	global $sourcedir, $txt;
72
73
	$simple = !empty($simple);
74
75
	$lang = empty($txt['lang_locale']) ? '' : substr($txt['lang_locale'], 0, 2);
76
77
	// The main case conversion logic
78
	if (in_array($case, array('upper', 'lower', 'fold')))
79
	{
80
		$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
81
82
		if ($chars === false)
83
			return false;
84
85
		switch ($case)
86
		{
87
			case 'upper':
88
				require_once($sourcedir . '/Unicode/CaseUpper.php');
89
90
				$substitutions = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
91
92
				// Turkish & Azeri conditional casing, part 1.
93
				if (in_array($lang, array('tr', 'az')))
94
					$substitutions['i'] = 'İ';
95
96
				break;
97
98
			case 'lower':
99
				require_once($sourcedir . '/Unicode/CaseLower.php');
100
101
				$substitutions = $simple ? utf8_strtolower_simple_maps() : utf8_strtolower_maps();
102
103
				// Turkish & Azeri conditional casing, part 1.
104
				if (in_array($lang, array('tr', 'az')))
105
				{
106
					$substitutions['İ'] = 'i';
107
					$substitutions['I' . "\xCC\x87"] = 'i';
108
					$substitutions['I'] = 'ı';
109
				}
110
111
				break;
112
113
			case 'fold':
114
				require_once($sourcedir . '/Unicode/CaseFold.php');
115
116
				$substitutions = $simple ? utf8_casefold_simple_maps() : utf8_casefold_maps();
117
118
				break;
119
		}
120
121
		foreach ($chars as &$char)
122
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
123
124
		$string = implode('', $chars);
125
	}
126
	elseif (in_array($case, array('title', 'ucfirst', 'ucwords')))
127
	{
128
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
129
		require_once($sourcedir . '/Unicode/CaseUpper.php');
130
		require_once($sourcedir . '/Unicode/CaseTitle.php');
131
132
		$prop_classes = utf8_regex_properties();
133
134
		$upper = $simple ? utf8_strtoupper_simple_maps() : utf8_strtoupper_maps();
135
136
		// Turkish & Azeri conditional casing, part 1.
137
		if (in_array($lang, array('tr', 'az')))
138
			$upper['i'] = 'İ';
139
140
		$title = array_merge($upper, $simple ? utf8_titlecase_simple_maps() : utf8_titlecase_maps());
141
142
		switch ($case)
143
		{
144
			case 'title':
145
				$string = utf8_convert_case($string, 'lower', $simple);
146
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})/u';
147
				break;
148
149
			case 'ucwords':
150
				$regex = '/(?:^|[^\w' . $prop_classes['Case_Ignorable'] . '])\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
151
				break;
152
153
			case 'ucfirst':
154
				$regex = '/^[^\w' . $prop_classes['Case_Ignorable'] . ']*\K(\p{L})(?=[' . $prop_classes['Case_Ignorable'] . ']*(?:(?<upper>\p{Lu})|\w?))/u';
155
				break;
156
		}
157
158
		$string = preg_replace_callback(
159
			$regex,
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $regex does not seem to be defined for all execution paths leading up to this point.
Loading history...
160
			function($matches) use ($upper, $title)
161
			{
162
				// If second letter is uppercase, use uppercase for first letter.
163
				// Otherwise, use titlecase for first letter.
164
				$case = !empty($matches['upper']) ? 'upper' : 'title';
165
166
				$matches[1] = isset($$case[$matches[1]]) ? $$case[$matches[1]] : $matches[1];
167
168
				return $matches[1];
169
			},
170
			$string
171
		);
172
	}
173
174
	// If casefolding, we're done.
175
	if ($case === 'fold')
176
		return $string;
177
178
	// Handle conditional casing situations...
179
	$substitutions = array();
180
	$replacements = array();
181
182
	// Greek conditional casing, part 1: Fix lowercase sigma.
183
	// Note that this rule doesn't depend on $txt['lang_locale'].
184
	if ($case !== 'upper' && strpos($string, 'ς') !== false || strpos($string, 'σ') !== false)
0 ignored issues
show
introduced by
Consider adding parentheses for clarity. Current Interpretation: ($case !== 'upper' && st...string, 'σ') !== false, Probably Intended Meaning: $case !== 'upper' && (st...tring, 'σ') !== false)
Loading history...
185
	{
186
		require_once($sourcedir . '/Unicode/RegularExpressions.php');
187
188
		$prop_classes = utf8_regex_properties();
189
190
		// First, convert all lowercase sigmas to regular form.
191
		$substitutions['ς'] = 'σ';
192
193
		// Then convert any at the end of words to final form.
194
		$replacements['/\Bσ([' . $prop_classes['Case_Ignorable'] . ']*)(?!\p{L})/u'] = 'ς$1';
195
	}
196
	// Greek conditional casing, part 2: No accents on uppercase strings.
197
	if ($lang === 'el' && $case === 'upper')
198
	{
199
		// Composed forms.
200
		$substitutions += array(
201
			'Ά' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ὰ' => 'Α', 'Ᾰ' => 'Α',
202
			'Ᾱ' => 'Α', 'Α' => 'Α', 'Α' => 'Α', 'Ἂ' => 'Α', 'Ἃ' => 'Α',
203
			'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α', 'Ὰ' => 'Α',
204
			'Ά' => 'Α', 'Α' => 'Α', 'Ἀ' => 'Α', 'Ἁ' => 'Α', 'Ἂ' => 'Α',
205
			'Ἃ' => 'Α', 'Ἄ' => 'Α', 'Ἅ' => 'Α', 'Ἆ' => 'Α', 'Ἇ' => 'Α',
206
			'Έ' => 'Ε', 'Ἐ' => 'Ε', 'Ἑ' => 'Ε', 'Ὲ' => 'Ε', 'Ἒ' => 'Ε',
207
			'Ἓ' => 'Ε', 'Ἔ' => 'Ε', 'Ἕ' => 'Ε', 'Ή' => 'Η', 'Ἠ' => 'Η',
208
			'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Η' => 'Η', 'Η' => 'Η', 'Ἢ' => 'Η',
209
			'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η', 'Ἧ' => 'Η',
210
			'Ἠ' => 'Η', 'Ἡ' => 'Η', 'Ὴ' => 'Η', 'Ή' => 'Η', 'Η' => 'Η',
211
			'Ἢ' => 'Η', 'Ἣ' => 'Η', 'Ἤ' => 'Η', 'Ἥ' => 'Η', 'Ἦ' => 'Η',
212
			'Ἧ' => 'Η', 'Ί' => 'Ι', 'Ἰ' => 'Ι', 'Ἱ' => 'Ι', 'Ὶ' => 'Ι',
213
			'Ῐ' => 'Ι', 'Ῑ' => 'Ι', 'Ι' => 'Ι', 'Ϊ' => 'Ι', 'Ι' => 'Ι',
214
			'Ἲ' => 'Ι', 'Ἳ' => 'Ι', 'Ἴ' => 'Ι', 'Ἵ' => 'Ι', 'Ἶ' => 'Ι',
215
			'Ἷ' => 'Ι', 'Ι' => 'Ι', 'Ι' => 'Ι', 'Ό' => 'Ο', 'Ὀ' => 'Ο',
216
			'Ὁ' => 'Ο', 'Ὸ' => 'Ο', 'Ὂ' => 'Ο', 'Ὃ' => 'Ο', 'Ὄ' => 'Ο',
217
			'Ὅ' => 'Ο', 'Ῥ' => 'Ρ', 'Ύ' => 'Υ', 'Υ' => 'Υ', 'Ὑ' => 'Υ',
218
			'Ὺ' => 'Υ', 'Ῠ' => 'Υ', 'Ῡ' => 'Υ', 'Υ' => 'Υ', 'Ϋ' => 'Υ',
219
			'Υ' => 'Υ', 'Υ' => 'Υ', 'Ὓ' => 'Υ', 'Υ' => 'Υ', 'Ὕ' => 'Υ',
220
			'Υ' => 'Υ', 'Ὗ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ', 'Υ' => 'Υ',
221
			'Ώ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ὼ' => 'Ω', 'Ω' => 'Ω',
222
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
223
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω', 'Ὠ' => 'Ω', 'Ὡ' => 'Ω', 'Ώ' => 'Ω',
224
			'Ω' => 'Ω', 'Ὢ' => 'Ω', 'Ὣ' => 'Ω', 'Ὤ' => 'Ω', 'Ὥ' => 'Ω',
225
			'Ὦ' => 'Ω', 'Ὧ' => 'Ω',
226
		);
227
228
		// Individual Greek diacritics.
229
		$substitutions += array(
230
			"\xCC\x80" => '', "\xCC\x81" => '', "\xCC\x84" => '',
231
			"\xCC\x86" => '', "\xCC\x88" => '', "\xCC\x93" => '',
232
			"\xCC\x94" => '', "\xCD\x82" => '', "\xCD\x83" => '',
233
			"\xCD\x84" => '', "\xCD\x85" => '', "\xCD\xBA" => '',
234
			"\xCE\x84" => '', "\xCE\x85" => '',
235
			"\xE1\xBE\xBD" => '', "\xE1\xBE\xBF" => '', "\xE1\xBF\x80" => '',
236
			"\xE1\xBF\x81" => '', "\xE1\xBF\x8D" => '', "\xE1\xBF\x8E" => '',
237
			"\xE1\xBF\x8F" => '', "\xE1\xBF\x9D" => '', "\xE1\xBF\x9E" => '',
238
			"\xE1\xBF\x9F" => '', "\xE1\xBF\xAD" => '', "\xE1\xBF\xAE" => '',
239
			"\xE1\xBF\xAF" => '', "\xE1\xBF\xBD" => '', "\xE1\xBF\xBE" => '',
240
		);
241
	}
242
243
	// Turkish & Azeri conditional casing, part 2.
244
	if ($case !== 'upper' && in_array($lang, array('tr', 'az')))
245
	{
246
		// Remove unnecessary "COMBINING DOT ABOVE" after i
247
		$substitutions['i' . "\xCC\x87"] = 'i';
248
	}
249
250
	// Lithuanian conditional casing.
251
	if ($lang === 'lt')
252
	{
253
		// Force a dot above lowercase i and j with accents by inserting
254
		// the "COMBINING DOT ABOVE" character.
255
		// Note: some fonts handle this incorrectly and show two dots,
256
		// but that's a bug in those fonts and cannot be fixed here.
257
		if ($case !== 'upper')
258
			$replacements['/(i\x{328}?|\x{12F}|j)([\x{300}\x{301}\x{303}])/u'] = '$1' . "\xCC\x87" . '$2';
259
260
		// Remove "COMBINING DOT ABOVE" after uppercase I and J.
261
		if ($case !== 'lower')
262
			$replacements['/(I\x{328}?|\x{12E}|J)\x{307}/u'] = '$1';
263
	}
264
265
	// Dutch has a special titlecase rule.
266
	if ($lang === 'nl' && $case === 'title')
267
	{
268
		$replacements['/\bIj/u'] = 'IJ';
269
	}
270
271
	// Now perform whatever conditional casing fixes we need.
272
	if (!empty($substitutions))
273
		$string = strtr($string, $substitutions);
274
275
	if (!empty($replacements))
276
		$string = preg_replace(array_keys($replacements), $replacements, $string);
277
278
	return $string;
279
}
280
281
/**
282
 * Normalizes UTF-8 via Canonical Decomposition.
283
 *
284
 * @param string $string A UTF-8 string
285
 * @return string The decomposed version of $string
286
 */
287
function utf8_normalize_d($string)
288
{
289
	$string = (string) $string;
290
291
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
292
	{
293
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_D))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

293
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D))
Loading history...
294
			return $string;
295
296
		if (is_callable('normalizer_normalize'))
297
			return normalizer_normalize($string, Normalizer::FORM_D);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_D of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

297
			return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_D);
Loading history...
298
	}
299
300
	if (utf8_is_normalized($string, 'd'))
301
		return $string;
302
303
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
304
305
	if ($chars === false)
306
		return false;
307
308
	return implode('', utf8_decompose($chars, false));
309
}
310
311
/**
312
 * Normalizes UTF-8 via Compatibility Decomposition.
313
 *
314
 * @param string $string A UTF-8 string.
315
 * @return string The decomposed version of $string.
316
 */
317
function utf8_normalize_kd($string)
318
{
319
	$string = (string) $string;
320
321
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
322
	{
323
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KD))
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_is_normalized(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

323
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD))
Loading history...
324
			return $string;
325
326
		if (is_callable('normalizer_normalize'))
327
			return normalizer_normalize($string, Normalizer::FORM_KD);
0 ignored issues
show
Bug introduced by
Normalizer::FORM_KD of type string is incompatible with the type integer expected by parameter $form of normalizer_normalize(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

327
			return normalizer_normalize($string, /** @scrutinizer ignore-type */ Normalizer::FORM_KD);
Loading history...
328
	}
329
330
	if (utf8_is_normalized($string, 'kd'))
331
		return $string;
332
333
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
334
335
	if ($chars === false)
336
		return false;
337
338
	return implode('', utf8_decompose($chars, true));
339
}
340
341
/**
342
 * Normalizes UTF-8 via Canonical Decomposition then Canonical Composition.
343
 *
344
 * @param string $string A UTF-8 string
345
 * @return string The composed version of $string
346
 */
347
function utf8_normalize_c($string)
348
{
349
	$string = (string) $string;
350
351
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
352
	{
353
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_C))
354
			return $string;
355
356
		if (is_callable('normalizer_normalize'))
357
			return normalizer_normalize($string, Normalizer::FORM_C);
358
	}
359
360
	if (utf8_is_normalized($string, 'c'))
361
		return $string;
362
363
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
364
365
	if ($chars === false)
366
		return false;
367
368
	return implode('', utf8_compose(utf8_decompose($chars, false)));
369
}
370
371
/**
372
 * Normalizes UTF-8 via Compatibility Decomposition then Canonical Composition.
373
 *
374
 * @param string $string The string
375
 * @return string The composed version of $string
376
 */
377
function utf8_normalize_kc($string)
378
{
379
	$string = (string) $string;
380
381
	if (is_callable('IntlChar::getUnicodeVersion') && version_compare(implode('.', IntlChar::getUnicodeVersion()), SMF_UNICODE_VERSION, '>='))
382
	{
383
		if (is_callable('normalizer_is_normalized') && normalizer_is_normalized($string, Normalizer::FORM_KC))
384
			return $string;
385
386
		if (is_callable('normalizer_normalize'))
387
			return normalizer_normalize($string, Normalizer::FORM_KC);
388
	}
389
390
	if (utf8_is_normalized($string, 'kc'))
391
		return $string;
392
393
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
394
395
	if ($chars === false)
396
		return false;
397
398
	return implode('', utf8_compose(utf8_decompose($chars, true)));
399
}
400
401
/**
402
 * Casefolds UTF-8 via Compatibility Composition Casefolding.
403
 * Used by idn_to_ascii polyfill in Subs-Compat.php
404
 *
405
 * @param string $string The string
406
 * @return string The casefolded version of $string
407
 */
408
function utf8_normalize_kc_casefold($string)
409
{
410
	global $sourcedir;
411
412
	$string = (string) $string;
413
414
	if (utf8_is_normalized($string, 'kc_casefold'))
415
		return $string;
416
417
	$chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
418
419
	if ($chars === false)
420
		return false;
421
422
	$chars = utf8_decompose($chars, true);
423
424
	require_once($sourcedir . '/Unicode/CaseFold.php');
425
	require_once($sourcedir . '/Unicode/DefaultIgnorables.php');
426
427
	$substitutions = utf8_casefold_maps();
428
	$ignorables = array_flip(utf8_default_ignorables());
429
430
	foreach ($chars as &$char)
431
	{
432
		if (isset($substitutions[$char]))
433
			$char = $substitutions[$char];
434
435
		elseif (isset($ignorables[$char]))
436
			$char = '';
437
	}
438
439
	return implode('', utf8_compose($chars));
440
}
441
442
/**
443
 * Checks whether a string is already normalized to a given form.
444
 *
445
 * @param string|array $string A string of UTF-8 characters.
446
 * @param string $form One of 'd', 'c', 'kd', 'kc', or 'kc_casefold'
447
 * @return bool Whether the string is already normalized to the given form.
448
 */
449
function utf8_is_normalized($string, $form)
450
{
451
	global $sourcedir;
452
453
	// Check whether string contains characters that are disallowed in this form.
454
	switch ($form)
455
	{
456
		case 'd':
457
			$prop = 'NFD_QC';
458
			break;
459
460
		case 'kd':
461
			$prop = 'NFKD_QC';
462
			break;
463
464
		case 'c':
465
			$prop = 'NFC_QC';
466
			break;
467
468
		case 'kc':
469
			$prop = 'NFKC_QC';
470
			break;
471
472
		case 'kc_casefold':
473
			$prop = 'Changes_When_NFKC_Casefolded';
474
			break;
475
476
		default:
477
			return false;
478
			break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
479
	}
480
481
	require_once($sourcedir . '/Unicode/QuickCheck.php');
482
	$qc = utf8_regex_quick_check();
483
484
	if (preg_match('/[' . $qc[$prop] . ']/u', $string))
0 ignored issues
show
Bug introduced by
It seems like $string can also be of type array; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

484
	if (preg_match('/[' . $qc[$prop] . ']/u', /** @scrutinizer ignore-type */ $string))
Loading history...
485
		return false;
486
487
	// Check whether all combining marks are in canonical order.
488
	// Note: Because PCRE's Unicode data might be outdated compared to ours,
489
	// this regex checks for marks and anything PCRE thinks is not a character.
490
	// That means the more thorough checks will occasionally be performed on
491
	// strings that don't need them, but building and running a perfect regex
492
	// would be more expensive in the vast majority of cases, so meh.
493
	if (preg_match_all('/([\p{M}\p{Cn}])/u', $string, $matches, PREG_OFFSET_CAPTURE))
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

493
	if (/** @scrutinizer ignore-call */ preg_match_all('/([\p{M}\p{Cn}])/u', $string, $matches, PREG_OFFSET_CAPTURE))

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
494
	{
495
		require_once($sourcedir . '/Unicode/CombiningClasses.php');
496
497
		$combining_classes = utf8_combining_classes();
498
499
		$last_pos = 0;
500
		$last_len = 0;
501
		$last_ccc = 0;
502
		foreach ($matches[1] as $match)
503
		{
504
			$char = $match[0];
505
			$pos = $match[1];
506
			$ccc = isset($combining_classes[$char]) ? $combining_classes[$char] : 0;
507
508
			// Not in canonical order, so return false.
509
			if ($pos === $last_pos + $last_len && $ccc > 0 && $last_ccc > $ccc)
510
				return false;
511
512
			$last_pos = $pos;
513
			$last_len = strlen($char);
514
			$last_ccc = $ccc;
515
		}
516
	}
517
518
	// If we get here, the string is normalized correctly.
519
	return true;
520
}
521
522
/**
523
 * Helper function for utf8_normalize_d and utf8_normalize_kd.
524
 *
525
 * @param array $chars Array of Unicode characters
526
 * @param bool $compatibility If true, perform compatibility decomposition. Default false.
527
 * @return array Array of decomposed Unicode characters.
528
 */
529
function utf8_decompose($chars, $compatibility = false)
530
{
531
	global $sourcedir;
532
533
	if (!empty($compatibility))
534
	{
535
		require_once($sourcedir . '/Unicode/DecompositionCompatibility.php');
536
537
		$substitutions = utf8_normalize_kd_maps();
538
539
		foreach ($chars as &$char)
540
			$char = isset($substitutions[$char]) ? $substitutions[$char] : $char;
541
	}
542
543
	require_once($sourcedir . '/Unicode/DecompositionCanonical.php');
544
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
545
546
	$substitutions = utf8_normalize_d_maps();
547
	$combining_classes = utf8_combining_classes();
548
549
	// Replace characters with decomposed forms.
550
	for ($i=0; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
551
	{
552
		// Hangul characters.
553
		// See "Hangul Syllable Decomposition" in the Unicode standard, ch. 3.12.
554
		if ($chars[$i] >= "\xEA\xB0\x80" && $chars[$i] <= "\xED\x9E\xA3")
555
		{
556
			if (!function_exists('mb_ord'))
557
				require_once($sourcedir . '/Subs-Compat.php');
558
559
			$s = mb_ord($chars[$i]);
560
			$sindex = $s - 0xAC00;
561
			$l = (int) (0x1100 + $sindex / (21 * 28));
562
			$v = (int) (0x1161 + ($sindex % (21 * 28)) / 28);
563
			$t = $sindex % 28;
564
565
			$chars[$i] = implode('', array(mb_chr($l), mb_chr($v), $t ? mb_chr(0x11A7 + $t) : ''));
566
		}
567
		// Everything else.
568
		elseif (isset($substitutions[$chars[$i]]))
569
			$chars[$i] = $substitutions[$chars[$i]];
570
	}
571
572
	// Must re-split the string before sorting.
573
	$chars = preg_split('/(.)/su', implode('', $chars), 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
574
575
	// Sort characters into canonical order.
576
	for ($i = 1; $i < count($chars); $i++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
577
	{
578
		if (empty($combining_classes[$chars[$i]]) || empty($combining_classes[$chars[$i - 1]]))
579
			continue;
580
581
		if ($combining_classes[$chars[$i - 1]] > $combining_classes[$chars[$i]])
582
		{
583
			$temp = $chars[$i];
584
			$chars[$i] = $chars[$i - 1];
585
			$chars[$i -1] = $temp;
586
587
			// Backtrack and check again.
588
			if ($i > 1)
589
				$i -= 2;
590
		}
591
	}
592
593
	return $chars;
594
}
595
596
/**
597
 * Helper function for utf8_normalize_c and utf8_normalize_kc.
598
 *
599
 * @param array $chars Array of decomposed Unicode characters
600
 * @return array Array of composed Unicode characters.
601
 */
602
function utf8_compose($chars)
603
{
604
	global $sourcedir;
605
606
	require_once($sourcedir . '/Unicode/Composition.php');
607
	require_once($sourcedir . '/Unicode/CombiningClasses.php');
608
609
	$substitutions = utf8_compose_maps();
610
	$combining_classes = utf8_combining_classes();
611
612
	for ($c = 0; $c < count($chars); $c++)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
613
	{
614
		// Singleton replacements.
615
		if (isset($substitutions[$chars[$c]]))
616
			$chars[$c] = $substitutions[$chars[$c]];
617
618
		// Hangul characters.
619
		// See "Hangul Syllable Composition" in the Unicode standard, ch. 3.12.
620
		if ($chars[$c] >= "\xE1\x84\x80" && $chars[$c] <= "\xE1\x84\x92" && isset($chars[$c + 1]) && $chars[$c + 1] >= "\xE1\x85\xA1" && $chars[$c + 1] <= "\xE1\x85\xB5")
621
		{
622
			if (!function_exists('mb_ord'))
623
				require_once($sourcedir . '/Subs-Compat.php');
624
625
			$l_part = $chars[$c];
626
			$v_part = $chars[$c + 1];
627
			$t_part = null;
628
629
			$l_index = mb_ord($l_part) - 0x1100;
630
			$v_index = mb_ord($v_part) - 0x1161;
631
632
			$lv_index = $l_index * 588 + $v_index * 28;
633
			$s = 0xAC00 + $lv_index;
634
635
			if (isset($chars[$c + 2]) && $chars[$c + 2] >= "\xE1\x86\xA8" && $chars[$c + 2] <= "\xE1\x87\x82")
636
			{
637
				$t_part = $chars[$c + 2];
638
				$t_index = mb_ord($t_part) - 0x11A7;
639
				$s += $t_index;
640
			}
641
642
			$chars[$c] = mb_chr($s);
643
			$chars[++$c] = null;
644
645
			if (isset($t_part))
646
				$chars[++$c] = null;
647
648
			continue;
649
		}
650
651
		if ($c > 0)
652
		{
653
			$ccc = isset($combining_classes[$chars[$c]]) ? $combining_classes[$chars[$c]] : 0;
654
655
			// Find the preceding starter character.
656
			$l = $c - 1;
657
			while ($l > 0 && (!isset($chars[$l]) || (!empty($combining_classes[$chars[$l]]) && $combining_classes[$chars[$l]] < $ccc)))
658
				$l--;
659
660
			// Is there a composed form for this combination?
661
			if (isset($substitutions[$chars[$l] . $chars[$c]]))
662
			{
663
				// Replace the starter character with the composed character.
664
				$chars[$l] = $substitutions[$chars[$l] . $chars[$c]];
665
666
				// Unset the current combining character.
667
				$chars[$c] = null;
668
			}
669
		}
670
	}
671
672
	return $chars;
673
}
674
675
/**
676
 * Helper function for sanitize_chars() that deals with invisible characters.
677
 *
678
 * This function deals with control characters, private use characters,
679
 * non-characters, and characters that are invisible by definition in the
680
 * Unicode standard. It does not deal with characters that are supposed to be
681
 * visible according to the Unicode standard, and makes no attempt to compensate
682
 * for possibly incomplete Unicode support in text rendering engines on client
683
 * devices.
684
 *
685
 * @param string $string The string to sanitize.
686
 * @param int $level Controls how invisible formatting characters are handled.
687
 *      0: Allow valid formatting characters. Use for sanitizing text in posts.
688
 *      1: Allow necessary formatting characters. Use for sanitizing usernames.
689
 *      2: Disallow all formatting characters. Use for internal comparisons
690
 *         only, such as in the word censor, search contexts, etc.
691
 * @param string $substitute Replacement string for the invalid characters.
692
 * @return string The sanitized string.
693
 */
694
function utf8_sanitize_invisibles($string, $level, $substitute)
695
{
696
	global $sourcedir;
697
698
	$string = (string) $string;
699
	$level = min(max((int) $level, 0), 2);
700
	$substitute = (string) $substitute;
701
702
	require_once($sourcedir . '/Unicode/RegularExpressions.php');
703
	$prop_classes = utf8_regex_properties();
704
705
	// We never want non-whitespace control characters
706
	$disallowed[] = '[^\P{Cc}\t\r\n]';
0 ignored issues
show
Comprehensibility Best Practice introduced by
$disallowed was never initialized. Although not strictly required by PHP, it is generally a good practice to add $disallowed = array(); before regardless.
Loading history...
707
708
	// We never want private use characters or non-characters.
709
	// Use our own version of \p{Cn} in order to avoid possible inconsistencies
710
	// between our data and whichever version of PCRE happens to be installed
711
	// on this server. Unlike \p{Cc} and \p{Co}, which never change, the value
712
	// of \p{Cn} changes with every new version of Unicode.
713
	$disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']';
714
715
	// Several more things we never want:
716
	$disallowed[] = '[' . implode('', array(
717
		// Soft Hyphen.
718
		'\x{AD}',
719
		// Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA.
720
		// Unicode Standard ch. 16 says: "they are insufficient for [their]
721
		// purpose and should be considered errors in the encoding."
722
		'\x{17B4}-\x{17B5}',
723
		// Invisible math characters.
724
		'\x{2061}-\x{2064}',
725
		// Deprecated formatting characters.
726
		'\x{206A}-\x{206F}',
727
		// Zero Width No-Break Space, a.k.a. Byte Order Mark.
728
		'\x{FEFF}',
729
		// Annotation characters and Object Replacement Character.
730
		'\x{FFF9}-\x{FFFC}',
731
	)) . ']';
732
733
	switch ($level)
734
	{
735
		case 2:
736
			$disallowed[] = '[' . implode('', array(
737
				// Combining Grapheme Character.
738
				'\x{34F}',
739
				// Zero Width Non-Joiner.
740
				'\x{200C}',
741
				// Zero Width Joiner.
742
				'\x{200D}',
743
				// All variation selectors.
744
				$prop_classes['Variation_Selector'],
745
				// Tag characters.
746
				'\x{E0000}-\x{E007F}',
747
			)) . ']';
748
749
			// no break
750
751
		case 1:
752
			$disallowed[] = '[' . implode('', array(
753
				// Zero Width Space.
754
				'\x{200B}',
755
				// Word Joiner.
756
				'\x{2060}',
757
				// "Bidi_Control" characters.
758
				// Disallowing means that all characters will behave according
759
				// to their default bidirectional text properties.
760
				$prop_classes['Bidi_Control'],
761
				// Hangul filler characters.
762
				// Used as placeholders in incomplete ideographs.
763
				'\x{115F}\x{1160}\x{3164}\x{FFA0}',
764
				// Shorthand formatting characters.
765
				'\x{1BCA0}-\x{1BCA3}',
766
				// Musical formatting characters.
767
				'\x{1D173}-\x{1D17A}',
768
			)) . ']';
769
770
			break;
771
772
		default:
773
			// Zero Width Space only allowed in certain scripts.
774
			$disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}';
775
776
			// Word Joiner disallowed inside words. (Yes, \w is Unicode safe.)
777
			$disallowed[] = '(?<=\w)\x{2060}(?=\w)';
778
779
			// Hangul Choseong Filler and Hangul Jungseong Filler must followed
780
			// by more Hangul Jamo characters.
781
			$disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])';
782
783
			// Hangul Filler for Hangul compatibility chars.
784
			$disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])';
785
786
			// Halfwidth Hangul Filler for halfwidth Hangul compatibility chars.
787
			$disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])';
788
789
			// Shorthand formatting characters only with other shorthand chars.
790
			$disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])';
791
			$disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]';
792
793
			// Musical formatting characters only with other musical chars.
794
			$disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])';
795
			$disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]';
796
797
			break;
798
	}
799
800
	if ($level < 2)
801
	{
802
		/*
803
			Combining Grapheme Character has two uses: to override standard
804
			search and collation behaviours, which we never want to allow, and
805
			to ensure correct behaviour of combining marks in a few exceptional
806
			cases, which is legitimate and should be allowed. This means we can
807
			simply test whether it is followed by a combining mark in order to
808
			determine whether to allow it.
809
		*/
810
		$disallowed[] = '\x{34F}(?!\p{M})';
811
812
		// Tag characters not allowed inside words.
813
		$disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)';
814
	}
815
816
	$string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string);
817
818
	// Are we done yet?
819
	if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string))
820
		return $string;
821
822
	// String must be in Normalization Form C for the following checks to work.
823
	$string = utf8_normalize_c($string);
824
825
	$placeholders = array();
826
827
	// Use placeholders to preserve known emoji from further processing.
828
	// Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex
829
	$string  = preg_replace_callback(
830
		'/' .
831
		// Flag emojis
832
		'[' . $prop_classes['Regional_Indicator'] . ']{2}' .
833
		// Or
834
		'|' .
835
		// Emoji characters
836
		'[' . $prop_classes['Emoji'] . ']' .
837
		// Possibly followed by modifiers of various sorts
838
		'(' .
839
			'[' . $prop_classes['Emoji_Modifier'] . ']' .
840
			'|' .
841
			'\x{FE0F}\x{20E3}?' .
842
			'|' .
843
			'[\x{E0020}-\x{E007E}]+\x{E007F}' .
844
		')?' .
845
		// Possibly concatenated with Zero Width Joiner and more emojis
846
		// (e.g. the "family" emoji sequences)
847
		'(' .
848
			'\x{200D}[' . $prop_classes['Emoji'] . ']' .
849
			'(' .
850
				'[' . $prop_classes['Emoji_Modifier'] . ']' .
851
				'|' .
852
				'\x{FE0F}\x{20E3}?' .
853
				'|' .
854
				'[\x{E0020}-\x{E007E}]+\x{E007F}' .
855
			')?' .
856
		')*' .
857
		'/u',
858
		function ($matches) use (&$placeholders)
859
		{
860
			// Skip lone ASCII characters that are not actually part of an emoji sequence.
861
			// This can happen because the digits 0-9 and the '*' and '#' characters are
862
			// the base characters for the "Emoji_Keycap_Sequence" emojis.
863
			if (strlen($matches[0]) === 1)
864
				return $matches[0];
865
866
			$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
867
			return $placeholders[$matches[0]];
868
		},
869
		$string
870
	);
871
872
	// Get rid of any unsanctioned variation selectors.
873
	if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string))
874
	{
875
		/*
876
			Unicode gives pre-defined lists of sanctioned variation sequences
877
			and says any use of variation selectors outside those sequences is
878
			unsanctioned.
879
		*/
880
881
		$patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u');
882
883
		foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars)
884
			$patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u';
885
886
		// Use placeholders for sanctioned variation selectors.
887
		$string = preg_replace_callback(
888
			$patterns,
889
			function ($matches) use (&$placeholders)
890
			{
891
				$placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C";
892
				return $placeholders[$matches[0]];
893
			},
894
			$string
895
		);
896
897
		// Remove any unsanctioned variation selectors.
898
		$string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string);
899
	}
900
901
	// Join controls are only allowed inside words in special circumstances.
902
	// See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters
903
	if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string))
904
	{
905
		// Zero Width Non-Joiner (U+200C)
906
		$zwnj = "\xE2\x80\x8C";
907
		// Zero Width Joiner (U+200D)
908
		$zwj = "\xE2\x80\x8D";
909
910
		$placeholders[$zwnj] = "\xEE\x80\x8C";
911
		$placeholders[$zwj] = "\xEE\x80\x8D";
912
913
		// When not in strict mode, allow ZWJ at word boundaries.
914
		if ($level === 0)
915
			$string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string);
916
917
		// Tests for Zero Width Joiner and Zero Width Non-Joiner.
918
		$joining_type_classes = utf8_regex_joining_type();
919
		$indic_classes = utf8_regex_indic();
920
921
		foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes)
922
		{
923
			// Cursive scripts like Arabic use ZWNJ in certain contexts.
924
			// For these scripts, use test A1 for allowing ZWNJ.
925
			// https://unicode.org/reports/tr31/#A1
926
			if (isset($joining_type_classes[$script]))
927
			{
928
				$lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : '';
929
				$rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : '';
930
				$t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : '';
931
932
				if (!empty($classes['Dual_Joining']))
933
				{
934
					$lj .= $classes['Dual_Joining'];
935
					$rj .= $classes['Dual_Joining'];
936
				}
937
938
				$pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']';
939
			}
940
			// Indic scripts with viramas use ZWNJ and ZWJ in certain contexts.
941
			// For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ.
942
			// https://unicode.org/reports/tr31/#A2
943
			// https://unicode.org/reports/tr31/#B
944
			else
945
			{
946
				// A letter that is part of this particular script.
947
				$letter = '[' . $classes['Letter'] . ']';
948
949
				// Zero or more non-spacing marks used in this script.
950
				$nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*';
951
952
				// Zero or more non-spacing combining marks used in this script.
953
				$nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*';
954
955
				// ZWNJ must be followed by another letter in the same script.
956
				$zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')';
957
958
				// ZWJ must NOT be followed by a vowel dependent character in this
959
				// script or by any character from a different script.
960
				$zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])';
961
962
				// Now build the pattern for this script.
963
				$pattern = $letter . $nonspacing_marks . '[' . $classes['Virama'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern);
964
			}
965
966
			// Do the thing.
967
			$string = preg_replace_callback(
968
				'/' . $pattern . '/u',
969
				function ($matches) use ($placeholders)
970
				{
971
					return strtr($matches[0], $placeholders);
972
				},
973
				$string
974
			);
975
976
			// Did we catch 'em all?
977
			if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false)
978
				break;
979
		}
980
981
		// Apart from the exceptions above, ZWNJ and ZWJ are not allowed.
982
		$string = str_replace(array($zwj, $zwnj), $substitute, $string);
983
	}
984
985
	// Revert placeholders back to original characters.
986
	$string = strtr($string, array_flip($placeholders));
987
988
989
	return $string;
990
}
991
992
?>