Update_Unicode   F
last analyzed

Complexity

Total Complexity 275

Size/Duplication

Total Lines 2095
Duplicated Lines 0 %

Importance

Changes 11
Bugs 0 Features 0
Metric Value
eloc 1012
c 11
b 0
f 0
dl 0
loc 2095
rs 1.552
wmc 275

21 Methods

Rating   Name   Duplication   Size   Complexity  
F build_script_stats() 0 212 35
B smf_file_header() 0 35 7
B process_main_unicode_data() 0 58 11
A deltree() 0 24 6
A lookup_ucd_version() 0 31 6
A get_function_code_and_regex() 0 54 4
C build_idna() 0 69 16
B fetch_unicode_file() 0 40 6
C process_casing_data() 0 83 13
B build_quick_check() 0 49 9
A make_temp_dir() 0 16 5
F execute() 0 218 34
C finalize_decomposition_forms() 0 70 13
A export_funcs_to_file() 0 25 5
C build_func_array() 0 75 12
B build_regex_properties() 0 64 11
F build_regex_indic() 0 169 31
D build_regex_joining_type() 0 107 19
C process_derived_normalization_props() 0 69 13
A should_update() 0 13 3
F build_regex_variation_selectors() 0 98 16

How to fix   Complexity   

Complex Class

Complex classes like Update_Unicode often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Update_Unicode, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * This file contains code used to update SMF's Unicode data files.
5
 *
6
 * Simple Machines Forum (SMF)
7
 *
8
 * @package SMF
9
 * @author Simple Machines https://www.simplemachines.org
10
 * @copyright 2025 Simple Machines and individual contributors
11
 * @license https://www.simplemachines.org/about/smf/license.php BSD
12
 *
13
 * @version 2.1.5
14
 */
15
16
/**
17
 * Class Update_Unicode
18
 */
19
class Update_Unicode extends SMF_BackgroundTask
20
{
21
	/**
22
	 * URLs where we can fetch the Unicode data files.
23
	 */
24
	const DATA_URL_UCD = 'https://unicode.org/Public/UCD/latest/ucd';
25
	const DATA_URL_IDNA = 'https://www.unicode.org/Public/idna/latest';
26
27
	/**
28
	 * @var string The latest official release of the Unicode Character Database.
29
	 */
30
	public $ucd_version = '';
31
32
	/**
33
	 * @var string Path to temporary working directory.
34
	 */
35
	public $temp_dir = '';
36
37
	/**
38
	 * @var string Convenince alias of $sourcedir . '/Unicode'.
39
	 */
40
	public $unicodedir = '';
41
42
	/**
43
	 * @var int Used to ensure we exit long running tasks cleanly.
44
	 */
45
	private $time_limit = 30;
46
47
	/**
48
	 * @var array Key-value pairs of character decompositions.
49
	 */
50
	private $full_decomposition_maps = array();
51
52
	/**
53
	 * @var array Character properties used during normalization.
54
	 */
55
	private $derived_normalization_props = array();
56
57
	/**
58
	 * @var array Assorted info about Unicode characters.
59
	 */
60
	private $char_data = array();
61
62
	/**
63
	 * @var array Statistical info about character scripts (e.g. Latin, Greek, Cyrillic, etc.)
64
	 */
65
	private $script_stats = array();
66
67
	/**
68
	 * @var array Tracks associations between character scripts' short and long names.
69
	 */
70
	private $script_aliases = array();
71
72
	/**
73
	 * @var array Info about functions to build in SMF's Unicode data files.
74
	 */
75
	private $funcs = array(
76
		array(
77
			'file' => 'Metadata.php',
78
			'regex' => '/if \(!defined\(\'SMF_UNICODE_VERSION\'\)\)\n\tdefine\(\'SMF_UNICODE_VERSION\', \'\d+(\.\d+)*\'\);/',
79
			'data' => array(
80
				// 0.0.0.0 will be replaced with correct value at runtime.
81
				"if (!defined('SMF_UNICODE_VERSION'))\n\tdefine('SMF_UNICODE_VERSION', '0.0.0.0');",
82
			),
83
		),
84
		'utf8_normalize_d_maps' => array(
85
			'file' => 'DecompositionCanonical.php',
86
			'key_type' => 'hexchar',
87
			'val_type' => 'hexchar',
88
			'desc' => array('Helper function for utf8_normalize_d.'),
89
			'return' => array(
90
				'type' => 'array',
91
				'desc' => 'Canonical Decomposition maps for Unicode normalization.',
92
			),
93
			'data' => array(),
94
		),
95
		'utf8_normalize_kd_maps' => array(
96
			'file' => 'DecompositionCompatibility.php',
97
			'key_type' => 'hexchar',
98
			'val_type' => 'hexchar',
99
			'desc' => array('Helper function for utf8_normalize_kd.'),
100
			'return' => array(
101
				'type' => 'array',
102
				'desc' => 'Compatibility Decomposition maps for Unicode normalization.',
103
			),
104
			'data' => array(),
105
		),
106
		'utf8_compose_maps' => array(
107
			'file' => 'Composition.php',
108
			'key_type' => 'hexchar',
109
			'val_type' => 'hexchar',
110
			'desc' => array('Helper function for utf8_compose.'),
111
			'return' => array(
112
				'type' => 'array',
113
				'desc' => 'Composition maps for Unicode normalization.',
114
			),
115
			'data' => array(),
116
		),
117
		'utf8_combining_classes' => array(
118
			'file' => 'CombiningClasses.php',
119
			'key_type' => 'hexchar',
120
			'val_type' => 'int',
121
			'desc' => array('Helper function for utf8_normalize_d.'),
122
			'return' => array(
123
				'type' => 'array',
124
				'desc' => 'Combining Class data for Unicode normalization.',
125
			),
126
			'data' => array(),
127
		),
128
		'utf8_strtolower_simple_maps' => array(
129
			'file' => 'CaseLower.php',
130
			'key_type' => 'hexchar',
131
			'val_type' => 'hexchar',
132
			'desc' => array('Helper function for utf8_strtolower.'),
133
			'return' => array(
134
				'type' => 'array',
135
				'desc' => 'Uppercase to lowercase maps.',
136
			),
137
			'data' => array(),
138
		),
139
		'utf8_strtolower_maps' => array(
140
			'file' => 'CaseLower.php',
141
			'key_type' => 'hexchar',
142
			'val_type' => 'hexchar',
143
			'desc' => array('Helper function for utf8_strtolower.'),
144
			'return' => array(
145
				'type' => 'array',
146
				'desc' => 'Uppercase to lowercase maps.',
147
			),
148
			'data' => array(),
149
		),
150
		'utf8_strtoupper_simple_maps' => array(
151
			'file' => 'CaseUpper.php',
152
			'key_type' => 'hexchar',
153
			'val_type' => 'hexchar',
154
			'desc' => array('Helper function for utf8_strtoupper.'),
155
			'return' => array(
156
				'type' => 'array',
157
				'desc' => 'Lowercase to uppercase maps.',
158
			),
159
			'data' => array(),
160
		),
161
		'utf8_strtoupper_maps' => array(
162
			'file' => 'CaseUpper.php',
163
			'key_type' => 'hexchar',
164
			'val_type' => 'hexchar',
165
			'desc' => array('Helper function for utf8_strtoupper.'),
166
			'return' => array(
167
				'type' => 'array',
168
				'desc' => 'Lowercase to uppercase maps.',
169
			),
170
			'data' => array(),
171
		),
172
		'utf8_titlecase_simple_maps' => array(
173
			'file' => 'CaseTitle.php',
174
			'key_type' => 'hexchar',
175
			'val_type' => 'hexchar',
176
			'desc' => array('Helper function for utf8_convert_case.'),
177
			'return' => array(
178
				'type' => 'array',
179
				'desc' => 'Simple title case maps.',
180
			),
181
			'data' => array(),
182
		),
183
		'utf8_titlecase_maps' => array(
184
			'file' => 'CaseTitle.php',
185
			'key_type' => 'hexchar',
186
			'val_type' => 'hexchar',
187
			'desc' => array('Helper function for utf8_convert_case.'),
188
			'return' => array(
189
				'type' => 'array',
190
				'desc' => 'Full title case maps.',
191
			),
192
			'data' => array(),
193
		),
194
		'utf8_casefold_simple_maps' => array(
195
			'file' => 'CaseFold.php',
196
			'key_type' => 'hexchar',
197
			'val_type' => 'hexchar',
198
			'desc' => array('Helper function for utf8_casefold.'),
199
			'return' => array(
200
				'type' => 'array',
201
				'desc' => 'Casefolding maps.',
202
			),
203
			'data' => array(),
204
		),
205
		'utf8_casefold_maps' => array(
206
			'file' => 'CaseFold.php',
207
			'key_type' => 'hexchar',
208
			'val_type' => 'hexchar',
209
			'desc' => array('Helper function for utf8_casefold.'),
210
			'return' => array(
211
				'type' => 'array',
212
				'desc' => 'Casefolding maps.',
213
			),
214
			'data' => array(),
215
		),
216
		'utf8_default_ignorables' => array(
217
			'file' => 'DefaultIgnorables.php',
218
			'key_type' => 'int',
219
			'val_type' => 'hexchar',
220
			'desc' => array('Helper function for utf8_normalize_kc_casefold.'),
221
			'return' => array(
222
				'type' => 'array',
223
				'desc' => 'Characters with the \'Default_Ignorable_Code_Point\' property.',
224
			),
225
			'data' => array(),
226
		),
227
		'utf8_regex_properties' => array(
228
			'file' => 'RegularExpressions.php',
229
			'key_type' => 'string',
230
			'val_type' => 'string',
231
			'propfiles' => array(
232
				'DerivedCoreProperties.txt',
233
				'PropList.txt',
234
				'emoji/emoji-data.txt',
235
				'extracted/DerivedGeneralCategory.txt',
236
			),
237
			'props' => array(
238
				'Bidi_Control',
239
				'Case_Ignorable',
240
				'Cn',
241
				'Default_Ignorable_Code_Point',
242
				'Emoji',
243
				'Emoji_Modifier',
244
				'Ideographic',
245
				'Join_Control',
246
				'Regional_Indicator',
247
				'Variation_Selector',
248
			),
249
			'desc' => array(
250
				'Helper function for utf8_sanitize_invisibles and utf8_convert_case.',
251
				'',
252
				'Character class lists compiled from:',
253
				'https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt',
254
				'https://unicode.org/Public/UNIDATA/PropList.txt',
255
				'https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt',
256
				'https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt',
257
			),
258
			'return' => array(
259
				'type' => 'array',
260
				'desc' => 'Character classes for various Unicode properties.',
261
			),
262
			'data' => array(),
263
		),
264
		'utf8_regex_variation_selectors' => array(
265
			'file' => 'RegularExpressions.php',
266
			'key_type' => 'string',
267
			'val_type' => 'string',
268
			'desc' => array(
269
				'Helper function for utf8_sanitize_invisibles.',
270
				'',
271
				'Character class lists compiled from:',
272
				'https://unicode.org/Public/UNIDATA/StandardizedVariants.txt',
273
				'https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt',
274
			),
275
			'return' => array(
276
				'type' => 'array',
277
				'desc' => 'Character classes for filtering variation selectors.',
278
			),
279
			'data' => array(),
280
		),
281
		'utf8_regex_joining_type' => array(
282
			'file' => 'RegularExpressions.php',
283
			'key_type' => 'string',
284
			'val_type' => 'string',
285
			'desc' => array(
286
				'Helper function for utf8_sanitize_invisibles.',
287
				'',
288
				'Character class lists compiled from:',
289
				'https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt',
290
			),
291
			'return' => array(
292
				'type' => 'array',
293
				'desc' => 'Character classes for joining characters in certain scripts.',
294
			),
295
			'data' => array(),
296
		),
297
		'utf8_regex_indic' => array(
298
			'file' => 'RegularExpressions.php',
299
			'key_type' => 'string',
300
			'val_type' => 'string',
301
			'desc' => array(
302
				'Helper function for utf8_sanitize_invisibles.',
303
				'',
304
				'Character class lists compiled from:',
305
				'https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt',
306
				'https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt',
307
			),
308
			'return' => array(
309
				'type' => 'array',
310
				'desc' => 'Character classes for Indic scripts that use viramas.',
311
			),
312
			'data' => array(),
313
		),
314
		'utf8_regex_quick_check' => array(
315
			'file' => 'QuickCheck.php',
316
			'key_type' => 'string',
317
			'val_type' => 'string',
318
			'desc' => array(
319
				'Helper function for utf8_is_normalized.',
320
				'',
321
				'Character class lists compiled from:',
322
				'https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt',
323
			),
324
			'return' => array(
325
				'type' => 'array',
326
				'desc' => 'Character classes for disallowed characters in normalization forms.',
327
			),
328
			'data' => array(),
329
		),
330
		'idna_maps' => array(
331
			'file' => 'Idna.php',
332
			'key_type' => 'hexchar',
333
			'val_type' => 'hexchar',
334
			'desc' => array('Helper function for idn_to_* polyfills.'),
335
			'return' => array(
336
				'type' => 'array',
337
				'desc' => 'Character maps for IDNA processing.',
338
			),
339
			'data' => array(),
340
		),
341
		'idna_maps_deviation' => array(
342
			'file' => 'Idna.php',
343
			'key_type' => 'hexchar',
344
			'val_type' => 'hexchar',
345
			'desc' => array('Helper function for idn_to_* polyfills.'),
346
			'return' => array(
347
				'type' => 'array',
348
				'desc' => '"Deviation" character maps for IDNA processing.',
349
			),
350
			'data' => array(),
351
		),
352
		'idna_regex' => array(
353
			'file' => 'Idna.php',
354
			'key_type' => 'string',
355
			'val_type' => 'string',
356
			'desc' => array('Helper function for idn_to_* polyfills.'),
357
			'return' => array(
358
				'type' => 'array',
359
				'desc' => 'Regular expressions useful for IDNA processing.',
360
			),
361
			'data' => array(),
362
		),
363
	);
364
365
	/**
366
	 * @var array Files to fetch from unicode.org.
367
	 */
368
	private $prefetch = array(
369
		self::DATA_URL_UCD => array(
370
			'CaseFolding.txt',
371
			'DerivedAge.txt',
372
			'DerivedCoreProperties.txt',
373
			'DerivedNormalizationProps.txt',
374
			'IndicSyllabicCategory.txt',
375
			'PropertyValueAliases.txt',
376
			'PropList.txt',
377
			'ScriptExtensions.txt',
378
			'Scripts.txt',
379
			'SpecialCasing.txt',
380
			'StandardizedVariants.txt',
381
			'UnicodeData.txt',
382
			'emoji/emoji-data.txt',
383
			'emoji/emoji-variation-sequences.txt',
384
			'extracted/DerivedGeneralCategory.txt',
385
			'extracted/DerivedJoiningType.txt',
386
		),
387
		self::DATA_URL_IDNA => array(
388
			'IdnaMappingTable.txt',
389
		),
390
	);
391
392
	/**
393
	 * This executes the task.
394
	 *
395
	 * @return bool Always returns true
396
	 */
397
	public function execute()
398
	{
399
		global $sourcedir, $smcFunc, $txt;
400
401
		/*****************
402
		 * Part 1: Setup *
403
		 *****************/
404
		$this->unicodedir = $sourcedir . DIRECTORY_SEPARATOR . 'Unicode';
405
406
		// We need a temporary directory to hold our files while we work on them.
407
		$this->make_temp_dir();
408
409
		if (empty($this->temp_dir))
410
			return true;
411
412
		// Prevent race conditions.
413
		if (is_file($this->temp_dir . DIRECTORY_SEPARATOR . 'lock'))
414
			return true;
415
416
		if (!@touch($this->temp_dir . DIRECTORY_SEPARATOR . 'lock'))
417
			return true;
418
419
		register_shutdown_function(function () {
420
			if (file_exists($this->temp_dir . DIRECTORY_SEPARATOR . 'lock'))
421
				unlink($this->temp_dir . DIRECTORY_SEPARATOR . 'lock');
422
		});
423
424
		// Do we even need to update?
425
		if (!$this->should_update())
426
		{
427
			$this->deltree($this->temp_dir);
428
			return true;
429
		}
430
431
		@ini_set('memory_limit', '256M');
432
433
		$this->time_limit = (empty(ini_get('max_execution_time')) || @set_time_limit(MAX_CLAIM_THRESHOLD) !== false) ? MAX_CLAIM_THRESHOLD : ini_get('max_execution_time');
0 ignored issues
show
Documentation Bug introduced by
It seems like empty(ini_get('max_execu...t('max_execution_time') can also be of type string. However, the property $time_limit is declared as type integer. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
434
435
		foreach ($this->funcs as $func_name => &$func_info)
436
		{
437
			$file_paths['final'] = implode(DIRECTORY_SEPARATOR, array($this->unicodedir, $func_info['file']));
438
439
			if (!file_exists($file_paths['final']))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $file_paths seems to be defined later in this foreach loop on line 437. Are you sure it is defined here?
Loading history...
440
				touch($file_paths['final']);
441
442
			if (!is_file($file_paths['final']) || !smf_chmod($file_paths['final']))
443
			{
444
				loadLanguage('Errors');
445
				log_error(sprintf($txt['unicode_update_failed'], $this->unicodedir));
446
				return true;
447
			}
448
449
			$file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file']));
450
451
			if (!file_exists($file_paths['temp']))
452
				touch($file_paths['temp']);
453
454
			if (!is_file($file_paths['temp']) || !smf_chmod($file_paths['temp']))
455
			{
456
				loadLanguage('Errors');
457
				log_error(sprintf($txt['unicode_update_failed'], $this->temp_dir));
458
				return true;
459
			}
460
461
			$file_contents['temp'] = file_get_contents($file_paths['temp']);
462
463
			if (empty($file_contents['temp']))
464
			{
465
				file_put_contents($file_paths['temp'], $this->smf_file_header());
466
			}
467
			elseif (substr($file_contents['temp'], -2) === '?' . '>')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $file_contents seems to be defined later in this foreach loop on line 461. Are you sure it is defined here?
Loading history...
468
			{
469
				file_put_contents($file_paths['temp'], substr($file_contents['temp'], 0, -2));
470
			}
471
		}
472
473
		// Prefetch the files in case the network is slow.
474
		foreach ($this->prefetch as $data_url => $files)
475
		{
476
			$max_fetch_time = 0;
477
478
			foreach ($files as $filename)
479
			{
480
				$fetch_start = microtime(true);
481
482
				$local_file = $this->fetch_unicode_file($filename, $data_url);
483
484
				$max_fetch_time = max($max_fetch_time, microtime(true) - $fetch_start);
485
486
				// If prefetch is taking a really long time, pause and try again later.
487
				if ($local_file === false || microtime(true) - TIME_START >= $this->time_limit - $max_fetch_time)
488
				{
489
					$smcFunc['db_insert']('',
490
						'{db_prefix}background_tasks',
491
						array(
492
							'task_file' => 'string',
493
							'task_class' => 'string',
494
							'task_data' => 'string',
495
							'claimed_time' => 'int',
496
						),
497
						array(
498
							'$sourcedir/tasks/UpdateUnicode.php',
499
							'Update_Unicode',
500
							'',
501
							time() - MAX_CLAIM_THRESHOLD,
502
						),
503
						array('id_task')
504
					);
505
506
					return true;
507
				}
508
			}
509
		}
510
511
		// Track whether anything goes wrong along the way.
512
		$success = true;
513
514
		/*********************************************
515
		 * Part 2: Normalization, case folding, etc. *
516
		 *********************************************/
517
		$success = $this->process_derived_normalization_props() & $success;
0 ignored issues
show
Bug introduced by
Are you sure you want to use the bitwise & or did you mean &&?
Loading history...
518
		$success = $this->process_main_unicode_data() & $success;
519
		$success = $this->process_casing_data() & $success;
520
		$success = $this->finalize_decomposition_forms() & $success;
521
522
		$this->full_decomposition_maps = array();
523
524
		$this->export_funcs_to_file();
525
526
		/***********************************
527
		 * Part 3: Regular expression data *
528
		 ***********************************/
529
		$success = $this->build_quick_check() & $success;
530
531
		$this->derived_normalization_props = array();
532
533
		$success = $this->build_regex_properties() & $success;
534
		$success = $this->build_regex_variation_selectors() & $success;
535
		$success = $this->build_script_stats() & $success;
536
		$success = $this->build_regex_joining_type() & $success;
537
		$success = $this->build_regex_indic() & $success;
538
539
		unset($this->funcs['utf8_combining_classes']['data']);
540
541
		$this->export_funcs_to_file();
542
543
		/*********************************
544
		 * Part 4: IDNA maps and regexes *
545
		 *********************************/
546
		$success = $this->build_idna() & $success;
547
548
		$this->export_funcs_to_file();
549
550
		/*******************
551
		 * Part 5: Wrapup. *
552
		 *******************/
553
		if ($success)
554
		{
555
			require_once($sourcedir . '/Subs-Admin.php');
556
557
			foreach ($this->funcs as $func_name => $func_info)
558
			{
559
				$file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file'];
560
561
				// If the temp file went missing, bail out immediately.
562
				if (!is_readable($file_paths['temp']) || !is_writable($file_paths['temp']))
563
					return true;
564
565
				// Add closing PHP tag to the temp file.
566
				if (!preg_match('/[?]>$/', file_get_contents($file_paths['temp'])))
567
					file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND);
568
			}
569
570
			foreach ($this->funcs as $func_name => $func_info)
571
			{
572
				$file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file'];
573
				$file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file'];
574
575
				// Only move if the file has changed, discounting the license block.
576
				foreach (array('temp', 'real') as $f)
577
				{
578
					if (file_exists($file_paths[$f]))
579
					{
580
						$file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f]));
581
					}
582
					else
583
						$file_contents[$f] = '';
584
				}
585
586
				if ($file_contents['temp'] === '')
587
				{
588
					$success = false;
589
				}
590
				elseif ($file_contents['temp'] !== $file_contents['real'])
591
				{
592
					$success &= safe_file_write($file_paths['real'], file_get_contents($file_paths['temp']), $file_paths['real'] . '.bak', time() + 1);
593
				}
594
			}
595
596
			// If we wrote all the files successfully, remove the backup files.
597
			if ($success)
0 ignored issues
show
Bug Best Practice introduced by
The expression $success of type false|integer is loosely compared to true; this is ambiguous if the integer can be 0. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
598
			{
599
				foreach (glob($this->unicodedir . DIRECTORY_SEPARATOR . '*.bak') as $path)
600
					unlink($path);
601
			}
602
			// If any file failed to write, revert all of them.
603
			else
604
			{
605
				foreach (glob($this->unicodedir . DIRECTORY_SEPARATOR . '*.bak') as $path)
606
					rename($path, substr($path, -4));
607
			}
608
		}
609
610
		// Clean up after ourselves.
611
		$this->deltree($this->temp_dir);
612
613
		// All done.
614
		return true;
615
	}
616
617
	/**
618
	 * Makes a temporary directory to hold our working files, and sets
619
	 * $this->temp_dir to the path of the created directory.
620
	 */
621
	private function make_temp_dir()
622
	{
623
		global $sourcedir;
624
625
		if (empty($this->temp_dir))
626
		{
627
			require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php');
628
629
			$this->temp_dir = rtrim(sm_temp_dir(), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'Unicode';
630
631
			if (!is_dir($this->temp_dir))
632
				@mkdir($this->temp_dir);
633
634
			// Needs to be a writable directory.
635
			if (!is_dir($this->temp_dir) || !smf_chmod($this->temp_dir))
636
				$this->temp_dir = null;
637
		}
638
	}
639
640
	/**
641
	 * Fetches the contents of a Unicode data file.
642
	 *
643
	 * Caches a local copy for subsequent lookups.
644
	 *
645
	 * @param string $filename Name of a Unicode datafile, relative to $data_url.
646
	 * @param string $data_url One of this class's DATA_URL_* constants.
647
	 *
648
	 * @return string Path to locally saved copy of the file.
649
	 */
650
	private function fetch_unicode_file($filename, $data_url)
651
	{
652
		global $sourcedir;
653
654
		$filename = ltrim($filename, '\\/');
655
		$file_url_name = strtr($filename, array('\\' => '/'));
656
		$file_local_name = strtr($filename, array('\\' => DIRECTORY_SEPARATOR, '/' => DIRECTORY_SEPARATOR));
657
658
		switch ($data_url)
659
		{
660
			case self::DATA_URL_IDNA:
661
				$sub_dir = 'idna';
662
				break;
663
664
			default:
665
				$sub_dir = 'ucd';
666
				break;
667
		}
668
669
		$local_file = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $sub_dir, $file_local_name));
670
671
		if (file_exists($local_file))
672
			return $local_file;
673
674
		if (!file_exists(dirname($local_file)))
675
		{
676
			@mkdir(dirname($local_file), 0777, true);
677
678
			if (!is_dir(dirname($local_file)))
679
				return false;
680
		}
681
682
		$file_contents = fetch_web_data($data_url . '/' . $file_url_name);
683
684
		if (empty($file_contents))
685
			return false;
686
687
		file_put_contents($local_file, $file_contents);
688
689
		return $local_file;
690
	}
691
692
	/**
693
	 * Deletes a directory and its contents.
694
	 *
695
	 * @param string Path to directory
0 ignored issues
show
Bug introduced by
The type Path was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
696
	 */
697
	private function deltree($dir_path)
698
	{
699
		// For safety.
700
		if (strpos($dir_path, $this->temp_dir) !== 0)
701
			return;
702
703
		$dir = new DirectoryIterator($dir_path);
704
705
		$to_delete = array();
706
		foreach ($dir as $fileinfo)
707
		{
708
			if ($fileinfo->isDot())
709
				continue;
710
711
			if ($fileinfo->isDir())
712
				$this->deltree($fileinfo->getPathname());
713
			else
714
				$to_delete[] = $fileinfo->getPathname();
715
		}
716
717
		foreach ($to_delete as $pathname)
718
			unlink($pathname);
719
720
		rmdir($dir_path);
721
	}
722
723
	/**
724
	 * Gets basic boilerplate for the PHP files that will be created.
725
	 *
726
	 * @return string Standard SMF file header.
727
	 */
728
	private function smf_file_header()
729
	{
730
		global $sourcedir;
731
732
		static $file_template;
733
734
		if (!empty($file_template))
735
			return $file_template;
736
737
		require_once($sourcedir . '/Subs-Admin.php');
738
		$settings_defs = get_settings_defs();
739
740
		$license_block = '';
741
742
		$keep_line = true;
743
		foreach (explode("\n", $settings_defs[0]['text']) as $line)
744
		{
745
			if (strpos($line, 'SMF') !== false || strpos($line, 'Simple Machines') !== false)
746
				$keep_line = true;
747
748
			if ($keep_line)
749
				$license_block .= $line . "\n";
750
751
			if ($line === '/**')
752
				$keep_line = false;
753
		}
754
755
		$file_template = implode("\n\n", array(
756
			'<' . '?php',
757
			trim($license_block),
758
			"if (!defined('SMF'))\n\tdie('No direct access...');",
759
			'',
760
		));
761
762
		return $file_template;
763
	}
764
765
	/**
766
	 * Updates Unicode data functions in their designated files.
767
	 */
768
	function export_funcs_to_file()
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
769
	{
770
		foreach ($this->funcs as $func_name => $func_info)
771
		{
772
			if (!isset($func_info['data']))
773
				continue;
774
775
			$temp_file_path = $this->temp_dir . '/' . $func_info['file'];
776
777
			list($func_code, $func_regex) = $this->get_function_code_and_regex($func_name);
778
779
			$file_contents = file_get_contents($temp_file_path);
780
781
			if (preg_match($func_regex, $file_contents))
782
			{
783
				file_put_contents($temp_file_path, preg_replace($func_regex, $func_code, $file_contents));
784
			}
785
			else
786
			{
787
				file_put_contents($temp_file_path, $func_code . "\n\n", FILE_APPEND);
788
			}
789
790
			// Free up some memory.
791
			if ($func_name != 'utf8_combining_classes')
792
				unset($this->funcs[$func_name]['data']);
793
		}
794
	}
795
796
	/**
797
	 * Builds complete code for the specified element in $this->funcs
798
	 * to be inserted into the relevant PHP file. Also builds a regex
799
	 * to check whether a copy of the the function is already present
800
	 * in the file.
801
	 *
802
	 * @param string $func_name Key of an element in $this->funcs.
803
	 *
804
	 * @return array PHP code and a regular expression.
805
	 */
806
	private function get_function_code_and_regex($func_name)
807
	{
808
		// No function name means data is raw code.
809
		if (!is_string($func_name))
0 ignored issues
show
introduced by
The condition is_string($func_name) is always true.
Loading history...
810
		{
811
			$func_code = implode("\n\n", $this->funcs[$func_name]['data']);
812
			$func_regex = isset($this->funcs[$func_name]['regex']) ? $this->funcs[$func_name]['regex'] : '/' . preg_quote($func_code, '/') . '/';
813
		}
814
		else
815
		{
816
			// The regex to look for this function in the existing file content.
817
			$func_regex = "/(\/\*([^*]|\*(?!\/))*\*\/\n)?function $func_name\(\)\n{.+?\n}/s";
818
819
			// The PHPDoc comment for this function.
820
			$func_code = '/**' . implode("\n * ", array_merge(
821
				array(''),
822
				$this->funcs[$func_name]['desc'],
823
				array(
824
					'',
825
					'Developers: Do not update the data in this function manually. Instead,',
826
					'run "php -f other/update_unicode_data.php" on the command line.',
827
				),
828
				empty($this->funcs[$func_name]['return']) ? array() : array(
829
					'',
830
					'@return ' . implode(' ', $this->funcs[$func_name]['return'])
831
				)
832
			)) . "\n */\n";
833
834
			// The code for this function.
835
			$func_code .= implode("\n", array(
836
				'function ' . $func_name . '()',
837
				'{',
838
				"\t" . 'return array(',
839
				'',
840
			));
841
842
			$this->build_func_array(
843
				$func_code,
844
				$this->funcs[$func_name]['data'],
845
				$this->funcs[$func_name]['key_type'],
846
				$this->funcs[$func_name]['val_type']
847
			);
848
849
			$func_code .= implode("\n", array(
850
				"\t" . ');',
851
				'}',
852
			));
853
		}
854
855
		// Some final tidying.
856
		$func_code = str_replace('\\\\x', '\x', $func_code);
857
		$func_code = preg_replace('/\h+$/m', '', $func_code);
858
859
		return array($func_code, $func_regex);
860
	}
861
862
	/**
863
	 * Helper for get_function_code_and_regex(). Builds the function's data array.
864
	 *
865
	 * @param string &$func_code The raw string that contains function code.
866
	 * @param array $data Data to format as an array.
867
	 * @param string $key_type How to format the array keys.
868
	 * @param string $val_type How to format the array values.
869
	 */
870
	private function build_func_array(&$func_code, $data, $key_type, $val_type)
871
	{
872
		static $indent = 2;
873
874
		foreach ($data as $key => $value)
875
		{
876
			$func_code .= str_repeat("\t", $indent);
877
878
			if ($key_type == 'hexchar')
879
			{
880
				$func_code .= '"';
881
882
				$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
883
884
				foreach (unpack('C*', $key) as $byte_value)
885
				{
886
					$func_code .= '\x' . strtoupper(dechex($byte_value));
887
				}
888
889
				$func_code .= '" => ';
890
			}
891
			elseif ($key_type == 'string' && !is_int($key))
892
			{
893
				$func_code .= var_export($key, true) . ' => ';
894
			}
895
896
			if (is_array($value))
897
			{
898
				if ($val_type == 'string' && count($value) === count($value, COUNT_RECURSIVE))
899
				{
900
					$nextline = "\n" . str_repeat("\t", $indent + 1);
901
902
					$func_code = rtrim($func_code);
903
904
					$func_code .= $nextline . implode(' .' . $nextline, array_map(
905
						function ($v)
906
						{
907
							return var_export($v, true);
908
						},
909
						$value
910
					));
911
				}
912
				else
913
				{
914
					$func_code .= 'array(' . "\n";
915
916
					$indent++;
917
					$this->build_func_array($func_code, $value, $key_type, $val_type);
918
					$indent--;
919
920
					$func_code .= str_repeat("\t", $indent) . ')';
921
				}
922
			}
923
			elseif ($val_type == 'hexchar')
924
			{
925
				$func_code .= '"';
926
927
				$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
928
				foreach (unpack('C*', $value) as $byte_value)
929
				{
930
					$func_code .= '\x' . strtoupper(dechex($byte_value));
931
				}
932
933
				$func_code .= '"';
934
			}
935
			elseif ($val_type == 'string')
936
			{
937
				$func_code .= var_export($value, true);
938
			}
939
			else
940
			{
941
				$func_code .= $value;
942
			}
943
944
			$func_code .= ',' . "\n";
945
		}
946
	}
947
948
	/**
949
	 * Compares version of SMF's local Unicode data with the latest release.
950
	 *
951
	 * @return bool Whether SMF should update its local Unicode data or not.
952
	 */
953
	private function should_update()
954
	{
955
		$this->lookup_ucd_version();
956
957
		// We can't do anything if lookup failed.
958
		if (empty($this->ucd_version))
959
			return false;
960
961
		// If this file is missing, force an update.
962
		if (!@include_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php'))
963
			return true;
964
965
		return version_compare($this->ucd_version, SMF_UNICODE_VERSION, '>=');
966
	}
967
968
	/**
969
	 * Sets $this->ucd_version to latest version number of the UCD.
970
	 */
971
	private function lookup_ucd_version()
972
	{
973
		global $sourcedir;
974
975
		if (!empty($this->ucd_version))
976
			return true;
977
978
		$local_file = $this->fetch_unicode_file('ReadMe.txt', self::DATA_URL_UCD);
979
980
		if (empty($local_file))
981
			return false;
982
983
		preg_match('/Version\s+(\d+(?:\.\d+)*)/', file_get_contents($local_file), $matches);
984
985
		if (empty($matches[1]))
986
			return false;
987
988
		$this->ucd_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0'));
989
990
		// Update this while we are at it.
991
		foreach ($this->funcs as $func_name => &$func_info)
992
		{
993
			if ($func_info['file'] === 'Metadata.php')
994
			{
995
				$func_info['data'][0] = str_replace('0.0.0.0', $this->ucd_version, $func_info['data'][0]);
996
997
				break;
998
			}
999
		}
1000
1001
		return true;
1002
	}
1003
1004
	/**
1005
	 * Processes DerivedNormalizationProps.txt in order to populate
1006
	 * $this->derived_normalization_props.
1007
	 */
1008
	private function process_derived_normalization_props()
1009
	{
1010
		$local_file = $this->fetch_unicode_file('DerivedNormalizationProps.txt', self::DATA_URL_UCD);
1011
1012
		if (empty($local_file))
1013
			return false;
1014
1015
		foreach (file($local_file) as $line)
1016
		{
1017
			$line = substr($line, 0, strcspn($line, '#'));
1018
1019
			if (strpos($line, ';') === false)
1020
			{
1021
				continue;
1022
			}
1023
1024
			$fields = explode(';', $line);
1025
1026
			foreach ($fields as $key => $value)
1027
			{
1028
				$fields[$key] = trim($value);
1029
			}
1030
1031
			if (!isset($this->derived_normalization_props[$fields[1]]))
1032
			{
1033
				$this->derived_normalization_props[$fields[1]] = array();
1034
			}
1035
1036
			if (strpos($fields[0], '..') === false)
1037
			{
1038
				$entities = array('&#x' . $fields[0] . ';');
1039
			}
1040
			else
1041
			{
1042
				$entities = array();
1043
1044
				list($start, $end) = explode('..', $fields[0]);
1045
1046
				$ord_s = hexdec($start);
1047
				$ord_e = hexdec($end);
1048
1049
				$ord = $ord_s;
1050
				while ($ord <= $ord_e)
1051
				{
1052
					$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
1053
				}
1054
			}
1055
1056
			$value = '';
1057
			if (!isset($fields[2]))
1058
			{
1059
				$value = 'SAME';
1060
			}
1061
			elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF')))
1062
			{
1063
				$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
1064
			}
1065
			else
1066
			{
1067
				$value = $fields[2];
1068
			}
1069
1070
			foreach ($entities as $entity)
1071
			{
1072
				$this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
1073
			}
1074
		}
1075
1076
		return true;
1077
	}
1078
1079
	/**
1080
	 * Processes UnicodeData.txt in order to populate $this->char_data,
1081
	 * $this->full_decomposition_maps, and the 'data' element of most elements
1082
	 * of $this->funcs.
1083
	 */
1084
	private function process_main_unicode_data()
1085
	{
1086
		$local_file = $this->fetch_unicode_file('UnicodeData.txt', self::DATA_URL_UCD);
1087
1088
		if (empty($local_file))
1089
			return false;
1090
1091
		foreach (file($local_file) as $line)
1092
		{
1093
			$fields = explode(';', $line);
1094
1095
			foreach ($fields as $key => $value)
1096
			{
1097
				$fields[$key] = trim($value);
1098
			}
1099
1100
			if (!empty($fields[3]))
1101
			{
1102
				$this->funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3];
1103
			}
1104
1105
			// Uppercase maps.
1106
			if ($fields[12] !== '')
1107
			{
1108
				$this->funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
1109
			}
1110
1111
			// Lowercase maps.
1112
			if ($fields[13] !== '')
1113
			{
1114
				$this->funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
1115
			}
1116
1117
			// Titlecase maps, where different from uppercase maps.
1118
			if ($fields[14] !== '' && $fields[14] !== $fields[12])
1119
			{
1120
				$this->funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';';
1121
			}
1122
1123
			// Remember this character's general category for later.
1124
			$this->char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2];
1125
1126
			if ($fields[5] === '')
1127
			{
1128
				continue;
1129
			}
1130
1131
			// All canonical decompositions AND all compatibility decompositions.
1132
			$this->full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
1133
1134
			// Just the canonical decompositions.
1135
			if (strpos($fields[5], '<') === false)
1136
			{
1137
				$this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';';
1138
			}
1139
		}
1140
1141
		return true;
1142
	}
1143
1144
	/**
1145
	 * Processes SpecialCasing.txt and CaseFolding.txt in order to get
1146
	 * finalized versions of all case conversion data.
1147
	 */
1148
	private function process_casing_data()
1149
	{
1150
		// Full case conversion maps are the same as the simple ones, unless they're not.
1151
		$this->funcs['utf8_strtoupper_maps']['data'] = $this->funcs['utf8_strtoupper_simple_maps']['data'];
1152
		$this->funcs['utf8_strtolower_maps']['data'] = $this->funcs['utf8_strtolower_simple_maps']['data'];
1153
		$this->funcs['utf8_titlecase_maps']['data'] = $this->funcs['utf8_titlecase_simple_maps']['data'];
1154
1155
		// Deal with the special casing data.
1156
		$local_file = $this->fetch_unicode_file('SpecialCasing.txt', self::DATA_URL_UCD);
1157
1158
		if (empty($local_file))
1159
			return false;
1160
1161
		foreach (file($local_file) as $line)
1162
		{
1163
			$line = substr($line, 0, strcspn($line, '#'));
1164
1165
			if (strpos($line, ';') === false)
1166
			{
1167
				continue;
1168
			}
1169
1170
			$fields = explode(';', $line);
1171
1172
			foreach ($fields as $key => $value)
1173
			{
1174
				$fields[$key] = trim($value);
1175
			}
1176
1177
			// Unconditional mappings.
1178
			// Note: conditional mappings need to be handled by more complex code.
1179
			if (empty($fields[4]))
1180
			{
1181
				$this->funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';';
1182
1183
				$this->funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';';
1184
1185
				// Titlecase only where different from uppercase.
1186
				if ($fields[3] !== $fields[2])
1187
				{
1188
					$this->funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
1189
				}
1190
			}
1191
		}
1192
1193
		ksort($this->funcs['utf8_strtolower_maps']['data']);
1194
		ksort($this->funcs['utf8_strtoupper_maps']['data']);
1195
		ksort($this->funcs['utf8_titlecase_maps']['data']);
1196
1197
		// Deal with the case folding data.
1198
		$local_file = $this->fetch_unicode_file('CaseFolding.txt', self::DATA_URL_UCD);
1199
1200
		if (empty($local_file))
1201
			return false;
1202
1203
		foreach (file($local_file) as $line)
1204
		{
1205
			$line = substr($line, 0, strcspn($line, '#'));
1206
1207
			if (strpos($line, ';') === false)
1208
			{
1209
				continue;
1210
			}
1211
1212
			$fields = explode(';', $line);
1213
1214
			foreach ($fields as $key => $value)
1215
			{
1216
				$fields[$key] = trim($value);
1217
			}
1218
1219
			// Full casefolding.
1220
			if (in_array($fields[1], array('C', 'F')))
1221
			{
1222
				$this->funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
1223
			}
1224
1225
			// Simple casefolding.
1226
			if (in_array($fields[1], array('C', 'S')))
1227
				$this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
1228
		}
1229
1230
		return true;
1231
	}
1232
1233
	/**
1234
	 * Finalizes all the decomposition forms.
1235
	 *
1236
	 * This is necessary because some characters decompose to other characters
1237
	 * that themselves decompose further.
1238
	 */
1239
	private function finalize_decomposition_forms()
1240
	{
1241
		// Iterate until we reach the final decomposition forms.
1242
		// First we do the compatibility decomposition forms.
1243
		$changed = true;
1244
		while ($changed)
1245
		{
1246
			$temp = array();
1247
			foreach ($this->full_decomposition_maps as $composed => $decomposed)
1248
			{
1249
				$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
1250
1251
				foreach ($parts as $partnum => $hex)
1252
				{
1253
					if (isset($this->full_decomposition_maps[$hex]))
1254
					{
1255
						$parts[$partnum] = $this->full_decomposition_maps[$hex];
1256
					}
1257
				}
1258
1259
				$decomposed = implode(' ', $parts);
1260
				unset($parts);
1261
1262
				$temp[$composed] = $decomposed;
1263
			}
1264
1265
			$changed = $this->full_decomposition_maps !== $temp;
1266
1267
			$this->full_decomposition_maps = $temp;
1268
		}
1269
1270
		// Same as above, but using only canonical decompositions.
1271
		$changed = true;
1272
		$iteration = 0;
1273
		while ($changed)
1274
		{
1275
			$temp = array();
1276
			foreach ($this->funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed)
1277
			{
1278
				if ($iteration === 0 && !in_array($composed, $this->derived_normalization_props['Full_Composition_Exclusion']))
1279
				{
1280
					$this->funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
1281
				}
1282
1283
				$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
1284
1285
				foreach ($parts as $partnum => $hex)
1286
				{
1287
					if (isset($this->funcs['utf8_normalize_d_maps']['data'][$hex]))
1288
					{
1289
						$parts[$partnum] = $this->funcs['utf8_normalize_d_maps']['data'][$hex];
1290
					}
1291
				}
1292
1293
				$decomposed = implode(' ', $parts);
1294
				unset($parts);
1295
1296
				$temp[$composed] = $decomposed;
1297
			}
1298
1299
			$changed = $this->funcs['utf8_normalize_d_maps']['data'] !== $temp;
1300
1301
			$this->funcs['utf8_normalize_d_maps']['data'] = $temp;
1302
			$iteration++;
1303
		}
1304
1305
		// Avoid bloat.
1306
		$this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']);
1307
1308
		return true;
1309
	}
1310
1311
	/**
1312
	 * Builds regular expressions for normalization quick check.
1313
	 */
1314
	private function build_quick_check()
1315
	{
1316
		foreach (array('NFC_QC', 'NFKC_QC', 'NFD_QC', 'NFKD_QC', 'Changes_When_NFKC_Casefolded') as $prop)
1317
		{
1318
			$current_range = array('start' => null, 'end' => null);
1319
			foreach ($this->derived_normalization_props[$prop] as $entity => $nm)
1320
			{
1321
				$range_string = '';
1322
1323
				$ord = hexdec(trim($entity, '&#x;'));
1324
1325
				if (!isset($current_range['start']))
1326
				{
1327
					$current_range['start'] = $ord;
1328
				}
1329
1330
				if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
1331
				{
1332
					$current_range['end'] = $ord;
1333
				}
1334
				else
1335
				{
1336
					$range_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1336
					$range_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
1337
1338
					if ($current_range['start'] != $current_range['end'])
1339
					{
1340
						$range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1340
						$range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
1341
					}
1342
1343
					$current_range = array('start' => $ord, 'end' => $ord);
1344
1345
					$this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string;
1346
				}
1347
			}
1348
1349
			if (isset($current_range['start']))
1350
			{
1351
				$range_string = '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
1352
1353
				if ($current_range['start'] != $current_range['end'])
1354
				{
1355
					$range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
1356
				}
1357
1358
				$this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string;
1359
			}
1360
		}
1361
1362
		return true;
1363
	}
1364
1365
	/**
1366
	 * Builds regular expression classes for extended Unicode properties.
1367
	 */
1368
	private function build_regex_properties()
1369
	{
1370
		foreach ($this->funcs['utf8_regex_properties']['propfiles'] as $filename)
1371
		{
1372
			$local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD);
1373
1374
			if (empty($local_file))
1375
				return false;
1376
1377
			foreach (file($local_file) as $line)
1378
			{
1379
				$line = substr($line, 0, strcspn($line, '#'));
1380
1381
				if (strpos($line, ';') === false)
1382
				{
1383
					continue;
1384
				}
1385
1386
				$fields = explode(';', $line);
1387
1388
				foreach ($fields as $key => $value)
1389
				{
1390
					$fields[$key] = trim($value);
1391
				}
1392
1393
				if (in_array($fields[1], $this->funcs['utf8_regex_properties']['props']))
1394
				{
1395
					if (!isset($this->funcs['utf8_regex_properties']['data'][$fields[1]]))
1396
					{
1397
						$this->funcs['utf8_regex_properties']['data'][$fields[1]] = array();
1398
					}
1399
1400
					$this->funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
1401
				}
1402
1403
				// We also track 'Default_Ignorable_Code_Point' property in a separate array.
1404
				if ($fields[1] !== 'Default_Ignorable_Code_Point')
1405
				{
1406
					continue;
1407
				}
1408
1409
				if (strpos($fields[0], '..') === false)
1410
				{
1411
					$this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
1412
				}
1413
				else
1414
				{
1415
					list($start, $end) = explode('..', $fields[0]);
1416
1417
					$ord_s = hexdec($start);
1418
					$ord_e = hexdec($end);
1419
1420
					$ord = $ord_s;
1421
					while ($ord <= $ord_e)
1422
					{
1423
						$this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
1424
					}
1425
				}
1426
			}
1427
		}
1428
1429
		ksort($this->funcs['utf8_regex_properties']['data']);
1430
1431
		return true;
1432
	}
1433
1434
	/**
1435
	 * Builds regular expression classes for filtering variation selectors.
1436
	 */
1437
	private function build_regex_variation_selectors()
1438
	{
1439
		$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
1440
1441
		foreach ($files as $filename)
1442
		{
1443
			$local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD);
1444
1445
			if (empty($local_file))
1446
				return false;
1447
1448
			foreach (file($local_file) as $line)
1449
			{
1450
				$line = substr($line, 0, strcspn($line, '#'));
1451
1452
				if (strpos($line, ';') === false)
1453
				{
1454
					continue;
1455
				}
1456
1457
				$fields = explode(';', $line);
1458
1459
				foreach ($fields as $key => $value)
1460
				{
1461
					$fields[$key] = trim($value);
1462
				}
1463
1464
				list($base_char, $variation_selector) = explode(' ', $fields[0]);
1465
1466
				$this->funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
1467
			}
1468
		}
1469
1470
		foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords)
1471
		{
1472
			$class_string = '';
1473
1474
			$current_range = array('start' => null, 'end' => null);
1475
			foreach ($ords as $ord)
1476
			{
1477
				if (!isset($current_range['start']))
1478
				{
1479
					$current_range['start'] = $ord;
1480
				}
1481
1482
				if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
1483
				{
1484
					$current_range['end'] = $ord;
1485
					continue;
1486
				}
1487
				else
1488
				{
1489
					$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1489
					$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
1490
1491
					if ($current_range['start'] != $current_range['end'])
1492
					{
1493
						$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1493
						$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
1494
					}
1495
1496
					$current_range = array('start' => $ord, 'end' => $ord);
1497
				}
1498
			}
1499
1500
			if (isset($current_range['start']))
1501
			{
1502
				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
1503
1504
				if ($current_range['start'] != $current_range['end'])
1505
				{
1506
					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
1507
				}
1508
			}
1509
1510
			// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
1511
			if (($identical = array_search($class_string, $this->funcs['utf8_regex_variation_selectors']['data'])) !== false)
1512
			{
1513
				unset(
1514
					$this->funcs['utf8_regex_variation_selectors']['data'][$identical],
1515
					$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
1516
				);
1517
1518
				$compound_selector = array($identical, $variation_selector);
1519
				sort($compound_selector);
1520
1521
				$variation_selector = implode('', $compound_selector);
1522
			}
1523
1524
			$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
1525
		}
1526
1527
		foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string)
1528
		{
1529
			$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
1530
		}
1531
1532
		krsort($this->funcs['utf8_regex_variation_selectors']['data']);
1533
1534
		return true;
1535
	}
1536
1537
	/**
1538
	 * Helper function for build_regex_joining_type and build_regex_indic.
1539
	 */
1540
	private function build_script_stats()
1541
	{
1542
		$local_file = $this->fetch_unicode_file('PropertyValueAliases.txt', self::DATA_URL_UCD);
1543
1544
		if (empty($local_file))
1545
			return false;
1546
1547
		foreach (file($local_file) as $line)
1548
		{
1549
			$line = substr($line, 0, strcspn($line, '#'));
1550
1551
			if (strpos($line, ';') === false)
1552
			{
1553
				continue;
1554
			}
1555
1556
			$fields = explode(';', $line);
1557
1558
			foreach ($fields as $key => $value)
1559
			{
1560
				$fields[$key] = trim($value);
1561
			}
1562
1563
			if ($fields[0] !== 'sc')
1564
			{
1565
				continue;
1566
			}
1567
1568
			$this->script_aliases[$fields[1]] = $fields[2];
1569
		}
1570
1571
		$local_file = $this->fetch_unicode_file('Scripts.txt', self::DATA_URL_UCD);
1572
1573
		if (empty($local_file))
1574
			return false;
1575
1576
		foreach (file($local_file) as $line)
1577
		{
1578
			$line = substr($line, 0, strcspn($line, '#'));
1579
1580
			if (strpos($line, ';') === false)
1581
			{
1582
				continue;
1583
			}
1584
1585
			$fields = explode(';', $line);
1586
1587
			foreach ($fields as $key => $value)
1588
			{
1589
				$fields[$key] = trim($value);
1590
			}
1591
1592
			if (in_array($fields[1], array('Common', 'Inherited')))
1593
			{
1594
				continue;
1595
			}
1596
1597
			if (strpos($fields[0], '..') === false)
1598
			{
1599
				$this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
1600
			}
1601
			else
1602
			{
1603
				list($start, $end) = explode('..', $fields[0]);
1604
1605
				$ord_s = hexdec($start);
1606
				$ord_e = hexdec($end);
1607
1608
				$ord = $ord_s;
1609
				while ($ord <= $ord_e)
1610
				{
1611
					$this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
1612
				}
1613
			}
1614
		}
1615
1616
		$local_file = $this->fetch_unicode_file('ScriptExtensions.txt', self::DATA_URL_UCD);
1617
1618
		if (empty($local_file))
1619
			return false;
1620
1621
		foreach (file($local_file) as $line)
1622
		{
1623
			$line = substr($line, 0, strcspn($line, '#'));
1624
1625
			if (strpos($line, ';') === false)
1626
			{
1627
				continue;
1628
			}
1629
1630
			$fields = explode(';', $line);
1631
1632
			foreach ($fields as $key => $value)
1633
			{
1634
				$fields[$key] = trim($value);
1635
			}
1636
1637
			$char_scripts = array();
1638
			foreach (explode(' ', $fields[1]) as $alias)
1639
			{
1640
				if (!in_array($this->script_aliases[$alias], array('Common', 'Inherited')))
1641
				{
1642
					$char_scripts[] = $this->script_aliases[$alias];
1643
				}
1644
			}
1645
1646
			if (strpos($fields[0], '..') === false)
1647
			{
1648
				foreach ($char_scripts as $char_script)
1649
				{
1650
					$this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
1651
				}
1652
			}
1653
			else
1654
			{
1655
				list($start, $end) = explode('..', $fields[0]);
1656
1657
				$ord_s = hexdec($start);
1658
				$ord_e = hexdec($end);
1659
1660
				$ord = $ord_s;
1661
				while ($ord <= $ord_e)
1662
				{
1663
					foreach ($char_scripts as $char_script)
1664
					{
1665
						$this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
1666
					}
1667
				}
1668
			}
1669
		}
1670
1671
		$local_file = $this->fetch_unicode_file('DerivedAge.txt', self::DATA_URL_UCD);
1672
1673
		if (empty($local_file))
1674
			return false;
1675
1676
		foreach (file($local_file) as $line)
1677
		{
1678
			$line = substr($line, 0, strcspn($line, '#'));
1679
1680
			if (strpos($line, ';') === false)
1681
			{
1682
				continue;
1683
			}
1684
1685
			$fields = explode(';', $line);
1686
1687
			foreach ($fields as $key => $value)
1688
			{
1689
				$fields[$key] = trim($value);
1690
			}
1691
1692
			$fields[1] = (float) $fields[1];
1693
1694
			if (strpos($fields[0], '..') === false)
1695
			{
1696
				$entity = '&#x' . $fields[0] . ';';
1697
1698
				if (empty($this->char_data[$entity]['scripts']))
1699
				{
1700
					continue;
1701
				}
1702
1703
				foreach ($this->char_data[$entity]['scripts'] as $char_script)
1704
				{
1705
					if (!isset($this->script_stats[$char_script]))
1706
					{
1707
						$this->script_stats[$char_script]['age'] = (float) $fields[1];
1708
						$this->script_stats[$char_script]['count'] = 1;
1709
					}
1710
					else
1711
					{
1712
						$this->script_stats[$char_script]['age'] = min((float) $fields[1], $this->script_stats[$char_script]['age']);
1713
						$this->script_stats[$char_script]['count']++;
1714
					}
1715
				}
1716
			}
1717
			else
1718
			{
1719
				list($start, $end) = explode('..', $fields[0]);
1720
1721
				$ord_s = hexdec($start);
1722
				$ord_e = hexdec($end);
1723
1724
				$ord = $ord_s;
1725
				while ($ord <= $ord_e)
1726
				{
1727
					$entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
1728
1729
					if (empty($this->char_data[$entity]['scripts']))
1730
					{
1731
						continue;
1732
					}
1733
1734
					foreach ($this->char_data[$entity]['scripts'] as $char_script)
1735
					{
1736
						if (!isset($this->script_stats[$char_script]))
1737
						{
1738
							$this->script_stats[$char_script]['age'] = $fields[1];
1739
							$this->script_stats[$char_script]['count'] = 1;
1740
						}
1741
						else
1742
						{
1743
							$this->script_stats[$char_script]['age'] = min($fields[1], $this->script_stats[$char_script]['age']);
1744
							$this->script_stats[$char_script]['count']++;
1745
						}
1746
					}
1747
				}
1748
			}
1749
		}
1750
1751
		return true;
1752
	}
1753
1754
	/**
1755
	 * Builds regex classes for join control tests in utf8_sanitize_invisibles.
1756
	 * Specifically, for cursive scripts like Arabic.
1757
	 */
1758
	private function build_regex_joining_type()
1759
	{
1760
		$local_file = $this->fetch_unicode_file('extracted/DerivedJoiningType.txt', self::DATA_URL_UCD);
1761
1762
		if (empty($local_file))
1763
			return false;
1764
1765
		foreach (file($local_file) as $line)
1766
		{
1767
			$line = substr($line, 0, strcspn($line, '#'));
1768
1769
			if (strpos($line, ';') === false)
1770
			{
1771
				continue;
1772
			}
1773
1774
			$fields = explode(';', $line);
1775
1776
			foreach ($fields as $key => $value)
1777
			{
1778
				$fields[$key] = trim($value);
1779
			}
1780
1781
			switch ($fields[1])
1782
			{
1783
				case 'C':
1784
					$joining_type = 'Join_Causing';
1785
					break;
1786
1787
				case 'D':
1788
					$joining_type = 'Dual_Joining';
1789
					break;
1790
1791
				case 'R':
1792
					$joining_type = 'Right_Joining';
1793
					break;
1794
1795
				case 'L':
1796
					$joining_type = 'Left_Joining';
1797
					break;
1798
1799
				case 'T':
1800
					$joining_type = 'Transparent';
1801
					break;
1802
1803
				default:
1804
					$joining_type = null;
1805
					break;
1806
			}
1807
1808
			if (!isset($joining_type))
1809
			{
1810
				continue;
1811
			}
1812
1813
			$entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';';
1814
1815
			if (empty($this->char_data[$entity]['scripts']))
1816
			{
1817
				continue;
1818
			}
1819
1820
			foreach ($this->char_data[$entity]['scripts'] as $char_script)
1821
			{
1822
				if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
1823
				{
1824
					$this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $this->script_stats[$char_script];
1825
				}
1826
1827
				if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
1828
				{
1829
					$this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
1830
				}
1831
1832
				$this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
1833
			}
1834
		}
1835
		// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
1836
		uasort($this->funcs['utf8_regex_joining_type']['data'], function ($a, $b)
1837
		{
1838
			if ($a['stats']['age'] == $b['stats']['age'])
1839
			{
1840
				return $b['stats']['count'] - $a['stats']['count'];
1841
			}
1842
			else
1843
			{
1844
				return $a['stats']['age'] - $b['stats']['age'];
1845
			}
1846
		});
1847
		foreach ($this->funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types)
1848
		{
1849
			unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
1850
1851
			// If the only joining type in this script is transparent, we don't care about it.
1852
			if (array_keys($joining_types) === array('Transparent'))
1853
			{
1854
				unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]);
1855
				continue;
1856
			}
1857
1858
			foreach ($joining_types as $joining_type => $value)
1859
			{
1860
				sort($value);
1861
			}
1862
		}
1863
1864
		return true;
1865
	}
1866
1867
	/**
1868
	 * Builds regex classes for join control tests in utf8_sanitize_invisibles.
1869
	 * Specifically, for Indic scripts like Devanagari.
1870
	 */
1871
	private function build_regex_indic()
1872
	{
1873
		$local_file = $this->fetch_unicode_file('IndicSyllabicCategory.txt', self::DATA_URL_UCD);
1874
1875
		if (empty($local_file))
1876
			return false;
1877
1878
		foreach (file($local_file) as $line)
1879
		{
1880
			$line = substr($line, 0, strcspn($line, '#'));
1881
1882
			if (strpos($line, ';') === false)
1883
			{
1884
				continue;
1885
			}
1886
1887
			$fields = explode(';', $line);
1888
1889
			foreach ($fields as $key => $value)
1890
			{
1891
				$fields[$key] = trim($value);
1892
			}
1893
1894
			$insc = $fields[1];
1895
1896
			if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
1897
			{
1898
				continue;
1899
			}
1900
1901
			$char_scripts = $this->char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
1902
1903
			if (empty($char_scripts))
1904
			{
1905
				continue;
1906
			}
1907
1908
			foreach ($char_scripts as $char_script)
1909
			{
1910
				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats']))
1911
				{
1912
					$this->funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $this->script_stats[$char_script];
1913
				}
1914
1915
				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script][$insc]))
1916
				{
1917
					$this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
1918
				}
1919
1920
				$this->funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
1921
			}
1922
		}
1923
		// Again, sort commonly used scripts before rare scripts.
1924
		uasort($this->funcs['utf8_regex_indic']['data'], function ($a, $b)
1925
		{
1926
			if ($a['stats']['age'] == $b['stats']['age'])
1927
			{
1928
				return $b['stats']['count'] - $a['stats']['count'];
1929
			}
1930
			else
1931
			{
1932
				return $a['stats']['age'] - $b['stats']['age'];
1933
			}
1934
		});
1935
		// We only want scripts with viramas.
1936
		foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
1937
		{
1938
			unset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
1939
1940
			if (!isset($inscs['Virama']))
1941
			{
1942
				unset($this->funcs['utf8_regex_indic']['data'][$char_script]);
1943
				continue;
1944
			}
1945
		}
1946
		// Now add some more classes that we need for each script.
1947
		foreach ($this->char_data as $entity => $info)
1948
		{
1949
			if (empty($info['scripts']))
1950
			{
1951
				continue;
1952
			}
1953
1954
			$ord = hexdec(trim($entity, '&#x;'));
1955
1956
			foreach ($info['scripts'] as $char_script)
1957
			{
1958
				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]))
1959
				{
1960
					continue;
1961
				}
1962
1963
				$this->funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
1964
1965
				if (empty($info['General_Category']))
1966
				{
1967
					continue;
1968
				}
1969
				elseif ($info['General_Category'] == 'Mn')
1970
				{
1971
					$this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
1972
1973
					if (!empty($this->funcs['utf8_combining_classes']['data'][$entity]))
1974
					{
1975
						$this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
1976
					}
1977
				}
1978
				elseif (substr($info['General_Category'], 0, 1) == 'L')
1979
				{
1980
					$this->funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
1981
				}
1982
			}
1983
		}
1984
		foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
1985
		{
1986
			foreach ($inscs as $insc => $value)
1987
			{
1988
				sort($value);
1989
1990
				if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark')))
1991
				{
1992
					continue;
1993
				}
1994
1995
				$class_string = '';
1996
1997
				$current_range = array('start' => null, 'end' => null);
1998
				foreach ($value as $ord)
1999
				{
2000
					if (!isset($current_range['start']))
2001
					{
2002
						$current_range['start'] = $ord;
2003
					}
2004
2005
					if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
2006
					{
2007
						$current_range['end'] = $ord;
2008
						continue;
2009
					}
2010
					else
2011
					{
2012
						$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2012
						$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
2013
2014
						if ($current_range['start'] != $current_range['end'])
2015
						{
2016
							$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2016
							$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
2017
						}
2018
2019
						$current_range = array('start' => $ord, 'end' => $ord);
2020
					}
2021
				}
2022
2023
				if (isset($current_range['start']))
2024
				{
2025
					$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
2026
2027
					if ($current_range['start'] != $current_range['end'])
2028
					{
2029
						$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
2030
					}
2031
				}
2032
2033
				$this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
2034
			}
2035
2036
			ksort($this->funcs['utf8_regex_indic']['data'][$char_script]);
2037
		}
2038
2039
		return true;
2040
	}
2041
2042
	/**
2043
	 * Builds maps and regex classes for IDNA purposes.
2044
	 */
2045
	private function build_idna()
2046
	{
2047
		$local_file = $this->fetch_unicode_file('IdnaMappingTable.txt', self::DATA_URL_IDNA);
2048
2049
		if (empty($local_file))
2050
			return false;
2051
2052
		foreach (file($local_file) as $line)
2053
		{
2054
			$line = substr($line, 0, strcspn($line, '#'));
2055
2056
			if (strpos($line, ';') === false)
2057
			{
2058
				continue;
2059
			}
2060
2061
			$fields = explode(';', $line);
2062
2063
			foreach ($fields as $key => $value)
2064
			{
2065
				$fields[$key] = preg_replace('/\b(0(?!\b))+/', '', trim($value));
2066
			}
2067
2068
			if (strpos($fields[0], '..') === false)
2069
			{
2070
				$entities = array('&#x' . $fields[0] . ';');
2071
			}
2072
			else
2073
			{
2074
				$entities = array();
2075
2076
				list($start, $end) = explode('..', $fields[0]);
2077
2078
				$ord_s = hexdec($start);
2079
				$ord_e = hexdec($end);
2080
2081
				$ord = $ord_s;
2082
				while ($ord <= $ord_e)
2083
				{
2084
					$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
2085
				}
2086
			}
2087
2088
			if ($fields[1] === 'mapped')
2089
			{
2090
				foreach ($entities as $entity)
2091
					$this->funcs['idna_maps']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
2092
			}
2093
			elseif ($fields[1] === 'deviation')
2094
			{
2095
				foreach ($entities as $entity)
2096
					$this->funcs['idna_maps_deviation']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
2097
2098
				$this->funcs['idna_regex']['data']['deviation'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
2099
			}
2100
			elseif ($fields[1] === 'ignored')
2101
			{
2102
				$this->funcs['idna_regex']['data']['ignored'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
2103
			}
2104
			elseif ($fields[1] === 'disallowed')
2105
			{
2106
				if (in_array('&#xD800;', $entities))
2107
					continue;
2108
2109
				$this->funcs['idna_regex']['data']['disallowed'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
2110
			}
2111
		}
2112
2113
		return true;
2114
	}
2115
}
2116
2117
?>