build_func_array()   B
last analyzed

Complexity

Conditions 9
Paths 13

Size

Total Lines 58
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 9
eloc 28
c 2
b 0
f 0
nc 13
nop 4
dl 0
loc 58
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This is an internal development file. It should NOT be included in
5
 * any SMF distribution packages.
6
 *
7
 * This file exists to make it easy for developers to update the
8
 * Unicode data in Subs-Charset.php whenever a new version of the
9
 * Unicode Character Database is released. Just run this file from the
10
 * command line in order to perform the update.
11
 *
12
 * Simple Machines Forum (SMF)
13
 *
14
 * @package SMF
15
 * @author Simple Machines https://www.simplemachines.org
16
 * @copyright 2022 Simple Machines and individual contributors
17
 * @license https://www.simplemachines.org/about/smf/license.php BSD
18
 *
19
 * @version 2.1.2
20
 */
21
22
$unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd';
23
24
$sourcedir = realpath(dirname(__DIR__) . '/Sources');
25
$unicodedir = $sourcedir . '/Unicode';
26
27
$full_decomposition_maps = array();
28
$funcs = array(
29
	'utf8_normalize_d_maps' => array(
30
		'file' => 'DecompositionCanonical.php',
31
		'key_type' => 'hexchar',
32
		'val_type' => 'hexchar',
33
		'data' => array(),
34
	),
35
	'utf8_normalize_kd_maps' => array(
36
		'file' => 'DecompositionCompatibility.php',
37
		'key_type' => 'hexchar',
38
		'val_type' => 'hexchar',
39
		'data' => array(),
40
	),
41
	'utf8_compose_maps' => array(
42
		'file' => 'Composition.php',
43
		'key_type' => 'hexchar',
44
		'val_type' => 'hexchar',
45
		'data' => array(),
46
	),
47
	'utf8_combining_classes' => array(
48
		'file' => 'CombiningClasses.php',
49
		'key_type' => 'hexchar',
50
		'val_type' => 'int',
51
		'data' => array(),
52
	),
53
	'utf8_strtolower_maps' => array(
54
		'file' => 'CaseLower.php',
55
		'key_type' => 'hexchar',
56
		'val_type' => 'hexchar',
57
		'data' => array(),
58
	),
59
	'utf8_strtoupper_maps' => array(
60
		'file' => 'CaseUpper.php',
61
		'key_type' => 'hexchar',
62
		'val_type' => 'hexchar',
63
		'data' => array(),
64
	),
65
	'utf8_casefold_maps' => array(
66
		'file' => 'CaseFold.php',
67
		'key_type' => 'hexchar',
68
		'val_type' => 'hexchar',
69
		'data' => array(),
70
	),
71
	'utf8_default_ignorables' => array(
72
		'file' => 'DefaultIgnorables.php',
73
		'key_type' => 'int',
74
		'val_type' => 'hexchar',
75
		'data' => array(),
76
	),
77
	'utf8_regex_properties' => array(
78
		'file' => 'RegularExpressions.php',
79
		'key_type' => 'string',
80
		'val_type' => 'string',
81
		'propfiles' => array(
82
			'DerivedCoreProperties.txt',
83
			'PropList.txt',
84
			'emoji/emoji-data.txt',
85
			'extracted/DerivedGeneralCategory.txt',
86
		),
87
		'props' => array(
88
			'Bidi_Control',
89
			'Cn',
90
			'Default_Ignorable_Code_Point',
91
			'Emoji',
92
			'Emoji_Modifier',
93
			'Ideographic',
94
			'Join_Control',
95
			'Regional_Indicator',
96
			'Variation_Selector',
97
		),
98
		'data' => array(),
99
	),
100
	'utf8_regex_variation_selectors' => array(
101
		'file' => 'RegularExpressions.php',
102
		'key_type' => 'string',
103
		'val_type' => 'string',
104
		'data' => array(),
105
	),
106
	'utf8_regex_joining_type' => array(
107
		'file' => 'RegularExpressions.php',
108
		'key_type' => 'string',
109
		'val_type' => 'string',
110
		'data' => array(),
111
	),
112
	'utf8_regex_indic' => array(
113
		'file' => 'RegularExpressions.php',
114
		'key_type' => 'string',
115
		'val_type' => 'string',
116
		'data' => array(),
117
	),
118
);
119
120
foreach ($funcs as $func_name => $func_info)
121
{
122
	if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file']))
123
	{
124
		die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.');
125
	}
126
}
127
128
@ini_set('memory_limit', '256M');
129
130
/*********************************************
131
 * Part 1: Normalization, case folding, etc. *
132
 *********************************************/
133
134
// We need some of these for further analysis below.
135
$derived_normalization_props = array();
136
foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line)
137
{
138
	$line = substr($line, 0, strcspn($line, '#'));
139
140
	if (strpos($line, ';') === false)
141
	{
142
		continue;
143
	}
144
145
	$fields = explode(';', $line);
146
147
	foreach ($fields as $key => $value)
148
	{
149
		$fields[$key] = trim($value);
150
	}
151
152
	if (!isset($derived_normalization_props[$fields[1]]))
153
	{
154
		$derived_normalization_props[$fields[1]] = array();
155
	}
156
157
	if (strpos($fields[0], '..') === false)
158
	{
159
		$entities = array('&#x' . $fields[0] . ';');
160
	}
161
	else
162
	{
163
		$entities = array();
164
165
		list($start, $end) = explode('..', $fields[0]);
166
167
		$ord_s = hexdec($start);
168
		$ord_e = hexdec($end);
169
170
		$ord = $ord_s;
171
		while ($ord <= $ord_e)
172
		{
173
			$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
174
		}
175
	}
176
177
	$value = '';
178
	if (!isset($fields[2]))
179
	{
180
		$value = 'SAME';
181
	}
182
	elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF')))
183
	{
184
		$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
185
	}
186
	else
187
	{
188
		$value = $fields[2];
189
	}
190
191
	foreach ($entities as $entity)
192
	{
193
		$derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
194
	}
195
}
196
197
// Go through all the characters in the Unicode database.
198
$char_data = array();
199
foreach (file($unicode_data_url . '/UnicodeData.txt') as $line)
200
{
201
	$fields = explode(';', $line);
202
203
	if (!empty($fields[3]))
204
	{
205
		$funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = trim($fields[3]);
206
	}
207
208
	// Uppercase maps.
209
	if ($fields[12] !== '')
210
	{
211
		$funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
212
	}
213
214
	// Lowercase maps.
215
	if ($fields[13] !== '')
216
	{
217
		$funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
218
	}
219
220
	// Remember this character's general category for later.
221
	$char_data['&#x' . $fields[0] . ';']['General_Category'] = trim($fields[2]);
222
223
	if ($fields[5] === '')
224
	{
225
		continue;
226
	}
227
228
	// All canonical decompositions AND all compatibility decompositions.
229
	$full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
230
231
	// Just the canonical decompositions.
232
	if (strpos($fields[5], '<') === false)
233
	{
234
		$funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[5])) . ';';
235
	}
236
}
237
238
foreach (file($unicode_data_url . '/CaseFolding.txt') as $line)
239
{
240
	$line = substr($line, 0, strcspn($line, '#'));
241
242
	if (strpos($line, ';') === false)
243
	{
244
		continue;
245
	}
246
247
	$fields = explode(';', $line);
248
249
	foreach ($fields as $key => $value)
250
	{
251
		$fields[$key] = trim($value);
252
	}
253
254
	// Full casefolding.
255
	if (in_array($fields[1], array('C', 'F')))
256
	{
257
		$funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
258
	}
259
260
	// Simple casefolding. Currently unused.
261
	// if (in_array($fields[1], array('C', 'S')))
262
	// 	$funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
263
}
264
265
// Recursively iterate until we reach the final decomposition forms.
266
// This is necessary because some characters decompose to other characters that
267
// themselves decompose further.
268
$changed = true;
269
while ($changed)
270
{
271
	$temp = array();
272
	foreach ($full_decomposition_maps as $composed => $decomposed)
273
	{
274
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
275
276
		foreach ($parts as $partnum => $hex)
277
		{
278
			if (isset($full_decomposition_maps[$hex]))
279
			{
280
				$parts[$partnum] = $full_decomposition_maps[$hex];
281
			}
282
		}
283
284
		$decomposed = implode(' ', $parts);
285
		unset($parts);
286
287
		$temp[$composed] = $decomposed;
288
	}
289
290
	$changed = $full_decomposition_maps !== $temp;
291
292
	$full_decomposition_maps = $temp;
293
}
294
295
// Same as above, but using only canonical decompositions.
296
$changed = true;
297
$iteration = 0;
298
while ($changed)
299
{
300
	$temp = array();
301
	foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed)
302
	{
303
		if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion']))
304
		{
305
			$funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
306
		}
307
308
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
309
310
		foreach ($parts as $partnum => $hex)
311
		{
312
			if (isset($funcs['utf8_normalize_d_maps']['data'][$hex]))
313
			{
314
				$parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex];
315
			}
316
		}
317
318
		$decomposed = implode(' ', $parts);
319
		unset($parts);
320
321
		$temp[$composed] = $decomposed;
322
	}
323
324
	$changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp;
325
326
	$funcs['utf8_normalize_d_maps']['data'] = $temp;
327
	$iteration++;
328
}
329
330
$funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']);
331
unset($full_decomposition_maps, $derived_normalization_props);
332
333
// Now update the files with the data we've got so far.
334
foreach ($funcs as $func_name => $func_info)
335
{
336
	if (empty($func_info['data']))
337
	{
338
		continue;
339
	}
340
341
	export_func_to_file($func_name, $func_info);
342
343
	// Free up some memory.
344
	if ($func_name != 'utf8_combining_classes')
345
	{
346
		unset($funcs[$func_name]);
347
	}
348
}
349
350
/***********************************
351
 * Part 2: Regular expression data *
352
 ***********************************/
353
354
// Build regular expression classes for extended Unicode properties.
355
foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename)
356
{
357
	foreach (file($unicode_data_url . '/' . $filename) as $line)
358
	{
359
		$line = substr($line, 0, strcspn($line, '#'));
360
361
		if (strpos($line, ';') === false)
362
		{
363
			continue;
364
		}
365
366
		$fields = explode(';', $line);
367
368
		foreach ($fields as $key => $value)
369
		{
370
			$fields[$key] = trim($value);
371
		}
372
373
		if (in_array($fields[1], $funcs['utf8_regex_properties']['props']))
374
		{
375
			if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]]))
376
			{
377
				$funcs['utf8_regex_properties']['data'][$fields[1]] = '';
378
			}
379
380
			$funcs['utf8_regex_properties']['data'][$fields[1]] .= '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
381
		}
382
383
		// We also track 'Default_Ignorable_Code_Point' property in a separate array.
384
		if ($fields[1] !== 'Default_Ignorable_Code_Point')
385
		{
386
			continue;
387
		}
388
389
		if (strpos($fields[0], '..') === false)
390
		{
391
			$funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
392
		}
393
		else
394
		{
395
			list($start, $end) = explode('..', $fields[0]);
396
397
			$ord_s = hexdec($start);
398
			$ord_e = hexdec($end);
399
400
			$ord = $ord_s;
401
			while ($ord <= $ord_e)
402
			{
403
				$funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
404
			}
405
		}
406
	}
407
}
408
ksort($funcs['utf8_regex_properties']['data']);
409
410
// Build regular expression classes for filtering variation selectors.
411
$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
412
foreach ($files as $filename)
413
{
414
	foreach (file($unicode_data_url . '/' . $filename) as $line)
415
	{
416
		$line = substr($line, 0, strcspn($line, '#'));
417
418
		if (strpos($line, ';') === false)
419
		{
420
			continue;
421
		}
422
423
		$fields = explode(';', $line);
424
425
		foreach ($fields as $key => $value)
426
		{
427
			$fields[$key] = trim($value);
428
		}
429
430
		list($base_char, $variation_selector) = explode(' ', $fields[0]);
431
432
		$funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
433
	}
434
}
435
foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords)
436
{
437
	$class_string = '';
438
439
	$current_range = array('start' => null, 'end' => null);
440
	foreach ($ords as $ord)
441
	{
442
		if (!isset($current_range['start']))
443
		{
444
			$current_range['start'] = $ord;
445
		}
446
447
		if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
448
		{
449
			$current_range['end'] = $ord;
450
			continue;
451
		}
452
		else
453
		{
454
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

454
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
455
456
			if ($current_range['start'] != $current_range['end'])
457
			{
458
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

458
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
459
			}
460
461
			$current_range = array('start' => $ord, 'end' => $ord);
462
		}
463
	}
464
465
	if (isset($current_range['start']))
466
	{
467
		$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
468
469
		if ($current_range['start'] != $current_range['end'])
470
		{
471
			$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
472
		}
473
	}
474
475
	// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
476
	if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false)
477
	{
478
		unset(
479
			$funcs['utf8_regex_variation_selectors']['data'][$identical],
480
			$funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
481
		);
482
483
		$compound_selector = array($identical, $variation_selector);
484
		sort($compound_selector);
485
486
		$variation_selector = implode('', $compound_selector);
487
	}
488
489
	$funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
490
}
491
krsort($funcs['utf8_regex_variation_selectors']['data']);
492
493
// The regex classes for join control tests require info about language scripts.
494
$script_stats = array();
495
$script_aliases = array();
496
foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line)
497
{
498
	$line = substr($line, 0, strcspn($line, '#'));
499
500
	if (strpos($line, ';') === false)
501
	{
502
		continue;
503
	}
504
505
	$fields = explode(';', $line);
506
507
	foreach ($fields as $key => $value)
508
	{
509
		$fields[$key] = trim($value);
510
	}
511
512
	if ($fields[0] !== 'sc')
513
	{
514
		continue;
515
	}
516
517
	$script_aliases[$fields[1]] = $fields[2];
518
}
519
foreach (file($unicode_data_url . '/Scripts.txt') as $line)
520
{
521
	$line = substr($line, 0, strcspn($line, '#'));
522
523
	if (strpos($line, ';') === false)
524
	{
525
		continue;
526
	}
527
528
	$fields = explode(';', $line);
529
530
	foreach ($fields as $key => $value)
531
	{
532
		$fields[$key] = trim($value);
533
	}
534
535
	if (in_array($fields[1], array('Common', 'Inherited')))
536
	{
537
		continue;
538
	}
539
540
	if (strpos($fields[0], '..') === false)
541
	{
542
		$char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
543
	}
544
	else
545
	{
546
		list($start, $end) = explode('..', $fields[0]);
547
548
		$ord_s = hexdec($start);
549
		$ord_e = hexdec($end);
550
551
		$ord = $ord_s;
552
		while ($ord <= $ord_e)
553
		{
554
			$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
555
		}
556
	}
557
}
558
foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line)
559
{
560
	$line = substr($line, 0, strcspn($line, '#'));
561
562
	if (strpos($line, ';') === false)
563
	{
564
		continue;
565
	}
566
567
	$fields = explode(';', $line);
568
569
	foreach ($fields as $key => $value)
570
	{
571
		$fields[$key] = trim($value);
572
	}
573
574
	$char_scripts = array();
575
	foreach (explode(' ', $fields[1]) as $alias)
576
	{
577
		if (!in_array($script_aliases[$alias], array('Common', 'Inherited')))
578
		{
579
			$char_scripts[] = $script_aliases[$alias];
580
		}
581
	}
582
583
	if (strpos($fields[0], '..') === false)
584
	{
585
		foreach ($char_scripts as $char_script)
586
		{
587
			$char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
588
		}
589
	}
590
	else
591
	{
592
		list($start, $end) = explode('..', $fields[0]);
593
594
		$ord_s = hexdec($start);
595
		$ord_e = hexdec($end);
596
597
		$ord = $ord_s;
598
		while ($ord <= $ord_e)
599
		{
600
			foreach ($char_scripts as $char_script)
601
			{
602
				$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
603
			}
604
		}
605
	}
606
}
607
foreach (file($unicode_data_url . '/DerivedAge.txt') as $line)
608
{
609
	$line = substr($line, 0, strcspn($line, '#'));
610
611
	if (strpos($line, ';') === false)
612
	{
613
		continue;
614
	}
615
616
	$fields = explode(';', $line);
617
618
	foreach ($fields as $key => $value)
619
	{
620
		$fields[$key] = trim($value);
621
	}
622
623
	$fields[1] = (float) $fields[1];
624
625
	if (strpos($fields[0], '..') === false)
626
	{
627
		$entity = '&#x' . $fields[0] . ';';
628
629
		if (empty($char_data[$entity]['scripts']))
630
		{
631
			continue;
632
		}
633
634
		foreach ($char_data[$entity]['scripts'] as $char_script)
635
		{
636
			if (!isset($script_stats[$char_script]))
637
			{
638
				$script_stats[$char_script]['age'] = (float) $fields[1];
639
				$script_stats[$char_script]['count'] = 1;
640
			}
641
			else
642
			{
643
				$script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']);
644
				$script_stats[$char_script]['count']++;
645
			}
646
		}
647
	}
648
	else
649
	{
650
		list($start, $end) = explode('..', $fields[0]);
651
652
		$ord_s = hexdec($start);
653
		$ord_e = hexdec($end);
654
655
		$ord = $ord_s;
656
		while ($ord <= $ord_e)
657
		{
658
			$entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
659
660
			if (empty($char_data[$entity]['scripts']))
661
			{
662
				continue;
663
			}
664
665
			foreach ($char_data[$entity]['scripts'] as $char_script)
666
			{
667
				if (!isset($script_stats[$char_script]))
668
				{
669
					$script_stats[$char_script]['age'] = $fields[1];
670
					$script_stats[$char_script]['count'] = 1;
671
				}
672
				else
673
				{
674
					$script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']);
675
					$script_stats[$char_script]['count']++;
676
				}
677
			}
678
		}
679
	}
680
}
681
682
// Build regex classes for join control tests in utf8_sanitize_invisibles:
683
// 1. Cursive scripts like Arabic.
684
foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line)
685
{
686
	$line = substr($line, 0, strcspn($line, '#'));
687
688
	if (strpos($line, ';') === false)
689
	{
690
		continue;
691
	}
692
693
	$fields = explode(';', $line);
694
695
	foreach ($fields as $key => $value)
696
	{
697
		$fields[$key] = trim($value);
698
	}
699
700
	switch ($fields[1])
701
	{
702
		case 'C':
703
			$joining_type = 'Join_Causing';
704
			break;
705
706
		case 'D':
707
			$joining_type = 'Dual_Joining';
708
			break;
709
710
		case 'R':
711
			$joining_type = 'Right_Joining';
712
			break;
713
714
		case 'L':
715
			$joining_type = 'Left_Joining';
716
			break;
717
718
		case 'T':
719
			$joining_type = 'Transparent';
720
			break;
721
722
		default:
723
			$joining_type = null;
724
			break;
725
	}
726
727
	if (!isset($joining_type))
728
	{
729
		continue;
730
	}
731
732
	$entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';';
733
734
	if (empty($char_data[$entity]['scripts']))
735
	{
736
		continue;
737
	}
738
739
	foreach ($char_data[$entity]['scripts'] as $char_script)
740
	{
741
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
742
		{
743
			$funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script];
744
		}
745
746
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
747
		{
748
			$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
749
		}
750
751
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
752
	}
753
}
754
// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
755
uasort($funcs['utf8_regex_joining_type']['data'], function ($a, $b)
756
{
757
	if ($a['stats']['age'] == $b['stats']['age'])
758
	{
759
		return $b['stats']['count'] - $a['stats']['count'];
760
	}
761
	else
762
	{
763
		return $a['stats']['age'] - $b['stats']['age'];
764
	}
765
});
766
foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types)
767
{
768
	unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
769
770
	// If the only joining type in this script is transparent, we don't care about it.
771
	if (array_keys($joining_types) === array('Transparent'))
772
	{
773
		unset($funcs['utf8_regex_joining_type']['data'][$char_script]);
774
		continue;
775
	}
776
777
	foreach ($joining_types as $joining_type => $value)
778
	{
779
		sort($value);
780
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = implode('', $value);
781
	}
782
}
783
784
// 2. Indic scripts like Devanagari.
785
foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line)
786
{
787
	$line = substr($line, 0, strcspn($line, '#'));
788
789
	if (strpos($line, ';') === false)
790
	{
791
		continue;
792
	}
793
794
	$fields = explode(';', $line);
795
796
	foreach ($fields as $key => $value)
797
	{
798
		$fields[$key] = trim($value);
799
	}
800
801
	$insc = $fields[1];
802
803
	if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
804
	{
805
		continue;
806
	}
807
808
	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
809
810
	if (empty($char_scripts))
811
	{
812
		continue;
813
	}
814
815
	foreach ($char_scripts as $char_script)
816
	{
817
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats']))
818
		{
819
			$funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script];
820
		}
821
822
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc]))
823
		{
824
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
825
		}
826
827
		$funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
828
	}
829
}
830
// Again, sort commonly used scripts before rare scripts.
831
uasort($funcs['utf8_regex_indic']['data'], function ($a, $b)
832
{
833
	if ($a['stats']['age'] == $b['stats']['age'])
834
	{
835
		return $b['stats']['count'] - $a['stats']['count'];
836
	}
837
	else
838
	{
839
		return $a['stats']['age'] - $b['stats']['age'];
840
	}
841
});
842
// We only want scripts with viramas.
843
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
844
{
845
	unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
846
847
	if (!isset($inscs['Virama']))
848
	{
849
		unset($funcs['utf8_regex_indic']['data'][$char_script]);
850
		continue;
851
	}
852
}
853
// Now add some more classes that we need for each script.
854
foreach ($char_data as $entity => $info)
855
{
856
	if (empty($info['scripts']))
857
	{
858
		continue;
859
	}
860
861
	$ord = hexdec(trim($entity, '&#x;'));
862
863
	foreach ($info['scripts'] as $char_script)
864
	{
865
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]))
866
		{
867
			continue;
868
		}
869
870
		$funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
871
872
		if (empty($info['General_Category']))
873
		{
874
			continue;
875
		}
876
		elseif ($info['General_Category'] == 'Mn')
877
		{
878
			$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
879
880
			if (!empty($funcs['utf8_combining_classes']['data'][$entity]))
881
			{
882
				$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
883
			}
884
		}
885
		elseif (substr($info['General_Category'], 0, 1) == 'L')
886
		{
887
			$funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
888
		}
889
	}
890
}
891
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
892
{
893
	foreach ($inscs as $insc => $value)
894
	{
895
		sort($value);
896
897
		if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark')))
898
		{
899
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = implode('', $value);
900
			continue;
901
		}
902
903
		$class_string = '';
904
905
		$current_range = array('start' => null, 'end' => null);
906
		foreach ($value as $ord)
907
		{
908
			if (!isset($current_range['start']))
909
			{
910
				$current_range['start'] = $ord;
911
			}
912
913
			if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
914
			{
915
				$current_range['end'] = $ord;
916
				continue;
917
			}
918
			else
919
			{
920
				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
921
922
				if ($current_range['start'] != $current_range['end'])
923
				{
924
					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
925
				}
926
927
				$current_range = array('start' => $ord, 'end' => $ord);
928
			}
929
		}
930
931
		if (isset($current_range['start']))
932
		{
933
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
934
935
			if ($current_range['start'] != $current_range['end'])
936
			{
937
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
938
			}
939
		}
940
941
		$funcs['utf8_regex_indic']['data'][$char_script][$insc] = $class_string;
942
	}
943
944
	ksort($funcs['utf8_regex_indic']['data'][$char_script]);
945
}
946
unset($funcs['utf8_combining_classes']);
947
948
foreach ($funcs as $func_name => $func_info)
949
{
950
	export_func_to_file($func_name, $func_info);
951
}
952
953
/**
954
 * Updates a Unicode data function in its designated file.
955
 *
956
 * @param string $func_name The name of the function.
957
 * @param array $func_info Info about the function, including its data.
958
 */
959
function export_func_to_file($func_name, $func_info)
960
{
961
	global $unicodedir;
962
963
	$file_contents = file_get_contents($unicodedir . '/' . $func_info['file']);
964
965
	$func_text = 'function ' . $func_name . '()' . "\n" . '{';
966
967
	$func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s';
968
969
	$func_text .= "\n\t" . 'return array(' . "\n";
970
971
	build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']);
972
973
	$func_text .= "\t" . ');' . "\n" . '}';
974
975
	$file_contents = preg_replace($func_regex, $func_text, $file_contents);
976
977
	file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents);
978
}
979
980
/**
981
 * Helper for export_func_to_file(). Builds the function's data array.
982
 *
983
 * @param string &$func_text The raw string that contains function code.
984
 * @param array $data Data to format as an array.
985
 * @param string $key_type How to format the array keys.
986
 * @param string $val_type How to format the array values.
987
 */
988
function build_func_array(&$func_text, $data, $key_type, $val_type)
989
{
990
	static $indent = 2;
991
992
	foreach ($data as $key => $value)
993
	{
994
		$func_text .= str_repeat("\t", $indent);
995
996
		if ($key_type == 'hexchar')
997
		{
998
			$func_text .= '"';
999
1000
			$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
1001
1002
			foreach (unpack('C*', $key) as $byte_value)
1003
			{
1004
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
1005
			}
1006
1007
			$func_text .= '" => ';
1008
		}
1009
		elseif ($key_type == 'string')
1010
		{
1011
			$func_text .= var_export($key, true) . ' => ';
1012
		}
1013
1014
		if (is_array($value))
1015
		{
1016
			$func_text .= 'array(' . "\n";
1017
1018
			$indent++;
1019
			build_func_array($func_text, $value, $key_type, $val_type);
1020
			$indent--;
1021
1022
			$func_text .= str_repeat("\t", $indent) . ')';
1023
		}
1024
		elseif ($val_type == 'hexchar')
1025
		{
1026
			$func_text .= '"';
1027
1028
			$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
1029
			foreach (unpack('C*', $value) as $byte_value)
1030
			{
1031
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
1032
			}
1033
1034
			$func_text .= '"';
1035
		}
1036
		elseif ($val_type == 'string')
1037
		{
1038
			$func_text .= var_export($value, true);
1039
		}
1040
		else
1041
		{
1042
			$func_text .= $value;
1043
		}
1044
1045
		$func_text .= ',' . "\n";
1046
	}
1047
}
1048
1049
?>