Passed
Pull Request — release-2.1 (#7134)
by Jon
06:05
created

export_func_to_file()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 19
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 9
c 2
b 0
f 0
nc 1
nop 2
dl 0
loc 19
rs 9.9666
1
<?php
2
3
/**
4
 * This is an internal development file. It should NOT be included in
5
 * any SMF distribution packages.
6
 *
7
 * This file exists to make it easy for developers to update the
8
 * Unicode data in Subs-Charset.php whenever a new version of the
9
 * Unicode Character Database is released. Just run this file from the
10
 * command line in order to perform the update.
11
 *
12
 * Simple Machines Forum (SMF)
13
 *
14
 * @package SMF
15
 * @author Simple Machines https://www.simplemachines.org
16
 * @copyright 2021 Simple Machines and individual contributors
17
 * @license https://www.simplemachines.org/about/smf/license.php BSD
18
 *
19
 * @version 2.1 RC4
20
 */
21
22
$unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd';
23
24
$sourcedir = realpath(dirname(__DIR__) . "/Sources");
25
$unicodedir = $sourcedir . "/Unicode";
26
27
$full_decomposition_maps = array();
28
$funcs = array(
29
	'utf8_normalize_d_maps' => array(
30
		'file' => 'DecompositionCanonical.php',
31
		'key_type' => 'hexchar',
32
		'val_type' => 'hexchar',
33
		'data' => array(),
34
	),
35
	'utf8_normalize_kd_maps' => array(
36
		'file' => 'DecompositionCompatibility.php',
37
		'key_type' => 'hexchar',
38
		'val_type' => 'hexchar',
39
		'data' => array(),
40
	),
41
	'utf8_compose_maps' => array(
42
		'file' => 'Composition.php',
43
		'key_type' => 'hexchar',
44
		'val_type' => 'hexchar',
45
		'data' => array(),
46
	),
47
	'utf8_combining_classes' => array(
48
		'file' => 'CombiningClasses.php',
49
		'key_type' => 'hexchar',
50
		'val_type' => 'int',
51
		'data' => array(),
52
	),
53
	'utf8_strtolower_maps' => array(
54
		'file' => 'CaseLower.php',
55
		'key_type' => 'hexchar',
56
		'val_type' => 'hexchar',
57
		'data' => array(),
58
	),
59
	'utf8_strtoupper_maps' => array(
60
		'file' => 'CaseUpper.php',
61
		'key_type' => 'hexchar',
62
		'val_type' => 'hexchar',
63
		'data' => array(),
64
	),
65
	'utf8_casefold_maps' => array(
66
		'file' => 'CaseFold.php',
67
		'key_type' => 'hexchar',
68
		'val_type' => 'hexchar',
69
		'data' => array(),
70
	),
71
	'utf8_default_ignorables' => array(
72
		'file' => 'DefaultIgnorables.php',
73
		'key_type' => 'int',
74
		'val_type' => 'hexchar',
75
		'data' => array(),
76
	),
77
	'utf8_regex_properties' => array(
78
		'file' => 'RegularExpressions.php',
79
		'key_type' => 'string',
80
		'val_type' => 'string',
81
		'propfiles' => array(
82
			'DerivedCoreProperties.txt',
83
			'PropList.txt',
84
			'emoji/emoji-data.txt',
85
			'extracted/DerivedGeneralCategory.txt'
86
		),
87
		'props' => array(
88
			'Bidi_Control',
89
			'Cn',
90
			'Default_Ignorable_Code_Point',
91
			'Emoji',
92
			'Emoji_Modifier',
93
			'Ideographic',
94
			'Join_Control',
95
			'Regional_Indicator',
96
			'Variation_Selector',
97
		),
98
		'data' => array(),
99
	),
100
	'utf8_regex_variation_selectors' => array(
101
		'file' => 'RegularExpressions.php',
102
		'key_type' => 'string',
103
		'val_type' => 'string',
104
		'data' => array(),
105
	),
106
	'utf8_regex_joining_type' => array(
107
		'file' => 'RegularExpressions.php',
108
		'key_type' => 'string',
109
		'val_type' => 'string',
110
		'data' => array(),
111
	),
112
	'utf8_regex_indic' => array(
113
		'file' => 'RegularExpressions.php',
114
		'key_type' => 'string',
115
		'val_type' => 'string',
116
		'data' => array(),
117
	),
118
);
119
120
foreach ($funcs as $func_name => $func_info)
121
{
122
	if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file']))
123
		die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.');
124
}
125
126
@ini_set('memory_limit', '256M');
127
128
/*********************************************
129
 * Part 1: Normalization, case folding, etc. *
130
 *********************************************/
131
132
// We need some of these for further analysis below.
133
$derived_normalization_props = array();
134
foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line)
135
{
136
	$line = substr($line, 0, strcspn($line, '#'));
137
138
	if (strpos($line, ';') === false)
139
		continue;
140
141
	$fields = explode(';', $line);
142
143
	foreach ($fields as $key => $value)
144
		$fields[$key] = trim($value);
145
146
	if (!isset($derived_normalization_props[$fields[1]]))
147
		$derived_normalization_props[$fields[1]] = array();
148
149
	if (strpos($fields[0], '..') === false)
150
		$entities = array('&#x' . $fields[0] . ';');
151
	else
152
	{
153
		$entities = array();
154
155
		list($start, $end) = explode('..', $fields[0]);
156
157
		$ord_s = hexdec($start);
158
		$ord_e = hexdec($end);
159
160
		$ord = $ord_s;
161
		while ($ord <= $ord_e)
162
			$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
163
	}
164
165
	$value = '';
166
	if (!isset($fields[2]))
167
		$value = 'SAME';
168
	elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF')))
169
		$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
170
	else
171
		$value = $fields[2];
172
173
	foreach ($entities as $entity)
174
		$derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
175
}
176
177
// Go through all the characters in the Unicode database.
178
$char_data = array();
179
foreach (file($unicode_data_url . '/UnicodeData.txt') as $line)
180
{
181
	$fields = explode(';', $line);
182
183
	if (!empty($fields[3]))
184
		$funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = trim($fields[3]);
185
186
	// Uppercase maps.
187
	if ($fields[12] !== '')
188
		$funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
189
190
	// Lowercase maps.
191
	if ($fields[13] !== '')
192
		$funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
193
194
	// Remember this character's general category for later.
195
	$char_data['&#x' . $fields[0] . ';']['General_Category'] = trim($fields[2]);
196
197
	if ($fields[5] === '')
198
		continue;
199
200
	// All canonical decompositions AND all compatibility decompositions.
201
	$full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
202
203
	// Just the canonical decompositions.
204
	if (strpos($fields[5], '<') === false)
205
		$funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[5])) . ';';
206
207
}
208
209
foreach (file($unicode_data_url . '/CaseFolding.txt') as $line)
210
{
211
	$line = substr($line, 0, strcspn($line, '#'));
212
213
	if (strpos($line, ';') === false)
214
		continue;
215
216
	$fields = explode(';', $line);
217
218
	foreach ($fields as $key => $value)
219
		$fields[$key] = trim($value);
220
221
	// Full casefolding.
222
	if (in_array($fields[1], array('C', 'F')))
223
		$funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
224
225
	// Simple casefolding. Currently unused.
226
	// if (in_array($fields[1], array('C', 'S')))
227
	// 	$funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
228
}
229
230
// Recursively iterate until we reach the final decomposition forms.
231
// This is necessary because some characters decompose to other characters that
232
// themselves decompose further.
233
$changed = true;
234
while ($changed)
235
{
236
	$temp = array();
237
	foreach ($full_decomposition_maps as $composed => $decomposed)
238
	{
239
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
240
241
		foreach ($parts as $partnum => $hex)
242
		{
243
			if (isset($full_decomposition_maps[$hex]))
244
				$parts[$partnum] = $full_decomposition_maps[$hex];
245
		}
246
247
		$decomposed = implode(' ', $parts);
248
		unset($parts);
249
250
		$temp[$composed] = $decomposed;
251
	}
252
253
	$changed = $full_decomposition_maps !== $temp;
254
255
	$full_decomposition_maps = $temp;
256
}
257
258
// Same as above, but using only canonical decompositions.
259
$changed = true;
260
$iteration = 0;
261
while ($changed)
262
{
263
	$temp = array();
264
	foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed)
265
	{
266
		if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion']))
267
			$funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
268
269
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
270
271
		foreach ($parts as $partnum => $hex)
272
		{
273
			if (isset($funcs['utf8_normalize_d_maps']['data'][$hex]))
274
				$parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex];
275
		}
276
277
		$decomposed = implode(' ', $parts);
278
		unset($parts);
279
280
		$temp[$composed] = $decomposed;
281
	}
282
283
	$changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp;
284
285
	$funcs['utf8_normalize_d_maps']['data'] = $temp;
286
	$iteration++;
287
}
288
289
$funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']);
290
unset($full_decomposition_maps, $derived_normalization_props);
291
292
// Now update the files with the data we've got so far.
293
foreach ($funcs as $func_name => $func_info)
294
{
295
	if (empty($func_info['data']))
296
		continue;
297
298
	export_func_to_file($func_name, $func_info);
299
300
	// Free up some memory.
301
	if ($func_name != 'utf8_combining_classes')
302
		unset($funcs[$func_name]);
303
}
304
305
/***********************************
306
 * Part 2: Regular expression data *
307
 ***********************************/
308
309
// Build regular expression classes for extended Unicode properties.
310
foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename)
311
{
312
	foreach (file($unicode_data_url . '/' . $filename) as $line)
313
	{
314
		$line = substr($line, 0, strcspn($line, '#'));
315
316
		if (strpos($line, ';') === false)
317
			continue;
318
319
		$fields = explode(';', $line);
320
321
		foreach ($fields as $key => $value)
322
			$fields[$key] = trim($value);
323
324
		if (in_array($fields[1], $funcs['utf8_regex_properties']['props']))
325
		{
326
			if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]]))
327
				$funcs['utf8_regex_properties']['data'][$fields[1]] = '';
328
329
			$funcs['utf8_regex_properties']['data'][$fields[1]] .= '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
330
		}
331
332
		// We also track 'Default_Ignorable_Code_Point' property in a separate array.
333
		if ($fields[1] !== 'Default_Ignorable_Code_Point')
334
			continue;
335
336
		if (strpos($fields[0], '..') === false)
337
			$funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
338
		else
339
		{
340
			list($start, $end) = explode('..', $fields[0]);
341
342
			$ord_s = hexdec($start);
343
			$ord_e = hexdec($end);
344
345
			$ord = $ord_s;
346
			while ($ord <= $ord_e)
347
				$funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
348
		}
349
	}
350
}
351
ksort($funcs['utf8_regex_properties']['data']);
352
353
// Build regular expression classes for filtering variation selectors.
354
$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
355
foreach ($files as $filename)
356
{
357
	foreach (file($unicode_data_url . '/' . $filename) as $line)
358
	{
359
		$line = substr($line, 0, strcspn($line, '#'));
360
361
		if (strpos($line, ';') === false)
362
			continue;
363
364
		$fields = explode(';', $line);
365
366
		foreach ($fields as $key => $value)
367
			$fields[$key] = trim($value);
368
369
		list($base_char, $variation_selector) = explode(' ', $fields[0]);
370
371
		$funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
372
	}
373
374
}
375
foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords)
376
{
377
	$class_string = '';
378
379
	$current_range = array('start' => null, 'end' => null);
380
	foreach($ords as $ord)
381
	{
382
		if (!isset($current_range['start']))
383
			$current_range['start'] = $ord;
384
385
		if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
386
		{
387
			$current_range['end'] = $ord;
388
			continue;
389
		}
390
		else
391
		{
392
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

392
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
393
394
			if ($current_range['start'] != $current_range['end'])
395
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

395
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
396
397
			$current_range = array('start' => $ord, 'end' => $ord);
398
		}
399
	}
400
401
	if (isset($current_range['start']))
402
	{
403
		$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
404
405
		if ($current_range['start'] != $current_range['end'])
406
			$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
407
	}
408
409
	// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
410
	if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false)
411
	{
412
		unset(
413
			$funcs['utf8_regex_variation_selectors']['data'][$identical],
414
			$funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
415
		);
416
417
		$compound_selector = array($identical, $variation_selector);
418
		sort($compound_selector);
419
420
		$variation_selector = implode('', $compound_selector);
421
	}
422
423
	$funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
424
}
425
krsort($funcs['utf8_regex_variation_selectors']['data']);
426
427
// The regex classes for join control tests require info about language scripts.
428
$script_stats = array();
429
$script_aliases = array();
430
foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line)
431
{
432
	$line = substr($line, 0, strcspn($line, '#'));
433
434
	if (strpos($line, ';') === false)
435
		continue;
436
437
	$fields = explode(';', $line);
438
439
	foreach ($fields as $key => $value)
440
		$fields[$key] = trim($value);
441
442
	if ($fields[0] !== 'sc')
443
		continue;
444
445
	$script_aliases[$fields[1]] = $fields[2];
446
}
447
foreach (file($unicode_data_url . '/Scripts.txt') as $line)
448
{
449
	$line = substr($line, 0, strcspn($line, '#'));
450
451
	if (strpos($line, ';') === false)
452
		continue;
453
454
	$fields = explode(';', $line);
455
456
	foreach ($fields as $key => $value)
457
		$fields[$key] = trim($value);
458
459
	if (in_array($fields[1], array('Common', 'Inherited')))
460
		continue;
461
462
	if (strpos($fields[0], '..') === false)
463
		$char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
464
	else
465
	{
466
		list($start, $end) = explode('..', $fields[0]);
467
468
		$ord_s = hexdec($start);
469
		$ord_e = hexdec($end);
470
471
		$ord = $ord_s;
472
		while ($ord <= $ord_e)
473
			$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
474
	}
475
}
476
foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line)
477
{
478
	$line = substr($line, 0, strcspn($line, '#'));
479
480
	if (strpos($line, ';') === false)
481
		continue;
482
483
	$fields = explode(';', $line);
484
485
	foreach ($fields as $key => $value)
486
		$fields[$key] = trim($value);
487
488
	$char_scripts = array();
489
	foreach (explode(' ', $fields[1]) as $alias)
490
	{
491
		if (!in_array($script_aliases[$alias], array('Common', 'Inherited')))
492
			$char_scripts[] = $script_aliases[$alias];
493
	}
494
495
	if (strpos($fields[0], '..') === false)
496
	{
497
		foreach ($char_scripts as $char_script)
498
			$char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
499
	}
500
	else
501
	{
502
		list($start, $end) = explode('..', $fields[0]);
503
504
		$ord_s = hexdec($start);
505
		$ord_e = hexdec($end);
506
507
		$ord = $ord_s;
508
		while ($ord <= $ord_e)
509
		{
510
			foreach ($char_scripts as $char_script)
511
				$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
512
		}
513
	}
514
}
515
foreach (file($unicode_data_url . '/DerivedAge.txt') as $line)
516
{
517
	$line = substr($line, 0, strcspn($line, '#'));
518
519
	if (strpos($line, ';') === false)
520
		continue;
521
522
	$fields = explode(';', $line);
523
524
	foreach ($fields as $key => $value)
525
		$fields[$key] = trim($value);
526
527
	$fields[1] = (float) $fields[1];
528
529
	if (strpos($fields[0], '..') === false)
530
	{
531
		$char_scripts = $char_data['&#x' . $fields[0] . ';']['scripts'];
532
533
		if (empty($char_scripts))
534
			continue;
535
536
		foreach ($char_scripts as $char_script)
537
		{
538
			if (!isset($script_stats[$char_script]))
539
			{
540
				$script_stats[$char_script]['age'] = (float) $fields[1];
541
				$script_stats[$char_script]['count'] = 1;
542
			}
543
			else
544
			{
545
				$script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']);
546
				$script_stats[$char_script]['count']++;
547
			}
548
		}
549
	}
550
	else
551
	{
552
		list($start, $end) = explode('..', $fields[0]);
553
554
		$ord_s = hexdec($start);
555
		$ord_e = hexdec($end);
556
557
		$ord = $ord_s;
558
		while ($ord <= $ord_e)
559
		{
560
			$char_scripts = $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'];
561
562
			if (empty($char_scripts))
563
				continue;
564
565
			foreach ($char_scripts as $char_script)
566
			{
567
				if (!isset($script_stats[$char_script]))
568
				{
569
					$script_stats[$char_script]['age'] = $fields[1];
570
					$script_stats[$char_script]['count'] = 1;
571
				}
572
				else
573
				{
574
					$script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']);
575
					$script_stats[$char_script]['count']++;
576
				}
577
			}
578
		}
579
	}
580
}
581
582
// Build regex classes for join control tests in utf8_sanitize_invisibles:
583
// 1. Cursive scripts like Arabic.
584
foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line)
585
{
586
	$line = substr($line, 0, strcspn($line, '#'));
587
588
	if (strpos($line, ';') === false)
589
		continue;
590
591
	$fields = explode(';', $line);
592
593
	foreach ($fields as $key => $value)
594
		$fields[$key] = trim($value);
595
596
	switch ($fields[1])
597
	{
598
		case 'C':
599
			$joining_type = 'Join_Causing';
600
			break;
601
602
		case 'D':
603
			$joining_type = 'Dual_Joining';
604
			break;
605
606
		case 'R':
607
			$joining_type = 'Right_Joining';
608
			break;
609
610
		case 'L':
611
			$joining_type = 'Left_Joining';
612
			break;
613
614
		case 'T':
615
			$joining_type = 'Transparent';
616
			break;
617
618
		default:
619
			$joining_type = null;
620
			break;
621
	}
622
623
	if (!isset($joining_type))
624
		continue;
625
626
	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
627
628
	if (empty($char_scripts))
629
		continue;
630
631
	foreach ($char_scripts as $char_script)
632
	{
633
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
634
			$funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script];
635
636
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
637
			$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
638
639
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
640
	}
641
}
642
// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
643
uasort($funcs['utf8_regex_joining_type']['data'], function($a, $b)
644
{
645
	if ($a['stats']['age'] == $b['stats']['age'])
646
		return $b['stats']['count'] - $a['stats']['count'];
647
	else
648
		return $a['stats']['age'] - $b['stats']['age'];
649
});
650
foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types)
651
{
652
	unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
653
654
	// If the only joining type in this script is transparent, we don't care about it.
655
	if (array_keys($joining_types) === array('Transparent'))
656
	{
657
		unset($funcs['utf8_regex_joining_type']['data'][$char_script]);
658
		continue;
659
	}
660
661
	foreach ($joining_types as $joining_type => $value)
662
	{
663
		sort($value);
664
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = implode('', $value);
665
	}
666
}
667
668
// 2. Indic scripts like Devanagari.
669
foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line)
670
{
671
	$line = substr($line, 0, strcspn($line, '#'));
672
673
	if (strpos($line, ';') === false)
674
		continue;
675
676
	$fields = explode(';', $line);
677
678
	foreach ($fields as $key => $value)
679
		$fields[$key] = trim($value);
680
681
	$insc = $fields[1];
682
683
	if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
684
		continue;
685
686
	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
687
688
	if (empty($char_scripts))
689
		continue;
690
691
	foreach ($char_scripts as $char_script)
692
	{
693
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats']))
694
			$funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script];
695
696
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc]))
697
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
698
699
		$funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
700
	}
701
}
702
// Again, sort commonly used scripts before rare scripts.
703
uasort($funcs['utf8_regex_indic']['data'], function($a, $b)
704
{
705
	if ($a['stats']['age'] == $b['stats']['age'])
706
		return $b['stats']['count'] - $a['stats']['count'];
707
	else
708
		return $a['stats']['age'] - $b['stats']['age'];
709
});
710
// We only want scripts with viramas.
711
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
712
{
713
	unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
714
715
	if (!isset($inscs['Virama']))
716
	{
717
		unset($funcs['utf8_regex_indic']['data'][$char_script]);
718
		continue;
719
	}
720
}
721
// Now add some more classes that we need for each script.
722
foreach ($char_data as $entity => $info)
723
{
724
	if (empty($info['scripts']))
725
		continue;
726
727
	$ord = hexdec(trim($entity, '&#x;'));
728
729
	foreach ($info['scripts'] as $char_script)
730
	{
731
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]))
732
			continue;
733
734
		$funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
735
736
		if ($info['General_Category'] == 'Mn')
737
		{
738
			$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
739
740
			if (!empty($funcs['utf8_combining_classes']['data'][$entity]))
741
				$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
742
		}
743
		elseif (substr($info['General_Category'], 0, 1) == 'L')
744
			$funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
745
	}
746
}
747
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
748
{
749
	foreach ($inscs as $insc => $value)
750
	{
751
		sort($value);
752
753
		if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark')))
754
		{
755
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = implode('', $value);
756
			continue;
757
		}
758
759
		$class_string = '';
760
761
		$current_range = array('start' => null, 'end' => null);
762
		foreach($value as $ord)
763
		{
764
			if (!isset($current_range['start']))
765
				$current_range['start'] = $ord;
766
767
			if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
768
			{
769
				$current_range['end'] = $ord;
770
				continue;
771
			}
772
			else
773
			{
774
				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
775
776
				if ($current_range['start'] != $current_range['end'])
777
					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
778
779
				$current_range = array('start' => $ord, 'end' => $ord);
780
			}
781
		}
782
783
		if (isset($current_range['start']))
784
		{
785
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
786
787
			if ($current_range['start'] != $current_range['end'])
788
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
789
		}
790
791
		$funcs['utf8_regex_indic']['data'][$char_script][$insc] = $class_string;
792
	}
793
794
	ksort($funcs['utf8_regex_indic']['data'][$char_script]);
795
}
796
unset($funcs['utf8_combining_classes']);
797
798
foreach ($funcs as $func_name => $func_info)
799
	export_func_to_file($func_name, $func_info);
800
801
802
/**
803
 * Updates a Unicode data function in its designated file.
804
 *
805
 * @param string $func_name The name of the function.
806
 * @param array $func_info Info about the function, including its data.
807
 */
808
function export_func_to_file($func_name, $func_info)
809
{
810
	global $unicodedir;
811
812
	$file_contents = file_get_contents($unicodedir . '/' . $func_info['file']);
813
814
	$func_text = 'function ' . $func_name . '()' . "\n" . '{';
815
816
	$func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s';
817
818
	$func_text .= "\n\t" . 'return array(' . "\n";
819
820
	build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']);
821
822
	$func_text .= "\t" . ');' . "\n" . '}';
823
824
	$file_contents = preg_replace($func_regex, $func_text, $file_contents);
825
826
	file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents);
827
}
828
829
830
/**
831
 * Helper for export_func_to_file(). Builds the function's data array.
832
 *
833
 * @param string &$func_text The raw string that contains function code.
834
 * @param array $data Data to format as an array.
835
 * @param string $key_type How to format the array keys.
836
 * @param string $val_type How to format the array values.
837
 */
838
function build_func_array(&$func_text, $data, $key_type, $val_type)
839
{
840
	static $indent = 2;
841
842
	foreach ($data as $key => $value)
843
	{
844
		$func_text .= str_repeat("\t", $indent);
845
846
		if ($key_type == 'hexchar')
847
		{
848
			$func_text .= '"';
849
850
			$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8');
851
852
			foreach (unpack('C*', $key) as $byte_value)
853
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
854
855
			$func_text .= '" => ';
856
		}
857
		elseif ($key_type == 'string')
858
			$func_text .= var_export($key, true) . ' => ';
859
860
		if (is_array($value))
861
		{
862
			$func_text .= 'array(' . "\n";
863
864
			$indent++;
865
			build_func_array($func_text, $value, $key_type, $val_type);
866
			$indent--;
867
868
			$func_text .= str_repeat("\t", $indent) . ')';
869
		}
870
		elseif ($val_type == 'hexchar')
871
		{
872
			$func_text .= '"';
873
874
			$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8');
875
			foreach (unpack('C*', $value) as $byte_value)
876
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
877
878
			$func_text .= '"';
879
		}
880
		elseif ($val_type == 'string')
881
			$func_text .= var_export($value, true);
882
		else
883
			$func_text .= $value;
884
885
		$func_text .= ',' . "\n";
886
	}
887
}
888
889
?>