Passed
Pull Request — release-2.1 (#7134)
by Jon
04:40
created

build_func_array()   B

Complexity

Conditions 9
Paths 13

Size

Total Lines 45
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 9
eloc 28
nc 13
nop 4
dl 0
loc 45
rs 8.0555
c 1
b 0
f 0
1
<?php
2
3
/**
4
 * This is an internal development file. It should NOT be included in
5
 * any SMF distribution packages.
6
 *
7
 * This file exists to make it easy for developers to update the
8
 * Unicode data in Subs-Charset.php whenever a new version of the
9
 * Unicode Character Database is released. Just run this file from the
10
 * command line in order to perform the update.
11
 *
12
 * Simple Machines Forum (SMF)
13
 *
14
 * @package SMF
15
 * @author Simple Machines https://www.simplemachines.org
16
 * @copyright 2021 Simple Machines and individual contributors
17
 * @license https://www.simplemachines.org/about/smf/license.php BSD
18
 *
19
 * @version 2.1 RC4
20
 */
21
22
$unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd';
23
24
$sourcedir = realpath(dirname(__DIR__) . "/Sources");
25
$unicodedir = $sourcedir . "/Unicode";
26
27
$full_decomposition_maps = array();
28
$funcs = array(
29
	'utf8_normalize_d_maps' => array(
30
		'file' => 'DecompositionCanonical.php',
31
		'key_type' => 'hexchar',
32
		'val_type' => 'hexchar',
33
		'data' => array(),
34
	),
35
	'utf8_normalize_kd_maps' => array(
36
		'file' => 'DecompositionCompatibility.php',
37
		'key_type' => 'hexchar',
38
		'val_type' => 'hexchar',
39
		'data' => array(),
40
	),
41
	'utf8_compose_maps' => array(
42
		'file' => 'Composition.php',
43
		'key_type' => 'hexchar',
44
		'val_type' => 'hexchar',
45
		'data' => array(),
46
	),
47
	'utf8_combining_classes' => array(
48
		'file' => 'CombiningClasses.php',
49
		'key_type' => 'hexchar',
50
		'val_type' => 'int',
51
		'data' => array(),
52
	),
53
	'utf8_strtolower_maps' => array(
54
		'file' => 'CaseLower.php',
55
		'key_type' => 'hexchar',
56
		'val_type' => 'hexchar',
57
		'data' => array(),
58
	),
59
	'utf8_strtoupper_maps' => array(
60
		'file' => 'CaseUpper.php',
61
		'key_type' => 'hexchar',
62
		'val_type' => 'hexchar',
63
		'data' => array(),
64
	),
65
	'utf8_casefold_maps' => array(
66
		'file' => 'CaseFold.php',
67
		'key_type' => 'hexchar',
68
		'val_type' => 'hexchar',
69
		'data' => array(),
70
	),
71
	'utf8_default_ignorables' => array(
72
		'file' => 'DefaultIgnorables.php',
73
		'key_type' => 'int',
74
		'val_type' => 'hexchar',
75
		'data' => array(),
76
	),
77
	'utf8_regex_properties' => array(
78
		'file' => 'RegularExpressions.php',
79
		'key_type' => 'string',
80
		'val_type' => 'string',
81
		'propfiles' => array(
82
			'DerivedCoreProperties.txt',
83
			'PropList.txt',
84
			'emoji/emoji-data.txt',
85
			'extracted/DerivedGeneralCategory.txt'
86
		),
87
		'props' => array(
88
			'Bidi_Control',
89
			'Cn',
90
			'Default_Ignorable_Code_Point',
91
			'Emoji',
92
			'Emoji_Modifier',
93
			'Ideographic',
94
			'Join_Control',
95
			'Regional_Indicator',
96
			'Variation_Selector',
97
		),
98
		'data' => array(),
99
	),
100
	'utf8_regex_variation_selectors' => array(
101
		'file' => 'RegularExpressions.php',
102
		'key_type' => 'string',
103
		'val_type' => 'string',
104
		'data' => array(),
105
	),
106
	'utf8_regex_joining_type' => array(
107
		'file' => 'RegularExpressions.php',
108
		'key_type' => 'string',
109
		'val_type' => 'string',
110
		'data' => array(),
111
	),
112
	'utf8_regex_indic' => array(
113
		'file' => 'RegularExpressions.php',
114
		'key_type' => 'string',
115
		'val_type' => 'string',
116
		'data' => array(),
117
	),
118
);
119
120
foreach ($funcs as $func_name => $func_info) {
121
	if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file']))
122
		die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.');
123
}
124
125
@ini_set('memory_limit', '256M');
126
127
/*********************************************
128
 * Part 1: Normalization, case folding, etc. *
129
 *********************************************/
130
131
// We need some of these for further analysis below.
132
$derived_normalization_props = array();
133
foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line) {
134
	$line = substr($line, 0, strcspn($line, '#'));
135
136
	if (strpos($line, ';') === false)
137
		continue;
138
139
	$fields = explode(';', $line);
140
141
	foreach ($fields as $key => $value) {
142
		$fields[$key] = trim($value);
143
	}
144
145
	if (!isset($derived_normalization_props[$fields[1]])) {
146
		$derived_normalization_props[$fields[1]] = array();
147
	}
148
149
	if (strpos($fields[0], '..') === false) {
150
		$entities = array('&#x' . $fields[0] . ';');
151
	} else {
152
		$entities = array();
153
154
		list($start, $end) = explode('..', $fields[0]);
155
156
		$ord_s = hexdec($start);
157
		$ord_e = hexdec($end);
158
159
		$ord = $ord_s;
160
		while ($ord <= $ord_e) {
161
			$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
162
		}
163
	}
164
165
	$value = '';
166
	if (!isset($fields[2])) {
167
		$value = 'SAME';
168
	} elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) {
169
		$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
170
	} else {
171
		$value = $fields[2];
172
	}
173
174
	foreach ($entities as $entity) {
175
		$derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
176
	}
177
}
178
179
// Go through all the characters in the Unicode database.
180
$char_data = array();
181
foreach (file($unicode_data_url . '/UnicodeData.txt') as $line) {
182
	$fields = explode(';', $line);
183
184
	if (!empty($fields[3]))
185
		$funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = trim($fields[3]);
186
187
	// Uppercase maps.
188
	if ($fields[12] !== '')
189
		$funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
190
191
	// Lowercase maps.
192
	if ($fields[13] !== '')
193
		$funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
194
195
	// Remember this character's general category for later.
196
	$char_data['&#x' . $fields[0] . ';']['General_Category'] = trim($fields[2]);
197
198
	if ($fields[5] === '')
199
		continue;
200
201
	// All canonical decompositions AND all compatibility decompositions.
202
	$full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
203
204
	// Just the canonical decompositions.
205
	if (strpos($fields[5], '<') === false) {
206
		$funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[5])) . ';';
207
	}
208
}
209
210
foreach (file($unicode_data_url . '/CaseFolding.txt') as $line) {
211
	$line = substr($line, 0, strcspn($line, '#'));
212
213
	if (strpos($line, ';') === false)
214
		continue;
215
216
	$fields = explode(';', $line);
217
218
	foreach ($fields as $key => $value) {
219
		$fields[$key] = trim($value);
220
	}
221
222
	// Full casefolding.
223
	if (in_array($fields[1], array('C', 'F'))) {
224
		$funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
225
	}
226
227
	// Simple casefolding. Currently unused.
228
	// if (in_array($fields[1], array('C', 'S'))) {
229
	// 	$funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
230
	// }
231
}
232
233
// Recursively iterate until we reach the final decomposition forms.
234
// This is necessary because some characters decompose to other characters that
235
// themselves decompose further.
236
$changed = true;
237
while ($changed) {
238
	$temp = array();
239
	foreach ($full_decomposition_maps as $composed => $decomposed) {
240
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
241
242
		foreach ($parts as $partnum => $hex) {
243
			if (isset($full_decomposition_maps[$hex])) {
244
				$parts[$partnum] = $full_decomposition_maps[$hex];
245
			}
246
		}
247
248
		$decomposed = implode(' ', $parts);
249
		unset($parts);
250
251
		$temp[$composed] = $decomposed;
252
	}
253
254
	$changed = $full_decomposition_maps !== $temp;
255
256
	$full_decomposition_maps = $temp;
257
}
258
259
// Same as above, but using only canonical decompositions.
260
$changed = true;
261
$iteration = 0;
262
while ($changed) {
263
	$temp = array();
264
	foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed) {
265
		if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion'])) {
266
			$funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
267
		}
268
269
		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
270
271
		foreach ($parts as $partnum => $hex) {
272
			if (isset($funcs['utf8_normalize_d_maps']['data'][$hex])) {
273
				$parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex];
274
			}
275
		}
276
277
		$decomposed = implode(' ', $parts);
278
		unset($parts);
279
280
		$temp[$composed] = $decomposed;
281
	}
282
283
	$changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp;
284
285
	$funcs['utf8_normalize_d_maps']['data'] = $temp;
286
	$iteration++;
287
}
288
289
$funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']);
290
unset($full_decomposition_maps, $derived_normalization_props);
291
292
// Now update the files with the data we've got so far.
293
foreach ($funcs as $func_name => $func_info) {
294
	if (empty($func_info['data']))
295
		continue;
296
297
	export_func_to_file($func_name, $func_info);
298
299
	// Free up some memory.
300
	if ($func_name != 'utf8_combining_classes')
301
		unset($funcs[$func_name]);
302
}
303
304
/***********************************
305
 * Part 2: Regular expression data *
306
 ***********************************/
307
308
// Build regular expression classes for extended Unicode properties.
309
foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename) {
310
	foreach (file($unicode_data_url . '/' . $filename) as $line) {
311
		$line = substr($line, 0, strcspn($line, '#'));
312
313
		if (strpos($line, ';') === false)
314
			continue;
315
316
		$fields = explode(';', $line);
317
318
		foreach ($fields as $key => $value) {
319
			$fields[$key] = trim($value);
320
		}
321
322
		if (in_array($fields[1], $funcs['utf8_regex_properties']['props'])) {
323
			if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]]))
324
				$funcs['utf8_regex_properties']['data'][$fields[1]] = '';
325
326
			$funcs['utf8_regex_properties']['data'][$fields[1]] .= '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
327
		}
328
329
		// We also track 'Default_Ignorable_Code_Point' property in a separate array.
330
		if ($fields[1] !== 'Default_Ignorable_Code_Point')
331
			continue;
332
333
		if (strpos($fields[0], '..') === false) {
334
			$funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
335
		} else {
336
			list($start, $end) = explode('..', $fields[0]);
337
338
			$ord_s = hexdec($start);
339
			$ord_e = hexdec($end);
340
341
			$ord = $ord_s;
342
			while ($ord <= $ord_e) {
343
				$funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
344
			}
345
		}
346
	}
347
}
348
ksort($funcs['utf8_regex_properties']['data']);
349
350
// Build regular expression classes for filtering variation selectors.
351
$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
352
foreach ($files as $filename) {
353
	foreach (file($unicode_data_url . '/' . $filename) as $line) {
354
		$line = substr($line, 0, strcspn($line, '#'));
355
356
		if (strpos($line, ';') === false)
357
			continue;
358
359
		$fields = explode(';', $line);
360
361
		foreach ($fields as $key => $value) {
362
			$fields[$key] = trim($value);
363
		}
364
365
		list($base_char, $variation_selector) = explode(' ', $fields[0]);
366
367
		$funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
368
	}
369
370
}
371
foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords) {
372
	$class_string = '';
373
374
	$current_range = array('start' => null, 'end' => null);
375
	foreach($ords as $ord) {
376
		if (!isset($current_range['start'])) {
377
			$current_range['start'] = $ord;
378
		}
379
380
		if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) {
381
			$current_range['end'] = $ord;
382
			continue;
383
		} else {
384
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['start'] of type null is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

384
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['start']))) . '}';
Loading history...
385
386
			if ($current_range['start'] != $current_range['end']) {
387
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
0 ignored issues
show
Bug introduced by
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

387
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex(/** @scrutinizer ignore-type */ $current_range['end']))) . '}';
Loading history...
388
			}
389
390
			$current_range = array('start' => $ord, 'end' => $ord);
391
		}
392
	}
393
394
	if (isset($current_range['start'])) {
395
		$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
396
397
		if ($current_range['start'] != $current_range['end']) {
398
			$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
399
		}
400
	}
401
402
	// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
403
	if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false) {
404
		unset(
405
			$funcs['utf8_regex_variation_selectors']['data'][$identical],
406
			$funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
407
		);
408
409
		$compound_selector = array($identical, $variation_selector);
410
		sort($compound_selector);
411
412
		$variation_selector = implode('', $compound_selector);
413
	}
414
415
	$funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
416
}
417
krsort($funcs['utf8_regex_variation_selectors']['data']);
418
419
// The regex classes for join control tests require info about language scripts.
420
$script_stats = array();
421
$script_aliases = array();
422
foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line) {
423
	$line = substr($line, 0, strcspn($line, '#'));
424
425
	if (strpos($line, ';') === false)
426
		continue;
427
428
	$fields = explode(';', $line);
429
430
	foreach ($fields as $key => $value) {
431
		$fields[$key] = trim($value);
432
	}
433
434
	if ($fields[0] !== 'sc')
435
		continue;
436
437
	$script_aliases[$fields[1]] = $fields[2];
438
}
439
foreach (file($unicode_data_url . '/Scripts.txt') as $line) {
440
	$line = substr($line, 0, strcspn($line, '#'));
441
442
	if (strpos($line, ';') === false)
443
		continue;
444
445
	$fields = explode(';', $line);
446
447
	foreach ($fields as $key => $value) {
448
		$fields[$key] = trim($value);
449
	}
450
451
	if (in_array($fields[1], array('Common', 'Inherited')))
452
		continue;
453
454
	if (strpos($fields[0], '..') === false) {
455
		$char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
456
	} else {
457
		list($start, $end) = explode('..', $fields[0]);
458
459
		$ord_s = hexdec($start);
460
		$ord_e = hexdec($end);
461
462
		$ord = $ord_s;
463
		while ($ord <= $ord_e) {
464
			$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
465
		}
466
	}
467
}
468
foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line) {
469
	$line = substr($line, 0, strcspn($line, '#'));
470
471
	if (strpos($line, ';') === false)
472
		continue;
473
474
	$fields = explode(';', $line);
475
476
	foreach ($fields as $key => $value) {
477
		$fields[$key] = trim($value);
478
	}
479
480
	$char_scripts = array();
481
	foreach (explode(' ', $fields[1]) as $alias) {
482
		if (!in_array($script_aliases[$alias], array('Common', 'Inherited'))) {
483
			$char_scripts[] = $script_aliases[$alias];
484
		}
485
	}
486
487
	if (strpos($fields[0], '..') === false) {
488
		foreach ($char_scripts as $char_script) {
489
			$char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
490
		}
491
	} else {
492
		list($start, $end) = explode('..', $fields[0]);
493
494
		$ord_s = hexdec($start);
495
		$ord_e = hexdec($end);
496
497
		$ord = $ord_s;
498
		while ($ord <= $ord_e) {
499
			foreach ($char_scripts as $char_script) {
500
				$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
501
			}
502
		}
503
	}
504
}
505
foreach (file($unicode_data_url . '/DerivedAge.txt') as $line) {
506
	$line = substr($line, 0, strcspn($line, '#'));
507
508
	if (strpos($line, ';') === false)
509
		continue;
510
511
	$fields = explode(';', $line);
512
513
	foreach ($fields as $key => $value) {
514
		$fields[$key] = trim($value);
515
	}
516
517
	$fields[1] = (float) $fields[1];
518
519
	if (strpos($fields[0], '..') === false) {
520
		$char_scripts = $char_data['&#x' . $fields[0] . ';']['scripts'];
521
522
		if (empty($char_scripts))
523
			continue;
524
525
		foreach ($char_scripts as $char_script) {
526
			if (!isset($script_stats[$char_script])) {
527
				$script_stats[$char_script]['age'] = (float) $fields[1];
528
				$script_stats[$char_script]['count'] = 1;
529
			} else {
530
				$script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']);
531
				$script_stats[$char_script]['count']++;
532
			}
533
		}
534
	} else {
535
		list($start, $end) = explode('..', $fields[0]);
536
537
		$ord_s = hexdec($start);
538
		$ord_e = hexdec($end);
539
540
		$ord = $ord_s;
541
		while ($ord <= $ord_e) {
542
			$char_scripts = $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'];
543
544
			if (empty($char_scripts))
545
				continue;
546
547
			foreach ($char_scripts as $char_script) {
548
				if (!isset($script_stats[$char_script])) {
549
					$script_stats[$char_script]['age'] = $fields[1];
550
					$script_stats[$char_script]['count'] = 1;
551
				} else {
552
					$script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']);
553
					$script_stats[$char_script]['count']++;
554
				}
555
			}
556
		}
557
	}
558
}
559
560
// Build regex classes for join control tests in utf8_sanitize_invisibles:
561
// 1. Cursive scripts like Arabic.
562
foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line) {
563
	$line = substr($line, 0, strcspn($line, '#'));
564
565
	if (strpos($line, ';') === false)
566
		continue;
567
568
	$fields = explode(';', $line);
569
570
	foreach ($fields as $key => $value) {
571
		$fields[$key] = trim($value);
572
	}
573
574
	switch ($fields[1]) {
575
		case 'C':
576
			$joining_type = 'Join_Causing';
577
			break;
578
579
		case 'D':
580
			$joining_type = 'Dual_Joining';
581
			break;
582
583
		case 'R':
584
			$joining_type = 'Right_Joining';
585
			break;
586
587
		case 'L':
588
			$joining_type = 'Left_Joining';
589
			break;
590
591
		case 'T':
592
			$joining_type = 'Transparent';
593
			break;
594
595
		default:
596
			$joining_type = null;
597
			break;
598
	}
599
600
	if (!isset($joining_type))
601
		continue;
602
603
	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
604
605
	if (empty($char_scripts))
606
		continue;
607
608
	foreach ($char_scripts as $char_script) {
609
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
610
			$funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script];
611
612
		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
613
			$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
614
615
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
616
	}
617
}
618
// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
619
uasort($funcs['utf8_regex_joining_type']['data'], function($a, $b) {
620
	if ($a['stats']['age'] == $b['stats']['age']) {
621
		return $b['stats']['count'] - $a['stats']['count'];
622
	} else {
623
		return $a['stats']['age'] - $b['stats']['age'];
624
	}
625
});
626
foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types) {
627
	unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
628
629
	// If the only joining type in this script is transparent, we don't care about it.
630
	if (array_keys($joining_types) === array('Transparent')) {
631
		unset($funcs['utf8_regex_joining_type']['data'][$char_script]);
632
		continue;
633
	}
634
635
	foreach ($joining_types as $joining_type => $value) {
636
		sort($value);
637
		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = implode('', $value);
638
	}
639
}
640
641
// 2. Indic scripts like Devanagari.
642
foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line) {
643
	$line = substr($line, 0, strcspn($line, '#'));
644
645
	if (strpos($line, ';') === false)
646
		continue;
647
648
	$fields = explode(';', $line);
649
650
	foreach ($fields as $key => $value) {
651
		$fields[$key] = trim($value);
652
	}
653
654
	$insc = $fields[1];
655
656
	if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
657
		continue;
658
659
	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
660
661
	if (empty($char_scripts))
662
		continue;
663
664
	foreach ($char_scripts as $char_script) {
665
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats']))
666
			$funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script];
667
668
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc]))
669
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
670
671
		$funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
672
	}
673
}
674
// Again, sort commonly used scripts before rare scripts.
675
uasort($funcs['utf8_regex_indic']['data'], function($a, $b) {
676
	if ($a['stats']['age'] == $b['stats']['age']) {
677
		return $b['stats']['count'] - $a['stats']['count'];
678
	} else {
679
		return $a['stats']['age'] - $b['stats']['age'];
680
	}
681
});
682
// We only want scripts with viramas.
683
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) {
684
	unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
685
686
	if (!isset($inscs['Virama'])) {
687
		unset($funcs['utf8_regex_indic']['data'][$char_script]);
688
		continue;
689
	}
690
}
691
// Now add some more classes that we need for each script.
692
foreach ($char_data as $entity => $info) {
693
	if (empty($info['scripts']))
694
		continue;
695
696
	$ord = hexdec(trim($entity, '&#x;'));
697
698
	foreach ($info['scripts'] as $char_script) {
699
		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]))
700
			continue;
701
702
		$funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
703
704
		if ($info['General_Category'] == 'Mn') {
705
			$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
706
707
			if (!empty($funcs['utf8_combining_classes']['data'][$entity])) {
708
				$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
709
			}
710
		} elseif (substr($info['General_Category'], 0, 1) == 'L') {
711
			$funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
712
		}
713
	}
714
}
715
foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) {
716
	foreach ($inscs as $insc => $value) {
717
		sort($value);
718
719
		if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark'))) {
720
			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = implode('', $value);
721
			continue;
722
		}
723
724
		$class_string = '';
725
726
		$current_range = array('start' => null, 'end' => null);
727
		foreach($value as $ord) {
728
			if (!isset($current_range['start'])) {
729
				$current_range['start'] = $ord;
730
			}
731
732
			if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) {
733
				$current_range['end'] = $ord;
734
				continue;
735
			} else {
736
				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
737
738
				if ($current_range['start'] != $current_range['end']) {
739
					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
740
				}
741
742
				$current_range = array('start' => $ord, 'end' => $ord);
743
			}
744
		}
745
746
		if (isset($current_range['start'])) {
747
			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
748
749
			if ($current_range['start'] != $current_range['end']) {
750
				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
751
			}
752
		}
753
754
		$funcs['utf8_regex_indic']['data'][$char_script][$insc] = $class_string;
755
	}
756
757
	ksort($funcs['utf8_regex_indic']['data'][$char_script]);
758
}
759
unset($funcs['utf8_combining_classes']);
760
761
foreach ($funcs as $func_name => $func_info) {
762
	export_func_to_file($func_name, $func_info);
763
}
764
765
766
/**
767
 * Updates a Unicode data function in its designated file.
768
 *
769
 * @param string $func_name The name of the function.
770
 * @param array $func_info Info about the function, including its data.
771
 */
772
function export_func_to_file($func_name, $func_info)
773
{
774
	global $unicodedir;
775
776
	$file_contents = file_get_contents($unicodedir . '/' . $func_info['file']);
777
778
	$func_text = 'function ' . $func_name . '()' . "\n" . '{';
779
780
	$func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s';
781
782
	$func_text .= "\n\t" . 'return array(' . "\n";
783
784
	build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']);
785
786
	$func_text .= "\t" . ');' . "\n" . '}';
787
788
	$file_contents = preg_replace($func_regex, $func_text, $file_contents);
789
790
	file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents);
791
}
792
793
794
/**
795
 * Helper for export_func_to_file(). Builds the function's data array.
796
 *
797
 * @param string &$func_text The raw string that contains function code.
798
 * @param array $data Data to format as an array.
799
 * @param string $key_type How to format the array keys.
800
 * @param string $val_type How to format the array values.
801
 */
802
function build_func_array(&$func_text, $data, $key_type, $val_type)
803
{
804
	static $indent = 2;
805
806
	foreach ($data as $key => $value) {
807
		$func_text .= str_repeat("\t", $indent);
808
809
		if ($key_type == 'hexchar') {
810
			$func_text .= '"';
811
812
			$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8');
813
814
			foreach (unpack('C*', $key) as $byte_value) {
815
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
816
			}
817
818
			$func_text .= '" => ';
819
		} elseif ($key_type == 'string') {
820
			$func_text .= var_export($key, true) . ' => ';
821
		}
822
823
		if (is_array($value)) {
824
			$func_text .= 'array(' . "\n";
825
826
			$indent++;
827
			build_func_array($func_text, $value, $key_type, $val_type);
828
			$indent--;
829
830
			$func_text .= str_repeat("\t", $indent) . ')';
831
		} elseif ($val_type == 'hexchar') {
832
			$func_text .= '"';
833
834
			$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8');
835
			foreach (unpack('C*', $value) as $byte_value) {
836
				$func_text .= '\\x' . strtoupper(dechex($byte_value));
837
			}
838
839
			$func_text .= '"';
840
		} elseif ($val_type == 'string') {
841
			$func_text .= var_export($value, true);
842
		} else {
843
			$func_text .= $value;
844
		}
845
846
		$func_text .= ',' . "\n";
847
	}
848
}
849
850
?>