1 | <?php |
||||
2 | |||||
3 | /** |
||||
4 | * This is an internal development file. It should NOT be included in |
||||
5 | * any SMF distribution packages. |
||||
6 | * |
||||
7 | * This file exists to make it easy for developers to update the |
||||
8 | * Unicode data in Subs-Charset.php whenever a new version of the |
||||
9 | * Unicode Character Database is released. Just run this file from the |
||||
10 | * command line in order to perform the update. |
||||
11 | * |
||||
12 | * Simple Machines Forum (SMF) |
||||
13 | * |
||||
14 | * @package SMF |
||||
15 | * @author Simple Machines https://www.simplemachines.org |
||||
16 | * @copyright 2022 Simple Machines and individual contributors |
||||
17 | * @license https://www.simplemachines.org/about/smf/license.php BSD |
||||
18 | * |
||||
19 | * @version 2.1.2 |
||||
20 | */ |
||||
21 | |||||
22 | $unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd'; |
||||
23 | |||||
24 | $sourcedir = realpath(dirname(__DIR__) . '/Sources'); |
||||
25 | $unicodedir = $sourcedir . '/Unicode'; |
||||
26 | |||||
27 | $full_decomposition_maps = array(); |
||||
28 | $funcs = array( |
||||
29 | 'utf8_normalize_d_maps' => array( |
||||
30 | 'file' => 'DecompositionCanonical.php', |
||||
31 | 'key_type' => 'hexchar', |
||||
32 | 'val_type' => 'hexchar', |
||||
33 | 'data' => array(), |
||||
34 | ), |
||||
35 | 'utf8_normalize_kd_maps' => array( |
||||
36 | 'file' => 'DecompositionCompatibility.php', |
||||
37 | 'key_type' => 'hexchar', |
||||
38 | 'val_type' => 'hexchar', |
||||
39 | 'data' => array(), |
||||
40 | ), |
||||
41 | 'utf8_compose_maps' => array( |
||||
42 | 'file' => 'Composition.php', |
||||
43 | 'key_type' => 'hexchar', |
||||
44 | 'val_type' => 'hexchar', |
||||
45 | 'data' => array(), |
||||
46 | ), |
||||
47 | 'utf8_combining_classes' => array( |
||||
48 | 'file' => 'CombiningClasses.php', |
||||
49 | 'key_type' => 'hexchar', |
||||
50 | 'val_type' => 'int', |
||||
51 | 'data' => array(), |
||||
52 | ), |
||||
53 | 'utf8_strtolower_maps' => array( |
||||
54 | 'file' => 'CaseLower.php', |
||||
55 | 'key_type' => 'hexchar', |
||||
56 | 'val_type' => 'hexchar', |
||||
57 | 'data' => array(), |
||||
58 | ), |
||||
59 | 'utf8_strtoupper_maps' => array( |
||||
60 | 'file' => 'CaseUpper.php', |
||||
61 | 'key_type' => 'hexchar', |
||||
62 | 'val_type' => 'hexchar', |
||||
63 | 'data' => array(), |
||||
64 | ), |
||||
65 | 'utf8_casefold_maps' => array( |
||||
66 | 'file' => 'CaseFold.php', |
||||
67 | 'key_type' => 'hexchar', |
||||
68 | 'val_type' => 'hexchar', |
||||
69 | 'data' => array(), |
||||
70 | ), |
||||
71 | 'utf8_default_ignorables' => array( |
||||
72 | 'file' => 'DefaultIgnorables.php', |
||||
73 | 'key_type' => 'int', |
||||
74 | 'val_type' => 'hexchar', |
||||
75 | 'data' => array(), |
||||
76 | ), |
||||
77 | 'utf8_regex_properties' => array( |
||||
78 | 'file' => 'RegularExpressions.php', |
||||
79 | 'key_type' => 'string', |
||||
80 | 'val_type' => 'string', |
||||
81 | 'propfiles' => array( |
||||
82 | 'DerivedCoreProperties.txt', |
||||
83 | 'PropList.txt', |
||||
84 | 'emoji/emoji-data.txt', |
||||
85 | 'extracted/DerivedGeneralCategory.txt', |
||||
86 | ), |
||||
87 | 'props' => array( |
||||
88 | 'Bidi_Control', |
||||
89 | 'Cn', |
||||
90 | 'Default_Ignorable_Code_Point', |
||||
91 | 'Emoji', |
||||
92 | 'Emoji_Modifier', |
||||
93 | 'Ideographic', |
||||
94 | 'Join_Control', |
||||
95 | 'Regional_Indicator', |
||||
96 | 'Variation_Selector', |
||||
97 | ), |
||||
98 | 'data' => array(), |
||||
99 | ), |
||||
100 | 'utf8_regex_variation_selectors' => array( |
||||
101 | 'file' => 'RegularExpressions.php', |
||||
102 | 'key_type' => 'string', |
||||
103 | 'val_type' => 'string', |
||||
104 | 'data' => array(), |
||||
105 | ), |
||||
106 | 'utf8_regex_joining_type' => array( |
||||
107 | 'file' => 'RegularExpressions.php', |
||||
108 | 'key_type' => 'string', |
||||
109 | 'val_type' => 'string', |
||||
110 | 'data' => array(), |
||||
111 | ), |
||||
112 | 'utf8_regex_indic' => array( |
||||
113 | 'file' => 'RegularExpressions.php', |
||||
114 | 'key_type' => 'string', |
||||
115 | 'val_type' => 'string', |
||||
116 | 'data' => array(), |
||||
117 | ), |
||||
118 | ); |
||||
119 | |||||
120 | foreach ($funcs as $func_name => $func_info) |
||||
121 | { |
||||
122 | if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file'])) |
||||
123 | { |
||||
124 | die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.'); |
||||
125 | } |
||||
126 | } |
||||
127 | |||||
128 | @ini_set('memory_limit', '256M'); |
||||
129 | |||||
130 | /********************************************* |
||||
131 | * Part 1: Normalization, case folding, etc. * |
||||
132 | *********************************************/ |
||||
133 | |||||
134 | // We need some of these for further analysis below. |
||||
135 | $derived_normalization_props = array(); |
||||
136 | foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line) |
||||
137 | { |
||||
138 | $line = substr($line, 0, strcspn($line, '#')); |
||||
139 | |||||
140 | if (strpos($line, ';') === false) |
||||
141 | { |
||||
142 | continue; |
||||
143 | } |
||||
144 | |||||
145 | $fields = explode(';', $line); |
||||
146 | |||||
147 | foreach ($fields as $key => $value) |
||||
148 | { |
||||
149 | $fields[$key] = trim($value); |
||||
150 | } |
||||
151 | |||||
152 | if (!isset($derived_normalization_props[$fields[1]])) |
||||
153 | { |
||||
154 | $derived_normalization_props[$fields[1]] = array(); |
||||
155 | } |
||||
156 | |||||
157 | if (strpos($fields[0], '..') === false) |
||||
158 | { |
||||
159 | $entities = array('&#x' . $fields[0] . ';'); |
||||
160 | } |
||||
161 | else |
||||
162 | { |
||||
163 | $entities = array(); |
||||
164 | |||||
165 | list($start, $end) = explode('..', $fields[0]); |
||||
166 | |||||
167 | $ord_s = hexdec($start); |
||||
168 | $ord_e = hexdec($end); |
||||
169 | |||||
170 | $ord = $ord_s; |
||||
171 | while ($ord <= $ord_e) |
||||
172 | { |
||||
173 | $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
||||
174 | } |
||||
175 | } |
||||
176 | |||||
177 | $value = ''; |
||||
178 | if (!isset($fields[2])) |
||||
179 | { |
||||
180 | $value = 'SAME'; |
||||
181 | } |
||||
182 | elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) |
||||
183 | { |
||||
184 | $value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : ''; |
||||
185 | } |
||||
186 | else |
||||
187 | { |
||||
188 | $value = $fields[2]; |
||||
189 | } |
||||
190 | |||||
191 | foreach ($entities as $entity) |
||||
192 | { |
||||
193 | $derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; |
||||
194 | } |
||||
195 | } |
||||
196 | |||||
197 | // Go through all the characters in the Unicode database. |
||||
198 | $char_data = array(); |
||||
199 | foreach (file($unicode_data_url . '/UnicodeData.txt') as $line) |
||||
200 | { |
||||
201 | $fields = explode(';', $line); |
||||
202 | |||||
203 | if (!empty($fields[3])) |
||||
204 | { |
||||
205 | $funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = trim($fields[3]); |
||||
206 | } |
||||
207 | |||||
208 | // Uppercase maps. |
||||
209 | if ($fields[12] !== '') |
||||
210 | { |
||||
211 | $funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';'; |
||||
212 | } |
||||
213 | |||||
214 | // Lowercase maps. |
||||
215 | if ($fields[13] !== '') |
||||
216 | { |
||||
217 | $funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';'; |
||||
218 | } |
||||
219 | |||||
220 | // Remember this character's general category for later. |
||||
221 | $char_data['&#x' . $fields[0] . ';']['General_Category'] = trim($fields[2]); |
||||
222 | |||||
223 | if ($fields[5] === '') |
||||
224 | { |
||||
225 | continue; |
||||
226 | } |
||||
227 | |||||
228 | // All canonical decompositions AND all compatibility decompositions. |
||||
229 | $full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';'; |
||||
230 | |||||
231 | // Just the canonical decompositions. |
||||
232 | if (strpos($fields[5], '<') === false) |
||||
233 | { |
||||
234 | $funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[5])) . ';'; |
||||
235 | } |
||||
236 | } |
||||
237 | |||||
238 | foreach (file($unicode_data_url . '/CaseFolding.txt') as $line) |
||||
239 | { |
||||
240 | $line = substr($line, 0, strcspn($line, '#')); |
||||
241 | |||||
242 | if (strpos($line, ';') === false) |
||||
243 | { |
||||
244 | continue; |
||||
245 | } |
||||
246 | |||||
247 | $fields = explode(';', $line); |
||||
248 | |||||
249 | foreach ($fields as $key => $value) |
||||
250 | { |
||||
251 | $fields[$key] = trim($value); |
||||
252 | } |
||||
253 | |||||
254 | // Full casefolding. |
||||
255 | if (in_array($fields[1], array('C', 'F'))) |
||||
256 | { |
||||
257 | $funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
||||
258 | } |
||||
259 | |||||
260 | // Simple casefolding. Currently unused. |
||||
261 | // if (in_array($fields[1], array('C', 'S'))) |
||||
262 | // $funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
||||
263 | } |
||||
264 | |||||
265 | // Recursively iterate until we reach the final decomposition forms. |
||||
266 | // This is necessary because some characters decompose to other characters that |
||||
267 | // themselves decompose further. |
||||
268 | $changed = true; |
||||
269 | while ($changed) |
||||
270 | { |
||||
271 | $temp = array(); |
||||
272 | foreach ($full_decomposition_maps as $composed => $decomposed) |
||||
273 | { |
||||
274 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
||||
275 | |||||
276 | foreach ($parts as $partnum => $hex) |
||||
277 | { |
||||
278 | if (isset($full_decomposition_maps[$hex])) |
||||
279 | { |
||||
280 | $parts[$partnum] = $full_decomposition_maps[$hex]; |
||||
281 | } |
||||
282 | } |
||||
283 | |||||
284 | $decomposed = implode(' ', $parts); |
||||
285 | unset($parts); |
||||
286 | |||||
287 | $temp[$composed] = $decomposed; |
||||
288 | } |
||||
289 | |||||
290 | $changed = $full_decomposition_maps !== $temp; |
||||
291 | |||||
292 | $full_decomposition_maps = $temp; |
||||
293 | } |
||||
294 | |||||
295 | // Same as above, but using only canonical decompositions. |
||||
296 | $changed = true; |
||||
297 | $iteration = 0; |
||||
298 | while ($changed) |
||||
299 | { |
||||
300 | $temp = array(); |
||||
301 | foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed) |
||||
302 | { |
||||
303 | if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion'])) |
||||
304 | { |
||||
305 | $funcs['utf8_compose_maps']['data'][$decomposed] = $composed; |
||||
306 | } |
||||
307 | |||||
308 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
||||
309 | |||||
310 | foreach ($parts as $partnum => $hex) |
||||
311 | { |
||||
312 | if (isset($funcs['utf8_normalize_d_maps']['data'][$hex])) |
||||
313 | { |
||||
314 | $parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex]; |
||||
315 | } |
||||
316 | } |
||||
317 | |||||
318 | $decomposed = implode(' ', $parts); |
||||
319 | unset($parts); |
||||
320 | |||||
321 | $temp[$composed] = $decomposed; |
||||
322 | } |
||||
323 | |||||
324 | $changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp; |
||||
325 | |||||
326 | $funcs['utf8_normalize_d_maps']['data'] = $temp; |
||||
327 | $iteration++; |
||||
328 | } |
||||
329 | |||||
330 | $funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']); |
||||
331 | unset($full_decomposition_maps, $derived_normalization_props); |
||||
332 | |||||
333 | // Now update the files with the data we've got so far. |
||||
334 | foreach ($funcs as $func_name => $func_info) |
||||
335 | { |
||||
336 | if (empty($func_info['data'])) |
||||
337 | { |
||||
338 | continue; |
||||
339 | } |
||||
340 | |||||
341 | export_func_to_file($func_name, $func_info); |
||||
342 | |||||
343 | // Free up some memory. |
||||
344 | if ($func_name != 'utf8_combining_classes') |
||||
345 | { |
||||
346 | unset($funcs[$func_name]); |
||||
347 | } |
||||
348 | } |
||||
349 | |||||
350 | /*********************************** |
||||
351 | * Part 2: Regular expression data * |
||||
352 | ***********************************/ |
||||
353 | |||||
354 | // Build regular expression classes for extended Unicode properties. |
||||
355 | foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename) |
||||
356 | { |
||||
357 | foreach (file($unicode_data_url . '/' . $filename) as $line) |
||||
358 | { |
||||
359 | $line = substr($line, 0, strcspn($line, '#')); |
||||
360 | |||||
361 | if (strpos($line, ';') === false) |
||||
362 | { |
||||
363 | continue; |
||||
364 | } |
||||
365 | |||||
366 | $fields = explode(';', $line); |
||||
367 | |||||
368 | foreach ($fields as $key => $value) |
||||
369 | { |
||||
370 | $fields[$key] = trim($value); |
||||
371 | } |
||||
372 | |||||
373 | if (in_array($fields[1], $funcs['utf8_regex_properties']['props'])) |
||||
374 | { |
||||
375 | if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]])) |
||||
376 | { |
||||
377 | $funcs['utf8_regex_properties']['data'][$fields[1]] = ''; |
||||
378 | } |
||||
379 | |||||
380 | $funcs['utf8_regex_properties']['data'][$fields[1]] .= '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||||
381 | } |
||||
382 | |||||
383 | // We also track 'Default_Ignorable_Code_Point' property in a separate array. |
||||
384 | if ($fields[1] !== 'Default_Ignorable_Code_Point') |
||||
385 | { |
||||
386 | continue; |
||||
387 | } |
||||
388 | |||||
389 | if (strpos($fields[0], '..') === false) |
||||
390 | { |
||||
391 | $funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; |
||||
392 | } |
||||
393 | else |
||||
394 | { |
||||
395 | list($start, $end) = explode('..', $fields[0]); |
||||
396 | |||||
397 | $ord_s = hexdec($start); |
||||
398 | $ord_e = hexdec($end); |
||||
399 | |||||
400 | $ord = $ord_s; |
||||
401 | while ($ord <= $ord_e) |
||||
402 | { |
||||
403 | $funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
||||
404 | } |
||||
405 | } |
||||
406 | } |
||||
407 | } |
||||
408 | ksort($funcs['utf8_regex_properties']['data']); |
||||
409 | |||||
410 | // Build regular expression classes for filtering variation selectors. |
||||
411 | $files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt'); |
||||
412 | foreach ($files as $filename) |
||||
413 | { |
||||
414 | foreach (file($unicode_data_url . '/' . $filename) as $line) |
||||
415 | { |
||||
416 | $line = substr($line, 0, strcspn($line, '#')); |
||||
417 | |||||
418 | if (strpos($line, ';') === false) |
||||
419 | { |
||||
420 | continue; |
||||
421 | } |
||||
422 | |||||
423 | $fields = explode(';', $line); |
||||
424 | |||||
425 | foreach ($fields as $key => $value) |
||||
426 | { |
||||
427 | $fields[$key] = trim($value); |
||||
428 | } |
||||
429 | |||||
430 | list($base_char, $variation_selector) = explode(' ', $fields[0]); |
||||
431 | |||||
432 | $funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char); |
||||
433 | } |
||||
434 | } |
||||
435 | foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords) |
||||
436 | { |
||||
437 | $class_string = ''; |
||||
438 | |||||
439 | $current_range = array('start' => null, 'end' => null); |
||||
440 | foreach ($ords as $ord) |
||||
441 | { |
||||
442 | if (!isset($current_range['start'])) |
||||
443 | { |
||||
444 | $current_range['start'] = $ord; |
||||
445 | } |
||||
446 | |||||
447 | if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) |
||||
448 | { |
||||
449 | $current_range['end'] = $ord; |
||||
450 | continue; |
||||
451 | } |
||||
452 | else |
||||
453 | { |
||||
454 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||||
455 | |||||
456 | if ($current_range['start'] != $current_range['end']) |
||||
457 | { |
||||
458 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||||
0 ignored issues
–
show
$current_range['end'] of type void is incompatible with the type integer expected by parameter $num of dechex() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
459 | } |
||||
460 | |||||
461 | $current_range = array('start' => $ord, 'end' => $ord); |
||||
462 | } |
||||
463 | } |
||||
464 | |||||
465 | if (isset($current_range['start'])) |
||||
466 | { |
||||
467 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||||
468 | |||||
469 | if ($current_range['start'] != $current_range['end']) |
||||
470 | { |
||||
471 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||||
472 | } |
||||
473 | } |
||||
474 | |||||
475 | // As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters. |
||||
476 | if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false) |
||||
477 | { |
||||
478 | unset( |
||||
479 | $funcs['utf8_regex_variation_selectors']['data'][$identical], |
||||
480 | $funcs['utf8_regex_variation_selectors']['data'][$variation_selector] |
||||
481 | ); |
||||
482 | |||||
483 | $compound_selector = array($identical, $variation_selector); |
||||
484 | sort($compound_selector); |
||||
485 | |||||
486 | $variation_selector = implode('', $compound_selector); |
||||
487 | } |
||||
488 | |||||
489 | $funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string; |
||||
490 | } |
||||
491 | krsort($funcs['utf8_regex_variation_selectors']['data']); |
||||
492 | |||||
493 | // The regex classes for join control tests require info about language scripts. |
||||
494 | $script_stats = array(); |
||||
495 | $script_aliases = array(); |
||||
496 | foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line) |
||||
497 | { |
||||
498 | $line = substr($line, 0, strcspn($line, '#')); |
||||
499 | |||||
500 | if (strpos($line, ';') === false) |
||||
501 | { |
||||
502 | continue; |
||||
503 | } |
||||
504 | |||||
505 | $fields = explode(';', $line); |
||||
506 | |||||
507 | foreach ($fields as $key => $value) |
||||
508 | { |
||||
509 | $fields[$key] = trim($value); |
||||
510 | } |
||||
511 | |||||
512 | if ($fields[0] !== 'sc') |
||||
513 | { |
||||
514 | continue; |
||||
515 | } |
||||
516 | |||||
517 | $script_aliases[$fields[1]] = $fields[2]; |
||||
518 | } |
||||
519 | foreach (file($unicode_data_url . '/Scripts.txt') as $line) |
||||
520 | { |
||||
521 | $line = substr($line, 0, strcspn($line, '#')); |
||||
522 | |||||
523 | if (strpos($line, ';') === false) |
||||
524 | { |
||||
525 | continue; |
||||
526 | } |
||||
527 | |||||
528 | $fields = explode(';', $line); |
||||
529 | |||||
530 | foreach ($fields as $key => $value) |
||||
531 | { |
||||
532 | $fields[$key] = trim($value); |
||||
533 | } |
||||
534 | |||||
535 | if (in_array($fields[1], array('Common', 'Inherited'))) |
||||
536 | { |
||||
537 | continue; |
||||
538 | } |
||||
539 | |||||
540 | if (strpos($fields[0], '..') === false) |
||||
541 | { |
||||
542 | $char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1]; |
||||
543 | } |
||||
544 | else |
||||
545 | { |
||||
546 | list($start, $end) = explode('..', $fields[0]); |
||||
547 | |||||
548 | $ord_s = hexdec($start); |
||||
549 | $ord_e = hexdec($end); |
||||
550 | |||||
551 | $ord = $ord_s; |
||||
552 | while ($ord <= $ord_e) |
||||
553 | { |
||||
554 | $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1]; |
||||
555 | } |
||||
556 | } |
||||
557 | } |
||||
558 | foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line) |
||||
559 | { |
||||
560 | $line = substr($line, 0, strcspn($line, '#')); |
||||
561 | |||||
562 | if (strpos($line, ';') === false) |
||||
563 | { |
||||
564 | continue; |
||||
565 | } |
||||
566 | |||||
567 | $fields = explode(';', $line); |
||||
568 | |||||
569 | foreach ($fields as $key => $value) |
||||
570 | { |
||||
571 | $fields[$key] = trim($value); |
||||
572 | } |
||||
573 | |||||
574 | $char_scripts = array(); |
||||
575 | foreach (explode(' ', $fields[1]) as $alias) |
||||
576 | { |
||||
577 | if (!in_array($script_aliases[$alias], array('Common', 'Inherited'))) |
||||
578 | { |
||||
579 | $char_scripts[] = $script_aliases[$alias]; |
||||
580 | } |
||||
581 | } |
||||
582 | |||||
583 | if (strpos($fields[0], '..') === false) |
||||
584 | { |
||||
585 | foreach ($char_scripts as $char_script) |
||||
586 | { |
||||
587 | $char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script; |
||||
588 | } |
||||
589 | } |
||||
590 | else |
||||
591 | { |
||||
592 | list($start, $end) = explode('..', $fields[0]); |
||||
593 | |||||
594 | $ord_s = hexdec($start); |
||||
595 | $ord_e = hexdec($end); |
||||
596 | |||||
597 | $ord = $ord_s; |
||||
598 | while ($ord <= $ord_e) |
||||
599 | { |
||||
600 | foreach ($char_scripts as $char_script) |
||||
601 | { |
||||
602 | $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script; |
||||
603 | } |
||||
604 | } |
||||
605 | } |
||||
606 | } |
||||
607 | foreach (file($unicode_data_url . '/DerivedAge.txt') as $line) |
||||
608 | { |
||||
609 | $line = substr($line, 0, strcspn($line, '#')); |
||||
610 | |||||
611 | if (strpos($line, ';') === false) |
||||
612 | { |
||||
613 | continue; |
||||
614 | } |
||||
615 | |||||
616 | $fields = explode(';', $line); |
||||
617 | |||||
618 | foreach ($fields as $key => $value) |
||||
619 | { |
||||
620 | $fields[$key] = trim($value); |
||||
621 | } |
||||
622 | |||||
623 | $fields[1] = (float) $fields[1]; |
||||
624 | |||||
625 | if (strpos($fields[0], '..') === false) |
||||
626 | { |
||||
627 | $entity = '&#x' . $fields[0] . ';'; |
||||
628 | |||||
629 | if (empty($char_data[$entity]['scripts'])) |
||||
630 | { |
||||
631 | continue; |
||||
632 | } |
||||
633 | |||||
634 | foreach ($char_data[$entity]['scripts'] as $char_script) |
||||
635 | { |
||||
636 | if (!isset($script_stats[$char_script])) |
||||
637 | { |
||||
638 | $script_stats[$char_script]['age'] = (float) $fields[1]; |
||||
639 | $script_stats[$char_script]['count'] = 1; |
||||
640 | } |
||||
641 | else |
||||
642 | { |
||||
643 | $script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']); |
||||
644 | $script_stats[$char_script]['count']++; |
||||
645 | } |
||||
646 | } |
||||
647 | } |
||||
648 | else |
||||
649 | { |
||||
650 | list($start, $end) = explode('..', $fields[0]); |
||||
651 | |||||
652 | $ord_s = hexdec($start); |
||||
653 | $ord_e = hexdec($end); |
||||
654 | |||||
655 | $ord = $ord_s; |
||||
656 | while ($ord <= $ord_e) |
||||
657 | { |
||||
658 | $entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
||||
659 | |||||
660 | if (empty($char_data[$entity]['scripts'])) |
||||
661 | { |
||||
662 | continue; |
||||
663 | } |
||||
664 | |||||
665 | foreach ($char_data[$entity]['scripts'] as $char_script) |
||||
666 | { |
||||
667 | if (!isset($script_stats[$char_script])) |
||||
668 | { |
||||
669 | $script_stats[$char_script]['age'] = $fields[1]; |
||||
670 | $script_stats[$char_script]['count'] = 1; |
||||
671 | } |
||||
672 | else |
||||
673 | { |
||||
674 | $script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']); |
||||
675 | $script_stats[$char_script]['count']++; |
||||
676 | } |
||||
677 | } |
||||
678 | } |
||||
679 | } |
||||
680 | } |
||||
681 | |||||
682 | // Build regex classes for join control tests in utf8_sanitize_invisibles: |
||||
683 | // 1. Cursive scripts like Arabic. |
||||
684 | foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line) |
||||
685 | { |
||||
686 | $line = substr($line, 0, strcspn($line, '#')); |
||||
687 | |||||
688 | if (strpos($line, ';') === false) |
||||
689 | { |
||||
690 | continue; |
||||
691 | } |
||||
692 | |||||
693 | $fields = explode(';', $line); |
||||
694 | |||||
695 | foreach ($fields as $key => $value) |
||||
696 | { |
||||
697 | $fields[$key] = trim($value); |
||||
698 | } |
||||
699 | |||||
700 | switch ($fields[1]) |
||||
701 | { |
||||
702 | case 'C': |
||||
703 | $joining_type = 'Join_Causing'; |
||||
704 | break; |
||||
705 | |||||
706 | case 'D': |
||||
707 | $joining_type = 'Dual_Joining'; |
||||
708 | break; |
||||
709 | |||||
710 | case 'R': |
||||
711 | $joining_type = 'Right_Joining'; |
||||
712 | break; |
||||
713 | |||||
714 | case 'L': |
||||
715 | $joining_type = 'Left_Joining'; |
||||
716 | break; |
||||
717 | |||||
718 | case 'T': |
||||
719 | $joining_type = 'Transparent'; |
||||
720 | break; |
||||
721 | |||||
722 | default: |
||||
723 | $joining_type = null; |
||||
724 | break; |
||||
725 | } |
||||
726 | |||||
727 | if (!isset($joining_type)) |
||||
728 | { |
||||
729 | continue; |
||||
730 | } |
||||
731 | |||||
732 | $entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';'; |
||||
733 | |||||
734 | if (empty($char_data[$entity]['scripts'])) |
||||
735 | { |
||||
736 | continue; |
||||
737 | } |
||||
738 | |||||
739 | foreach ($char_data[$entity]['scripts'] as $char_script) |
||||
740 | { |
||||
741 | if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'])) |
||||
742 | { |
||||
743 | $funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script]; |
||||
744 | } |
||||
745 | |||||
746 | if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type])) |
||||
747 | { |
||||
748 | $funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array(); |
||||
749 | } |
||||
750 | |||||
751 | $funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||||
752 | } |
||||
753 | } |
||||
754 | // This sort works decently well to ensure widely used scripts are ranked before rare scripts. |
||||
755 | uasort($funcs['utf8_regex_joining_type']['data'], function ($a, $b) |
||||
756 | { |
||||
757 | if ($a['stats']['age'] == $b['stats']['age']) |
||||
758 | { |
||||
759 | return $b['stats']['count'] - $a['stats']['count']; |
||||
760 | } |
||||
761 | else |
||||
762 | { |
||||
763 | return $a['stats']['age'] - $b['stats']['age']; |
||||
764 | } |
||||
765 | }); |
||||
766 | foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types) |
||||
767 | { |
||||
768 | unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']); |
||||
769 | |||||
770 | // If the only joining type in this script is transparent, we don't care about it. |
||||
771 | if (array_keys($joining_types) === array('Transparent')) |
||||
772 | { |
||||
773 | unset($funcs['utf8_regex_joining_type']['data'][$char_script]); |
||||
774 | continue; |
||||
775 | } |
||||
776 | |||||
777 | foreach ($joining_types as $joining_type => $value) |
||||
778 | { |
||||
779 | sort($value); |
||||
780 | $funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = implode('', $value); |
||||
781 | } |
||||
782 | } |
||||
783 | |||||
784 | // 2. Indic scripts like Devanagari. |
||||
785 | foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line) |
||||
786 | { |
||||
787 | $line = substr($line, 0, strcspn($line, '#')); |
||||
788 | |||||
789 | if (strpos($line, ';') === false) |
||||
790 | { |
||||
791 | continue; |
||||
792 | } |
||||
793 | |||||
794 | $fields = explode(';', $line); |
||||
795 | |||||
796 | foreach ($fields as $key => $value) |
||||
797 | { |
||||
798 | $fields[$key] = trim($value); |
||||
799 | } |
||||
800 | |||||
801 | $insc = $fields[1]; |
||||
802 | |||||
803 | if (!in_array($insc, array('Virama', 'Vowel_Dependent'))) |
||||
804 | { |
||||
805 | continue; |
||||
806 | } |
||||
807 | |||||
808 | $char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts']; |
||||
809 | |||||
810 | if (empty($char_scripts)) |
||||
811 | { |
||||
812 | continue; |
||||
813 | } |
||||
814 | |||||
815 | foreach ($char_scripts as $char_script) |
||||
816 | { |
||||
817 | if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats'])) |
||||
818 | { |
||||
819 | $funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script]; |
||||
820 | } |
||||
821 | |||||
822 | if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc])) |
||||
823 | { |
||||
824 | $funcs['utf8_regex_indic']['data'][$char_script][$insc] = array(); |
||||
825 | } |
||||
826 | |||||
827 | $funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||||
828 | } |
||||
829 | } |
||||
830 | // Again, sort commonly used scripts before rare scripts. |
||||
831 | uasort($funcs['utf8_regex_indic']['data'], function ($a, $b) |
||||
832 | { |
||||
833 | if ($a['stats']['age'] == $b['stats']['age']) |
||||
834 | { |
||||
835 | return $b['stats']['count'] - $a['stats']['count']; |
||||
836 | } |
||||
837 | else |
||||
838 | { |
||||
839 | return $a['stats']['age'] - $b['stats']['age']; |
||||
840 | } |
||||
841 | }); |
||||
842 | // We only want scripts with viramas. |
||||
843 | foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) |
||||
844 | { |
||||
845 | unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']); |
||||
846 | |||||
847 | if (!isset($inscs['Virama'])) |
||||
848 | { |
||||
849 | unset($funcs['utf8_regex_indic']['data'][$char_script]); |
||||
850 | continue; |
||||
851 | } |
||||
852 | } |
||||
853 | // Now add some more classes that we need for each script. |
||||
854 | foreach ($char_data as $entity => $info) |
||||
855 | { |
||||
856 | if (empty($info['scripts'])) |
||||
857 | { |
||||
858 | continue; |
||||
859 | } |
||||
860 | |||||
861 | $ord = hexdec(trim($entity, '&#x;')); |
||||
862 | |||||
863 | foreach ($info['scripts'] as $char_script) |
||||
864 | { |
||||
865 | if (!isset($funcs['utf8_regex_indic']['data'][$char_script])) |
||||
866 | { |
||||
867 | continue; |
||||
868 | } |
||||
869 | |||||
870 | $funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord; |
||||
871 | |||||
872 | if (empty($info['General_Category'])) |
||||
873 | { |
||||
874 | continue; |
||||
875 | } |
||||
876 | elseif ($info['General_Category'] == 'Mn') |
||||
877 | { |
||||
878 | $funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord; |
||||
879 | |||||
880 | if (!empty($funcs['utf8_combining_classes']['data'][$entity])) |
||||
881 | { |
||||
882 | $funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord; |
||||
883 | } |
||||
884 | } |
||||
885 | elseif (substr($info['General_Category'], 0, 1) == 'L') |
||||
886 | { |
||||
887 | $funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord; |
||||
888 | } |
||||
889 | } |
||||
890 | } |
||||
891 | foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) |
||||
892 | { |
||||
893 | foreach ($inscs as $insc => $value) |
||||
894 | { |
||||
895 | sort($value); |
||||
896 | |||||
897 | if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark'))) |
||||
898 | { |
||||
899 | $funcs['utf8_regex_indic']['data'][$char_script][$insc] = implode('', $value); |
||||
900 | continue; |
||||
901 | } |
||||
902 | |||||
903 | $class_string = ''; |
||||
904 | |||||
905 | $current_range = array('start' => null, 'end' => null); |
||||
906 | foreach ($value as $ord) |
||||
907 | { |
||||
908 | if (!isset($current_range['start'])) |
||||
909 | { |
||||
910 | $current_range['start'] = $ord; |
||||
911 | } |
||||
912 | |||||
913 | if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) |
||||
914 | { |
||||
915 | $current_range['end'] = $ord; |
||||
916 | continue; |
||||
917 | } |
||||
918 | else |
||||
919 | { |
||||
920 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||||
921 | |||||
922 | if ($current_range['start'] != $current_range['end']) |
||||
923 | { |
||||
924 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||||
925 | } |
||||
926 | |||||
927 | $current_range = array('start' => $ord, 'end' => $ord); |
||||
928 | } |
||||
929 | } |
||||
930 | |||||
931 | if (isset($current_range['start'])) |
||||
932 | { |
||||
933 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||||
934 | |||||
935 | if ($current_range['start'] != $current_range['end']) |
||||
936 | { |
||||
937 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||||
938 | } |
||||
939 | } |
||||
940 | |||||
941 | $funcs['utf8_regex_indic']['data'][$char_script][$insc] = $class_string; |
||||
942 | } |
||||
943 | |||||
944 | ksort($funcs['utf8_regex_indic']['data'][$char_script]); |
||||
945 | } |
||||
946 | unset($funcs['utf8_combining_classes']); |
||||
947 | |||||
948 | foreach ($funcs as $func_name => $func_info) |
||||
949 | { |
||||
950 | export_func_to_file($func_name, $func_info); |
||||
951 | } |
||||
952 | |||||
953 | /** |
||||
954 | * Updates a Unicode data function in its designated file. |
||||
955 | * |
||||
956 | * @param string $func_name The name of the function. |
||||
957 | * @param array $func_info Info about the function, including its data. |
||||
958 | */ |
||||
959 | function export_func_to_file($func_name, $func_info) |
||||
960 | { |
||||
961 | global $unicodedir; |
||||
962 | |||||
963 | $file_contents = file_get_contents($unicodedir . '/' . $func_info['file']); |
||||
964 | |||||
965 | $func_text = 'function ' . $func_name . '()' . "\n" . '{'; |
||||
966 | |||||
967 | $func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s'; |
||||
968 | |||||
969 | $func_text .= "\n\t" . 'return array(' . "\n"; |
||||
970 | |||||
971 | build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']); |
||||
972 | |||||
973 | $func_text .= "\t" . ');' . "\n" . '}'; |
||||
974 | |||||
975 | $file_contents = preg_replace($func_regex, $func_text, $file_contents); |
||||
976 | |||||
977 | file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents); |
||||
978 | } |
||||
979 | |||||
980 | /** |
||||
981 | * Helper for export_func_to_file(). Builds the function's data array. |
||||
982 | * |
||||
983 | * @param string &$func_text The raw string that contains function code. |
||||
984 | * @param array $data Data to format as an array. |
||||
985 | * @param string $key_type How to format the array keys. |
||||
986 | * @param string $val_type How to format the array values. |
||||
987 | */ |
||||
988 | function build_func_array(&$func_text, $data, $key_type, $val_type) |
||||
989 | { |
||||
990 | static $indent = 2; |
||||
991 | |||||
992 | foreach ($data as $key => $value) |
||||
993 | { |
||||
994 | $func_text .= str_repeat("\t", $indent); |
||||
995 | |||||
996 | if ($key_type == 'hexchar') |
||||
997 | { |
||||
998 | $func_text .= '"'; |
||||
999 | |||||
1000 | $key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); |
||||
1001 | |||||
1002 | foreach (unpack('C*', $key) as $byte_value) |
||||
1003 | { |
||||
1004 | $func_text .= '\\x' . strtoupper(dechex($byte_value)); |
||||
1005 | } |
||||
1006 | |||||
1007 | $func_text .= '" => '; |
||||
1008 | } |
||||
1009 | elseif ($key_type == 'string') |
||||
1010 | { |
||||
1011 | $func_text .= var_export($key, true) . ' => '; |
||||
1012 | } |
||||
1013 | |||||
1014 | if (is_array($value)) |
||||
1015 | { |
||||
1016 | $func_text .= 'array(' . "\n"; |
||||
1017 | |||||
1018 | $indent++; |
||||
1019 | build_func_array($func_text, $value, $key_type, $val_type); |
||||
1020 | $indent--; |
||||
1021 | |||||
1022 | $func_text .= str_repeat("\t", $indent) . ')'; |
||||
1023 | } |
||||
1024 | elseif ($val_type == 'hexchar') |
||||
1025 | { |
||||
1026 | $func_text .= '"'; |
||||
1027 | |||||
1028 | $value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); |
||||
1029 | foreach (unpack('C*', $value) as $byte_value) |
||||
1030 | { |
||||
1031 | $func_text .= '\\x' . strtoupper(dechex($byte_value)); |
||||
1032 | } |
||||
1033 | |||||
1034 | $func_text .= '"'; |
||||
1035 | } |
||||
1036 | elseif ($val_type == 'string') |
||||
1037 | { |
||||
1038 | $func_text .= var_export($value, true); |
||||
1039 | } |
||||
1040 | else |
||||
1041 | { |
||||
1042 | $func_text .= $value; |
||||
1043 | } |
||||
1044 | |||||
1045 | $func_text .= ',' . "\n"; |
||||
1046 | } |
||||
1047 | } |
||||
1048 | |||||
1049 | ?> |