Total Complexity | 275 |
Total Lines | 2095 |
Duplicated Lines | 0 % |
Changes | 11 | ||
Bugs | 0 | Features | 0 |
Complex classes like Update_Unicode often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Update_Unicode, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
19 | class Update_Unicode extends SMF_BackgroundTask |
||
20 | { |
||
21 | /** |
||
22 | * URLs where we can fetch the Unicode data files. |
||
23 | */ |
||
24 | const DATA_URL_UCD = 'https://unicode.org/Public/UCD/latest/ucd'; |
||
25 | const DATA_URL_IDNA = 'https://www.unicode.org/Public/idna/latest'; |
||
26 | |||
27 | /** |
||
28 | * @var string The latest official release of the Unicode Character Database. |
||
29 | */ |
||
30 | public $ucd_version = ''; |
||
31 | |||
32 | /** |
||
33 | * @var string Path to temporary working directory. |
||
34 | */ |
||
35 | public $temp_dir = ''; |
||
36 | |||
37 | /** |
||
38 | * @var string Convenince alias of $sourcedir . '/Unicode'. |
||
39 | */ |
||
40 | public $unicodedir = ''; |
||
41 | |||
42 | /** |
||
43 | * @var int Used to ensure we exit long running tasks cleanly. |
||
44 | */ |
||
45 | private $time_limit = 30; |
||
46 | |||
47 | /** |
||
48 | * @var array Key-value pairs of character decompositions. |
||
49 | */ |
||
50 | private $full_decomposition_maps = array(); |
||
51 | |||
52 | /** |
||
53 | * @var array Character properties used during normalization. |
||
54 | */ |
||
55 | private $derived_normalization_props = array(); |
||
56 | |||
57 | /** |
||
58 | * @var array Assorted info about Unicode characters. |
||
59 | */ |
||
60 | private $char_data = array(); |
||
61 | |||
62 | /** |
||
63 | * @var array Statistical info about character scripts (e.g. Latin, Greek, Cyrillic, etc.) |
||
64 | */ |
||
65 | private $script_stats = array(); |
||
66 | |||
67 | /** |
||
68 | * @var array Tracks associations between character scripts' short and long names. |
||
69 | */ |
||
70 | private $script_aliases = array(); |
||
71 | |||
72 | /** |
||
73 | * @var array Info about functions to build in SMF's Unicode data files. |
||
74 | */ |
||
75 | private $funcs = array( |
||
76 | array( |
||
77 | 'file' => 'Metadata.php', |
||
78 | 'regex' => '/if \(!defined\(\'SMF_UNICODE_VERSION\'\)\)\n\tdefine\(\'SMF_UNICODE_VERSION\', \'\d+(\.\d+)*\'\);/', |
||
79 | 'data' => array( |
||
80 | // 0.0.0.0 will be replaced with correct value at runtime. |
||
81 | "if (!defined('SMF_UNICODE_VERSION'))\n\tdefine('SMF_UNICODE_VERSION', '0.0.0.0');", |
||
82 | ), |
||
83 | ), |
||
84 | 'utf8_normalize_d_maps' => array( |
||
85 | 'file' => 'DecompositionCanonical.php', |
||
86 | 'key_type' => 'hexchar', |
||
87 | 'val_type' => 'hexchar', |
||
88 | 'desc' => array('Helper function for utf8_normalize_d.'), |
||
89 | 'return' => array( |
||
90 | 'type' => 'array', |
||
91 | 'desc' => 'Canonical Decomposition maps for Unicode normalization.', |
||
92 | ), |
||
93 | 'data' => array(), |
||
94 | ), |
||
95 | 'utf8_normalize_kd_maps' => array( |
||
96 | 'file' => 'DecompositionCompatibility.php', |
||
97 | 'key_type' => 'hexchar', |
||
98 | 'val_type' => 'hexchar', |
||
99 | 'desc' => array('Helper function for utf8_normalize_kd.'), |
||
100 | 'return' => array( |
||
101 | 'type' => 'array', |
||
102 | 'desc' => 'Compatibility Decomposition maps for Unicode normalization.', |
||
103 | ), |
||
104 | 'data' => array(), |
||
105 | ), |
||
106 | 'utf8_compose_maps' => array( |
||
107 | 'file' => 'Composition.php', |
||
108 | 'key_type' => 'hexchar', |
||
109 | 'val_type' => 'hexchar', |
||
110 | 'desc' => array('Helper function for utf8_compose.'), |
||
111 | 'return' => array( |
||
112 | 'type' => 'array', |
||
113 | 'desc' => 'Composition maps for Unicode normalization.', |
||
114 | ), |
||
115 | 'data' => array(), |
||
116 | ), |
||
117 | 'utf8_combining_classes' => array( |
||
118 | 'file' => 'CombiningClasses.php', |
||
119 | 'key_type' => 'hexchar', |
||
120 | 'val_type' => 'int', |
||
121 | 'desc' => array('Helper function for utf8_normalize_d.'), |
||
122 | 'return' => array( |
||
123 | 'type' => 'array', |
||
124 | 'desc' => 'Combining Class data for Unicode normalization.', |
||
125 | ), |
||
126 | 'data' => array(), |
||
127 | ), |
||
128 | 'utf8_strtolower_simple_maps' => array( |
||
129 | 'file' => 'CaseLower.php', |
||
130 | 'key_type' => 'hexchar', |
||
131 | 'val_type' => 'hexchar', |
||
132 | 'desc' => array('Helper function for utf8_strtolower.'), |
||
133 | 'return' => array( |
||
134 | 'type' => 'array', |
||
135 | 'desc' => 'Uppercase to lowercase maps.', |
||
136 | ), |
||
137 | 'data' => array(), |
||
138 | ), |
||
139 | 'utf8_strtolower_maps' => array( |
||
140 | 'file' => 'CaseLower.php', |
||
141 | 'key_type' => 'hexchar', |
||
142 | 'val_type' => 'hexchar', |
||
143 | 'desc' => array('Helper function for utf8_strtolower.'), |
||
144 | 'return' => array( |
||
145 | 'type' => 'array', |
||
146 | 'desc' => 'Uppercase to lowercase maps.', |
||
147 | ), |
||
148 | 'data' => array(), |
||
149 | ), |
||
150 | 'utf8_strtoupper_simple_maps' => array( |
||
151 | 'file' => 'CaseUpper.php', |
||
152 | 'key_type' => 'hexchar', |
||
153 | 'val_type' => 'hexchar', |
||
154 | 'desc' => array('Helper function for utf8_strtoupper.'), |
||
155 | 'return' => array( |
||
156 | 'type' => 'array', |
||
157 | 'desc' => 'Lowercase to uppercase maps.', |
||
158 | ), |
||
159 | 'data' => array(), |
||
160 | ), |
||
161 | 'utf8_strtoupper_maps' => array( |
||
162 | 'file' => 'CaseUpper.php', |
||
163 | 'key_type' => 'hexchar', |
||
164 | 'val_type' => 'hexchar', |
||
165 | 'desc' => array('Helper function for utf8_strtoupper.'), |
||
166 | 'return' => array( |
||
167 | 'type' => 'array', |
||
168 | 'desc' => 'Lowercase to uppercase maps.', |
||
169 | ), |
||
170 | 'data' => array(), |
||
171 | ), |
||
172 | 'utf8_titlecase_simple_maps' => array( |
||
173 | 'file' => 'CaseTitle.php', |
||
174 | 'key_type' => 'hexchar', |
||
175 | 'val_type' => 'hexchar', |
||
176 | 'desc' => array('Helper function for utf8_convert_case.'), |
||
177 | 'return' => array( |
||
178 | 'type' => 'array', |
||
179 | 'desc' => 'Simple title case maps.', |
||
180 | ), |
||
181 | 'data' => array(), |
||
182 | ), |
||
183 | 'utf8_titlecase_maps' => array( |
||
184 | 'file' => 'CaseTitle.php', |
||
185 | 'key_type' => 'hexchar', |
||
186 | 'val_type' => 'hexchar', |
||
187 | 'desc' => array('Helper function for utf8_convert_case.'), |
||
188 | 'return' => array( |
||
189 | 'type' => 'array', |
||
190 | 'desc' => 'Full title case maps.', |
||
191 | ), |
||
192 | 'data' => array(), |
||
193 | ), |
||
194 | 'utf8_casefold_simple_maps' => array( |
||
195 | 'file' => 'CaseFold.php', |
||
196 | 'key_type' => 'hexchar', |
||
197 | 'val_type' => 'hexchar', |
||
198 | 'desc' => array('Helper function for utf8_casefold.'), |
||
199 | 'return' => array( |
||
200 | 'type' => 'array', |
||
201 | 'desc' => 'Casefolding maps.', |
||
202 | ), |
||
203 | 'data' => array(), |
||
204 | ), |
||
205 | 'utf8_casefold_maps' => array( |
||
206 | 'file' => 'CaseFold.php', |
||
207 | 'key_type' => 'hexchar', |
||
208 | 'val_type' => 'hexchar', |
||
209 | 'desc' => array('Helper function for utf8_casefold.'), |
||
210 | 'return' => array( |
||
211 | 'type' => 'array', |
||
212 | 'desc' => 'Casefolding maps.', |
||
213 | ), |
||
214 | 'data' => array(), |
||
215 | ), |
||
216 | 'utf8_default_ignorables' => array( |
||
217 | 'file' => 'DefaultIgnorables.php', |
||
218 | 'key_type' => 'int', |
||
219 | 'val_type' => 'hexchar', |
||
220 | 'desc' => array('Helper function for utf8_normalize_kc_casefold.'), |
||
221 | 'return' => array( |
||
222 | 'type' => 'array', |
||
223 | 'desc' => 'Characters with the \'Default_Ignorable_Code_Point\' property.', |
||
224 | ), |
||
225 | 'data' => array(), |
||
226 | ), |
||
227 | 'utf8_regex_properties' => array( |
||
228 | 'file' => 'RegularExpressions.php', |
||
229 | 'key_type' => 'string', |
||
230 | 'val_type' => 'string', |
||
231 | 'propfiles' => array( |
||
232 | 'DerivedCoreProperties.txt', |
||
233 | 'PropList.txt', |
||
234 | 'emoji/emoji-data.txt', |
||
235 | 'extracted/DerivedGeneralCategory.txt', |
||
236 | ), |
||
237 | 'props' => array( |
||
238 | 'Bidi_Control', |
||
239 | 'Case_Ignorable', |
||
240 | 'Cn', |
||
241 | 'Default_Ignorable_Code_Point', |
||
242 | 'Emoji', |
||
243 | 'Emoji_Modifier', |
||
244 | 'Ideographic', |
||
245 | 'Join_Control', |
||
246 | 'Regional_Indicator', |
||
247 | 'Variation_Selector', |
||
248 | ), |
||
249 | 'desc' => array( |
||
250 | 'Helper function for utf8_sanitize_invisibles and utf8_convert_case.', |
||
251 | '', |
||
252 | 'Character class lists compiled from:', |
||
253 | 'https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt', |
||
254 | 'https://unicode.org/Public/UNIDATA/PropList.txt', |
||
255 | 'https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt', |
||
256 | 'https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt', |
||
257 | ), |
||
258 | 'return' => array( |
||
259 | 'type' => 'array', |
||
260 | 'desc' => 'Character classes for various Unicode properties.', |
||
261 | ), |
||
262 | 'data' => array(), |
||
263 | ), |
||
264 | 'utf8_regex_variation_selectors' => array( |
||
265 | 'file' => 'RegularExpressions.php', |
||
266 | 'key_type' => 'string', |
||
267 | 'val_type' => 'string', |
||
268 | 'desc' => array( |
||
269 | 'Helper function for utf8_sanitize_invisibles.', |
||
270 | '', |
||
271 | 'Character class lists compiled from:', |
||
272 | 'https://unicode.org/Public/UNIDATA/StandardizedVariants.txt', |
||
273 | 'https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt', |
||
274 | ), |
||
275 | 'return' => array( |
||
276 | 'type' => 'array', |
||
277 | 'desc' => 'Character classes for filtering variation selectors.', |
||
278 | ), |
||
279 | 'data' => array(), |
||
280 | ), |
||
281 | 'utf8_regex_joining_type' => array( |
||
282 | 'file' => 'RegularExpressions.php', |
||
283 | 'key_type' => 'string', |
||
284 | 'val_type' => 'string', |
||
285 | 'desc' => array( |
||
286 | 'Helper function for utf8_sanitize_invisibles.', |
||
287 | '', |
||
288 | 'Character class lists compiled from:', |
||
289 | 'https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt', |
||
290 | ), |
||
291 | 'return' => array( |
||
292 | 'type' => 'array', |
||
293 | 'desc' => 'Character classes for joining characters in certain scripts.', |
||
294 | ), |
||
295 | 'data' => array(), |
||
296 | ), |
||
297 | 'utf8_regex_indic' => array( |
||
298 | 'file' => 'RegularExpressions.php', |
||
299 | 'key_type' => 'string', |
||
300 | 'val_type' => 'string', |
||
301 | 'desc' => array( |
||
302 | 'Helper function for utf8_sanitize_invisibles.', |
||
303 | '', |
||
304 | 'Character class lists compiled from:', |
||
305 | 'https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt', |
||
306 | 'https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt', |
||
307 | ), |
||
308 | 'return' => array( |
||
309 | 'type' => 'array', |
||
310 | 'desc' => 'Character classes for Indic scripts that use viramas.', |
||
311 | ), |
||
312 | 'data' => array(), |
||
313 | ), |
||
314 | 'utf8_regex_quick_check' => array( |
||
315 | 'file' => 'QuickCheck.php', |
||
316 | 'key_type' => 'string', |
||
317 | 'val_type' => 'string', |
||
318 | 'desc' => array( |
||
319 | 'Helper function for utf8_is_normalized.', |
||
320 | '', |
||
321 | 'Character class lists compiled from:', |
||
322 | 'https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt', |
||
323 | ), |
||
324 | 'return' => array( |
||
325 | 'type' => 'array', |
||
326 | 'desc' => 'Character classes for disallowed characters in normalization forms.', |
||
327 | ), |
||
328 | 'data' => array(), |
||
329 | ), |
||
330 | 'idna_maps' => array( |
||
331 | 'file' => 'Idna.php', |
||
332 | 'key_type' => 'hexchar', |
||
333 | 'val_type' => 'hexchar', |
||
334 | 'desc' => array('Helper function for idn_to_* polyfills.'), |
||
335 | 'return' => array( |
||
336 | 'type' => 'array', |
||
337 | 'desc' => 'Character maps for IDNA processing.', |
||
338 | ), |
||
339 | 'data' => array(), |
||
340 | ), |
||
341 | 'idna_maps_deviation' => array( |
||
342 | 'file' => 'Idna.php', |
||
343 | 'key_type' => 'hexchar', |
||
344 | 'val_type' => 'hexchar', |
||
345 | 'desc' => array('Helper function for idn_to_* polyfills.'), |
||
346 | 'return' => array( |
||
347 | 'type' => 'array', |
||
348 | 'desc' => '"Deviation" character maps for IDNA processing.', |
||
349 | ), |
||
350 | 'data' => array(), |
||
351 | ), |
||
352 | 'idna_regex' => array( |
||
353 | 'file' => 'Idna.php', |
||
354 | 'key_type' => 'string', |
||
355 | 'val_type' => 'string', |
||
356 | 'desc' => array('Helper function for idn_to_* polyfills.'), |
||
357 | 'return' => array( |
||
358 | 'type' => 'array', |
||
359 | 'desc' => 'Regular expressions useful for IDNA processing.', |
||
360 | ), |
||
361 | 'data' => array(), |
||
362 | ), |
||
363 | ); |
||
364 | |||
365 | /** |
||
366 | * @var array Files to fetch from unicode.org. |
||
367 | */ |
||
368 | private $prefetch = array( |
||
369 | self::DATA_URL_UCD => array( |
||
370 | 'CaseFolding.txt', |
||
371 | 'DerivedAge.txt', |
||
372 | 'DerivedCoreProperties.txt', |
||
373 | 'DerivedNormalizationProps.txt', |
||
374 | 'IndicSyllabicCategory.txt', |
||
375 | 'PropertyValueAliases.txt', |
||
376 | 'PropList.txt', |
||
377 | 'ScriptExtensions.txt', |
||
378 | 'Scripts.txt', |
||
379 | 'SpecialCasing.txt', |
||
380 | 'StandardizedVariants.txt', |
||
381 | 'UnicodeData.txt', |
||
382 | 'emoji/emoji-data.txt', |
||
383 | 'emoji/emoji-variation-sequences.txt', |
||
384 | 'extracted/DerivedGeneralCategory.txt', |
||
385 | 'extracted/DerivedJoiningType.txt', |
||
386 | ), |
||
387 | self::DATA_URL_IDNA => array( |
||
388 | 'IdnaMappingTable.txt', |
||
389 | ), |
||
390 | ); |
||
391 | |||
392 | /** |
||
393 | * This executes the task. |
||
394 | * |
||
395 | * @return bool Always returns true |
||
396 | */ |
||
397 | public function execute() |
||
398 | { |
||
399 | global $sourcedir, $smcFunc, $txt; |
||
400 | |||
401 | /***************** |
||
402 | * Part 1: Setup * |
||
403 | *****************/ |
||
404 | $this->unicodedir = $sourcedir . DIRECTORY_SEPARATOR . 'Unicode'; |
||
405 | |||
406 | // We need a temporary directory to hold our files while we work on them. |
||
407 | $this->make_temp_dir(); |
||
408 | |||
409 | if (empty($this->temp_dir)) |
||
410 | return true; |
||
411 | |||
412 | // Prevent race conditions. |
||
413 | if (is_file($this->temp_dir . DIRECTORY_SEPARATOR . 'lock')) |
||
414 | return true; |
||
415 | |||
416 | if (!@touch($this->temp_dir . DIRECTORY_SEPARATOR . 'lock')) |
||
417 | return true; |
||
418 | |||
419 | register_shutdown_function(function () { |
||
420 | if (file_exists($this->temp_dir . DIRECTORY_SEPARATOR . 'lock')) |
||
421 | unlink($this->temp_dir . DIRECTORY_SEPARATOR . 'lock'); |
||
422 | }); |
||
423 | |||
424 | // Do we even need to update? |
||
425 | if (!$this->should_update()) |
||
426 | { |
||
427 | $this->deltree($this->temp_dir); |
||
428 | return true; |
||
429 | } |
||
430 | |||
431 | @ini_set('memory_limit', '256M'); |
||
432 | |||
433 | $this->time_limit = (empty(ini_get('max_execution_time')) || @set_time_limit(MAX_CLAIM_THRESHOLD) !== false) ? MAX_CLAIM_THRESHOLD : ini_get('max_execution_time'); |
||
|
|||
434 | |||
435 | foreach ($this->funcs as $func_name => &$func_info) |
||
436 | { |
||
437 | $file_paths['final'] = implode(DIRECTORY_SEPARATOR, array($this->unicodedir, $func_info['file'])); |
||
438 | |||
439 | if (!file_exists($file_paths['final'])) |
||
440 | touch($file_paths['final']); |
||
441 | |||
442 | if (!is_file($file_paths['final']) || !smf_chmod($file_paths['final'])) |
||
443 | { |
||
444 | loadLanguage('Errors'); |
||
445 | log_error(sprintf($txt['unicode_update_failed'], $this->unicodedir)); |
||
446 | return true; |
||
447 | } |
||
448 | |||
449 | $file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file'])); |
||
450 | |||
451 | if (!file_exists($file_paths['temp'])) |
||
452 | touch($file_paths['temp']); |
||
453 | |||
454 | if (!is_file($file_paths['temp']) || !smf_chmod($file_paths['temp'])) |
||
455 | { |
||
456 | loadLanguage('Errors'); |
||
457 | log_error(sprintf($txt['unicode_update_failed'], $this->temp_dir)); |
||
458 | return true; |
||
459 | } |
||
460 | |||
461 | $file_contents['temp'] = file_get_contents($file_paths['temp']); |
||
462 | |||
463 | if (empty($file_contents['temp'])) |
||
464 | { |
||
465 | file_put_contents($file_paths['temp'], $this->smf_file_header()); |
||
466 | } |
||
467 | elseif (substr($file_contents['temp'], -2) === '?' . '>') |
||
468 | { |
||
469 | file_put_contents($file_paths['temp'], substr($file_contents['temp'], 0, -2)); |
||
470 | } |
||
471 | } |
||
472 | |||
473 | // Prefetch the files in case the network is slow. |
||
474 | foreach ($this->prefetch as $data_url => $files) |
||
475 | { |
||
476 | $max_fetch_time = 0; |
||
477 | |||
478 | foreach ($files as $filename) |
||
479 | { |
||
480 | $fetch_start = microtime(true); |
||
481 | |||
482 | $local_file = $this->fetch_unicode_file($filename, $data_url); |
||
483 | |||
484 | $max_fetch_time = max($max_fetch_time, microtime(true) - $fetch_start); |
||
485 | |||
486 | // If prefetch is taking a really long time, pause and try again later. |
||
487 | if ($local_file === false || microtime(true) - TIME_START >= $this->time_limit - $max_fetch_time) |
||
488 | { |
||
489 | $smcFunc['db_insert']('', |
||
490 | '{db_prefix}background_tasks', |
||
491 | array( |
||
492 | 'task_file' => 'string', |
||
493 | 'task_class' => 'string', |
||
494 | 'task_data' => 'string', |
||
495 | 'claimed_time' => 'int', |
||
496 | ), |
||
497 | array( |
||
498 | '$sourcedir/tasks/UpdateUnicode.php', |
||
499 | 'Update_Unicode', |
||
500 | '', |
||
501 | time() - MAX_CLAIM_THRESHOLD, |
||
502 | ), |
||
503 | array('id_task') |
||
504 | ); |
||
505 | |||
506 | return true; |
||
507 | } |
||
508 | } |
||
509 | } |
||
510 | |||
511 | // Track whether anything goes wrong along the way. |
||
512 | $success = true; |
||
513 | |||
514 | /********************************************* |
||
515 | * Part 2: Normalization, case folding, etc. * |
||
516 | *********************************************/ |
||
517 | $success = $this->process_derived_normalization_props() & $success; |
||
518 | $success = $this->process_main_unicode_data() & $success; |
||
519 | $success = $this->process_casing_data() & $success; |
||
520 | $success = $this->finalize_decomposition_forms() & $success; |
||
521 | |||
522 | $this->full_decomposition_maps = array(); |
||
523 | |||
524 | $this->export_funcs_to_file(); |
||
525 | |||
526 | /*********************************** |
||
527 | * Part 3: Regular expression data * |
||
528 | ***********************************/ |
||
529 | $success = $this->build_quick_check() & $success; |
||
530 | |||
531 | $this->derived_normalization_props = array(); |
||
532 | |||
533 | $success = $this->build_regex_properties() & $success; |
||
534 | $success = $this->build_regex_variation_selectors() & $success; |
||
535 | $success = $this->build_script_stats() & $success; |
||
536 | $success = $this->build_regex_joining_type() & $success; |
||
537 | $success = $this->build_regex_indic() & $success; |
||
538 | |||
539 | unset($this->funcs['utf8_combining_classes']['data']); |
||
540 | |||
541 | $this->export_funcs_to_file(); |
||
542 | |||
543 | /********************************* |
||
544 | * Part 4: IDNA maps and regexes * |
||
545 | *********************************/ |
||
546 | $success = $this->build_idna() & $success; |
||
547 | |||
548 | $this->export_funcs_to_file(); |
||
549 | |||
550 | /******************* |
||
551 | * Part 5: Wrapup. * |
||
552 | *******************/ |
||
553 | if ($success) |
||
554 | { |
||
555 | require_once($sourcedir . '/Subs-Admin.php'); |
||
556 | |||
557 | foreach ($this->funcs as $func_name => $func_info) |
||
558 | { |
||
559 | $file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file']; |
||
560 | |||
561 | // If the temp file went missing, bail out immediately. |
||
562 | if (!is_readable($file_paths['temp']) || !is_writable($file_paths['temp'])) |
||
563 | return true; |
||
564 | |||
565 | // Add closing PHP tag to the temp file. |
||
566 | if (!preg_match('/[?]>$/', file_get_contents($file_paths['temp']))) |
||
567 | file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND); |
||
568 | } |
||
569 | |||
570 | foreach ($this->funcs as $func_name => $func_info) |
||
571 | { |
||
572 | $file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file']; |
||
573 | $file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file']; |
||
574 | |||
575 | // Only move if the file has changed, discounting the license block. |
||
576 | foreach (array('temp', 'real') as $f) |
||
577 | { |
||
578 | if (file_exists($file_paths[$f])) |
||
579 | { |
||
580 | $file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f])); |
||
581 | } |
||
582 | else |
||
583 | $file_contents[$f] = ''; |
||
584 | } |
||
585 | |||
586 | if ($file_contents['temp'] === '') |
||
587 | { |
||
588 | $success = false; |
||
589 | } |
||
590 | elseif ($file_contents['temp'] !== $file_contents['real']) |
||
591 | { |
||
592 | $success &= safe_file_write($file_paths['real'], file_get_contents($file_paths['temp']), $file_paths['real'] . '.bak', time() + 1); |
||
593 | } |
||
594 | } |
||
595 | |||
596 | // If we wrote all the files successfully, remove the backup files. |
||
597 | if ($success) |
||
598 | { |
||
599 | foreach (glob($this->unicodedir . DIRECTORY_SEPARATOR . '*.bak') as $path) |
||
600 | unlink($path); |
||
601 | } |
||
602 | // If any file failed to write, revert all of them. |
||
603 | else |
||
604 | { |
||
605 | foreach (glob($this->unicodedir . DIRECTORY_SEPARATOR . '*.bak') as $path) |
||
606 | rename($path, substr($path, -4)); |
||
607 | } |
||
608 | } |
||
609 | |||
610 | // Clean up after ourselves. |
||
611 | $this->deltree($this->temp_dir); |
||
612 | |||
613 | // All done. |
||
614 | return true; |
||
615 | } |
||
616 | |||
617 | /** |
||
618 | * Makes a temporary directory to hold our working files, and sets |
||
619 | * $this->temp_dir to the path of the created directory. |
||
620 | */ |
||
621 | private function make_temp_dir() |
||
622 | { |
||
623 | global $sourcedir; |
||
624 | |||
625 | if (empty($this->temp_dir)) |
||
626 | { |
||
627 | require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php'); |
||
628 | |||
629 | $this->temp_dir = rtrim(sm_temp_dir(), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'Unicode'; |
||
630 | |||
631 | if (!is_dir($this->temp_dir)) |
||
632 | @mkdir($this->temp_dir); |
||
633 | |||
634 | // Needs to be a writable directory. |
||
635 | if (!is_dir($this->temp_dir) || !smf_chmod($this->temp_dir)) |
||
636 | $this->temp_dir = null; |
||
637 | } |
||
638 | } |
||
639 | |||
640 | /** |
||
641 | * Fetches the contents of a Unicode data file. |
||
642 | * |
||
643 | * Caches a local copy for subsequent lookups. |
||
644 | * |
||
645 | * @param string $filename Name of a Unicode datafile, relative to $data_url. |
||
646 | * @param string $data_url One of this class's DATA_URL_* constants. |
||
647 | * |
||
648 | * @return string Path to locally saved copy of the file. |
||
649 | */ |
||
650 | private function fetch_unicode_file($filename, $data_url) |
||
651 | { |
||
652 | global $sourcedir; |
||
653 | |||
654 | $filename = ltrim($filename, '\\/'); |
||
655 | $file_url_name = strtr($filename, array('\\' => '/')); |
||
656 | $file_local_name = strtr($filename, array('\\' => DIRECTORY_SEPARATOR, '/' => DIRECTORY_SEPARATOR)); |
||
657 | |||
658 | switch ($data_url) |
||
659 | { |
||
660 | case self::DATA_URL_IDNA: |
||
661 | $sub_dir = 'idna'; |
||
662 | break; |
||
663 | |||
664 | default: |
||
665 | $sub_dir = 'ucd'; |
||
666 | break; |
||
667 | } |
||
668 | |||
669 | $local_file = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $sub_dir, $file_local_name)); |
||
670 | |||
671 | if (file_exists($local_file)) |
||
672 | return $local_file; |
||
673 | |||
674 | if (!file_exists(dirname($local_file))) |
||
675 | { |
||
676 | @mkdir(dirname($local_file), 0777, true); |
||
677 | |||
678 | if (!is_dir(dirname($local_file))) |
||
679 | return false; |
||
680 | } |
||
681 | |||
682 | $file_contents = fetch_web_data($data_url . '/' . $file_url_name); |
||
683 | |||
684 | if (empty($file_contents)) |
||
685 | return false; |
||
686 | |||
687 | file_put_contents($local_file, $file_contents); |
||
688 | |||
689 | return $local_file; |
||
690 | } |
||
691 | |||
692 | /** |
||
693 | * Deletes a directory and its contents. |
||
694 | * |
||
695 | * @param string Path to directory |
||
696 | */ |
||
697 | private function deltree($dir_path) |
||
698 | { |
||
699 | // For safety. |
||
700 | if (strpos($dir_path, $this->temp_dir) !== 0) |
||
701 | return; |
||
702 | |||
703 | $dir = new DirectoryIterator($dir_path); |
||
704 | |||
705 | $to_delete = array(); |
||
706 | foreach ($dir as $fileinfo) |
||
707 | { |
||
708 | if ($fileinfo->isDot()) |
||
709 | continue; |
||
710 | |||
711 | if ($fileinfo->isDir()) |
||
712 | $this->deltree($fileinfo->getPathname()); |
||
713 | else |
||
714 | $to_delete[] = $fileinfo->getPathname(); |
||
715 | } |
||
716 | |||
717 | foreach ($to_delete as $pathname) |
||
718 | unlink($pathname); |
||
719 | |||
720 | rmdir($dir_path); |
||
721 | } |
||
722 | |||
723 | /** |
||
724 | * Gets basic boilerplate for the PHP files that will be created. |
||
725 | * |
||
726 | * @return string Standard SMF file header. |
||
727 | */ |
||
728 | private function smf_file_header() |
||
729 | { |
||
730 | global $sourcedir; |
||
731 | |||
732 | static $file_template; |
||
733 | |||
734 | if (!empty($file_template)) |
||
735 | return $file_template; |
||
736 | |||
737 | require_once($sourcedir . '/Subs-Admin.php'); |
||
738 | $settings_defs = get_settings_defs(); |
||
739 | |||
740 | $license_block = ''; |
||
741 | |||
742 | $keep_line = true; |
||
743 | foreach (explode("\n", $settings_defs[0]['text']) as $line) |
||
744 | { |
||
745 | if (strpos($line, 'SMF') !== false || strpos($line, 'Simple Machines') !== false) |
||
746 | $keep_line = true; |
||
747 | |||
748 | if ($keep_line) |
||
749 | $license_block .= $line . "\n"; |
||
750 | |||
751 | if ($line === '/**') |
||
752 | $keep_line = false; |
||
753 | } |
||
754 | |||
755 | $file_template = implode("\n\n", array( |
||
756 | '<' . '?php', |
||
757 | trim($license_block), |
||
758 | "if (!defined('SMF'))\n\tdie('No direct access...');", |
||
759 | '', |
||
760 | )); |
||
761 | |||
762 | return $file_template; |
||
763 | } |
||
764 | |||
765 | /** |
||
766 | * Updates Unicode data functions in their designated files. |
||
767 | */ |
||
768 | function export_funcs_to_file() |
||
769 | { |
||
770 | foreach ($this->funcs as $func_name => $func_info) |
||
771 | { |
||
772 | if (!isset($func_info['data'])) |
||
773 | continue; |
||
774 | |||
775 | $temp_file_path = $this->temp_dir . '/' . $func_info['file']; |
||
776 | |||
777 | list($func_code, $func_regex) = $this->get_function_code_and_regex($func_name); |
||
778 | |||
779 | $file_contents = file_get_contents($temp_file_path); |
||
780 | |||
781 | if (preg_match($func_regex, $file_contents)) |
||
782 | { |
||
783 | file_put_contents($temp_file_path, preg_replace($func_regex, $func_code, $file_contents)); |
||
784 | } |
||
785 | else |
||
786 | { |
||
787 | file_put_contents($temp_file_path, $func_code . "\n\n", FILE_APPEND); |
||
788 | } |
||
789 | |||
790 | // Free up some memory. |
||
791 | if ($func_name != 'utf8_combining_classes') |
||
792 | unset($this->funcs[$func_name]['data']); |
||
793 | } |
||
794 | } |
||
795 | |||
796 | /** |
||
797 | * Builds complete code for the specified element in $this->funcs |
||
798 | * to be inserted into the relevant PHP file. Also builds a regex |
||
799 | * to check whether a copy of the the function is already present |
||
800 | * in the file. |
||
801 | * |
||
802 | * @param string $func_name Key of an element in $this->funcs. |
||
803 | * |
||
804 | * @return array PHP code and a regular expression. |
||
805 | */ |
||
806 | private function get_function_code_and_regex($func_name) |
||
807 | { |
||
808 | // No function name means data is raw code. |
||
809 | if (!is_string($func_name)) |
||
810 | { |
||
811 | $func_code = implode("\n\n", $this->funcs[$func_name]['data']); |
||
812 | $func_regex = isset($this->funcs[$func_name]['regex']) ? $this->funcs[$func_name]['regex'] : '/' . preg_quote($func_code, '/') . '/'; |
||
813 | } |
||
814 | else |
||
815 | { |
||
816 | // The regex to look for this function in the existing file content. |
||
817 | $func_regex = "/(\/\*([^*]|\*(?!\/))*\*\/\n)?function $func_name\(\)\n{.+?\n}/s"; |
||
818 | |||
819 | // The PHPDoc comment for this function. |
||
820 | $func_code = '/**' . implode("\n * ", array_merge( |
||
821 | array(''), |
||
822 | $this->funcs[$func_name]['desc'], |
||
823 | array( |
||
824 | '', |
||
825 | 'Developers: Do not update the data in this function manually. Instead,', |
||
826 | 'run "php -f other/update_unicode_data.php" on the command line.', |
||
827 | ), |
||
828 | empty($this->funcs[$func_name]['return']) ? array() : array( |
||
829 | '', |
||
830 | '@return ' . implode(' ', $this->funcs[$func_name]['return']) |
||
831 | ) |
||
832 | )) . "\n */\n"; |
||
833 | |||
834 | // The code for this function. |
||
835 | $func_code .= implode("\n", array( |
||
836 | 'function ' . $func_name . '()', |
||
837 | '{', |
||
838 | "\t" . 'return array(', |
||
839 | '', |
||
840 | )); |
||
841 | |||
842 | $this->build_func_array( |
||
843 | $func_code, |
||
844 | $this->funcs[$func_name]['data'], |
||
845 | $this->funcs[$func_name]['key_type'], |
||
846 | $this->funcs[$func_name]['val_type'] |
||
847 | ); |
||
848 | |||
849 | $func_code .= implode("\n", array( |
||
850 | "\t" . ');', |
||
851 | '}', |
||
852 | )); |
||
853 | } |
||
854 | |||
855 | // Some final tidying. |
||
856 | $func_code = str_replace('\\\\x', '\x', $func_code); |
||
857 | $func_code = preg_replace('/\h+$/m', '', $func_code); |
||
858 | |||
859 | return array($func_code, $func_regex); |
||
860 | } |
||
861 | |||
862 | /** |
||
863 | * Helper for get_function_code_and_regex(). Builds the function's data array. |
||
864 | * |
||
865 | * @param string &$func_code The raw string that contains function code. |
||
866 | * @param array $data Data to format as an array. |
||
867 | * @param string $key_type How to format the array keys. |
||
868 | * @param string $val_type How to format the array values. |
||
869 | */ |
||
870 | private function build_func_array(&$func_code, $data, $key_type, $val_type) |
||
871 | { |
||
872 | static $indent = 2; |
||
873 | |||
874 | foreach ($data as $key => $value) |
||
875 | { |
||
876 | $func_code .= str_repeat("\t", $indent); |
||
877 | |||
878 | if ($key_type == 'hexchar') |
||
879 | { |
||
880 | $func_code .= '"'; |
||
881 | |||
882 | $key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); |
||
883 | |||
884 | foreach (unpack('C*', $key) as $byte_value) |
||
885 | { |
||
886 | $func_code .= '\x' . strtoupper(dechex($byte_value)); |
||
887 | } |
||
888 | |||
889 | $func_code .= '" => '; |
||
890 | } |
||
891 | elseif ($key_type == 'string' && !is_int($key)) |
||
892 | { |
||
893 | $func_code .= var_export($key, true) . ' => '; |
||
894 | } |
||
895 | |||
896 | if (is_array($value)) |
||
897 | { |
||
898 | if ($val_type == 'string' && count($value) === count($value, COUNT_RECURSIVE)) |
||
899 | { |
||
900 | $nextline = "\n" . str_repeat("\t", $indent + 1); |
||
901 | |||
902 | $func_code = rtrim($func_code); |
||
903 | |||
904 | $func_code .= $nextline . implode(' .' . $nextline, array_map( |
||
905 | function ($v) |
||
906 | { |
||
907 | return var_export($v, true); |
||
908 | }, |
||
909 | $value |
||
910 | )); |
||
911 | } |
||
912 | else |
||
913 | { |
||
914 | $func_code .= 'array(' . "\n"; |
||
915 | |||
916 | $indent++; |
||
917 | $this->build_func_array($func_code, $value, $key_type, $val_type); |
||
918 | $indent--; |
||
919 | |||
920 | $func_code .= str_repeat("\t", $indent) . ')'; |
||
921 | } |
||
922 | } |
||
923 | elseif ($val_type == 'hexchar') |
||
924 | { |
||
925 | $func_code .= '"'; |
||
926 | |||
927 | $value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); |
||
928 | foreach (unpack('C*', $value) as $byte_value) |
||
929 | { |
||
930 | $func_code .= '\x' . strtoupper(dechex($byte_value)); |
||
931 | } |
||
932 | |||
933 | $func_code .= '"'; |
||
934 | } |
||
935 | elseif ($val_type == 'string') |
||
936 | { |
||
937 | $func_code .= var_export($value, true); |
||
938 | } |
||
939 | else |
||
940 | { |
||
941 | $func_code .= $value; |
||
942 | } |
||
943 | |||
944 | $func_code .= ',' . "\n"; |
||
945 | } |
||
946 | } |
||
947 | |||
948 | /** |
||
949 | * Compares version of SMF's local Unicode data with the latest release. |
||
950 | * |
||
951 | * @return bool Whether SMF should update its local Unicode data or not. |
||
952 | */ |
||
953 | private function should_update() |
||
954 | { |
||
955 | $this->lookup_ucd_version(); |
||
956 | |||
957 | // We can't do anything if lookup failed. |
||
958 | if (empty($this->ucd_version)) |
||
959 | return false; |
||
960 | |||
961 | // If this file is missing, force an update. |
||
962 | if (!@include_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php')) |
||
963 | return true; |
||
964 | |||
965 | return version_compare($this->ucd_version, SMF_UNICODE_VERSION, '>='); |
||
966 | } |
||
967 | |||
968 | /** |
||
969 | * Sets $this->ucd_version to latest version number of the UCD. |
||
970 | */ |
||
971 | private function lookup_ucd_version() |
||
972 | { |
||
973 | global $sourcedir; |
||
974 | |||
975 | if (!empty($this->ucd_version)) |
||
976 | return true; |
||
977 | |||
978 | $local_file = $this->fetch_unicode_file('ReadMe.txt', self::DATA_URL_UCD); |
||
979 | |||
980 | if (empty($local_file)) |
||
981 | return false; |
||
982 | |||
983 | preg_match('/Version\s+(\d+(?:\.\d+)*)/', file_get_contents($local_file), $matches); |
||
984 | |||
985 | if (empty($matches[1])) |
||
986 | return false; |
||
987 | |||
988 | $this->ucd_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0')); |
||
989 | |||
990 | // Update this while we are at it. |
||
991 | foreach ($this->funcs as $func_name => &$func_info) |
||
992 | { |
||
993 | if ($func_info['file'] === 'Metadata.php') |
||
994 | { |
||
995 | $func_info['data'][0] = str_replace('0.0.0.0', $this->ucd_version, $func_info['data'][0]); |
||
996 | |||
997 | break; |
||
998 | } |
||
999 | } |
||
1000 | |||
1001 | return true; |
||
1002 | } |
||
1003 | |||
1004 | /** |
||
1005 | * Processes DerivedNormalizationProps.txt in order to populate |
||
1006 | * $this->derived_normalization_props. |
||
1007 | */ |
||
1008 | private function process_derived_normalization_props() |
||
1009 | { |
||
1010 | $local_file = $this->fetch_unicode_file('DerivedNormalizationProps.txt', self::DATA_URL_UCD); |
||
1011 | |||
1012 | if (empty($local_file)) |
||
1013 | return false; |
||
1014 | |||
1015 | foreach (file($local_file) as $line) |
||
1016 | { |
||
1017 | $line = substr($line, 0, strcspn($line, '#')); |
||
1018 | |||
1019 | if (strpos($line, ';') === false) |
||
1020 | { |
||
1021 | continue; |
||
1022 | } |
||
1023 | |||
1024 | $fields = explode(';', $line); |
||
1025 | |||
1026 | foreach ($fields as $key => $value) |
||
1027 | { |
||
1028 | $fields[$key] = trim($value); |
||
1029 | } |
||
1030 | |||
1031 | if (!isset($this->derived_normalization_props[$fields[1]])) |
||
1032 | { |
||
1033 | $this->derived_normalization_props[$fields[1]] = array(); |
||
1034 | } |
||
1035 | |||
1036 | if (strpos($fields[0], '..') === false) |
||
1037 | { |
||
1038 | $entities = array('&#x' . $fields[0] . ';'); |
||
1039 | } |
||
1040 | else |
||
1041 | { |
||
1042 | $entities = array(); |
||
1043 | |||
1044 | list($start, $end) = explode('..', $fields[0]); |
||
1045 | |||
1046 | $ord_s = hexdec($start); |
||
1047 | $ord_e = hexdec($end); |
||
1048 | |||
1049 | $ord = $ord_s; |
||
1050 | while ($ord <= $ord_e) |
||
1051 | { |
||
1052 | $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
||
1053 | } |
||
1054 | } |
||
1055 | |||
1056 | $value = ''; |
||
1057 | if (!isset($fields[2])) |
||
1058 | { |
||
1059 | $value = 'SAME'; |
||
1060 | } |
||
1061 | elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) |
||
1062 | { |
||
1063 | $value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : ''; |
||
1064 | } |
||
1065 | else |
||
1066 | { |
||
1067 | $value = $fields[2]; |
||
1068 | } |
||
1069 | |||
1070 | foreach ($entities as $entity) |
||
1071 | { |
||
1072 | $this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; |
||
1073 | } |
||
1074 | } |
||
1075 | |||
1076 | return true; |
||
1077 | } |
||
1078 | |||
1079 | /** |
||
1080 | * Processes UnicodeData.txt in order to populate $this->char_data, |
||
1081 | * $this->full_decomposition_maps, and the 'data' element of most elements |
||
1082 | * of $this->funcs. |
||
1083 | */ |
||
1084 | private function process_main_unicode_data() |
||
1085 | { |
||
1086 | $local_file = $this->fetch_unicode_file('UnicodeData.txt', self::DATA_URL_UCD); |
||
1087 | |||
1088 | if (empty($local_file)) |
||
1089 | return false; |
||
1090 | |||
1091 | foreach (file($local_file) as $line) |
||
1092 | { |
||
1093 | $fields = explode(';', $line); |
||
1094 | |||
1095 | foreach ($fields as $key => $value) |
||
1096 | { |
||
1097 | $fields[$key] = trim($value); |
||
1098 | } |
||
1099 | |||
1100 | if (!empty($fields[3])) |
||
1101 | { |
||
1102 | $this->funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3]; |
||
1103 | } |
||
1104 | |||
1105 | // Uppercase maps. |
||
1106 | if ($fields[12] !== '') |
||
1107 | { |
||
1108 | $this->funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';'; |
||
1109 | } |
||
1110 | |||
1111 | // Lowercase maps. |
||
1112 | if ($fields[13] !== '') |
||
1113 | { |
||
1114 | $this->funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';'; |
||
1115 | } |
||
1116 | |||
1117 | // Titlecase maps, where different from uppercase maps. |
||
1118 | if ($fields[14] !== '' && $fields[14] !== $fields[12]) |
||
1119 | { |
||
1120 | $this->funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';'; |
||
1121 | } |
||
1122 | |||
1123 | // Remember this character's general category for later. |
||
1124 | $this->char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2]; |
||
1125 | |||
1126 | if ($fields[5] === '') |
||
1127 | { |
||
1128 | continue; |
||
1129 | } |
||
1130 | |||
1131 | // All canonical decompositions AND all compatibility decompositions. |
||
1132 | $this->full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';'; |
||
1133 | |||
1134 | // Just the canonical decompositions. |
||
1135 | if (strpos($fields[5], '<') === false) |
||
1136 | { |
||
1137 | $this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';'; |
||
1138 | } |
||
1139 | } |
||
1140 | |||
1141 | return true; |
||
1142 | } |
||
1143 | |||
1144 | /** |
||
1145 | * Processes SpecialCasing.txt and CaseFolding.txt in order to get |
||
1146 | * finalized versions of all case conversion data. |
||
1147 | */ |
||
1148 | private function process_casing_data() |
||
1149 | { |
||
1150 | // Full case conversion maps are the same as the simple ones, unless they're not. |
||
1151 | $this->funcs['utf8_strtoupper_maps']['data'] = $this->funcs['utf8_strtoupper_simple_maps']['data']; |
||
1152 | $this->funcs['utf8_strtolower_maps']['data'] = $this->funcs['utf8_strtolower_simple_maps']['data']; |
||
1153 | $this->funcs['utf8_titlecase_maps']['data'] = $this->funcs['utf8_titlecase_simple_maps']['data']; |
||
1154 | |||
1155 | // Deal with the special casing data. |
||
1156 | $local_file = $this->fetch_unicode_file('SpecialCasing.txt', self::DATA_URL_UCD); |
||
1157 | |||
1158 | if (empty($local_file)) |
||
1159 | return false; |
||
1160 | |||
1161 | foreach (file($local_file) as $line) |
||
1162 | { |
||
1163 | $line = substr($line, 0, strcspn($line, '#')); |
||
1164 | |||
1165 | if (strpos($line, ';') === false) |
||
1166 | { |
||
1167 | continue; |
||
1168 | } |
||
1169 | |||
1170 | $fields = explode(';', $line); |
||
1171 | |||
1172 | foreach ($fields as $key => $value) |
||
1173 | { |
||
1174 | $fields[$key] = trim($value); |
||
1175 | } |
||
1176 | |||
1177 | // Unconditional mappings. |
||
1178 | // Note: conditional mappings need to be handled by more complex code. |
||
1179 | if (empty($fields[4])) |
||
1180 | { |
||
1181 | $this->funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';'; |
||
1182 | |||
1183 | $this->funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';'; |
||
1184 | |||
1185 | // Titlecase only where different from uppercase. |
||
1186 | if ($fields[3] !== $fields[2]) |
||
1187 | { |
||
1188 | $this->funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
||
1189 | } |
||
1190 | } |
||
1191 | } |
||
1192 | |||
1193 | ksort($this->funcs['utf8_strtolower_maps']['data']); |
||
1194 | ksort($this->funcs['utf8_strtoupper_maps']['data']); |
||
1195 | ksort($this->funcs['utf8_titlecase_maps']['data']); |
||
1196 | |||
1197 | // Deal with the case folding data. |
||
1198 | $local_file = $this->fetch_unicode_file('CaseFolding.txt', self::DATA_URL_UCD); |
||
1199 | |||
1200 | if (empty($local_file)) |
||
1201 | return false; |
||
1202 | |||
1203 | foreach (file($local_file) as $line) |
||
1204 | { |
||
1205 | $line = substr($line, 0, strcspn($line, '#')); |
||
1206 | |||
1207 | if (strpos($line, ';') === false) |
||
1208 | { |
||
1209 | continue; |
||
1210 | } |
||
1211 | |||
1212 | $fields = explode(';', $line); |
||
1213 | |||
1214 | foreach ($fields as $key => $value) |
||
1215 | { |
||
1216 | $fields[$key] = trim($value); |
||
1217 | } |
||
1218 | |||
1219 | // Full casefolding. |
||
1220 | if (in_array($fields[1], array('C', 'F'))) |
||
1221 | { |
||
1222 | $this->funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
||
1223 | } |
||
1224 | |||
1225 | // Simple casefolding. |
||
1226 | if (in_array($fields[1], array('C', 'S'))) |
||
1227 | $this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
||
1228 | } |
||
1229 | |||
1230 | return true; |
||
1231 | } |
||
1232 | |||
1233 | /** |
||
1234 | * Finalizes all the decomposition forms. |
||
1235 | * |
||
1236 | * This is necessary because some characters decompose to other characters |
||
1237 | * that themselves decompose further. |
||
1238 | */ |
||
1239 | private function finalize_decomposition_forms() |
||
1240 | { |
||
1241 | // Iterate until we reach the final decomposition forms. |
||
1242 | // First we do the compatibility decomposition forms. |
||
1243 | $changed = true; |
||
1244 | while ($changed) |
||
1245 | { |
||
1246 | $temp = array(); |
||
1247 | foreach ($this->full_decomposition_maps as $composed => $decomposed) |
||
1248 | { |
||
1249 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
||
1250 | |||
1251 | foreach ($parts as $partnum => $hex) |
||
1252 | { |
||
1253 | if (isset($this->full_decomposition_maps[$hex])) |
||
1254 | { |
||
1255 | $parts[$partnum] = $this->full_decomposition_maps[$hex]; |
||
1256 | } |
||
1257 | } |
||
1258 | |||
1259 | $decomposed = implode(' ', $parts); |
||
1260 | unset($parts); |
||
1261 | |||
1262 | $temp[$composed] = $decomposed; |
||
1263 | } |
||
1264 | |||
1265 | $changed = $this->full_decomposition_maps !== $temp; |
||
1266 | |||
1267 | $this->full_decomposition_maps = $temp; |
||
1268 | } |
||
1269 | |||
1270 | // Same as above, but using only canonical decompositions. |
||
1271 | $changed = true; |
||
1272 | $iteration = 0; |
||
1273 | while ($changed) |
||
1274 | { |
||
1275 | $temp = array(); |
||
1276 | foreach ($this->funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed) |
||
1277 | { |
||
1278 | if ($iteration === 0 && !in_array($composed, $this->derived_normalization_props['Full_Composition_Exclusion'])) |
||
1279 | { |
||
1280 | $this->funcs['utf8_compose_maps']['data'][$decomposed] = $composed; |
||
1281 | } |
||
1282 | |||
1283 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
||
1284 | |||
1285 | foreach ($parts as $partnum => $hex) |
||
1286 | { |
||
1287 | if (isset($this->funcs['utf8_normalize_d_maps']['data'][$hex])) |
||
1288 | { |
||
1289 | $parts[$partnum] = $this->funcs['utf8_normalize_d_maps']['data'][$hex]; |
||
1290 | } |
||
1291 | } |
||
1292 | |||
1293 | $decomposed = implode(' ', $parts); |
||
1294 | unset($parts); |
||
1295 | |||
1296 | $temp[$composed] = $decomposed; |
||
1297 | } |
||
1298 | |||
1299 | $changed = $this->funcs['utf8_normalize_d_maps']['data'] !== $temp; |
||
1300 | |||
1301 | $this->funcs['utf8_normalize_d_maps']['data'] = $temp; |
||
1302 | $iteration++; |
||
1303 | } |
||
1304 | |||
1305 | // Avoid bloat. |
||
1306 | $this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']); |
||
1307 | |||
1308 | return true; |
||
1309 | } |
||
1310 | |||
1311 | /** |
||
1312 | * Builds regular expressions for normalization quick check. |
||
1313 | */ |
||
1314 | private function build_quick_check() |
||
1315 | { |
||
1316 | foreach (array('NFC_QC', 'NFKC_QC', 'NFD_QC', 'NFKD_QC', 'Changes_When_NFKC_Casefolded') as $prop) |
||
1317 | { |
||
1318 | $current_range = array('start' => null, 'end' => null); |
||
1319 | foreach ($this->derived_normalization_props[$prop] as $entity => $nm) |
||
1320 | { |
||
1321 | $range_string = ''; |
||
1322 | |||
1323 | $ord = hexdec(trim($entity, '&#x;')); |
||
1324 | |||
1325 | if (!isset($current_range['start'])) |
||
1326 | { |
||
1327 | $current_range['start'] = $ord; |
||
1328 | } |
||
1329 | |||
1330 | if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) |
||
1331 | { |
||
1332 | $current_range['end'] = $ord; |
||
1333 | } |
||
1334 | else |
||
1335 | { |
||
1336 | $range_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
1337 | |||
1338 | if ($current_range['start'] != $current_range['end']) |
||
1339 | { |
||
1340 | $range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
1341 | } |
||
1342 | |||
1343 | $current_range = array('start' => $ord, 'end' => $ord); |
||
1344 | |||
1345 | $this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string; |
||
1346 | } |
||
1347 | } |
||
1348 | |||
1349 | if (isset($current_range['start'])) |
||
1350 | { |
||
1351 | $range_string = '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
1352 | |||
1353 | if ($current_range['start'] != $current_range['end']) |
||
1354 | { |
||
1355 | $range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
1356 | } |
||
1357 | |||
1358 | $this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string; |
||
1359 | } |
||
1360 | } |
||
1361 | |||
1362 | return true; |
||
1363 | } |
||
1364 | |||
1365 | /** |
||
1366 | * Builds regular expression classes for extended Unicode properties. |
||
1367 | */ |
||
1368 | private function build_regex_properties() |
||
1369 | { |
||
1370 | foreach ($this->funcs['utf8_regex_properties']['propfiles'] as $filename) |
||
1371 | { |
||
1372 | $local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD); |
||
1373 | |||
1374 | if (empty($local_file)) |
||
1375 | return false; |
||
1376 | |||
1377 | foreach (file($local_file) as $line) |
||
1378 | { |
||
1379 | $line = substr($line, 0, strcspn($line, '#')); |
||
1380 | |||
1381 | if (strpos($line, ';') === false) |
||
1382 | { |
||
1383 | continue; |
||
1384 | } |
||
1385 | |||
1386 | $fields = explode(';', $line); |
||
1387 | |||
1388 | foreach ($fields as $key => $value) |
||
1389 | { |
||
1390 | $fields[$key] = trim($value); |
||
1391 | } |
||
1392 | |||
1393 | if (in_array($fields[1], $this->funcs['utf8_regex_properties']['props'])) |
||
1394 | { |
||
1395 | if (!isset($this->funcs['utf8_regex_properties']['data'][$fields[1]])) |
||
1396 | { |
||
1397 | $this->funcs['utf8_regex_properties']['data'][$fields[1]] = array(); |
||
1398 | } |
||
1399 | |||
1400 | $this->funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||
1401 | } |
||
1402 | |||
1403 | // We also track 'Default_Ignorable_Code_Point' property in a separate array. |
||
1404 | if ($fields[1] !== 'Default_Ignorable_Code_Point') |
||
1405 | { |
||
1406 | continue; |
||
1407 | } |
||
1408 | |||
1409 | if (strpos($fields[0], '..') === false) |
||
1410 | { |
||
1411 | $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; |
||
1412 | } |
||
1413 | else |
||
1414 | { |
||
1415 | list($start, $end) = explode('..', $fields[0]); |
||
1416 | |||
1417 | $ord_s = hexdec($start); |
||
1418 | $ord_e = hexdec($end); |
||
1419 | |||
1420 | $ord = $ord_s; |
||
1421 | while ($ord <= $ord_e) |
||
1422 | { |
||
1423 | $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
||
1424 | } |
||
1425 | } |
||
1426 | } |
||
1427 | } |
||
1428 | |||
1429 | ksort($this->funcs['utf8_regex_properties']['data']); |
||
1430 | |||
1431 | return true; |
||
1432 | } |
||
1433 | |||
1434 | /** |
||
1435 | * Builds regular expression classes for filtering variation selectors. |
||
1436 | */ |
||
1437 | private function build_regex_variation_selectors() |
||
1438 | { |
||
1439 | $files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt'); |
||
1440 | |||
1441 | foreach ($files as $filename) |
||
1442 | { |
||
1443 | $local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD); |
||
1444 | |||
1445 | if (empty($local_file)) |
||
1446 | return false; |
||
1447 | |||
1448 | foreach (file($local_file) as $line) |
||
1449 | { |
||
1450 | $line = substr($line, 0, strcspn($line, '#')); |
||
1451 | |||
1452 | if (strpos($line, ';') === false) |
||
1453 | { |
||
1454 | continue; |
||
1455 | } |
||
1456 | |||
1457 | $fields = explode(';', $line); |
||
1458 | |||
1459 | foreach ($fields as $key => $value) |
||
1460 | { |
||
1461 | $fields[$key] = trim($value); |
||
1462 | } |
||
1463 | |||
1464 | list($base_char, $variation_selector) = explode(' ', $fields[0]); |
||
1465 | |||
1466 | $this->funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char); |
||
1467 | } |
||
1468 | } |
||
1469 | |||
1470 | foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords) |
||
1471 | { |
||
1472 | $class_string = ''; |
||
1473 | |||
1474 | $current_range = array('start' => null, 'end' => null); |
||
1475 | foreach ($ords as $ord) |
||
1476 | { |
||
1477 | if (!isset($current_range['start'])) |
||
1478 | { |
||
1479 | $current_range['start'] = $ord; |
||
1480 | } |
||
1481 | |||
1482 | if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) |
||
1483 | { |
||
1484 | $current_range['end'] = $ord; |
||
1485 | continue; |
||
1486 | } |
||
1487 | else |
||
1488 | { |
||
1489 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
1490 | |||
1491 | if ($current_range['start'] != $current_range['end']) |
||
1492 | { |
||
1493 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
1494 | } |
||
1495 | |||
1496 | $current_range = array('start' => $ord, 'end' => $ord); |
||
1497 | } |
||
1498 | } |
||
1499 | |||
1500 | if (isset($current_range['start'])) |
||
1501 | { |
||
1502 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
1503 | |||
1504 | if ($current_range['start'] != $current_range['end']) |
||
1505 | { |
||
1506 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
1507 | } |
||
1508 | } |
||
1509 | |||
1510 | // As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters. |
||
1511 | if (($identical = array_search($class_string, $this->funcs['utf8_regex_variation_selectors']['data'])) !== false) |
||
1512 | { |
||
1513 | unset( |
||
1514 | $this->funcs['utf8_regex_variation_selectors']['data'][$identical], |
||
1515 | $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] |
||
1516 | ); |
||
1517 | |||
1518 | $compound_selector = array($identical, $variation_selector); |
||
1519 | sort($compound_selector); |
||
1520 | |||
1521 | $variation_selector = implode('', $compound_selector); |
||
1522 | } |
||
1523 | |||
1524 | $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string; |
||
1525 | } |
||
1526 | |||
1527 | foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string) |
||
1528 | { |
||
1529 | $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string); |
||
1530 | } |
||
1531 | |||
1532 | krsort($this->funcs['utf8_regex_variation_selectors']['data']); |
||
1533 | |||
1534 | return true; |
||
1535 | } |
||
1536 | |||
1537 | /** |
||
1538 | * Helper function for build_regex_joining_type and build_regex_indic. |
||
1539 | */ |
||
1540 | private function build_script_stats() |
||
1752 | } |
||
1753 | |||
1754 | /** |
||
1755 | * Builds regex classes for join control tests in utf8_sanitize_invisibles. |
||
1756 | * Specifically, for cursive scripts like Arabic. |
||
1757 | */ |
||
1758 | private function build_regex_joining_type() |
||
1759 | { |
||
1760 | $local_file = $this->fetch_unicode_file('extracted/DerivedJoiningType.txt', self::DATA_URL_UCD); |
||
1761 | |||
1762 | if (empty($local_file)) |
||
1763 | return false; |
||
1764 | |||
1765 | foreach (file($local_file) as $line) |
||
1766 | { |
||
1767 | $line = substr($line, 0, strcspn($line, '#')); |
||
1768 | |||
1769 | if (strpos($line, ';') === false) |
||
1770 | { |
||
1771 | continue; |
||
1772 | } |
||
1773 | |||
1774 | $fields = explode(';', $line); |
||
1775 | |||
1776 | foreach ($fields as $key => $value) |
||
1777 | { |
||
1778 | $fields[$key] = trim($value); |
||
1779 | } |
||
1780 | |||
1781 | switch ($fields[1]) |
||
1782 | { |
||
1783 | case 'C': |
||
1784 | $joining_type = 'Join_Causing'; |
||
1785 | break; |
||
1786 | |||
1787 | case 'D': |
||
1788 | $joining_type = 'Dual_Joining'; |
||
1789 | break; |
||
1790 | |||
1791 | case 'R': |
||
1792 | $joining_type = 'Right_Joining'; |
||
1793 | break; |
||
1794 | |||
1795 | case 'L': |
||
1796 | $joining_type = 'Left_Joining'; |
||
1797 | break; |
||
1798 | |||
1799 | case 'T': |
||
1800 | $joining_type = 'Transparent'; |
||
1801 | break; |
||
1802 | |||
1803 | default: |
||
1804 | $joining_type = null; |
||
1805 | break; |
||
1806 | } |
||
1807 | |||
1808 | if (!isset($joining_type)) |
||
1809 | { |
||
1810 | continue; |
||
1811 | } |
||
1812 | |||
1813 | $entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';'; |
||
1814 | |||
1815 | if (empty($this->char_data[$entity]['scripts'])) |
||
1816 | { |
||
1817 | continue; |
||
1818 | } |
||
1819 | |||
1820 | foreach ($this->char_data[$entity]['scripts'] as $char_script) |
||
1821 | { |
||
1822 | if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'])) |
||
1823 | { |
||
1824 | $this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $this->script_stats[$char_script]; |
||
1825 | } |
||
1826 | |||
1827 | if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type])) |
||
1828 | { |
||
1829 | $this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array(); |
||
1830 | } |
||
1831 | |||
1832 | $this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||
1833 | } |
||
1834 | } |
||
1835 | // This sort works decently well to ensure widely used scripts are ranked before rare scripts. |
||
1836 | uasort($this->funcs['utf8_regex_joining_type']['data'], function ($a, $b) |
||
1837 | { |
||
1838 | if ($a['stats']['age'] == $b['stats']['age']) |
||
1839 | { |
||
1840 | return $b['stats']['count'] - $a['stats']['count']; |
||
1841 | } |
||
1842 | else |
||
1843 | { |
||
1844 | return $a['stats']['age'] - $b['stats']['age']; |
||
1845 | } |
||
1846 | }); |
||
1847 | foreach ($this->funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types) |
||
1848 | { |
||
1849 | unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']); |
||
1850 | |||
1851 | // If the only joining type in this script is transparent, we don't care about it. |
||
1852 | if (array_keys($joining_types) === array('Transparent')) |
||
1853 | { |
||
1854 | unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]); |
||
1855 | continue; |
||
1856 | } |
||
1857 | |||
1858 | foreach ($joining_types as $joining_type => $value) |
||
1859 | { |
||
1860 | sort($value); |
||
1861 | } |
||
1862 | } |
||
1863 | |||
1864 | return true; |
||
1865 | } |
||
1866 | |||
1867 | /** |
||
1868 | * Builds regex classes for join control tests in utf8_sanitize_invisibles. |
||
1869 | * Specifically, for Indic scripts like Devanagari. |
||
1870 | */ |
||
1871 | private function build_regex_indic() |
||
1872 | { |
||
1873 | $local_file = $this->fetch_unicode_file('IndicSyllabicCategory.txt', self::DATA_URL_UCD); |
||
1874 | |||
1875 | if (empty($local_file)) |
||
1876 | return false; |
||
1877 | |||
1878 | foreach (file($local_file) as $line) |
||
1879 | { |
||
1880 | $line = substr($line, 0, strcspn($line, '#')); |
||
1881 | |||
1882 | if (strpos($line, ';') === false) |
||
1883 | { |
||
1884 | continue; |
||
1885 | } |
||
1886 | |||
1887 | $fields = explode(';', $line); |
||
1888 | |||
1889 | foreach ($fields as $key => $value) |
||
1890 | { |
||
1891 | $fields[$key] = trim($value); |
||
1892 | } |
||
1893 | |||
1894 | $insc = $fields[1]; |
||
1895 | |||
1896 | if (!in_array($insc, array('Virama', 'Vowel_Dependent'))) |
||
1897 | { |
||
1898 | continue; |
||
1899 | } |
||
1900 | |||
1901 | $char_scripts = $this->char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts']; |
||
1902 | |||
1903 | if (empty($char_scripts)) |
||
1904 | { |
||
1905 | continue; |
||
1906 | } |
||
1907 | |||
1908 | foreach ($char_scripts as $char_script) |
||
1909 | { |
||
1910 | if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'])) |
||
1911 | { |
||
1912 | $this->funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $this->script_stats[$char_script]; |
||
1913 | } |
||
1914 | |||
1915 | if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script][$insc])) |
||
1916 | { |
||
1917 | $this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = array(); |
||
1918 | } |
||
1919 | |||
1920 | $this->funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; |
||
1921 | } |
||
1922 | } |
||
1923 | // Again, sort commonly used scripts before rare scripts. |
||
1924 | uasort($this->funcs['utf8_regex_indic']['data'], function ($a, $b) |
||
1925 | { |
||
1926 | if ($a['stats']['age'] == $b['stats']['age']) |
||
1927 | { |
||
1928 | return $b['stats']['count'] - $a['stats']['count']; |
||
1929 | } |
||
1930 | else |
||
1931 | { |
||
1932 | return $a['stats']['age'] - $b['stats']['age']; |
||
1933 | } |
||
1934 | }); |
||
1935 | // We only want scripts with viramas. |
||
1936 | foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs) |
||
1937 | { |
||
1938 | unset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']); |
||
1939 | |||
1940 | if (!isset($inscs['Virama'])) |
||
1941 | { |
||
1942 | unset($this->funcs['utf8_regex_indic']['data'][$char_script]); |
||
1943 | continue; |
||
1944 | } |
||
1945 | } |
||
1946 | // Now add some more classes that we need for each script. |
||
1947 | foreach ($this->char_data as $entity => $info) |
||
1948 | { |
||
1949 | if (empty($info['scripts'])) |
||
1950 | { |
||
1951 | continue; |
||
1952 | } |
||
1953 | |||
1954 | $ord = hexdec(trim($entity, '&#x;')); |
||
1955 | |||
1956 | foreach ($info['scripts'] as $char_script) |
||
1957 | { |
||
1958 | if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script])) |
||
1959 | { |
||
1960 | continue; |
||
1961 | } |
||
1962 | |||
1963 | $this->funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord; |
||
1964 | |||
1965 | if (empty($info['General_Category'])) |
||
1966 | { |
||
1967 | continue; |
||
1968 | } |
||
1969 | elseif ($info['General_Category'] == 'Mn') |
||
1970 | { |
||
1971 | $this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord; |
||
1972 | |||
1973 | if (!empty($this->funcs['utf8_combining_classes']['data'][$entity])) |
||
1974 | { |
||
1975 | $this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord; |
||
1976 | } |
||
1977 | } |
||
1978 | elseif (substr($info['General_Category'], 0, 1) == 'L') |
||
1979 | { |
||
1980 | $this->funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord; |
||
1981 | } |
||
1982 | } |
||
1983 | } |
||
1984 | foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs) |
||
1985 | { |
||
1986 | foreach ($inscs as $insc => $value) |
||
1987 | { |
||
1988 | sort($value); |
||
1989 | |||
1990 | if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark'))) |
||
1991 | { |
||
1992 | continue; |
||
1993 | } |
||
1994 | |||
1995 | $class_string = ''; |
||
1996 | |||
1997 | $current_range = array('start' => null, 'end' => null); |
||
1998 | foreach ($value as $ord) |
||
1999 | { |
||
2000 | if (!isset($current_range['start'])) |
||
2001 | { |
||
2002 | $current_range['start'] = $ord; |
||
2003 | } |
||
2004 | |||
2005 | if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) |
||
2006 | { |
||
2007 | $current_range['end'] = $ord; |
||
2008 | continue; |
||
2009 | } |
||
2010 | else |
||
2011 | { |
||
2012 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
2013 | |||
2014 | if ($current_range['start'] != $current_range['end']) |
||
2015 | { |
||
2016 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
2017 | } |
||
2018 | |||
2019 | $current_range = array('start' => $ord, 'end' => $ord); |
||
2020 | } |
||
2021 | } |
||
2022 | |||
2023 | if (isset($current_range['start'])) |
||
2024 | { |
||
2025 | $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; |
||
2026 | |||
2027 | if ($current_range['start'] != $current_range['end']) |
||
2028 | { |
||
2029 | $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; |
||
2030 | } |
||
2031 | } |
||
2032 | |||
2033 | $this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string); |
||
2034 | } |
||
2035 | |||
2036 | ksort($this->funcs['utf8_regex_indic']['data'][$char_script]); |
||
2037 | } |
||
2038 | |||
2039 | return true; |
||
2040 | } |
||
2041 | |||
2042 | /** |
||
2043 | * Builds maps and regex classes for IDNA purposes. |
||
2044 | */ |
||
2045 | private function build_idna() |
||
2114 | } |
||
2115 | } |
||
2116 | |||
2117 | ?> |
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.