This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
0 ignored issues
–
show
|
|||
2 | /** |
||
3 | * Maintenance script to generate first letter data files for Collation.php. |
||
4 | * |
||
5 | * This program is free software; you can redistribute it and/or modify |
||
6 | * it under the terms of the GNU General Public License as published by |
||
7 | * the Free Software Foundation; either version 2 of the License, or |
||
8 | * (at your option) any later version. |
||
9 | * |
||
10 | * This program is distributed in the hope that it will be useful, |
||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
13 | * GNU General Public License for more details. |
||
14 | * |
||
15 | * You should have received a copy of the GNU General Public License along |
||
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
18 | * http://www.gnu.org/copyleft/gpl.html |
||
19 | * |
||
20 | * @file |
||
21 | * @ingroup MaintenanceLanguage |
||
22 | */ |
||
23 | |||
24 | require_once __DIR__ . '/../Maintenance.php'; |
||
25 | |||
26 | /** |
||
27 | * Generate first letter data files for Collation.php |
||
28 | * |
||
29 | * @ingroup MaintenanceLanguage |
||
30 | */ |
||
31 | class GenerateCollationData extends Maintenance { |
||
32 | /** The directory with source data files in it */ |
||
33 | public $dataDir; |
||
34 | |||
35 | /** The primary weights, indexed by codepoint */ |
||
36 | public $weights; |
||
37 | |||
38 | /** |
||
39 | * A hashtable keyed by codepoint, where presence indicates that a character |
||
40 | * has a decomposition mapping. This makes it non-preferred for group header |
||
41 | * selection. |
||
42 | */ |
||
43 | public $mappedChars; |
||
44 | |||
45 | public $debugOutFile; |
||
46 | |||
47 | /** |
||
48 | * Important tertiary weights from UTS #10 section 7.2 |
||
49 | */ |
||
50 | const NORMAL_UPPERCASE = 0x08; |
||
51 | const NORMAL_HIRAGANA = 0x0E; |
||
52 | |||
53 | public function __construct() { |
||
54 | parent::__construct(); |
||
55 | $this->addOption( 'data-dir', 'A directory on the local filesystem ' . |
||
56 | 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org', |
||
57 | false, true ); |
||
58 | $this->addOption( 'debug-output', 'Filename for sending debug output to', |
||
59 | false, true ); |
||
60 | } |
||
61 | |||
62 | public function execute() { |
||
63 | $this->dataDir = $this->getOption( 'data-dir', '.' ); |
||
64 | |||
65 | $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" ); |
||
66 | $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ); |
||
67 | |||
68 | // As of January 2013, these links work for all versions of Unicode |
||
69 | // between 5.1 and 6.2, inclusive. |
||
70 | $allkeysURL = "http://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt"; |
||
71 | $ucdallURL = "http://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip"; |
||
72 | |||
73 | if ( !$allkeysPresent || !$ucdallPresent ) { |
||
74 | $icuVersion = IcuCollation::getICUVersion(); |
||
75 | $unicodeVersion = IcuCollation::getUnicodeVersionForICU(); |
||
76 | |||
77 | $error = ""; |
||
78 | |||
79 | if ( !$allkeysPresent ) { |
||
80 | $error .= "Unable to find allkeys.txt. " |
||
81 | . "Download it and specify its location with --data-dir=<DIR>. " |
||
82 | . "\n\n"; |
||
83 | } |
||
84 | if ( !$ucdallPresent ) { |
||
85 | $error .= "Unable to find ucd.all.grouped.xml. " |
||
86 | . "Download it, unzip, and specify its location with --data-dir=<DIR>. " |
||
87 | . "\n\n"; |
||
88 | } |
||
89 | |||
90 | $versionKnown = false; |
||
91 | if ( !$icuVersion ) { |
||
92 | // Unknown version - either very old intl, |
||
93 | // or PHP < 5.3.7 which does not expose this information |
||
94 | $error .= "As MediaWiki could not determine the version of ICU library used by your PHP's " |
||
95 | . "intl extension it can't suggest which file version to download. " |
||
96 | . "This can be caused by running a very old version of intl or PHP < 5.3.7. " |
||
97 | . "If you are sure everything is all right, find out the ICU version " |
||
98 | . "by running phpinfo(), check what is the Unicode version it is using " |
||
99 | . "at http://site.icu-project.org/download, then try finding appropriate data file(s) at:"; |
||
100 | } elseif ( version_compare( $icuVersion, "4.0", "<" ) ) { |
||
101 | // Extra old version |
||
102 | $error .= "You are using outdated version of ICU ($icuVersion), intended for " |
||
103 | . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" ) |
||
104 | . "; this file might not be avalaible for it, and it's not supported by MediaWiki. " |
||
105 | . " You are on your own; consider upgrading PHP's intl extension or try " |
||
106 | . "one of the files available at:"; |
||
107 | } elseif ( version_compare( $icuVersion, "51.0", ">=" ) ) { |
||
108 | // Extra recent version |
||
109 | $error .= "You are using ICU $icuVersion, released after this script was last updated. " |
||
110 | . "Check what is the Unicode version it is using at http://site.icu-project.org/download . " |
||
111 | . "It can't be guaranteed everything will work, but appropriate file(s) should " |
||
112 | . "be available at:"; |
||
113 | } else { |
||
114 | // ICU 4.0 to 50.x |
||
115 | $versionKnown = true; |
||
116 | $error .= "You are using ICU $icuVersion, intended for " |
||
117 | . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" ) |
||
118 | . ". Appropriate file(s) should be available at:"; |
||
119 | } |
||
120 | $error .= "\n"; |
||
121 | |||
122 | if ( $versionKnown && $unicodeVersion ) { |
||
0 ignored issues
–
show
The expression
$unicodeVersion of type false|string is loosely compared to true ; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.
In PHP, under loose comparison (like For '' == false // true
'' == null // true
'ab' == false // false
'ab' == null // false
// It is often better to use strict comparison
'' === false // false
'' === null // false
![]() |
|||
123 | $allkeysURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $allkeysURL ); |
||
124 | $ucdallURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $ucdallURL ); |
||
125 | } |
||
126 | |||
127 | if ( !$allkeysPresent ) { |
||
128 | $error .= "* $allkeysURL\n"; |
||
129 | } |
||
130 | if ( !$ucdallPresent ) { |
||
131 | $error .= "* $ucdallURL\n"; |
||
132 | } |
||
133 | |||
134 | $this->error( $error ); |
||
135 | exit( 1 ); |
||
0 ignored issues
–
show
The method
execute() contains an exit expression.
An exit expression should only be used in rare cases. For example, if you write a short command line script. In most cases however, using an ![]() |
|||
136 | } |
||
137 | |||
138 | $debugOutFileName = $this->getOption( 'debug-output' ); |
||
139 | if ( $debugOutFileName ) { |
||
140 | $this->debugOutFile = fopen( $debugOutFileName, 'w' ); |
||
141 | if ( !$this->debugOutFile ) { |
||
142 | $this->error( "Unable to open debug output file for writing" ); |
||
143 | exit( 1 ); |
||
0 ignored issues
–
show
The method
execute() contains an exit expression.
An exit expression should only be used in rare cases. For example, if you write a short command line script. In most cases however, using an ![]() |
|||
144 | } |
||
145 | } |
||
146 | $this->loadUcd(); |
||
147 | $this->generateFirstChars(); |
||
148 | } |
||
149 | |||
150 | function loadUcd() { |
||
151 | $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" ); |
||
152 | $uxr->readChars( [ $this, 'charCallback' ] ); |
||
153 | } |
||
154 | |||
155 | function charCallback( $data ) { |
||
156 | // Skip non-printable characters, |
||
157 | // but do not skip a normal space (U+0020) since |
||
158 | // people like to use that as a fake no header symbol. |
||
159 | $category = substr( $data['gc'], 0, 1 ); |
||
160 | if ( strpos( 'LNPS', $category ) === false |
||
161 | && $data['cp'] !== '0020' |
||
162 | ) { |
||
163 | return; |
||
164 | } |
||
165 | $cp = hexdec( $data['cp'] ); |
||
166 | |||
167 | // Skip the CJK ideograph blocks, as an optimisation measure. |
||
168 | // UCA doesn't sort them properly anyway, without tailoring. |
||
169 | if ( IcuCollation::isCjk( $cp ) ) { |
||
170 | return; |
||
171 | } |
||
172 | |||
173 | // Skip the composed Hangul syllables, we will use the bare Jamo |
||
174 | // as first letters |
||
175 | if ( $data['block'] == 'Hangul Syllables' ) { |
||
176 | return; |
||
177 | } |
||
178 | |||
179 | // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3 |
||
180 | if ( $data['UIdeo'] === 'Y' ) { |
||
181 | if ( $data['block'] == 'CJK Unified Ideographs' |
||
182 | || $data['block'] == 'CJK Compatibility Ideographs' |
||
183 | ) { |
||
184 | $base = 0xFB40; |
||
185 | } else { |
||
186 | $base = 0xFB80; |
||
187 | } |
||
188 | } else { |
||
189 | $base = 0xFBC0; |
||
190 | } |
||
191 | $a = $base + ( $cp >> 15 ); |
||
192 | $b = ( $cp & 0x7fff ) | 0x8000; |
||
193 | |||
194 | $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b ); |
||
195 | |||
196 | if ( $data['dm'] !== '#' ) { |
||
197 | $this->mappedChars[$cp] = true; |
||
198 | } |
||
199 | |||
200 | if ( $cp % 4096 == 0 ) { |
||
201 | print "{$data['cp']}\n"; |
||
202 | } |
||
203 | } |
||
204 | |||
205 | function generateFirstChars() { |
||
206 | $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' ); |
||
207 | if ( !$file ) { |
||
208 | $this->error( "Unable to open allkeys.txt" ); |
||
209 | exit( 1 ); |
||
0 ignored issues
–
show
The method
generateFirstChars() contains an exit expression.
An exit expression should only be used in rare cases. For example, if you write a short command line script. In most cases however, using an ![]() |
|||
210 | } |
||
211 | global $IP; |
||
212 | $outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' ); |
||
213 | if ( !$outFile ) { |
||
214 | $this->error( "Unable to open output file first-letters-root.ser" ); |
||
215 | exit( 1 ); |
||
0 ignored issues
–
show
The method
generateFirstChars() contains an exit expression.
An exit expression should only be used in rare cases. For example, if you write a short command line script. In most cases however, using an ![]() |
|||
216 | } |
||
217 | |||
218 | $goodTertiaryChars = []; |
||
219 | |||
220 | // For each character with an entry in allkeys.txt, overwrite the implicit |
||
221 | // entry in $this->weights that came from the UCD. |
||
222 | // Also gather a list of tertiary weights, for use in selecting the group header |
||
223 | while ( false !== ( $line = fgets( $file ) ) ) { |
||
224 | // We're only interested in single-character weights, pick them out with a regex |
||
225 | $line = trim( $line ); |
||
226 | if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) { |
||
227 | continue; |
||
228 | } |
||
229 | |||
230 | $cp = hexdec( $m[1] ); |
||
231 | $allWeights = trim( $m[2] ); |
||
232 | $primary = ''; |
||
233 | $tertiary = ''; |
||
234 | |||
235 | if ( !isset( $this->weights[$cp] ) ) { |
||
236 | // Non-printable, ignore |
||
237 | continue; |
||
238 | } |
||
239 | foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) { |
||
240 | preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m ); |
||
241 | if ( !empty( $m[1] ) ) { |
||
242 | if ( $m[1][0] !== '0000' ) { |
||
243 | $primary .= '.' . $m[1][0]; |
||
244 | } |
||
245 | if ( $m[1][2] !== '0000' ) { |
||
246 | $tertiary .= '.' . $m[1][2]; |
||
247 | } |
||
248 | } |
||
249 | } |
||
250 | $this->weights[$cp] = $primary; |
||
251 | if ( $tertiary === '.0008' |
||
252 | || $tertiary === '.000E' |
||
253 | ) { |
||
254 | $goodTertiaryChars[$cp] = true; |
||
255 | } |
||
256 | } |
||
257 | fclose( $file ); |
||
258 | |||
259 | // Identify groups of characters with the same primary weight |
||
260 | $this->groups = []; |
||
0 ignored issues
–
show
The property
groups does not exist. Did you maybe forget to declare it?
In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code: class MyClass { }
$x = new MyClass();
$x->foo = true;
Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion: class MyClass {
public $foo;
}
$x = new MyClass();
$x->foo = true;
![]() |
|||
261 | asort( $this->weights, SORT_STRING ); |
||
262 | $prevWeight = reset( $this->weights ); |
||
263 | $group = []; |
||
264 | foreach ( $this->weights as $cp => $weight ) { |
||
265 | if ( $weight !== $prevWeight ) { |
||
266 | $this->groups[$prevWeight] = $group; |
||
267 | $prevWeight = $weight; |
||
268 | if ( isset( $this->groups[$weight] ) ) { |
||
269 | $group = $this->groups[$weight]; |
||
270 | } else { |
||
271 | $group = []; |
||
272 | } |
||
273 | } |
||
274 | $group[] = $cp; |
||
275 | } |
||
276 | if ( $group ) { |
||
277 | $this->groups[$prevWeight] = $group; |
||
278 | } |
||
279 | |||
280 | // If one character has a given primary weight sequence, and a second |
||
281 | // character has a longer primary weight sequence with an initial |
||
282 | // portion equal to the first character, then remove the second |
||
283 | // character. This avoids having characters like U+A732 (double A) |
||
284 | // polluting the basic latin sort area. |
||
285 | |||
286 | foreach ( $this->groups as $weight => $group ) { |
||
287 | if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) { |
||
288 | if ( isset( $this->groups[$m[1]] ) ) { |
||
289 | unset( $this->groups[$weight] ); |
||
290 | } |
||
291 | } |
||
292 | } |
||
293 | |||
294 | ksort( $this->groups, SORT_STRING ); |
||
295 | |||
296 | // Identify the header character in each group |
||
297 | $headerChars = []; |
||
298 | $prevChar = "\000"; |
||
299 | $tertiaryCollator = new Collator( 'root' ); |
||
300 | $primaryCollator = new Collator( 'root' ); |
||
301 | $primaryCollator->setStrength( Collator::PRIMARY ); |
||
302 | $numOutOfOrder = 0; |
||
303 | foreach ( $this->groups as $weight => $group ) { |
||
304 | $uncomposedChars = []; |
||
305 | $goodChars = []; |
||
306 | foreach ( $group as $cp ) { |
||
307 | if ( isset( $goodTertiaryChars[$cp] ) ) { |
||
308 | $goodChars[] = $cp; |
||
309 | } |
||
310 | if ( !isset( $this->mappedChars[$cp] ) ) { |
||
311 | $uncomposedChars[] = $cp; |
||
312 | } |
||
313 | } |
||
314 | $x = array_intersect( $goodChars, $uncomposedChars ); |
||
315 | if ( !$x ) { |
||
316 | $x = $uncomposedChars; |
||
317 | if ( !$x ) { |
||
318 | $x = $group; |
||
319 | } |
||
320 | } |
||
321 | |||
322 | // Use ICU to pick the lowest sorting character in the selection |
||
323 | $tertiaryCollator->sort( $x ); |
||
324 | $cp = $x[0]; |
||
325 | |||
326 | $char = UtfNormal\Utils::codepointToUtf8( $cp ); |
||
327 | $headerChars[] = $char; |
||
328 | if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) { |
||
329 | $numOutOfOrder++; |
||
330 | /* |
||
331 | printf( "Out of order: U+%05X > U+%05X\n", |
||
332 | utf8ToCodepoint( $prevChar ), |
||
333 | utf8ToCodepoint( $char ) ); |
||
334 | */ |
||
335 | } |
||
336 | $prevChar = $char; |
||
337 | |||
338 | if ( $this->debugOutFile ) { |
||
339 | fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char, |
||
340 | implode( ' ', array_map( 'UtfNormal\Utils::codepointToUtf8', $group ) ) ) ); |
||
341 | } |
||
342 | } |
||
343 | |||
344 | print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n"; |
||
345 | |||
346 | fwrite( $outFile, serialize( $headerChars ) ); |
||
347 | } |
||
348 | } |
||
349 | |||
350 | class UcdXmlReader { |
||
351 | public $fileName; |
||
352 | public $callback; |
||
353 | public $groupAttrs; |
||
354 | public $xml; |
||
355 | public $blocks = []; |
||
356 | public $currentBlock; |
||
357 | |||
358 | function __construct( $fileName ) { |
||
359 | $this->fileName = $fileName; |
||
360 | } |
||
361 | |||
362 | public function readChars( $callback ) { |
||
363 | $this->getBlocks(); |
||
364 | $this->currentBlock = reset( $this->blocks ); |
||
365 | $xml = $this->open(); |
||
366 | $this->callback = $callback; |
||
367 | |||
368 | while ( $xml->name !== 'repertoire' && $xml->next() ); |
||
369 | |||
370 | while ( $xml->read() ) { |
||
371 | if ( $xml->nodeType == XMLReader::ELEMENT ) { |
||
372 | if ( $xml->name === 'group' ) { |
||
373 | $this->groupAttrs = $this->readAttributes(); |
||
374 | } elseif ( $xml->name === 'char' ) { |
||
375 | $this->handleChar(); |
||
376 | } |
||
377 | } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) { |
||
378 | if ( $xml->name === 'group' ) { |
||
379 | $this->groupAttrs = []; |
||
380 | } |
||
381 | } |
||
382 | } |
||
383 | $xml->close(); |
||
384 | } |
||
385 | |||
386 | protected function open() { |
||
387 | $this->xml = new XMLReader; |
||
388 | $this->xml->open( $this->fileName ); |
||
389 | if ( !$this->xml ) { |
||
390 | throw new MWException( __METHOD__ . ": unable to open {$this->fileName}" ); |
||
391 | } |
||
392 | while ( $this->xml->name !== 'ucd' && $this->xml->read() ); |
||
393 | $this->xml->read(); |
||
394 | |||
395 | return $this->xml; |
||
396 | } |
||
397 | |||
398 | /** |
||
399 | * Read the attributes of the current element node and return them |
||
400 | * as an array |
||
401 | * @return array |
||
402 | */ |
||
403 | protected function readAttributes() { |
||
404 | $attrs = []; |
||
405 | while ( $this->xml->moveToNextAttribute() ) { |
||
406 | $attrs[$this->xml->name] = $this->xml->value; |
||
407 | } |
||
408 | |||
409 | return $attrs; |
||
410 | } |
||
411 | |||
412 | protected function handleChar() { |
||
413 | $attrs = $this->readAttributes() + $this->groupAttrs; |
||
414 | if ( isset( $attrs['cp'] ) ) { |
||
415 | $first = $last = hexdec( $attrs['cp'] ); |
||
416 | } else { |
||
417 | $first = hexdec( $attrs['first-cp'] ); |
||
418 | $last = hexdec( $attrs['last-cp'] ); |
||
419 | unset( $attrs['first-cp'] ); |
||
420 | unset( $attrs['last-cp'] ); |
||
421 | } |
||
422 | |||
423 | for ( $cp = $first; $cp <= $last; $cp++ ) { |
||
424 | $hexCp = sprintf( "%04X", $cp ); |
||
425 | foreach ( [ 'na', 'na1' ] as $nameProp ) { |
||
426 | if ( isset( $attrs[$nameProp] ) ) { |
||
427 | $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] ); |
||
428 | } |
||
429 | } |
||
430 | |||
431 | while ( $this->currentBlock ) { |
||
432 | if ( $cp < $this->currentBlock[0] ) { |
||
433 | break; |
||
434 | } elseif ( $cp <= $this->currentBlock[1] ) { |
||
435 | $attrs['block'] = key( $this->blocks ); |
||
436 | break; |
||
437 | } else { |
||
438 | $this->currentBlock = next( $this->blocks ); |
||
439 | } |
||
440 | } |
||
441 | |||
442 | $attrs['cp'] = $hexCp; |
||
443 | call_user_func( $this->callback, $attrs ); |
||
444 | } |
||
445 | } |
||
446 | |||
447 | public function getBlocks() { |
||
448 | if ( $this->blocks ) { |
||
449 | return $this->blocks; |
||
450 | } |
||
451 | |||
452 | $xml = $this->open(); |
||
453 | while ( $xml->name !== 'blocks' && $xml->read() ); |
||
454 | |||
455 | while ( $xml->read() ) { |
||
456 | if ( $xml->nodeType == XMLReader::ELEMENT ) { |
||
457 | if ( $xml->name === 'block' ) { |
||
458 | $attrs = $this->readAttributes(); |
||
459 | $first = hexdec( $attrs['first-cp'] ); |
||
460 | $last = hexdec( $attrs['last-cp'] ); |
||
461 | $this->blocks[$attrs['name']] = [ $first, $last ]; |
||
462 | } |
||
463 | } |
||
464 | } |
||
465 | $xml->close(); |
||
466 | |||
467 | return $this->blocks; |
||
468 | } |
||
469 | } |
||
470 | |||
471 | $maintClass = 'GenerateCollationData'; |
||
472 | require_once RUN_MAINTENANCE_IF_MAIN; |
||
473 |
The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.
The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.
To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.