This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * This program is free software; you can redistribute it and/or modify |
||
4 | * it under the terms of the GNU General Public License as published by |
||
5 | * the Free Software Foundation; either version 2 of the License, or |
||
6 | * (at your option) any later version. |
||
7 | * |
||
8 | * This program is distributed in the hope that it will be useful, |
||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
11 | * GNU General Public License for more details. |
||
12 | * |
||
13 | * You should have received a copy of the GNU General Public License along |
||
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
16 | * http://www.gnu.org/copyleft/gpl.html |
||
17 | * |
||
18 | * @file |
||
19 | */ |
||
20 | |||
21 | /** |
||
22 | * @since 1.16.3 |
||
23 | */ |
||
24 | class IcuCollation extends Collation { |
||
25 | const FIRST_LETTER_VERSION = 2; |
||
26 | |||
27 | /** @var Collator */ |
||
28 | private $primaryCollator; |
||
29 | |||
30 | /** @var Collator */ |
||
31 | private $mainCollator; |
||
32 | |||
33 | /** @var string */ |
||
34 | private $locale; |
||
35 | |||
36 | /** @var Language */ |
||
37 | protected $digitTransformLanguage; |
||
38 | |||
39 | /** @var boolean */ |
||
40 | private $useNumericCollation = false; |
||
41 | |||
42 | /** @var array */ |
||
43 | private $firstLetterData; |
||
44 | |||
45 | /** |
||
46 | * Unified CJK blocks. |
||
47 | * |
||
48 | * The same definition of a CJK block must be used for both Collation and |
||
49 | * generateCollationData.php. These blocks are omitted from the first |
||
50 | * letter data, as an optimisation measure and because the default UCA table |
||
51 | * is pretty useless for sorting Chinese text anyway. Japanese and Korean |
||
52 | * blocks are not included here, because they are smaller and more useful. |
||
53 | */ |
||
54 | private static $cjkBlocks = [ |
||
55 | [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement |
||
56 | [ 0x2F00, 0x2FDF ], // Kangxi Radicals |
||
57 | [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters |
||
58 | [ 0x3000, 0x303F ], // CJK Symbols and Punctuation |
||
59 | [ 0x31C0, 0x31EF ], // CJK Strokes |
||
60 | [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months |
||
61 | [ 0x3300, 0x33FF ], // CJK Compatibility |
||
62 | [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A |
||
63 | [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs |
||
64 | [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs |
||
65 | [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms |
||
66 | [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B |
||
67 | [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C |
||
68 | [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D |
||
69 | [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement |
||
70 | ]; |
||
71 | |||
72 | /** |
||
73 | * Additional characters (or character groups) to be considered separate |
||
74 | * letters for given languages, or to be removed from the list of such |
||
75 | * letters (denoted by keys starting with '-'). |
||
76 | * |
||
77 | * These are additions to (or subtractions from) the data stored in the |
||
78 | * first-letters-root.ser file (which among others includes full basic latin, |
||
79 | * cyrillic and greek alphabets). |
||
80 | * |
||
81 | * "Separate letter" is a letter that would have a separate heading/section |
||
82 | * for it in a dictionary or a phone book in this language. This data isn't |
||
83 | * used for sorting (the ICU library handles that), only for deciding which |
||
84 | * characters (or character groups) to use as headings. |
||
85 | * |
||
86 | * Initially generated based on the primary level of Unicode collation |
||
87 | * tailorings available at http://developer.mimer.com/charts/tailorings.htm , |
||
88 | * later modified. |
||
89 | * |
||
90 | * Empty arrays are intended; this signifies that the data for the language is |
||
91 | * available and that there are, in fact, no additional letters to consider. |
||
92 | */ |
||
93 | private static $tailoringFirstLetters = [ |
||
94 | 'af' => [], |
||
95 | 'am' => [], |
||
96 | 'ar' => [], |
||
97 | 'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ], |
||
98 | 'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu |
||
99 | 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
100 | 'be' => [ "Ё" ], |
||
101 | 'be-tarask' => [ "Ё" ], |
||
102 | 'bg' => [], |
||
103 | 'bo' => [], |
||
104 | 'br' => [ "Ch", "C'h" ], |
||
105 | 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
106 | 'bs-Cyrl' => [], |
||
107 | 'ca' => [], |
||
108 | 'chr' => [], |
||
109 | 'co' => [], // not in libicu |
||
110 | 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], |
||
111 | 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], |
||
112 | 'da' => [ "Æ", "Ø", "Å" ], |
||
113 | 'de' => [], |
||
114 | 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ], |
||
115 | 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ], |
||
116 | 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ], |
||
117 | 'el' => [], |
||
118 | 'en' => [], |
||
119 | 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ], |
||
120 | 'es' => [ "Ñ" ], |
||
121 | 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et) |
||
122 | 'eu' => [ "Ñ" ], // not in libicu |
||
123 | 'fa' => [ |
||
124 | // RTL, let's put each letter on a new line |
||
125 | "آ", |
||
126 | "ء", |
||
127 | "ه", |
||
128 | "ا", |
||
129 | "و" |
||
130 | ], |
||
131 | 'fi' => [ "Å", "Ä", "Ö" ], |
||
132 | 'fil' => [ "Ñ", "Ng" ], |
||
133 | 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ], |
||
134 | 'fr' => [], |
||
135 | 'fr-CA' => [], // fr-CA sorts accents slightly different from fr. |
||
136 | 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu |
||
137 | 'fy' => [], // not in libicu |
||
138 | 'ga' => [], |
||
139 | 'gd' => [], // not in libicu |
||
140 | 'gl' => [ "Ch", "Ll", "Ñ" ], |
||
141 | 'gu' => [ "\xe0\xaa\x82", "\xe0\xaa\x83", "\xe0\xaa\x81", "\xe0\xaa\xb3" ], |
||
142 | 'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ], |
||
143 | 'haw' => [ 'ʻ' ], |
||
144 | 'he' => [], |
||
145 | 'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ], |
||
146 | 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
147 | 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], |
||
148 | 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], |
||
149 | 'hy' => [ "և" ], |
||
150 | 'id' => [], |
||
151 | 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ], |
||
152 | 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], |
||
153 | 'it' => [], |
||
154 | 'ka' => [], |
||
155 | 'kk' => [ "Ү", "І" ], |
||
156 | 'kl' => [ "Æ", "Ø", "Å" ], |
||
157 | 'km' => [ |
||
158 | "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86", |
||
159 | "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87", |
||
160 | "\xe1\x9e\xb7\xe1\x9f\x87", "\xe1\x9e\xbb\xe1\x9f\x87", |
||
161 | "\xe1\x9f\x81\xe1\x9f\x87", "\xe1\x9f\x84\xe1\x9f\x87", |
||
162 | ], |
||
163 | 'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ], |
||
164 | 'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ], |
||
165 | 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu |
||
166 | 'ky' => [ "Ё" ], |
||
167 | 'la' => [], // not in libicu |
||
168 | 'lb' => [], |
||
169 | 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ], |
||
170 | 'ln' => [ 'Ɛ' ], |
||
171 | 'lo' => [], |
||
172 | 'lt' => [ "Č", "Š", "Ž" ], |
||
173 | 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], |
||
174 | 'mk' => [ "Ѓ", "Ќ" ], |
||
175 | 'ml' => [], |
||
176 | 'mn' => [], |
||
177 | 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // not in libicu |
||
178 | 'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ], |
||
179 | 'ms' => [], |
||
180 | 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ], |
||
181 | 'nb' => [ "Æ", "Ø", "Å" ], |
||
182 | 'ne' => [], |
||
183 | 'nl' => [], |
||
184 | 'nn' => [ "Æ", "Ø", "Å" ], |
||
185 | 'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead. |
||
186 | 'oc' => [], // not in libicu |
||
187 | 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ], |
||
188 | 'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ], |
||
189 | 'pa' => [ "\xe0\xa9\x8d" ], |
||
190 | 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], |
||
191 | 'pt' => [], |
||
192 | 'rm' => [], // not in libicu |
||
193 | 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], |
||
194 | 'ru' => [], |
||
195 | 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu |
||
196 | 'sco' => [], |
||
197 | 'se' => [ |
||
198 | 'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ', |
||
199 | 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö' |
||
200 | ], |
||
201 | 'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ], |
||
202 | 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], |
||
203 | 'sl' => [ "Č", "Š", "Ž" ], |
||
204 | 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ], |
||
205 | 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ], |
||
206 | 'sr' => [], |
||
207 | 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
208 | 'sv' => [ "Å", "Ä", "Ö" ], |
||
209 | 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], |
||
210 | 'sw' => [], |
||
211 | 'ta' => [ |
||
212 | "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", |
||
213 | "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", |
||
214 | "ஸ்", "ஹ்", "க்ஷ்" |
||
215 | ], |
||
216 | 'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ], |
||
217 | 'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ], |
||
218 | 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ], |
||
219 | 'tl' => [ "Ñ", "Ng" ], // not in libicu |
||
220 | 'to' => [ "Ng", "ʻ" ], |
||
221 | 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
222 | 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu |
||
223 | 'uk' => [ "Ґ", "Ь" ], |
||
224 | 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu |
||
225 | 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], |
||
226 | 'vo' => [ "Ä", "Ö", "Ü" ], |
||
227 | 'yi' => [ |
||
228 | "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc", |
||
229 | "\xd7\xa9\xd7\x82", "\xd7\xaa\xd6\xbc" |
||
230 | ], |
||
231 | 'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ], |
||
232 | 'zu' => [], |
||
233 | ]; |
||
234 | |||
235 | /** |
||
236 | * @since 1.16.3 |
||
237 | */ |
||
238 | const RECORD_LENGTH = 14; |
||
239 | |||
240 | public function __construct( $locale ) { |
||
241 | if ( !extension_loaded( 'intl' ) ) { |
||
242 | throw new MWException( 'An ICU collation was requested, ' . |
||
243 | 'but the intl extension is not available.' ); |
||
244 | } |
||
245 | |||
246 | $this->locale = $locale; |
||
247 | // Drop everything after the '@' in locale's name |
||
248 | $localeParts = explode( '@', $locale ); |
||
249 | $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] ); |
||
250 | |||
251 | $this->mainCollator = Collator::create( $locale ); |
||
252 | if ( !$this->mainCollator ) { |
||
253 | throw new MWException( "Invalid ICU locale specified for collation: $locale" ); |
||
254 | } |
||
255 | |||
256 | $this->primaryCollator = Collator::create( $locale ); |
||
257 | $this->primaryCollator->setStrength( Collator::PRIMARY ); |
||
258 | |||
259 | // If the special suffix for numeric collation is present, turn on numeric collation. |
||
260 | if ( substr( $locale, -5, 5 ) === '-u-kn' ) { |
||
261 | $this->useNumericCollation = true; |
||
262 | // Strip off the special suffix so it doesn't trip up fetchFirstLetterData(). |
||
263 | $this->locale = substr( $this->locale, 0, -5 ); |
||
264 | $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
265 | $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
266 | } |
||
267 | } |
||
268 | |||
269 | public function getSortKey( $string ) { |
||
270 | return $this->mainCollator->getSortKey( $string ); |
||
271 | } |
||
272 | |||
273 | public function getPrimarySortKey( $string ) { |
||
274 | return $this->primaryCollator->getSortKey( $string ); |
||
275 | } |
||
276 | |||
277 | public function getFirstLetter( $string ) { |
||
278 | $string = strval( $string ); |
||
279 | if ( $string === '' ) { |
||
280 | return ''; |
||
281 | } |
||
282 | |||
283 | $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); |
||
284 | |||
285 | // If the first character is a CJK character, just return that character. |
||
286 | if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) { |
||
287 | return $firstChar; |
||
288 | } |
||
289 | |||
290 | $sortKey = $this->getPrimarySortKey( $string ); |
||
291 | |||
292 | // Do a binary search to find the correct letter to sort under |
||
293 | $min = ArrayUtils::findLowerBound( |
||
294 | [ $this, 'getSortKeyByLetterIndex' ], |
||
295 | $this->getFirstLetterCount(), |
||
296 | 'strcmp', |
||
297 | $sortKey ); |
||
298 | |||
299 | if ( $min === false ) { |
||
300 | // Before the first letter |
||
301 | return ''; |
||
302 | } |
||
303 | |||
304 | $sortLetter = $this->getLetterByIndex( $min ); |
||
305 | |||
306 | if ( $this->useNumericCollation ) { |
||
307 | // If the sort letter is a number, return '0–9' (or localized equivalent). |
||
308 | // ASCII value of 0 is 48. ASCII value of 9 is 57. |
||
309 | // Note that this also applies to non-Arabic numerals since they are |
||
310 | // mapped to Arabic numeral sort letters. For example, ২ sorts as 2. |
||
311 | if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) { |
||
312 | $sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text(); |
||
313 | } |
||
314 | } |
||
315 | return $sortLetter; |
||
316 | } |
||
317 | |||
318 | /** |
||
319 | * @since 1.16.3 |
||
320 | * @return array |
||
321 | */ |
||
322 | public function getFirstLetterData() { |
||
323 | if ( $this->firstLetterData === null ) { |
||
324 | $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING ); |
||
325 | $cacheKey = $cache->makeKey( |
||
326 | 'first-letters', |
||
327 | $this->locale, |
||
328 | $this->digitTransformLanguage->getCode(), |
||
329 | self::getICUVersion(), |
||
330 | self::FIRST_LETTER_VERSION |
||
331 | ); |
||
332 | $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () { |
||
0 ignored issues
–
show
|
|||
333 | return $this->fetchFirstLetterData(); |
||
334 | } ); |
||
335 | } |
||
336 | return $this->firstLetterData; |
||
337 | } |
||
338 | |||
339 | /** |
||
340 | * @return array |
||
341 | * @throws MWException |
||
342 | */ |
||
343 | private function fetchFirstLetterData() { |
||
344 | // Generate data from serialized data file |
||
345 | if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) { |
||
346 | $letters = wfGetPrecompiledData( 'first-letters-root.ser' ); |
||
347 | // Append additional characters |
||
348 | $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] ); |
||
349 | // Remove unnecessary ones, if any |
||
350 | if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) { |
||
351 | $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] ); |
||
352 | } |
||
353 | // Apply digit transforms |
||
354 | $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]; |
||
355 | $letters = array_diff( $letters, $digits ); |
||
356 | foreach ( $digits as $digit ) { |
||
357 | $letters[] = $this->digitTransformLanguage->formatNum( $digit, true ); |
||
358 | } |
||
359 | } else { |
||
360 | $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); |
||
361 | if ( $letters === false ) { |
||
362 | throw new MWException( "MediaWiki does not support ICU locale " . |
||
363 | "\"{$this->locale}\"" ); |
||
364 | } |
||
365 | } |
||
366 | |||
367 | /* Sort the letters. |
||
368 | * |
||
369 | * It's impossible to have the precompiled data file properly sorted, |
||
370 | * because the sort order changes depending on ICU version. If the |
||
371 | * array is not properly sorted, the binary search will return random |
||
372 | * results. |
||
373 | * |
||
374 | * We also take this opportunity to remove primary collisions. |
||
375 | */ |
||
376 | $letterMap = []; |
||
377 | foreach ( $letters as $letter ) { |
||
378 | $key = $this->getPrimarySortKey( $letter ); |
||
379 | if ( isset( $letterMap[$key] ) ) { |
||
380 | // Primary collision |
||
381 | // Keep whichever one sorts first in the main collator |
||
382 | if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { |
||
383 | $letterMap[$key] = $letter; |
||
384 | } |
||
385 | } else { |
||
386 | $letterMap[$key] = $letter; |
||
387 | } |
||
388 | } |
||
389 | ksort( $letterMap, SORT_STRING ); |
||
390 | |||
391 | /* Remove duplicate prefixes. Basically if something has a sortkey |
||
392 | * which is a prefix of some other sortkey, then it is an |
||
393 | * expansion and probably should not be considered a section |
||
394 | * header. |
||
395 | * |
||
396 | * For example 'þ' is sometimes sorted as if it is the letters |
||
397 | * 'th'. Other times it is its own primary element. Another |
||
398 | * example is '₨'. Sometimes its a currency symbol. Sometimes it |
||
399 | * is an 'R' followed by an 's'. |
||
400 | * |
||
401 | * Additionally an expanded element should always sort directly |
||
402 | * after its first element due to they way sortkeys work. |
||
403 | * |
||
404 | * UCA sortkey elements are of variable length but no collation |
||
405 | * element should be a prefix of some other element, so I think |
||
406 | * this is safe. See: |
||
407 | * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm |
||
408 | * - http://site.icu-project.org/design/collation/uca-weight-allocation |
||
409 | * |
||
410 | * Additionally, there is something called primary compression to |
||
411 | * worry about. Basically, if you have two primary elements that |
||
412 | * are more than one byte and both start with the same byte then |
||
413 | * the first byte is dropped on the second primary. Additionally |
||
414 | * either \x03 or \xFF may be added to mean that the next primary |
||
415 | * does not start with the first byte of the first primary. |
||
416 | * |
||
417 | * This shouldn't matter much, as the first primary is not |
||
418 | * changed, and that is what we are comparing against. |
||
419 | * |
||
420 | * tl;dr: This makes some assumptions about how icu implements |
||
421 | * collations. It seems incredibly unlikely these assumptions |
||
422 | * will change, but nonetheless they are assumptions. |
||
423 | */ |
||
424 | |||
425 | $prev = false; |
||
426 | $duplicatePrefixes = []; |
||
427 | foreach ( $letterMap as $key => $value ) { |
||
428 | // Remove terminator byte. Otherwise the prefix |
||
429 | // comparison will get hung up on that. |
||
430 | $trimmedKey = rtrim( $key, "\0" ); |
||
431 | if ( $prev === false || $prev === '' ) { |
||
432 | $prev = $trimmedKey; |
||
433 | // We don't yet have a collation element |
||
434 | // to compare against, so continue. |
||
435 | continue; |
||
436 | } |
||
437 | |||
438 | // Due to the fact the array is sorted, we only have |
||
439 | // to compare with the element directly previous |
||
440 | // to the current element (skipping expansions). |
||
441 | // An element "X" will always sort directly |
||
442 | // before "XZ" (Unless we have "XY", but we |
||
443 | // do not update $prev in that case). |
||
444 | if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) { |
||
445 | $duplicatePrefixes[] = $key; |
||
446 | // If this is an expansion, we don't want to |
||
447 | // compare the next element to this element, |
||
448 | // but to what is currently $prev |
||
449 | continue; |
||
450 | } |
||
451 | $prev = $trimmedKey; |
||
452 | } |
||
453 | foreach ( $duplicatePrefixes as $badKey ) { |
||
454 | wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" ); |
||
455 | unset( $letterMap[$badKey] ); |
||
456 | // This code assumes that unsetting does not change sort order. |
||
457 | } |
||
458 | $data = [ |
||
459 | 'chars' => array_values( $letterMap ), |
||
460 | 'keys' => array_keys( $letterMap ), |
||
461 | ]; |
||
462 | |||
463 | // Reduce memory usage before caching |
||
464 | unset( $letterMap ); |
||
465 | |||
466 | return $data; |
||
467 | } |
||
468 | |||
469 | /** |
||
470 | * @since 1.16.3 |
||
471 | */ |
||
472 | public function getLetterByIndex( $index ) { |
||
473 | return $this->getFirstLetterData()['chars'][$index]; |
||
474 | } |
||
475 | |||
476 | /** |
||
477 | * @since 1.16.3 |
||
478 | */ |
||
479 | public function getSortKeyByLetterIndex( $index ) { |
||
480 | return $this->getFirstLetterData()['keys'][$index]; |
||
481 | } |
||
482 | |||
483 | /** |
||
484 | * @since 1.16.3 |
||
485 | */ |
||
486 | public function getFirstLetterCount() { |
||
487 | return count( $this->getFirstLetterData()['chars'] ); |
||
488 | } |
||
489 | |||
490 | /** |
||
491 | * Test if a code point is a CJK (Chinese, Japanese, Korean) character |
||
492 | * @since 1.16.3 |
||
493 | */ |
||
494 | public static function isCjk( $codepoint ) { |
||
495 | foreach ( self::$cjkBlocks as $block ) { |
||
496 | if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { |
||
497 | return true; |
||
498 | } |
||
499 | } |
||
500 | return false; |
||
501 | } |
||
502 | |||
503 | /** |
||
504 | * Return the version of ICU library used by PHP's intl extension, |
||
505 | * or false when the extension is not installed of the version |
||
506 | * can't be determined. |
||
507 | * |
||
508 | * The constant INTL_ICU_VERSION this function refers to isn't really |
||
509 | * documented. It is available since PHP 5.3.7 (see PHP bug 54561). |
||
510 | * This function will return false on older PHPs. |
||
511 | * |
||
512 | * @since 1.21 |
||
513 | * @return string|bool |
||
514 | */ |
||
515 | static function getICUVersion() { |
||
516 | return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false; |
||
517 | } |
||
518 | |||
519 | /** |
||
520 | * Return the version of Unicode appropriate for the version of ICU library |
||
521 | * currently in use, or false when it can't be determined. |
||
522 | * |
||
523 | * @since 1.21 |
||
524 | * @return string|bool |
||
525 | */ |
||
526 | static function getUnicodeVersionForICU() { |
||
527 | $icuVersion = IcuCollation::getICUVersion(); |
||
528 | if ( !$icuVersion ) { |
||
529 | return false; |
||
530 | } |
||
531 | |||
532 | $versionPrefix = substr( $icuVersion, 0, 3 ); |
||
533 | // Source: http://site.icu-project.org/download |
||
534 | $map = [ |
||
535 | '57.' => '8.0', |
||
536 | '56.' => '8.0', |
||
537 | '55.' => '7.0', |
||
538 | '54.' => '7.0', |
||
539 | '53.' => '6.3', |
||
540 | '52.' => '6.3', |
||
541 | '51.' => '6.2', |
||
542 | '50.' => '6.2', |
||
543 | '49.' => '6.1', |
||
544 | '4.8' => '6.0', |
||
545 | '4.6' => '6.0', |
||
546 | '4.4' => '5.2', |
||
547 | '4.2' => '5.1', |
||
548 | '4.0' => '5.1', |
||
549 | '3.8' => '5.0', |
||
550 | '3.6' => '5.0', |
||
551 | '3.4' => '4.1', |
||
552 | ]; |
||
553 | |||
554 | if ( isset( $map[$versionPrefix] ) ) { |
||
555 | return $map[$versionPrefix]; |
||
556 | } else { |
||
557 | return false; |
||
558 | } |
||
559 | } |
||
560 | } |
||
561 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..