These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * This program is free software; you can redistribute it and/or modify |
||
4 | * it under the terms of the GNU General Public License as published by |
||
5 | * the Free Software Foundation; either version 2 of the License, or |
||
6 | * (at your option) any later version. |
||
7 | * |
||
8 | * This program is distributed in the hope that it will be useful, |
||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
11 | * GNU General Public License for more details. |
||
12 | * |
||
13 | * You should have received a copy of the GNU General Public License along |
||
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
16 | * http://www.gnu.org/copyleft/gpl.html |
||
17 | * |
||
18 | * @file |
||
19 | */ |
||
20 | |||
21 | /** |
||
22 | * @since 1.16.3 |
||
23 | */ |
||
24 | class IcuCollation extends Collation { |
||
25 | const FIRST_LETTER_VERSION = 2; |
||
26 | |||
27 | /** @var Collator */ |
||
28 | private $primaryCollator; |
||
29 | |||
30 | /** @var Collator */ |
||
31 | private $mainCollator; |
||
32 | |||
33 | /** @var string */ |
||
34 | private $locale; |
||
35 | |||
36 | /** @var Language */ |
||
37 | protected $digitTransformLanguage; |
||
38 | |||
39 | /** @var boolean */ |
||
40 | private $useNumericCollation = false; |
||
41 | |||
42 | /** @var array */ |
||
43 | private $firstLetterData; |
||
44 | |||
45 | /** |
||
46 | * Unified CJK blocks. |
||
47 | * |
||
48 | * The same definition of a CJK block must be used for both Collation and |
||
49 | * generateCollationData.php. These blocks are omitted from the first |
||
50 | * letter data, as an optimisation measure and because the default UCA table |
||
51 | * is pretty useless for sorting Chinese text anyway. Japanese and Korean |
||
52 | * blocks are not included here, because they are smaller and more useful. |
||
53 | */ |
||
54 | private static $cjkBlocks = [ |
||
55 | [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement |
||
56 | [ 0x2F00, 0x2FDF ], // Kangxi Radicals |
||
57 | [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters |
||
58 | [ 0x3000, 0x303F ], // CJK Symbols and Punctuation |
||
59 | [ 0x31C0, 0x31EF ], // CJK Strokes |
||
60 | [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months |
||
61 | [ 0x3300, 0x33FF ], // CJK Compatibility |
||
62 | [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A |
||
63 | [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs |
||
64 | [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs |
||
65 | [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms |
||
66 | [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B |
||
67 | [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C |
||
68 | [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D |
||
69 | [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement |
||
70 | ]; |
||
71 | |||
72 | /** |
||
73 | * Additional characters (or character groups) to be considered separate |
||
74 | * letters for given languages, or to be removed from the list of such |
||
75 | * letters (denoted by keys starting with '-'). |
||
76 | * |
||
77 | * These are additions to (or subtractions from) the data stored in the |
||
78 | * first-letters-root.ser file (which among others includes full basic latin, |
||
79 | * cyrillic and greek alphabets). |
||
80 | * |
||
81 | * "Separate letter" is a letter that would have a separate heading/section |
||
82 | * for it in a dictionary or a phone book in this language. This data isn't |
||
83 | * used for sorting (the ICU library handles that), only for deciding which |
||
84 | * characters (or character groups) to use as headings. |
||
85 | * |
||
86 | * Initially generated based on the primary level of Unicode collation |
||
87 | * tailorings available at http://developer.mimer.com/charts/tailorings.htm , |
||
88 | * later modified. |
||
89 | * |
||
90 | * Empty arrays are intended; this signifies that the data for the language is |
||
91 | * available and that there are, in fact, no additional letters to consider. |
||
92 | */ |
||
93 | private static $tailoringFirstLetters = [ |
||
94 | // Verified by native speakers |
||
95 | 'be' => [ "Ё" ], |
||
96 | 'be-tarask' => [ "Ё" ], |
||
97 | 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
98 | 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], |
||
99 | 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], |
||
100 | 'en' => [], |
||
101 | 'fa' => [ |
||
102 | // RTL, let's put each letter on a new line |
||
103 | "آ", |
||
104 | "ء", |
||
105 | "ه", |
||
106 | "ا", |
||
107 | "و" |
||
108 | ], |
||
109 | 'fi' => [ "Å", "Ä", "Ö" ], |
||
110 | 'fr' => [], |
||
111 | 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
112 | 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], |
||
113 | 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], |
||
114 | 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], |
||
115 | 'it' => [], |
||
116 | 'lt' => [ "Č", "Š", "Ž" ], |
||
117 | 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], |
||
118 | 'mk' => [ "Ѓ", "Ќ" ], |
||
119 | 'nl' => [], |
||
120 | 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], |
||
121 | 'pt' => [], |
||
122 | 'ru' => [], |
||
123 | 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], |
||
124 | 'sr' => [], |
||
125 | 'sv' => [ "Å", "Ä", "Ö" ], |
||
126 | 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], |
||
127 | 'ta' => [ |
||
128 | "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", |
||
129 | "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", |
||
130 | "ஸ்", "ஹ்", "க்ஷ்" |
||
131 | ], |
||
132 | 'uk' => [ "Ґ", "Ь" ], |
||
133 | 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], |
||
134 | // Not verified, but likely correct |
||
135 | 'af' => [], |
||
136 | 'am' => [], |
||
137 | 'ar' => [], |
||
138 | 'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ], |
||
139 | 'ast' => [ "Ch", "Ll", "Ñ" ], // Not in libicu? |
||
140 | 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
141 | 'bg' => [], |
||
142 | 'bo' => [], |
||
143 | 'br' => [ "Ch", "C'h" ], |
||
144 | 'bs-Cyrl' => [], |
||
145 | 'ca' => [], |
||
146 | 'chr' => [], |
||
147 | 'co' => [], // Not in libicu? |
||
148 | 'da' => [ "Æ", "Ø", "Å" ], |
||
149 | 'de' => [], |
||
150 | 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ], |
||
151 | 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ], |
||
152 | 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ], |
||
153 | 'el' => [], |
||
154 | 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ], |
||
155 | 'es' => [ "Ñ" ], |
||
156 | 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et) |
||
157 | 'eu' => [ "Ñ" ], // Not in libicu? |
||
158 | 'fil' => [ "Ñ", "Ng" ], |
||
159 | 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ], |
||
160 | 'fr-CA' => [], // fr-CA sorts accents slightly different from fr. |
||
161 | 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu |
||
162 | 'fy' => [], // not in libicu |
||
163 | 'ga' => [], |
||
164 | 'gd' => [], // not in libicu |
||
165 | 'gl' => [ "Ch", "Ll", "Ñ" ], |
||
166 | 'gu' => [ "\xe0\xaa\x82", "\xe0\xaa\x83", "\xe0\xaa\x81", "\xe0\xaa\xb3" ], |
||
167 | 'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ], |
||
168 | 'haw' => [ 'ʻ' ], |
||
169 | 'he' => [], |
||
170 | 'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ], |
||
171 | 'hy' => [ "և" ], |
||
172 | 'id' => [], |
||
173 | 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ], |
||
174 | 'ka' => [], |
||
175 | 'km' => [ |
||
176 | "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86", |
||
177 | "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87", |
||
178 | "\xe1\x9e\xb7\xe1\x9f\x87", "\xe1\x9e\xbb\xe1\x9f\x87", |
||
179 | "\xe1\x9f\x81\xe1\x9f\x87", "\xe1\x9f\x84\xe1\x9f\x87", |
||
180 | ], |
||
181 | 'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ], |
||
182 | 'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ], |
||
183 | 'kk' => [ "Ү", "І" ], |
||
184 | 'kl' => [ "Æ", "Ø", "Å" ], |
||
185 | 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // ku is not in libicu |
||
186 | 'ky' => [ "Ё" ], |
||
187 | 'la' => [], // la is not in libicu |
||
188 | 'lb' => [], |
||
189 | 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ], |
||
190 | 'ln' => [ 'Ɛ' ], |
||
191 | 'lo' => [], |
||
192 | 'ml' => [], |
||
193 | 'mn' => [], |
||
194 | 'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ], |
||
195 | 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // no mo in libicu |
||
196 | 'ms' => [], |
||
197 | 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ], |
||
198 | 'nb' => [ "Æ", "Ø", "Å" ], |
||
199 | 'ne' => [], |
||
200 | 'nn' => [ "Æ", "Ø", "Å" ], |
||
201 | // no is not in the libicu list. You should probably use nb or nn instead. |
||
202 | 'no' => [ "Æ", "Ø", "Å" ], |
||
203 | 'oc' => [], // not in libicu |
||
204 | 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ], |
||
205 | 'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ], |
||
206 | 'pa' => [ "\xe0\xa9\x8d" ], |
||
207 | 'rm' => [], // not in libicu |
||
208 | 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], |
||
209 | 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu |
||
210 | 'sco' => [], |
||
211 | 'se' => [ |
||
212 | 'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ', |
||
213 | 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö' |
||
214 | ], |
||
215 | 'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ], |
||
216 | 'sl' => [ "Č", "Š", "Ž" ], |
||
217 | 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ], |
||
218 | 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ], |
||
219 | 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
220 | 'sw' => [], |
||
221 | 'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ], |
||
222 | 'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ], |
||
223 | 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ], |
||
224 | 'tl' => [ "Ñ", "Ng" ], // not in libicu |
||
225 | 'to' => [ "Ng", "ʻ" ], |
||
226 | 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
227 | 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu |
||
228 | 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu |
||
229 | 'vo' => [ "Ä", "Ö", "Ü" ], |
||
230 | 'yi' => [ |
||
231 | "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc", |
||
232 | "\xd7\xa9\xd7\x82", "\xd7\xaa\xd6\xbc" |
||
233 | ], |
||
234 | 'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ], |
||
235 | 'zu' => [], |
||
236 | ]; |
||
237 | |||
238 | /** |
||
239 | * @since 1.16.3 |
||
240 | */ |
||
241 | const RECORD_LENGTH = 14; |
||
242 | |||
243 | public function __construct( $locale ) { |
||
244 | if ( !extension_loaded( 'intl' ) ) { |
||
245 | throw new MWException( 'An ICU collation was requested, ' . |
||
246 | 'but the intl extension is not available.' ); |
||
247 | } |
||
248 | |||
249 | $this->locale = $locale; |
||
250 | // Drop everything after the '@' in locale's name |
||
251 | $localeParts = explode( '@', $locale ); |
||
252 | $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] ); |
||
253 | |||
254 | $this->mainCollator = Collator::create( $locale ); |
||
255 | if ( !$this->mainCollator ) { |
||
256 | throw new MWException( "Invalid ICU locale specified for collation: $locale" ); |
||
257 | } |
||
258 | |||
259 | $this->primaryCollator = Collator::create( $locale ); |
||
260 | $this->primaryCollator->setStrength( Collator::PRIMARY ); |
||
261 | |||
262 | // If the special suffix for numeric collation is present, turn on numeric collation. |
||
263 | if ( substr( $locale, -5, 5 ) === '-u-kn' ) { |
||
264 | $this->useNumericCollation = true; |
||
265 | // Strip off the special suffix so it doesn't trip up fetchFirstLetterData(). |
||
266 | $this->locale = substr( $this->locale, 0, -5 ); |
||
267 | $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
268 | $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
269 | } |
||
270 | } |
||
271 | |||
272 | public function getSortKey( $string ) { |
||
273 | return $this->mainCollator->getSortKey( $string ); |
||
274 | } |
||
275 | |||
276 | public function getPrimarySortKey( $string ) { |
||
277 | return $this->primaryCollator->getSortKey( $string ); |
||
278 | } |
||
279 | |||
280 | public function getFirstLetter( $string ) { |
||
281 | $string = strval( $string ); |
||
282 | if ( $string === '' ) { |
||
283 | return ''; |
||
284 | } |
||
285 | |||
286 | $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); |
||
287 | |||
288 | // If the first character is a CJK character, just return that character. |
||
289 | if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) { |
||
290 | return $firstChar; |
||
291 | } |
||
292 | |||
293 | $sortKey = $this->getPrimarySortKey( $string ); |
||
294 | |||
295 | // Do a binary search to find the correct letter to sort under |
||
296 | $min = ArrayUtils::findLowerBound( |
||
297 | [ $this, 'getSortKeyByLetterIndex' ], |
||
298 | $this->getFirstLetterCount(), |
||
299 | 'strcmp', |
||
300 | $sortKey ); |
||
301 | |||
302 | if ( $min === false ) { |
||
303 | // Before the first letter |
||
304 | return ''; |
||
305 | } |
||
306 | |||
307 | $sortLetter = $this->getLetterByIndex( $min ); |
||
308 | |||
309 | if ( $this->useNumericCollation ) { |
||
310 | // If the sort letter is a number, return '0–9' (or localized equivalent). |
||
311 | // ASCII value of 0 is 48. ASCII value of 9 is 57. |
||
312 | // Note that this also applies to non-Arabic numerals since they are |
||
313 | // mapped to Arabic numeral sort letters. For example, ২ sorts as 2. |
||
314 | if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) { |
||
315 | $sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text(); |
||
316 | } |
||
317 | } |
||
318 | return $sortLetter; |
||
319 | } |
||
320 | |||
321 | /** |
||
322 | * @since 1.16.3 |
||
323 | * @return array |
||
324 | */ |
||
325 | public function getFirstLetterData() { |
||
326 | if ( $this->firstLetterData === null ) { |
||
327 | $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING ); |
||
328 | $cacheKey = $cache->makeKey( |
||
329 | 'first-letters', |
||
330 | $this->locale, |
||
331 | $this->digitTransformLanguage->getCode(), |
||
332 | self::getICUVersion(), |
||
333 | self::FIRST_LETTER_VERSION |
||
334 | ); |
||
335 | $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () { |
||
0 ignored issues
–
show
|
|||
336 | return $this->fetchFirstLetterData(); |
||
337 | } ); |
||
338 | } |
||
339 | return $this->firstLetterData; |
||
340 | } |
||
341 | |||
342 | /** |
||
343 | * @return array |
||
344 | * @throws MWException |
||
345 | */ |
||
346 | private function fetchFirstLetterData() { |
||
347 | // Generate data from serialized data file |
||
348 | if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) { |
||
349 | $letters = wfGetPrecompiledData( 'first-letters-root.ser' ); |
||
350 | // Append additional characters |
||
351 | $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] ); |
||
352 | // Remove unnecessary ones, if any |
||
353 | if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) { |
||
354 | $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] ); |
||
355 | } |
||
356 | // Apply digit transforms |
||
357 | $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]; |
||
358 | $letters = array_diff( $letters, $digits ); |
||
359 | foreach ( $digits as $digit ) { |
||
360 | $letters[] = $this->digitTransformLanguage->formatNum( $digit, true ); |
||
361 | } |
||
362 | } else { |
||
363 | $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); |
||
364 | if ( $letters === false ) { |
||
365 | throw new MWException( "MediaWiki does not support ICU locale " . |
||
366 | "\"{$this->locale}\"" ); |
||
367 | } |
||
368 | } |
||
369 | |||
370 | /* Sort the letters. |
||
371 | * |
||
372 | * It's impossible to have the precompiled data file properly sorted, |
||
373 | * because the sort order changes depending on ICU version. If the |
||
374 | * array is not properly sorted, the binary search will return random |
||
375 | * results. |
||
376 | * |
||
377 | * We also take this opportunity to remove primary collisions. |
||
378 | */ |
||
379 | $letterMap = []; |
||
380 | foreach ( $letters as $letter ) { |
||
381 | $key = $this->getPrimarySortKey( $letter ); |
||
382 | if ( isset( $letterMap[$key] ) ) { |
||
383 | // Primary collision |
||
384 | // Keep whichever one sorts first in the main collator |
||
385 | if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { |
||
386 | $letterMap[$key] = $letter; |
||
387 | } |
||
388 | } else { |
||
389 | $letterMap[$key] = $letter; |
||
390 | } |
||
391 | } |
||
392 | ksort( $letterMap, SORT_STRING ); |
||
393 | |||
394 | /* Remove duplicate prefixes. Basically if something has a sortkey |
||
395 | * which is a prefix of some other sortkey, then it is an |
||
396 | * expansion and probably should not be considered a section |
||
397 | * header. |
||
398 | * |
||
399 | * For example 'þ' is sometimes sorted as if it is the letters |
||
400 | * 'th'. Other times it is its own primary element. Another |
||
401 | * example is '₨'. Sometimes its a currency symbol. Sometimes it |
||
402 | * is an 'R' followed by an 's'. |
||
403 | * |
||
404 | * Additionally an expanded element should always sort directly |
||
405 | * after its first element due to they way sortkeys work. |
||
406 | * |
||
407 | * UCA sortkey elements are of variable length but no collation |
||
408 | * element should be a prefix of some other element, so I think |
||
409 | * this is safe. See: |
||
410 | * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm |
||
411 | * - http://site.icu-project.org/design/collation/uca-weight-allocation |
||
412 | * |
||
413 | * Additionally, there is something called primary compression to |
||
414 | * worry about. Basically, if you have two primary elements that |
||
415 | * are more than one byte and both start with the same byte then |
||
416 | * the first byte is dropped on the second primary. Additionally |
||
417 | * either \x03 or \xFF may be added to mean that the next primary |
||
418 | * does not start with the first byte of the first primary. |
||
419 | * |
||
420 | * This shouldn't matter much, as the first primary is not |
||
421 | * changed, and that is what we are comparing against. |
||
422 | * |
||
423 | * tl;dr: This makes some assumptions about how icu implements |
||
424 | * collations. It seems incredibly unlikely these assumptions |
||
425 | * will change, but nonetheless they are assumptions. |
||
426 | */ |
||
427 | |||
428 | $prev = false; |
||
429 | $duplicatePrefixes = []; |
||
430 | foreach ( $letterMap as $key => $value ) { |
||
431 | // Remove terminator byte. Otherwise the prefix |
||
432 | // comparison will get hung up on that. |
||
433 | $trimmedKey = rtrim( $key, "\0" ); |
||
434 | if ( $prev === false || $prev === '' ) { |
||
435 | $prev = $trimmedKey; |
||
436 | // We don't yet have a collation element |
||
437 | // to compare against, so continue. |
||
438 | continue; |
||
439 | } |
||
440 | |||
441 | // Due to the fact the array is sorted, we only have |
||
442 | // to compare with the element directly previous |
||
443 | // to the current element (skipping expansions). |
||
444 | // An element "X" will always sort directly |
||
445 | // before "XZ" (Unless we have "XY", but we |
||
446 | // do not update $prev in that case). |
||
447 | if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) { |
||
448 | $duplicatePrefixes[] = $key; |
||
449 | // If this is an expansion, we don't want to |
||
450 | // compare the next element to this element, |
||
451 | // but to what is currently $prev |
||
452 | continue; |
||
453 | } |
||
454 | $prev = $trimmedKey; |
||
455 | } |
||
456 | foreach ( $duplicatePrefixes as $badKey ) { |
||
457 | wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" ); |
||
458 | unset( $letterMap[$badKey] ); |
||
459 | // This code assumes that unsetting does not change sort order. |
||
460 | } |
||
461 | $data = [ |
||
462 | 'chars' => array_values( $letterMap ), |
||
463 | 'keys' => array_keys( $letterMap ), |
||
464 | ]; |
||
465 | |||
466 | // Reduce memory usage before caching |
||
467 | unset( $letterMap ); |
||
468 | |||
469 | return $data; |
||
470 | } |
||
471 | |||
472 | /** |
||
473 | * @since 1.16.3 |
||
474 | */ |
||
475 | public function getLetterByIndex( $index ) { |
||
476 | return $this->getFirstLetterData()['chars'][$index]; |
||
477 | } |
||
478 | |||
479 | /** |
||
480 | * @since 1.16.3 |
||
481 | */ |
||
482 | public function getSortKeyByLetterIndex( $index ) { |
||
483 | return $this->getFirstLetterData()['keys'][$index]; |
||
484 | } |
||
485 | |||
486 | /** |
||
487 | * @since 1.16.3 |
||
488 | */ |
||
489 | public function getFirstLetterCount() { |
||
490 | return count( $this->getFirstLetterData()['chars'] ); |
||
491 | } |
||
492 | |||
493 | /** |
||
494 | * Test if a code point is a CJK (Chinese, Japanese, Korean) character |
||
495 | * @since 1.16.3 |
||
496 | */ |
||
497 | public static function isCjk( $codepoint ) { |
||
498 | foreach ( self::$cjkBlocks as $block ) { |
||
499 | if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { |
||
500 | return true; |
||
501 | } |
||
502 | } |
||
503 | return false; |
||
504 | } |
||
505 | |||
506 | /** |
||
507 | * Return the version of ICU library used by PHP's intl extension, |
||
508 | * or false when the extension is not installed of the version |
||
509 | * can't be determined. |
||
510 | * |
||
511 | * The constant INTL_ICU_VERSION this function refers to isn't really |
||
512 | * documented. It is available since PHP 5.3.7 (see PHP bug 54561). |
||
513 | * This function will return false on older PHPs. |
||
514 | * |
||
515 | * @since 1.21 |
||
516 | * @return string|bool |
||
517 | */ |
||
518 | static function getICUVersion() { |
||
519 | return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false; |
||
520 | } |
||
521 | |||
522 | /** |
||
523 | * Return the version of Unicode appropriate for the version of ICU library |
||
524 | * currently in use, or false when it can't be determined. |
||
525 | * |
||
526 | * @since 1.21 |
||
527 | * @return string|bool |
||
528 | */ |
||
529 | static function getUnicodeVersionForICU() { |
||
530 | $icuVersion = IcuCollation::getICUVersion(); |
||
531 | if ( !$icuVersion ) { |
||
532 | return false; |
||
533 | } |
||
534 | |||
535 | $versionPrefix = substr( $icuVersion, 0, 3 ); |
||
536 | // Source: http://site.icu-project.org/download |
||
537 | $map = [ |
||
538 | '57.' => '8.0', |
||
539 | '56.' => '8.0', |
||
540 | '55.' => '7.0', |
||
541 | '54.' => '7.0', |
||
542 | '53.' => '6.3', |
||
543 | '52.' => '6.3', |
||
544 | '51.' => '6.2', |
||
545 | '50.' => '6.2', |
||
546 | '49.' => '6.1', |
||
547 | '4.8' => '6.0', |
||
548 | '4.6' => '6.0', |
||
549 | '4.4' => '5.2', |
||
550 | '4.2' => '5.1', |
||
551 | '4.0' => '5.1', |
||
552 | '3.8' => '5.0', |
||
553 | '3.6' => '5.0', |
||
554 | '3.4' => '4.1', |
||
555 | ]; |
||
556 | |||
557 | if ( isset( $map[$versionPrefix] ) ) { |
||
558 | return $map[$versionPrefix]; |
||
559 | } else { |
||
560 | return false; |
||
561 | } |
||
562 | } |
||
563 | } |
||
564 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..