wikimedia /
mediawiki
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
| 1 | <?php |
||
| 2 | /** |
||
| 3 | * This program is free software; you can redistribute it and/or modify |
||
| 4 | * it under the terms of the GNU General Public License as published by |
||
| 5 | * the Free Software Foundation; either version 2 of the License, or |
||
| 6 | * (at your option) any later version. |
||
| 7 | * |
||
| 8 | * This program is distributed in the hope that it will be useful, |
||
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 11 | * GNU General Public License for more details. |
||
| 12 | * |
||
| 13 | * You should have received a copy of the GNU General Public License along |
||
| 14 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
| 15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
| 16 | * http://www.gnu.org/copyleft/gpl.html |
||
| 17 | * |
||
| 18 | * @file |
||
| 19 | */ |
||
| 20 | |||
| 21 | /** |
||
| 22 | * @since 1.16.3 |
||
| 23 | */ |
||
| 24 | class IcuCollation extends Collation { |
||
| 25 | const FIRST_LETTER_VERSION = 2; |
||
| 26 | |||
| 27 | /** @var Collator */ |
||
| 28 | private $primaryCollator; |
||
| 29 | |||
| 30 | /** @var Collator */ |
||
| 31 | private $mainCollator; |
||
| 32 | |||
| 33 | /** @var string */ |
||
| 34 | private $locale; |
||
| 35 | |||
| 36 | /** @var Language */ |
||
| 37 | protected $digitTransformLanguage; |
||
| 38 | |||
| 39 | /** @var boolean */ |
||
| 40 | private $useNumericCollation = false; |
||
| 41 | |||
| 42 | /** @var array */ |
||
| 43 | private $firstLetterData; |
||
| 44 | |||
| 45 | /** |
||
| 46 | * Unified CJK blocks. |
||
| 47 | * |
||
| 48 | * The same definition of a CJK block must be used for both Collation and |
||
| 49 | * generateCollationData.php. These blocks are omitted from the first |
||
| 50 | * letter data, as an optimisation measure and because the default UCA table |
||
| 51 | * is pretty useless for sorting Chinese text anyway. Japanese and Korean |
||
| 52 | * blocks are not included here, because they are smaller and more useful. |
||
| 53 | */ |
||
| 54 | private static $cjkBlocks = [ |
||
| 55 | [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement |
||
| 56 | [ 0x2F00, 0x2FDF ], // Kangxi Radicals |
||
| 57 | [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters |
||
| 58 | [ 0x3000, 0x303F ], // CJK Symbols and Punctuation |
||
| 59 | [ 0x31C0, 0x31EF ], // CJK Strokes |
||
| 60 | [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months |
||
| 61 | [ 0x3300, 0x33FF ], // CJK Compatibility |
||
| 62 | [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A |
||
| 63 | [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs |
||
| 64 | [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs |
||
| 65 | [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms |
||
| 66 | [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B |
||
| 67 | [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C |
||
| 68 | [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D |
||
| 69 | [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement |
||
| 70 | ]; |
||
| 71 | |||
| 72 | /** |
||
| 73 | * Additional characters (or character groups) to be considered separate |
||
| 74 | * letters for given languages, or to be removed from the list of such |
||
| 75 | * letters (denoted by keys starting with '-'). |
||
| 76 | * |
||
| 77 | * These are additions to (or subtractions from) the data stored in the |
||
| 78 | * first-letters-root.ser file (which among others includes full basic latin, |
||
| 79 | * cyrillic and greek alphabets). |
||
| 80 | * |
||
| 81 | * "Separate letter" is a letter that would have a separate heading/section |
||
| 82 | * for it in a dictionary or a phone book in this language. This data isn't |
||
| 83 | * used for sorting (the ICU library handles that), only for deciding which |
||
| 84 | * characters (or character groups) to use as headings. |
||
| 85 | * |
||
| 86 | * Initially generated based on the primary level of Unicode collation |
||
| 87 | * tailorings available at http://developer.mimer.com/charts/tailorings.htm , |
||
| 88 | * later modified. |
||
| 89 | * |
||
| 90 | * Empty arrays are intended; this signifies that the data for the language is |
||
| 91 | * available and that there are, in fact, no additional letters to consider. |
||
| 92 | */ |
||
| 93 | private static $tailoringFirstLetters = [ |
||
| 94 | 'af' => [], |
||
| 95 | 'am' => [], |
||
| 96 | 'ar' => [], |
||
| 97 | 'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ], |
||
| 98 | 'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu |
||
| 99 | 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
| 100 | 'be' => [ "Ё" ], |
||
| 101 | 'be-tarask' => [ "Ё" ], |
||
| 102 | 'bg' => [], |
||
| 103 | 'bo' => [], |
||
| 104 | 'br' => [ "Ch", "C'h" ], |
||
| 105 | 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 106 | 'bs-Cyrl' => [], |
||
| 107 | 'ca' => [], |
||
| 108 | 'chr' => [], |
||
| 109 | 'co' => [], // not in libicu |
||
| 110 | 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], |
||
| 111 | 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], |
||
| 112 | 'da' => [ "Æ", "Ø", "Å" ], |
||
| 113 | 'de' => [], |
||
| 114 | 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ], |
||
| 115 | 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ], |
||
| 116 | 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ], |
||
| 117 | 'el' => [], |
||
| 118 | 'en' => [], |
||
| 119 | 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ], |
||
| 120 | 'es' => [ "Ñ" ], |
||
| 121 | 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et) |
||
| 122 | 'eu' => [ "Ñ" ], // not in libicu |
||
| 123 | 'fa' => [ |
||
| 124 | // RTL, let's put each letter on a new line |
||
| 125 | "آ", |
||
| 126 | "ء", |
||
| 127 | "ه", |
||
| 128 | "ا", |
||
| 129 | "و" |
||
| 130 | ], |
||
| 131 | 'fi' => [ "Å", "Ä", "Ö" ], |
||
| 132 | 'fil' => [ "Ñ", "Ng" ], |
||
| 133 | 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ], |
||
| 134 | 'fr' => [], |
||
| 135 | 'fr-CA' => [], // fr-CA sorts accents slightly different from fr. |
||
| 136 | 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu |
||
| 137 | 'fy' => [], // not in libicu |
||
| 138 | 'ga' => [], |
||
| 139 | 'gd' => [], // not in libicu |
||
| 140 | 'gl' => [ "Ch", "Ll", "Ñ" ], |
||
| 141 | 'gu' => [ "\xe0\xaa\x82", "\xe0\xaa\x83", "\xe0\xaa\x81", "\xe0\xaa\xb3" ], |
||
| 142 | 'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ], |
||
| 143 | 'haw' => [ 'ʻ' ], |
||
| 144 | 'he' => [], |
||
| 145 | 'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ], |
||
| 146 | 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 147 | 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], |
||
| 148 | 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], |
||
| 149 | 'hy' => [ "և" ], |
||
| 150 | 'id' => [], |
||
| 151 | 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ], |
||
| 152 | 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], |
||
| 153 | 'it' => [], |
||
| 154 | 'ka' => [], |
||
| 155 | 'kk' => [ "Ү", "І" ], |
||
| 156 | 'kl' => [ "Æ", "Ø", "Å" ], |
||
| 157 | 'km' => [ |
||
| 158 | "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86", |
||
| 159 | "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87", |
||
| 160 | "\xe1\x9e\xb7\xe1\x9f\x87", "\xe1\x9e\xbb\xe1\x9f\x87", |
||
| 161 | "\xe1\x9f\x81\xe1\x9f\x87", "\xe1\x9f\x84\xe1\x9f\x87", |
||
| 162 | ], |
||
| 163 | 'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ], |
||
| 164 | 'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ], |
||
| 165 | 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu |
||
| 166 | 'ky' => [ "Ё" ], |
||
| 167 | 'la' => [], // not in libicu |
||
| 168 | 'lb' => [], |
||
| 169 | 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ], |
||
| 170 | 'ln' => [ 'Ɛ' ], |
||
| 171 | 'lo' => [], |
||
| 172 | 'lt' => [ "Č", "Š", "Ž" ], |
||
| 173 | 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], |
||
| 174 | 'mk' => [ "Ѓ", "Ќ" ], |
||
| 175 | 'ml' => [], |
||
| 176 | 'mn' => [], |
||
| 177 | 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // not in libicu |
||
| 178 | 'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ], |
||
| 179 | 'ms' => [], |
||
| 180 | 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ], |
||
| 181 | 'nb' => [ "Æ", "Ø", "Å" ], |
||
| 182 | 'ne' => [], |
||
| 183 | 'nl' => [], |
||
| 184 | 'nn' => [ "Æ", "Ø", "Å" ], |
||
| 185 | 'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead. |
||
| 186 | 'oc' => [], // not in libicu |
||
| 187 | 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ], |
||
| 188 | 'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ], |
||
| 189 | 'pa' => [ "\xe0\xa9\x8d" ], |
||
| 190 | 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], |
||
| 191 | 'pt' => [], |
||
| 192 | 'rm' => [], // not in libicu |
||
| 193 | 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], |
||
| 194 | 'ru' => [], |
||
| 195 | 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu |
||
| 196 | 'sco' => [], |
||
| 197 | 'se' => [ |
||
| 198 | 'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ', |
||
| 199 | 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö' |
||
| 200 | ], |
||
| 201 | 'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ], |
||
| 202 | 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], |
||
| 203 | 'sl' => [ "Č", "Š", "Ž" ], |
||
| 204 | 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ], |
||
| 205 | 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ], |
||
| 206 | 'sr' => [], |
||
| 207 | 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 208 | 'sv' => [ "Å", "Ä", "Ö" ], |
||
| 209 | 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], |
||
| 210 | 'sw' => [], |
||
| 211 | 'ta' => [ |
||
| 212 | "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", |
||
| 213 | "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", |
||
| 214 | "ஸ்", "ஹ்", "க்ஷ்" |
||
| 215 | ], |
||
| 216 | 'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ], |
||
| 217 | 'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ], |
||
| 218 | 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ], |
||
| 219 | 'tl' => [ "Ñ", "Ng" ], // not in libicu |
||
| 220 | 'to' => [ "Ng", "ʻ" ], |
||
| 221 | 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
| 222 | 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu |
||
| 223 | 'uk' => [ "Ґ", "Ь" ], |
||
| 224 | 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu |
||
| 225 | 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], |
||
| 226 | 'vo' => [ "Ä", "Ö", "Ü" ], |
||
| 227 | 'yi' => [ |
||
| 228 | "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc", |
||
| 229 | "\xd7\xa9\xd7\x82", "\xd7\xaa\xd6\xbc" |
||
| 230 | ], |
||
| 231 | 'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ], |
||
| 232 | 'zu' => [], |
||
| 233 | ]; |
||
| 234 | |||
| 235 | /** |
||
| 236 | * @since 1.16.3 |
||
| 237 | */ |
||
| 238 | const RECORD_LENGTH = 14; |
||
| 239 | |||
| 240 | public function __construct( $locale ) { |
||
| 241 | if ( !extension_loaded( 'intl' ) ) { |
||
| 242 | throw new MWException( 'An ICU collation was requested, ' . |
||
| 243 | 'but the intl extension is not available.' ); |
||
| 244 | } |
||
| 245 | |||
| 246 | $this->locale = $locale; |
||
| 247 | // Drop everything after the '@' in locale's name |
||
| 248 | $localeParts = explode( '@', $locale ); |
||
| 249 | $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] ); |
||
| 250 | |||
| 251 | $this->mainCollator = Collator::create( $locale ); |
||
| 252 | if ( !$this->mainCollator ) { |
||
| 253 | throw new MWException( "Invalid ICU locale specified for collation: $locale" ); |
||
| 254 | } |
||
| 255 | |||
| 256 | $this->primaryCollator = Collator::create( $locale ); |
||
| 257 | $this->primaryCollator->setStrength( Collator::PRIMARY ); |
||
| 258 | |||
| 259 | // If the special suffix for numeric collation is present, turn on numeric collation. |
||
| 260 | if ( substr( $locale, -5, 5 ) === '-u-kn' ) { |
||
| 261 | $this->useNumericCollation = true; |
||
| 262 | // Strip off the special suffix so it doesn't trip up fetchFirstLetterData(). |
||
| 263 | $this->locale = substr( $this->locale, 0, -5 ); |
||
| 264 | $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
| 265 | $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); |
||
| 266 | } |
||
| 267 | } |
||
| 268 | |||
| 269 | public function getSortKey( $string ) { |
||
| 270 | return $this->mainCollator->getSortKey( $string ); |
||
| 271 | } |
||
| 272 | |||
| 273 | public function getPrimarySortKey( $string ) { |
||
| 274 | return $this->primaryCollator->getSortKey( $string ); |
||
| 275 | } |
||
| 276 | |||
| 277 | public function getFirstLetter( $string ) { |
||
| 278 | $string = strval( $string ); |
||
| 279 | if ( $string === '' ) { |
||
| 280 | return ''; |
||
| 281 | } |
||
| 282 | |||
| 283 | $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); |
||
| 284 | |||
| 285 | // If the first character is a CJK character, just return that character. |
||
| 286 | if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) { |
||
| 287 | return $firstChar; |
||
| 288 | } |
||
| 289 | |||
| 290 | $sortKey = $this->getPrimarySortKey( $string ); |
||
| 291 | |||
| 292 | // Do a binary search to find the correct letter to sort under |
||
| 293 | $min = ArrayUtils::findLowerBound( |
||
| 294 | [ $this, 'getSortKeyByLetterIndex' ], |
||
| 295 | $this->getFirstLetterCount(), |
||
| 296 | 'strcmp', |
||
| 297 | $sortKey ); |
||
| 298 | |||
| 299 | if ( $min === false ) { |
||
| 300 | // Before the first letter |
||
| 301 | return ''; |
||
| 302 | } |
||
| 303 | |||
| 304 | $sortLetter = $this->getLetterByIndex( $min ); |
||
| 305 | |||
| 306 | if ( $this->useNumericCollation ) { |
||
| 307 | // If the sort letter is a number, return '0–9' (or localized equivalent). |
||
| 308 | // ASCII value of 0 is 48. ASCII value of 9 is 57. |
||
| 309 | // Note that this also applies to non-Arabic numerals since they are |
||
| 310 | // mapped to Arabic numeral sort letters. For example, ২ sorts as 2. |
||
| 311 | if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) { |
||
| 312 | $sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text(); |
||
| 313 | } |
||
| 314 | } |
||
| 315 | return $sortLetter; |
||
| 316 | } |
||
| 317 | |||
| 318 | /** |
||
| 319 | * @since 1.16.3 |
||
| 320 | * @return array |
||
| 321 | */ |
||
| 322 | public function getFirstLetterData() { |
||
| 323 | if ( $this->firstLetterData === null ) { |
||
| 324 | $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING ); |
||
| 325 | $cacheKey = $cache->makeKey( |
||
| 326 | 'first-letters', |
||
| 327 | $this->locale, |
||
| 328 | $this->digitTransformLanguage->getCode(), |
||
| 329 | self::getICUVersion(), |
||
| 330 | self::FIRST_LETTER_VERSION |
||
| 331 | ); |
||
| 332 | $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () { |
||
|
0 ignored issues
–
show
|
|||
| 333 | return $this->fetchFirstLetterData(); |
||
| 334 | } ); |
||
| 335 | } |
||
| 336 | return $this->firstLetterData; |
||
| 337 | } |
||
| 338 | |||
| 339 | /** |
||
| 340 | * @return array |
||
| 341 | * @throws MWException |
||
| 342 | */ |
||
| 343 | private function fetchFirstLetterData() { |
||
| 344 | // Generate data from serialized data file |
||
| 345 | if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) { |
||
| 346 | $letters = wfGetPrecompiledData( 'first-letters-root.ser' ); |
||
| 347 | // Append additional characters |
||
| 348 | $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] ); |
||
| 349 | // Remove unnecessary ones, if any |
||
| 350 | if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) { |
||
| 351 | $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] ); |
||
| 352 | } |
||
| 353 | // Apply digit transforms |
||
| 354 | $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]; |
||
| 355 | $letters = array_diff( $letters, $digits ); |
||
| 356 | foreach ( $digits as $digit ) { |
||
| 357 | $letters[] = $this->digitTransformLanguage->formatNum( $digit, true ); |
||
| 358 | } |
||
| 359 | } else { |
||
| 360 | $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); |
||
| 361 | if ( $letters === false ) { |
||
| 362 | throw new MWException( "MediaWiki does not support ICU locale " . |
||
| 363 | "\"{$this->locale}\"" ); |
||
| 364 | } |
||
| 365 | } |
||
| 366 | |||
| 367 | /* Sort the letters. |
||
| 368 | * |
||
| 369 | * It's impossible to have the precompiled data file properly sorted, |
||
| 370 | * because the sort order changes depending on ICU version. If the |
||
| 371 | * array is not properly sorted, the binary search will return random |
||
| 372 | * results. |
||
| 373 | * |
||
| 374 | * We also take this opportunity to remove primary collisions. |
||
| 375 | */ |
||
| 376 | $letterMap = []; |
||
| 377 | foreach ( $letters as $letter ) { |
||
| 378 | $key = $this->getPrimarySortKey( $letter ); |
||
| 379 | if ( isset( $letterMap[$key] ) ) { |
||
| 380 | // Primary collision |
||
| 381 | // Keep whichever one sorts first in the main collator |
||
| 382 | if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { |
||
| 383 | $letterMap[$key] = $letter; |
||
| 384 | } |
||
| 385 | } else { |
||
| 386 | $letterMap[$key] = $letter; |
||
| 387 | } |
||
| 388 | } |
||
| 389 | ksort( $letterMap, SORT_STRING ); |
||
| 390 | |||
| 391 | /* Remove duplicate prefixes. Basically if something has a sortkey |
||
| 392 | * which is a prefix of some other sortkey, then it is an |
||
| 393 | * expansion and probably should not be considered a section |
||
| 394 | * header. |
||
| 395 | * |
||
| 396 | * For example 'þ' is sometimes sorted as if it is the letters |
||
| 397 | * 'th'. Other times it is its own primary element. Another |
||
| 398 | * example is '₨'. Sometimes its a currency symbol. Sometimes it |
||
| 399 | * is an 'R' followed by an 's'. |
||
| 400 | * |
||
| 401 | * Additionally an expanded element should always sort directly |
||
| 402 | * after its first element due to they way sortkeys work. |
||
| 403 | * |
||
| 404 | * UCA sortkey elements are of variable length but no collation |
||
| 405 | * element should be a prefix of some other element, so I think |
||
| 406 | * this is safe. See: |
||
| 407 | * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm |
||
| 408 | * - http://site.icu-project.org/design/collation/uca-weight-allocation |
||
| 409 | * |
||
| 410 | * Additionally, there is something called primary compression to |
||
| 411 | * worry about. Basically, if you have two primary elements that |
||
| 412 | * are more than one byte and both start with the same byte then |
||
| 413 | * the first byte is dropped on the second primary. Additionally |
||
| 414 | * either \x03 or \xFF may be added to mean that the next primary |
||
| 415 | * does not start with the first byte of the first primary. |
||
| 416 | * |
||
| 417 | * This shouldn't matter much, as the first primary is not |
||
| 418 | * changed, and that is what we are comparing against. |
||
| 419 | * |
||
| 420 | * tl;dr: This makes some assumptions about how icu implements |
||
| 421 | * collations. It seems incredibly unlikely these assumptions |
||
| 422 | * will change, but nonetheless they are assumptions. |
||
| 423 | */ |
||
| 424 | |||
| 425 | $prev = false; |
||
| 426 | $duplicatePrefixes = []; |
||
| 427 | foreach ( $letterMap as $key => $value ) { |
||
| 428 | // Remove terminator byte. Otherwise the prefix |
||
| 429 | // comparison will get hung up on that. |
||
| 430 | $trimmedKey = rtrim( $key, "\0" ); |
||
| 431 | if ( $prev === false || $prev === '' ) { |
||
| 432 | $prev = $trimmedKey; |
||
| 433 | // We don't yet have a collation element |
||
| 434 | // to compare against, so continue. |
||
| 435 | continue; |
||
| 436 | } |
||
| 437 | |||
| 438 | // Due to the fact the array is sorted, we only have |
||
| 439 | // to compare with the element directly previous |
||
| 440 | // to the current element (skipping expansions). |
||
| 441 | // An element "X" will always sort directly |
||
| 442 | // before "XZ" (Unless we have "XY", but we |
||
| 443 | // do not update $prev in that case). |
||
| 444 | if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) { |
||
| 445 | $duplicatePrefixes[] = $key; |
||
| 446 | // If this is an expansion, we don't want to |
||
| 447 | // compare the next element to this element, |
||
| 448 | // but to what is currently $prev |
||
| 449 | continue; |
||
| 450 | } |
||
| 451 | $prev = $trimmedKey; |
||
| 452 | } |
||
| 453 | foreach ( $duplicatePrefixes as $badKey ) { |
||
| 454 | wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" ); |
||
| 455 | unset( $letterMap[$badKey] ); |
||
| 456 | // This code assumes that unsetting does not change sort order. |
||
| 457 | } |
||
| 458 | $data = [ |
||
| 459 | 'chars' => array_values( $letterMap ), |
||
| 460 | 'keys' => array_keys( $letterMap ), |
||
| 461 | ]; |
||
| 462 | |||
| 463 | // Reduce memory usage before caching |
||
| 464 | unset( $letterMap ); |
||
| 465 | |||
| 466 | return $data; |
||
| 467 | } |
||
| 468 | |||
| 469 | /** |
||
| 470 | * @since 1.16.3 |
||
| 471 | */ |
||
| 472 | public function getLetterByIndex( $index ) { |
||
| 473 | return $this->getFirstLetterData()['chars'][$index]; |
||
| 474 | } |
||
| 475 | |||
| 476 | /** |
||
| 477 | * @since 1.16.3 |
||
| 478 | */ |
||
| 479 | public function getSortKeyByLetterIndex( $index ) { |
||
| 480 | return $this->getFirstLetterData()['keys'][$index]; |
||
| 481 | } |
||
| 482 | |||
| 483 | /** |
||
| 484 | * @since 1.16.3 |
||
| 485 | */ |
||
| 486 | public function getFirstLetterCount() { |
||
| 487 | return count( $this->getFirstLetterData()['chars'] ); |
||
| 488 | } |
||
| 489 | |||
| 490 | /** |
||
| 491 | * Test if a code point is a CJK (Chinese, Japanese, Korean) character |
||
| 492 | * @since 1.16.3 |
||
| 493 | */ |
||
| 494 | public static function isCjk( $codepoint ) { |
||
| 495 | foreach ( self::$cjkBlocks as $block ) { |
||
| 496 | if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { |
||
| 497 | return true; |
||
| 498 | } |
||
| 499 | } |
||
| 500 | return false; |
||
| 501 | } |
||
| 502 | |||
| 503 | /** |
||
| 504 | * Return the version of ICU library used by PHP's intl extension, |
||
| 505 | * or false when the extension is not installed of the version |
||
| 506 | * can't be determined. |
||
| 507 | * |
||
| 508 | * The constant INTL_ICU_VERSION this function refers to isn't really |
||
| 509 | * documented. It is available since PHP 5.3.7 (see PHP bug 54561). |
||
| 510 | * This function will return false on older PHPs. |
||
| 511 | * |
||
| 512 | * @since 1.21 |
||
| 513 | * @return string|bool |
||
| 514 | */ |
||
| 515 | static function getICUVersion() { |
||
| 516 | return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false; |
||
| 517 | } |
||
| 518 | |||
| 519 | /** |
||
| 520 | * Return the version of Unicode appropriate for the version of ICU library |
||
| 521 | * currently in use, or false when it can't be determined. |
||
| 522 | * |
||
| 523 | * @since 1.21 |
||
| 524 | * @return string|bool |
||
| 525 | */ |
||
| 526 | static function getUnicodeVersionForICU() { |
||
| 527 | $icuVersion = IcuCollation::getICUVersion(); |
||
| 528 | if ( !$icuVersion ) { |
||
| 529 | return false; |
||
| 530 | } |
||
| 531 | |||
| 532 | $versionPrefix = substr( $icuVersion, 0, 3 ); |
||
| 533 | // Source: http://site.icu-project.org/download |
||
| 534 | $map = [ |
||
| 535 | '57.' => '8.0', |
||
| 536 | '56.' => '8.0', |
||
| 537 | '55.' => '7.0', |
||
| 538 | '54.' => '7.0', |
||
| 539 | '53.' => '6.3', |
||
| 540 | '52.' => '6.3', |
||
| 541 | '51.' => '6.2', |
||
| 542 | '50.' => '6.2', |
||
| 543 | '49.' => '6.1', |
||
| 544 | '4.8' => '6.0', |
||
| 545 | '4.6' => '6.0', |
||
| 546 | '4.4' => '5.2', |
||
| 547 | '4.2' => '5.1', |
||
| 548 | '4.0' => '5.1', |
||
| 549 | '3.8' => '5.0', |
||
| 550 | '3.6' => '5.0', |
||
| 551 | '3.4' => '4.1', |
||
| 552 | ]; |
||
| 553 | |||
| 554 | if ( isset( $map[$versionPrefix] ) ) { |
||
| 555 | return $map[$versionPrefix]; |
||
| 556 | } else { |
||
| 557 | return false; |
||
| 558 | } |
||
| 559 | } |
||
| 560 | } |
||
| 561 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..