Complex classes like IcuCollation often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use IcuCollation, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 24 | class IcuCollation extends Collation { |
||
| 25 | const FIRST_LETTER_VERSION = 2; |
||
| 26 | |||
| 27 | /** @var Collator */ |
||
| 28 | private $primaryCollator; |
||
| 29 | |||
| 30 | /** @var Collator */ |
||
| 31 | private $mainCollator; |
||
| 32 | |||
| 33 | /** @var string */ |
||
| 34 | private $locale; |
||
| 35 | |||
| 36 | /** @var Language */ |
||
| 37 | protected $digitTransformLanguage; |
||
| 38 | |||
| 39 | /** @var boolean */ |
||
| 40 | private $useNumericCollation = false; |
||
| 41 | |||
| 42 | /** @var array */ |
||
| 43 | private $firstLetterData; |
||
| 44 | |||
| 45 | /** |
||
| 46 | * Unified CJK blocks. |
||
| 47 | * |
||
| 48 | * The same definition of a CJK block must be used for both Collation and |
||
| 49 | * generateCollationData.php. These blocks are omitted from the first |
||
| 50 | * letter data, as an optimisation measure and because the default UCA table |
||
| 51 | * is pretty useless for sorting Chinese text anyway. Japanese and Korean |
||
| 52 | * blocks are not included here, because they are smaller and more useful. |
||
| 53 | */ |
||
| 54 | private static $cjkBlocks = [ |
||
| 55 | [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement |
||
| 56 | [ 0x2F00, 0x2FDF ], // Kangxi Radicals |
||
| 57 | [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters |
||
| 58 | [ 0x3000, 0x303F ], // CJK Symbols and Punctuation |
||
| 59 | [ 0x31C0, 0x31EF ], // CJK Strokes |
||
| 60 | [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months |
||
| 61 | [ 0x3300, 0x33FF ], // CJK Compatibility |
||
| 62 | [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A |
||
| 63 | [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs |
||
| 64 | [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs |
||
| 65 | [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms |
||
| 66 | [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B |
||
| 67 | [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C |
||
| 68 | [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D |
||
| 69 | [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement |
||
| 70 | ]; |
||
| 71 | |||
| 72 | /** |
||
| 73 | * Additional characters (or character groups) to be considered separate |
||
| 74 | * letters for given languages, or to be removed from the list of such |
||
| 75 | * letters (denoted by keys starting with '-'). |
||
| 76 | * |
||
| 77 | * These are additions to (or subtractions from) the data stored in the |
||
| 78 | * first-letters-root.ser file (which among others includes full basic latin, |
||
| 79 | * cyrillic and greek alphabets). |
||
| 80 | * |
||
| 81 | * "Separate letter" is a letter that would have a separate heading/section |
||
| 82 | * for it in a dictionary or a phone book in this language. This data isn't |
||
| 83 | * used for sorting (the ICU library handles that), only for deciding which |
||
| 84 | * characters (or character groups) to use as headings. |
||
| 85 | * |
||
| 86 | * Initially generated based on the primary level of Unicode collation |
||
| 87 | * tailorings available at http://developer.mimer.com/charts/tailorings.htm , |
||
| 88 | * later modified. |
||
| 89 | * |
||
| 90 | * Empty arrays are intended; this signifies that the data for the language is |
||
| 91 | * available and that there are, in fact, no additional letters to consider. |
||
| 92 | */ |
||
| 93 | private static $tailoringFirstLetters = [ |
||
| 94 | 'af' => [], |
||
| 95 | 'am' => [], |
||
| 96 | 'ar' => [], |
||
| 97 | 'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ], |
||
| 98 | 'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu |
||
| 99 | 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
| 100 | 'be' => [ "Ё" ], |
||
| 101 | 'be-tarask' => [ "Ё" ], |
||
| 102 | 'bg' => [], |
||
| 103 | 'bo' => [], |
||
| 104 | 'br' => [ "Ch", "C'h" ], |
||
| 105 | 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 106 | 'bs-Cyrl' => [], |
||
| 107 | 'ca' => [], |
||
| 108 | 'chr' => [], |
||
| 109 | 'co' => [], // not in libicu |
||
| 110 | 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], |
||
| 111 | 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], |
||
| 112 | 'da' => [ "Æ", "Ø", "Å" ], |
||
| 113 | 'de' => [], |
||
| 114 | 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ], |
||
| 115 | 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ], |
||
| 116 | 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ], |
||
| 117 | 'el' => [], |
||
| 118 | 'en' => [], |
||
| 119 | 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ], |
||
| 120 | 'es' => [ "Ñ" ], |
||
| 121 | 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et) |
||
| 122 | 'eu' => [ "Ñ" ], // not in libicu |
||
| 123 | 'fa' => [ |
||
| 124 | // RTL, let's put each letter on a new line |
||
| 125 | "آ", |
||
| 126 | "ء", |
||
| 127 | "ه", |
||
| 128 | "ا", |
||
| 129 | "و" |
||
| 130 | ], |
||
| 131 | 'fi' => [ "Å", "Ä", "Ö" ], |
||
| 132 | 'fil' => [ "Ñ", "Ng" ], |
||
| 133 | 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ], |
||
| 134 | 'fr' => [], |
||
| 135 | 'fr-CA' => [], // fr-CA sorts accents slightly different from fr. |
||
| 136 | 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu |
||
| 137 | 'fy' => [], // not in libicu |
||
| 138 | 'ga' => [], |
||
| 139 | 'gd' => [], // not in libicu |
||
| 140 | 'gl' => [ "Ch", "Ll", "Ñ" ], |
||
| 141 | 'gu' => [ "\xe0\xaa\x82", "\xe0\xaa\x83", "\xe0\xaa\x81", "\xe0\xaa\xb3" ], |
||
| 142 | 'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ], |
||
| 143 | 'haw' => [ 'ʻ' ], |
||
| 144 | 'he' => [], |
||
| 145 | 'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ], |
||
| 146 | 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 147 | 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], |
||
| 148 | 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], |
||
| 149 | 'hy' => [ "և" ], |
||
| 150 | 'id' => [], |
||
| 151 | 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ], |
||
| 152 | 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], |
||
| 153 | 'it' => [], |
||
| 154 | 'ka' => [], |
||
| 155 | 'kk' => [ "Ү", "І" ], |
||
| 156 | 'kl' => [ "Æ", "Ø", "Å" ], |
||
| 157 | 'km' => [ |
||
| 158 | "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86", |
||
| 159 | "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87", |
||
| 160 | "\xe1\x9e\xb7\xe1\x9f\x87", "\xe1\x9e\xbb\xe1\x9f\x87", |
||
| 161 | "\xe1\x9f\x81\xe1\x9f\x87", "\xe1\x9f\x84\xe1\x9f\x87", |
||
| 162 | ], |
||
| 163 | 'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ], |
||
| 164 | 'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ], |
||
| 165 | 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu |
||
| 166 | 'ky' => [ "Ё" ], |
||
| 167 | 'la' => [], // not in libicu |
||
| 168 | 'lb' => [], |
||
| 169 | 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ], |
||
| 170 | 'ln' => [ 'Ɛ' ], |
||
| 171 | 'lo' => [], |
||
| 172 | 'lt' => [ "Č", "Š", "Ž" ], |
||
| 173 | 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], |
||
| 174 | 'mk' => [ "Ѓ", "Ќ" ], |
||
| 175 | 'ml' => [], |
||
| 176 | 'mn' => [], |
||
| 177 | 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // not in libicu |
||
| 178 | 'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ], |
||
| 179 | 'ms' => [], |
||
| 180 | 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ], |
||
| 181 | 'nb' => [ "Æ", "Ø", "Å" ], |
||
| 182 | 'ne' => [], |
||
| 183 | 'nl' => [], |
||
| 184 | 'nn' => [ "Æ", "Ø", "Å" ], |
||
| 185 | 'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead. |
||
| 186 | 'oc' => [], // not in libicu |
||
| 187 | 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ], |
||
| 188 | 'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ], |
||
| 189 | 'pa' => [ "\xe0\xa9\x8d" ], |
||
| 190 | 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], |
||
| 191 | 'pt' => [], |
||
| 192 | 'rm' => [], // not in libicu |
||
| 193 | 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], |
||
| 194 | 'ru' => [], |
||
| 195 | 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu |
||
| 196 | 'sco' => [], |
||
| 197 | 'se' => [ |
||
| 198 | 'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ', |
||
| 199 | 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö' |
||
| 200 | ], |
||
| 201 | 'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ], |
||
| 202 | 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], |
||
| 203 | 'sl' => [ "Č", "Š", "Ž" ], |
||
| 204 | 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ], |
||
| 205 | 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ], |
||
| 206 | 'sr' => [], |
||
| 207 | 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], |
||
| 208 | 'sv' => [ "Å", "Ä", "Ö" ], |
||
| 209 | 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], |
||
| 210 | 'sw' => [], |
||
| 211 | 'ta' => [ |
||
| 212 | "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", |
||
| 213 | "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", |
||
| 214 | "ஸ்", "ஹ்", "க்ஷ்" |
||
| 215 | ], |
||
| 216 | 'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ], |
||
| 217 | 'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ], |
||
| 218 | 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ], |
||
| 219 | 'tl' => [ "Ñ", "Ng" ], // not in libicu |
||
| 220 | 'to' => [ "Ng", "ʻ" ], |
||
| 221 | 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ], |
||
| 222 | 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu |
||
| 223 | 'uk' => [ "Ґ", "Ь" ], |
||
| 224 | 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu |
||
| 225 | 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], |
||
| 226 | 'vo' => [ "Ä", "Ö", "Ü" ], |
||
| 227 | 'yi' => [ |
||
| 228 | "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc", |
||
| 229 | "\xd7\xa9\xd7\x82", "\xd7\xaa\xd6\xbc" |
||
| 230 | ], |
||
| 231 | 'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ], |
||
| 232 | 'zu' => [], |
||
| 233 | ]; |
||
| 234 | |||
| 235 | /** |
||
| 236 | * @since 1.16.3 |
||
| 237 | */ |
||
| 238 | const RECORD_LENGTH = 14; |
||
| 239 | |||
| 240 | public function __construct( $locale ) { |
||
| 268 | |||
| 269 | public function getSortKey( $string ) { |
||
| 272 | |||
| 273 | public function getPrimarySortKey( $string ) { |
||
| 276 | |||
| 277 | public function getFirstLetter( $string ) { |
||
| 317 | |||
| 318 | /** |
||
| 319 | * @since 1.16.3 |
||
| 320 | * @return array |
||
| 321 | */ |
||
| 322 | public function getFirstLetterData() { |
||
| 338 | |||
| 339 | /** |
||
| 340 | * @return array |
||
| 341 | * @throws MWException |
||
| 342 | */ |
||
| 343 | private function fetchFirstLetterData() { |
||
| 468 | |||
| 469 | /** |
||
| 470 | * @since 1.16.3 |
||
| 471 | */ |
||
| 472 | public function getLetterByIndex( $index ) { |
||
| 475 | |||
| 476 | /** |
||
| 477 | * @since 1.16.3 |
||
| 478 | */ |
||
| 479 | public function getSortKeyByLetterIndex( $index ) { |
||
| 482 | |||
| 483 | /** |
||
| 484 | * @since 1.16.3 |
||
| 485 | */ |
||
| 486 | public function getFirstLetterCount() { |
||
| 489 | |||
| 490 | /** |
||
| 491 | * Test if a code point is a CJK (Chinese, Japanese, Korean) character |
||
| 492 | * @since 1.16.3 |
||
| 493 | */ |
||
| 494 | public static function isCjk( $codepoint ) { |
||
| 502 | |||
| 503 | /** |
||
| 504 | * Return the version of ICU library used by PHP's intl extension, |
||
| 505 | * or false when the extension is not installed of the version |
||
| 506 | * can't be determined. |
||
| 507 | * |
||
| 508 | * The constant INTL_ICU_VERSION this function refers to isn't really |
||
| 509 | * documented. It is available since PHP 5.3.7 (see PHP bug 54561). |
||
| 510 | * This function will return false on older PHPs. |
||
| 511 | * |
||
| 512 | * @since 1.21 |
||
| 513 | * @return string|bool |
||
| 514 | */ |
||
| 515 | static function getICUVersion() { |
||
| 518 | |||
| 519 | /** |
||
| 520 | * Return the version of Unicode appropriate for the version of ICU library |
||
| 521 | * currently in use, or false when it can't be determined. |
||
| 522 | * |
||
| 523 | * @since 1.21 |
||
| 524 | * @return string|bool |
||
| 525 | */ |
||
| 526 | static function getUnicodeVersionForICU() { |
||
| 560 | } |
||
| 561 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..