Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | <?php |
||
| 18 | class CharsetConverter |
||
| 19 | { |
||
| 20 | /** |
||
| 21 | * @var array aliased charsets supported by mb_convert_encoding. |
||
| 22 | * The alias is stripped of any non-alphanumeric characters (so CP367 |
||
| 23 | * is equal to CP-367) when comparing. |
||
| 24 | * Some of these translations are already supported by |
||
| 25 | * mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in |
||
| 26 | * other implementations or versions since they're not part of |
||
| 27 | * documented support. |
||
| 28 | */ |
||
| 29 | public static $mbAliases = [ |
||
| 30 | // supported but not included in mb_list_encodings for some reason... |
||
| 31 | 'CP850' => 'CP850', |
||
| 32 | 'GB2312' => 'GB2312', |
||
| 33 | // aliases |
||
| 34 | '646' => 'ASCII', |
||
| 35 | 'ANSIX341968' => 'ASCII', |
||
| 36 | 'ANSIX341986' => 'ASCII', |
||
| 37 | 'CP367' => 'ASCII', |
||
| 38 | 'CSASCII' => 'ASCII', |
||
| 39 | 'IBM367' => 'ASCII', |
||
| 40 | 'ISO646US' => 'ASCII', |
||
| 41 | 'ISO646IRV1991' => 'ASCII', |
||
| 42 | 'ISOIR6' => 'ASCII', |
||
| 43 | 'US' => 'ASCII', |
||
| 44 | 'USASCII' => 'ASCII', |
||
| 45 | 'BIG5' => 'BIG-5', |
||
| 46 | 'BIG5TW' => 'BIG-5', |
||
| 47 | 'CSBIG5' => 'BIG-5', |
||
| 48 | '1251' => 'WINDOWS-1251', |
||
| 49 | 'CP1251' => 'WINDOWS-1251', |
||
| 50 | 'WINDOWS1251' => 'WINDOWS-1251', |
||
| 51 | '1252' => 'WINDOWS-1252', |
||
| 52 | 'CP1252' => 'WINDOWS-1252', |
||
| 53 | 'WINDOWS1252' => 'WINDOWS-1252', |
||
| 54 | '1254' => 'WINDOWS-1254', |
||
| 55 | 'CP1254' => 'WINDOWS-1254', |
||
| 56 | 'WINDOWS1254' => 'WINDOWS-1254', |
||
| 57 | '1255' => 'ISO-8859-8', |
||
| 58 | 'CP1255' => 'ISO-8859-8', |
||
| 59 | 'ISO88598I' => 'ISO-8859-8', |
||
| 60 | 'WINDOWS1255' => 'ISO-8859-8', |
||
| 61 | '850' => 'CP850', |
||
| 62 | 'CSPC850MULTILINGUAL' => 'CP850', |
||
| 63 | 'IBM850' => 'CP850', |
||
| 64 | '866' => 'CP866', |
||
| 65 | 'CSIBM866' => 'CP866', |
||
| 66 | 'IBM866' => 'CP866', |
||
| 67 | '932' => 'CP932', |
||
| 68 | 'MS932' => 'CP932', |
||
| 69 | 'MSKANJI' => 'CP932', |
||
| 70 | '950' => 'CP950', |
||
| 71 | 'MS950' => 'CP950', |
||
| 72 | 'EUCJP' => 'EUC-JP', |
||
| 73 | 'UJIS' => 'EUC-JP', |
||
| 74 | 'EUCKR' => 'EUC-KR', |
||
| 75 | 'KOREAN' => 'EUC-KR', |
||
| 76 | 'KSC5601' => 'EUC-KR', |
||
| 77 | 'KSC56011987' => 'EUC-KR', |
||
| 78 | 'KSX1001' => 'EUC-KR', |
||
| 79 | 'GB180302000' => 'GB18030', |
||
| 80 | // GB2312 not listed but supported |
||
| 81 | 'CHINESE' => 'GB2312', |
||
| 82 | 'CSISO58GB231280' => 'GB2312', |
||
| 83 | 'EUCCN' => 'GB2312', |
||
| 84 | 'EUCGB2312CN' => 'GB2312', |
||
| 85 | 'GB23121980' => 'GB2312', |
||
| 86 | 'GB231280' => 'GB2312', |
||
| 87 | 'ISOIR58' => 'GB2312', |
||
| 88 | 'GBK' => 'CP936', |
||
| 89 | '936' => 'CP936', |
||
| 90 | 'ms936' => 'CP936', |
||
| 91 | 'HZGB' => 'HZ', |
||
| 92 | 'HZGB2312' => 'HZ', |
||
| 93 | 'CSISO2022JP' => 'ISO-2022-JP', |
||
| 94 | 'ISO2022JP' => 'ISO-2022-JP', |
||
| 95 | 'ISO2022JP2004' => 'ISO-2022-JP-2004', |
||
| 96 | 'CSISO2022KR' => 'ISO-2022-KR', |
||
| 97 | 'ISO2022KR' => 'ISO-2022-KR', |
||
| 98 | 'CSISOLATIN6' => 'ISO-8859-10', |
||
| 99 | 'ISO885910' => 'ISO-8859-10', |
||
| 100 | 'ISO8859101992' => 'ISO-8859-10', |
||
| 101 | 'ISOIR157' => 'ISO-8859-10', |
||
| 102 | 'L6' => 'ISO-8859-10', |
||
| 103 | 'LATIN6' => 'ISO-8859-10', |
||
| 104 | 'ISO885913' => 'ISO-8859-13', |
||
| 105 | 'ISO885914' => 'ISO-8859-14', |
||
| 106 | 'ISO8859141998' => 'ISO-8859-14', |
||
| 107 | 'ISOCELTIC' => 'ISO-8859-14', |
||
| 108 | 'ISOIR199' => 'ISO-8859-14', |
||
| 109 | 'L8' => 'ISO-8859-14', |
||
| 110 | 'LATIN8' => 'ISO-8859-14', |
||
| 111 | 'ISO885915' => 'ISO-8859-15', |
||
| 112 | 'ISO885916' => 'ISO-8859-16', |
||
| 113 | 'ISO8859162001' => 'ISO-8859-16', |
||
| 114 | 'ISOIR226' => 'ISO-8859-16', |
||
| 115 | 'L10' => 'ISO-8859-16', |
||
| 116 | 'LATIN10' => 'ISO-8859-16', |
||
| 117 | 'CSISOLATIN2' => 'ISO-8859-2', |
||
| 118 | 'ISO88592' => 'ISO-8859-2', |
||
| 119 | 'ISO885921987' => 'ISO-8859-2', |
||
| 120 | 'ISOIR101' => 'ISO-8859-2', |
||
| 121 | 'L2' => 'ISO-8859-2', |
||
| 122 | 'LATIN2' => 'ISO-8859-2', |
||
| 123 | 'CSISOLATIN3' => 'ISO-8859-3', |
||
| 124 | 'ISO88593' => 'ISO-8859-3', |
||
| 125 | 'ISO885931988' => 'ISO-8859-3', |
||
| 126 | 'ISOIR109' => 'ISO-8859-3', |
||
| 127 | 'L3' => 'ISO-8859-3', |
||
| 128 | 'LATIN3' => 'ISO-8859-3', |
||
| 129 | 'CSISOLATIN4' => 'ISO-8859-4', |
||
| 130 | 'ISO88594' => 'ISO-8859-4', |
||
| 131 | 'ISO885941988' => 'ISO-8859-4', |
||
| 132 | 'ISOIR110' => 'ISO-8859-4', |
||
| 133 | 'L4' => 'ISO-8859-4', |
||
| 134 | 'LATIN4' => 'ISO-8859-4', |
||
| 135 | 'CSISOLATINCYRILLIC' => 'ISO-8859-5', |
||
| 136 | 'CYRILLIC' => 'ISO-8859-5', |
||
| 137 | 'ISO88595' => 'ISO-8859-5', |
||
| 138 | 'ISO885951988' => 'ISO-8859-5', |
||
| 139 | 'ISOIR144' => 'ISO-8859-5', |
||
| 140 | 'ARABIC' => 'ISO-8859-6', |
||
| 141 | 'ASMO708' => 'ISO-8859-6', |
||
| 142 | 'CSISOLATINARABIC' => 'ISO-8859-6', |
||
| 143 | 'ECMA114' => 'ISO-8859-6', |
||
| 144 | 'ISO88596' => 'ISO-8859-6', |
||
| 145 | 'ISO885961987' => 'ISO-8859-6', |
||
| 146 | 'ISOIR127' => 'ISO-8859-6', |
||
| 147 | 'CSISOLATINGREEK' => 'ISO-8859-7', |
||
| 148 | 'ECMA118' => 'ISO-8859-7', |
||
| 149 | 'ELOT928' => 'ISO-8859-7', |
||
| 150 | 'GREEK' => 'ISO-8859-7', |
||
| 151 | 'GREEK8' => 'ISO-8859-7', |
||
| 152 | 'ISO88597' => 'ISO-8859-7', |
||
| 153 | 'ISO885971987' => 'ISO-8859-7', |
||
| 154 | 'ISOIR126' => 'ISO-8859-7', |
||
| 155 | 'CSISOLATINHEBREW' => 'ISO-8859-8', |
||
| 156 | 'HEBREW' => 'ISO-8859-8', |
||
| 157 | 'ISO88598' => 'ISO-8859-8', |
||
| 158 | 'ISO885981988' => 'ISO-8859-8', |
||
| 159 | 'ISOIR138' => 'ISO-8859-8', |
||
| 160 | 'CSISOLATIN5' => 'ISO-8859-9', |
||
| 161 | 'ISO88599' => 'ISO-8859-9', |
||
| 162 | 'ISO885991989' => 'ISO-8859-9', |
||
| 163 | 'ISOIR148' => 'ISO-8859-9', |
||
| 164 | 'L5' => 'ISO-8859-9', |
||
| 165 | 'LATIN5' => 'ISO-8859-9', |
||
| 166 | 'CSKOI8R' => 'KOI8-R', |
||
| 167 | 'KOI8R' => 'KOI8-R', |
||
| 168 | '8859' => 'ISO-8859-1', |
||
| 169 | 'CP819' => 'ISO-8859-1', |
||
| 170 | 'CSISOLATIN1' => 'ISO-8859-1', |
||
| 171 | 'IBM819' => 'ISO-8859-1', |
||
| 172 | 'ISO8859' => 'ISO-8859-1', |
||
| 173 | 'ISO88591' => 'ISO-8859-1', |
||
| 174 | 'ISO885911987' => 'ISO-8859-1', |
||
| 175 | 'ISOIR100' => 'ISO-8859-1', |
||
| 176 | 'L1' => 'ISO-8859-1', |
||
| 177 | 'LATIN' => 'ISO-8859-1', |
||
| 178 | 'LATIN1' => 'ISO-8859-1', |
||
| 179 | 'CSSHIFTJIS' => 'SJIS', |
||
| 180 | 'SHIFTJIS' => 'SJIS', |
||
| 181 | 'SHIFTJIS2004' => 'SJIS-2004', |
||
| 182 | 'SJIS2004' => 'SJIS-2004', |
||
| 183 | ]; |
||
| 184 | |||
| 185 | /** |
||
| 186 | * @var array aliased charsets supported by iconv. |
||
| 187 | */ |
||
| 188 | public static $iconvAliases = [ |
||
| 189 | // iconv aliases -- a lot of these may already be supported |
||
| 190 | 'BIG5HKSCS' => 'BIG5HKSCS', |
||
| 191 | 'HKSCS' => 'BIG5HKSCS', |
||
| 192 | '037' => 'CP037', |
||
| 193 | 'EBCDICCPCA' => 'CP037', |
||
| 194 | 'EBCDICCPNL' => 'CP037', |
||
| 195 | 'EBCDICCPUS' => 'CP037', |
||
| 196 | 'EBCDICCPWT' => 'CP037', |
||
| 197 | 'CSIBM037' => 'CP037', |
||
| 198 | 'IBM037' => 'CP037', |
||
| 199 | 'IBM039' => 'CP037', |
||
| 200 | '1026' => 'CP1026', |
||
| 201 | 'CSIBM1026' => 'CP1026', |
||
| 202 | 'IBM1026' => 'CP1026', |
||
| 203 | '1140' => 'CP1140', |
||
| 204 | 'IBM1140' => 'CP1140', |
||
| 205 | '1250' => 'CP1250', |
||
| 206 | 'WINDOWS1250' => 'CP1250', |
||
| 207 | '1253' => 'CP1253', |
||
| 208 | 'WINDOWS1253' => 'CP1253', |
||
| 209 | '1256' => 'CP1256', |
||
| 210 | 'WINDOWS1256' => 'CP1256', |
||
| 211 | '1257' => 'CP1257', |
||
| 212 | 'WINDOWS1257' => 'CP1257', |
||
| 213 | '1258' => 'CP1258', |
||
| 214 | 'WINDOWS1258' => 'CP1258', |
||
| 215 | '424' => 'CP424', |
||
| 216 | 'CSIBM424' => 'CP424', |
||
| 217 | 'EBCDICCPHE' => 'CP424', |
||
| 218 | 'IBM424' => 'CP424', |
||
| 219 | '437' => 'CP437', |
||
| 220 | 'CSPC8CODEPAGE437' => 'CP437', |
||
| 221 | 'IBM437' => 'CP437', |
||
| 222 | '500' => 'CP500', |
||
| 223 | 'CSIBM500' => 'CP500', |
||
| 224 | 'EBCDICCPBE' => 'CP500', |
||
| 225 | 'EBCDICCPCH' => 'CP500', |
||
| 226 | 'IBM500' => 'CP500', |
||
| 227 | '775' => 'CP775', |
||
| 228 | 'CSPC775BALTIC' => 'CP775', |
||
| 229 | 'IBM775' => 'CP775', |
||
| 230 | '860' => 'CP860', |
||
| 231 | 'CSIBM860' => 'CP860', |
||
| 232 | 'IBM860' => 'CP860', |
||
| 233 | '861' => 'CP861', |
||
| 234 | 'CPIS' => 'CP861', |
||
| 235 | 'CSIBM861' => 'CP861', |
||
| 236 | 'IBM861' => 'CP861', |
||
| 237 | '862' => 'CP862', |
||
| 238 | 'CSPC862LATINHEBREW' => 'CP862', |
||
| 239 | 'IBM862' => 'CP862', |
||
| 240 | '863' => 'CP863', |
||
| 241 | 'CSIBM863' => 'CP863', |
||
| 242 | 'IBM863' => 'CP863', |
||
| 243 | '864' => 'CP864', |
||
| 244 | 'CSIBM864' => 'CP864', |
||
| 245 | 'IBM864' => 'CP864', |
||
| 246 | '865' => 'CP865', |
||
| 247 | 'CSIBM865' => 'CP865', |
||
| 248 | 'IBM865' => 'CP865', |
||
| 249 | '869' => 'CP869', |
||
| 250 | 'CPGR' => 'CP869', |
||
| 251 | 'CSIBM869' => 'CP869', |
||
| 252 | 'IBM869' => 'CP869', |
||
| 253 | '949' => 'CP949', |
||
| 254 | 'MS949' => 'CP949', |
||
| 255 | 'UHC' => 'CP949', |
||
| 256 | 'ROMAN8' => 'ROMAN8', |
||
| 257 | 'HPROMAN8' => 'ROMAN8', |
||
| 258 | 'R8' => 'ROMAN8', |
||
| 259 | 'CSHPROMAN8' => 'ROMAN8', |
||
| 260 | 'ISO2022JP2' => 'ISO2022JP2', |
||
| 261 | 'THAI' => 'ISO885911', |
||
| 262 | 'ISO885911' => 'ISO885911', |
||
| 263 | 'ISO8859112001' => 'ISO885911', |
||
| 264 | 'JOHAB' => 'CP1361', |
||
| 265 | 'MS1361' => 'CP1361', |
||
| 266 | 'MACCYRILLIC' => 'MACCYRILLIC', |
||
| 267 | 'CSPTCP154' => 'PT154', |
||
| 268 | 'PTCP154' => 'PT154', |
||
| 269 | 'CP154' => 'PT154', |
||
| 270 | 'CYRILLICASIAN' => 'PT154', |
||
| 271 | 'TIS620' => 'TIS620', |
||
| 272 | 'TIS6200' => 'TIS620', |
||
| 273 | 'TIS62025290' => 'TIS620', |
||
| 274 | 'TIS62025291' => 'TIS620', |
||
| 275 | 'ISOIR166' => 'TIS620', |
||
| 276 | ]; |
||
| 277 | |||
| 278 | /** |
||
| 279 | * @var string charset to convert from |
||
| 280 | */ |
||
| 281 | protected $fromCharset; |
||
| 282 | |||
| 283 | /** |
||
| 284 | * @var string charset to convert to |
||
| 285 | */ |
||
| 286 | protected $toCharset; |
||
| 287 | |||
| 288 | protected $fromCharsetMbSupported = true; |
||
| 289 | protected $toCharsetMbSupported = true; |
||
| 290 | |||
| 291 | /** |
||
| 292 | * Constructs the charset converter with source/destination charsets. |
||
| 293 | * |
||
| 294 | * @param string $fromCharset |
||
| 295 | * @param string $toCharset |
||
| 296 | */ |
||
| 297 | 3 | public function __construct($fromCharset, $toCharset) |
|
| 302 | |||
| 303 | /** |
||
| 304 | * Converts the passed string's charset from $this->fromCharset to |
||
| 305 | * $this->toCharset. |
||
| 306 | * |
||
| 307 | * The function attempts to use mb_convert_encoding if possible, and falls |
||
| 308 | * back to iconv if not. If the source or destination character sets aren't |
||
| 309 | * supported, a blank string is returned. |
||
| 310 | * |
||
| 311 | * @param string $str |
||
| 312 | * @return string |
||
| 313 | */ |
||
| 314 | 3 | public function convert($str) |
|
| 330 | |||
| 331 | /** |
||
| 332 | * Looks up the passed $cs in mb_list_encodings, then strips non |
||
| 333 | * alpha-numeric characters and tries again, then failing that calls |
||
| 334 | * findAliasedCharset. The method returns the charset name that should be |
||
| 335 | * used in calls to mb_convert_encoding or iconv. |
||
| 336 | * |
||
| 337 | * If the charset is part of mb_list_encodings, $mbSupported is set to true. |
||
| 338 | * |
||
| 339 | * @param string $cs |
||
| 340 | * @param boolean $mbSupported |
||
| 341 | * @return string the final charset name to use |
||
| 342 | */ |
||
| 343 | 3 | private function findSupportedCharset($cs, &$mbSupported) |
|
| 357 | |||
| 358 | /** |
||
| 359 | * Looks up the passed $comp and $stripped strings in self::$mbAliases, and |
||
| 360 | * returns the mapped charset if applicable. Otherwise calls |
||
| 361 | * $this->findAliasedIconvCharset. |
||
| 362 | * |
||
| 363 | * $mbSupported is set to false if the charset is not located in |
||
| 364 | * self::$mbAliases. |
||
| 365 | * |
||
| 366 | * @param string $comp |
||
| 367 | * @param string $stripped |
||
| 368 | * @param boolean $mbSupported |
||
| 369 | * @return string the mapped charset |
||
| 370 | */ |
||
| 371 | 3 | private function findAliasedCharset($comp, $stripped, &$mbSupported) |
|
| 381 | |||
| 382 | /** |
||
| 383 | * Looks up the passed $comp and $stripped strings in self::$iconvAliases, |
||
| 384 | * and returns the mapped charset if applicable. Otherwise returns $comp. |
||
| 385 | * |
||
| 386 | * @param string $comp |
||
| 387 | * @param string $stripped |
||
| 388 | * @return string the mapped charset (if mapped) or $comp otherwise |
||
| 389 | */ |
||
| 390 | 1 | private function findAliasedIconvCharset($comp, $stripped) |
|
| 399 | } |
||
| 400 |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.