zbateson /
mb-wrapper
| 1 | <?php |
||||
| 2 | /** |
||||
| 3 | * This file is part of the ZBateson\MbWrapper project. |
||||
| 4 | * |
||||
| 5 | * @license http://opensource.org/licenses/bsd-license.php BSD |
||||
| 6 | */ |
||||
| 7 | namespace ZBateson\MbWrapper; |
||||
| 8 | |||||
| 9 | /** |
||||
| 10 | * Helper class for converting strings between charsets, finding a multibyte |
||||
| 11 | * strings length, and creating a substring. |
||||
| 12 | * |
||||
| 13 | * MbWrapper prefers PHP's mb_* extension first, and reverts to iconv_* if the |
||||
| 14 | * charsets aren't listed as supported by mb_list_encodings(). |
||||
| 15 | * |
||||
| 16 | * A list of aliased charsets are maintained to support the greatest number of |
||||
| 17 | * charsets. In addition, when searching for a charset, separator characters |
||||
| 18 | * such as dashes are removed, and searches are always performed |
||||
| 19 | * case-insensitively. This is to support strange reported encodings in emails, |
||||
| 20 | * etc... |
||||
| 21 | * |
||||
| 22 | * @author Zaahid Bateson |
||||
| 23 | */ |
||||
| 24 | class MbWrapper |
||||
| 25 | { |
||||
| 26 | /** |
||||
| 27 | * @var array<string, string> aliased charsets supported by mb_convert_encoding. |
||||
| 28 | * The alias is stripped of any non-alphanumeric characters (so CP367 |
||||
| 29 | * is equal to CP-367) when comparing. |
||||
| 30 | * Some of these translations are already supported by |
||||
| 31 | * mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in |
||||
| 32 | * other implementations or versions since they're not part of |
||||
| 33 | * documented support. |
||||
| 34 | */ |
||||
| 35 | public static $mbAliases = [ |
||||
| 36 | // supported but not included in mb_list_encodings for some reason... |
||||
| 37 | 'CP850' => 'CP850', |
||||
| 38 | 'GB2312' => 'GB18030', |
||||
| 39 | 'SJIS2004' => 'SJIS-2004', |
||||
| 40 | // aliases |
||||
| 41 | 'ANSIX341968' => 'ASCII', |
||||
| 42 | 'ANSIX341986' => 'ASCII', |
||||
| 43 | 'ARABIC' => 'ISO-8859-6', |
||||
| 44 | 'ASMO708' => 'ISO-8859-6', |
||||
| 45 | 'BIG5' => 'BIG-5', |
||||
| 46 | 'BIG5TW' => 'BIG-5', |
||||
| 47 | 'CESU8' => 'UTF-8', |
||||
| 48 | 'CHINESE' => 'GB18030', |
||||
| 49 | 'CP367' => 'ASCII', |
||||
| 50 | 'CP819' => 'ISO-8859-1', |
||||
| 51 | 'CP1251' => 'WINDOWS-1251', |
||||
| 52 | 'CP1252' => 'WINDOWS-1252', |
||||
| 53 | 'CP1254' => 'WINDOWS-1254', |
||||
| 54 | 'CP1255' => 'ISO-8859-8', |
||||
| 55 | 'CSASCII' => 'ASCII', |
||||
| 56 | 'CSBIG5' => 'BIG-5', |
||||
| 57 | 'CSIBM866' => 'CP866', |
||||
| 58 | 'CSISO2022JP' => 'ISO-2022-JP', |
||||
| 59 | 'CSISO2022KR' => 'ISO-2022-KR', |
||||
| 60 | 'CSISO58GB231280' => 'GB18030', |
||||
| 61 | 'CSISOLATIN1' => 'ISO-8859-1', |
||||
| 62 | 'CSISOLATIN2' => 'ISO-8859-2', |
||||
| 63 | 'CSISOLATIN3' => 'ISO-8859-3', |
||||
| 64 | 'CSISOLATIN4' => 'ISO-8859-4', |
||||
| 65 | 'CSISOLATIN5' => 'ISO-8859-9', |
||||
| 66 | 'CSISOLATIN6' => 'ISO-8859-10', |
||||
| 67 | 'CSISOLATINARABIC' => 'ISO-8859-6', |
||||
| 68 | 'CSISOLATINCYRILLIC' => 'ISO-8859-5', |
||||
| 69 | 'CSISOLATINGREEK' => 'ISO-8859-7', |
||||
| 70 | 'CSISOLATINHEBREW' => 'ISO-8859-8', |
||||
| 71 | 'CSKOI8R' => 'KOI8-R', |
||||
| 72 | 'CSPC850MULTILINGUAL' => 'CP850', |
||||
| 73 | 'CSSHIFTJIS' => 'SJIS', |
||||
| 74 | 'CYRILLIC' => 'ISO-8859-5', |
||||
| 75 | 'ECMA114' => 'ISO-8859-6', |
||||
| 76 | 'ECMA118' => 'ISO-8859-7', |
||||
| 77 | 'ELOT928' => 'ISO-8859-7', |
||||
| 78 | 'EUCCN' => 'GB18030', |
||||
| 79 | 'EUCGB2312CN' => 'GB18030', |
||||
| 80 | 'GB180302000' => 'GB18030', |
||||
| 81 | 'GB23121980' => 'GB18030', |
||||
| 82 | 'GB231280' => 'GB18030', |
||||
| 83 | 'GBK' => 'CP936', |
||||
| 84 | 'GREEK8' => 'ISO-8859-7', |
||||
| 85 | 'GREEK' => 'ISO-8859-7', |
||||
| 86 | 'HEBREW' => 'ISO-8859-8', |
||||
| 87 | 'HZGB2312' => 'HZ', |
||||
| 88 | 'HZGB' => 'HZ', |
||||
| 89 | 'IBM367' => 'ASCII', |
||||
| 90 | 'IBM819' => 'ISO-8859-1', |
||||
| 91 | 'IBM850' => 'CP850', |
||||
| 92 | 'IBM866' => 'CP866', |
||||
| 93 | 'ISO2022JP2004' => 'ISO-2022-JP-2004', |
||||
| 94 | 'ISO646IRV1991' => 'ASCII', |
||||
| 95 | 'ISO646US' => 'ASCII', |
||||
| 96 | 'ISO8859' => 'ISO-8859-1', |
||||
| 97 | 'ISO8859101992' => 'ISO-8859-10', |
||||
| 98 | 'ISO885911987' => 'ISO-8859-1', |
||||
| 99 | 'ISO8859141998' => 'ISO-8859-14', |
||||
| 100 | 'ISO8859162001' => 'ISO-8859-16', |
||||
| 101 | 'ISO885921987' => 'ISO-8859-2', |
||||
| 102 | 'ISO885931988' => 'ISO-8859-3', |
||||
| 103 | 'ISO885941988' => 'ISO-8859-4', |
||||
| 104 | 'ISO885951988' => 'ISO-8859-5', |
||||
| 105 | 'ISO885961987' => 'ISO-8859-6', |
||||
| 106 | 'ISO885971987' => 'ISO-8859-7', |
||||
| 107 | 'ISO885981988' => 'ISO-8859-8', |
||||
| 108 | 'ISO88598I' => 'ISO-8859-8', |
||||
| 109 | 'ISO885991989' => 'ISO-8859-9', |
||||
| 110 | 'ISOCELTIC' => 'ISO-8859-14', |
||||
| 111 | 'ISOIR100' => 'ISO-8859-1', |
||||
| 112 | 'ISOIR101' => 'ISO-8859-2', |
||||
| 113 | 'ISOIR109' => 'ISO-8859-3', |
||||
| 114 | 'ISOIR110' => 'ISO-8859-4', |
||||
| 115 | 'ISOIR126' => 'ISO-8859-7', |
||||
| 116 | 'ISOIR127' => 'ISO-8859-6', |
||||
| 117 | 'ISOIR138' => 'ISO-8859-8', |
||||
| 118 | 'ISOIR144' => 'ISO-8859-5', |
||||
| 119 | 'ISOIR148' => 'ISO-8859-9', |
||||
| 120 | 'ISOIR157' => 'ISO-8859-10', |
||||
| 121 | 'ISOIR199' => 'ISO-8859-14', |
||||
| 122 | 'ISOIR226' => 'ISO-8859-16', |
||||
| 123 | 'ISOIR58' => 'GB18030', |
||||
| 124 | 'ISOIR6' => 'ASCII', |
||||
| 125 | 'KOI8R' => 'KOI8-R', |
||||
| 126 | 'KOREAN' => 'EUC-KR', |
||||
| 127 | 'KSC56011987' => 'EUC-KR', |
||||
| 128 | 'KSC5601' => 'EUC-KR', |
||||
| 129 | 'KSX1001' => 'EUC-KR', |
||||
| 130 | 'L1' => 'ISO-8859-1', |
||||
| 131 | 'L2' => 'ISO-8859-2', |
||||
| 132 | 'L3' => 'ISO-8859-3', |
||||
| 133 | 'L4' => 'ISO-8859-4', |
||||
| 134 | 'L5' => 'ISO-8859-9', |
||||
| 135 | 'L6' => 'ISO-8859-10', |
||||
| 136 | 'L8' => 'ISO-8859-14', |
||||
| 137 | 'L10' => 'ISO-8859-16', |
||||
| 138 | 'LATIN' => 'ISO-8859-1', |
||||
| 139 | 'LATIN1' => 'ISO-8859-1', |
||||
| 140 | 'LATIN2' => 'ISO-8859-2', |
||||
| 141 | 'LATIN3' => 'ISO-8859-3', |
||||
| 142 | 'LATIN4' => 'ISO-8859-4', |
||||
| 143 | 'LATIN5' => 'ISO-8859-9', |
||||
| 144 | 'LATIN6' => 'ISO-8859-10', |
||||
| 145 | 'LATIN8' => 'ISO-8859-14', |
||||
| 146 | 'LATIN10' => 'ISO-8859-16', |
||||
| 147 | 'MS932' => 'CP932', |
||||
| 148 | 'ms936' => 'CP936', |
||||
| 149 | 'MS950' => 'CP950', |
||||
| 150 | 'MSKANJI' => 'CP932', |
||||
| 151 | 'SHIFTJIS2004' => 'SJIS', |
||||
| 152 | 'SHIFTJIS' => 'SJIS', |
||||
| 153 | 'UJIS' => 'EUC-JP', |
||||
| 154 | 'UNICODE11UTF7' => 'UTF-7', |
||||
| 155 | 'US' => 'ASCII', |
||||
| 156 | 'USASCII' => 'ASCII', |
||||
| 157 | 'WE8MSWIN1252' => 'WINDOWS-1252', |
||||
| 158 | 'WINDOWS1251' => 'WINDOWS-1251', |
||||
| 159 | 'WINDOWS1252' => 'WINDOWS-1252', |
||||
| 160 | 'WINDOWS1254' => 'WINDOWS-1254', |
||||
| 161 | 'WINDOWS1255' => 'ISO-8859-8', |
||||
| 162 | '0' => 'WINDOWS-1252', |
||||
| 163 | '128' => 'SJIS', |
||||
| 164 | '129' => 'EUC-KR', |
||||
| 165 | '134' => 'GB18030', |
||||
| 166 | '136' => 'BIG-5', |
||||
| 167 | '161' => 'WINDOWS-1253', |
||||
| 168 | '162' => 'WINDOWS-1254', |
||||
| 169 | '177' => 'WINDOWS-1255', |
||||
| 170 | '178' => 'WINDOWS-1256', |
||||
| 171 | '186' => 'WINDOWS-1257', |
||||
| 172 | '204' => 'WINDOWS-1251', |
||||
| 173 | '222' => 'WINDOWS-874', |
||||
| 174 | '238' => 'WINDOWS-1250', |
||||
| 175 | '646' => 'ASCII', |
||||
| 176 | '850' => 'CP850', |
||||
| 177 | '866' => 'CP866', |
||||
| 178 | '932' => 'CP932', |
||||
| 179 | '936' => 'CP936', |
||||
| 180 | '950' => 'CP950', |
||||
| 181 | '1251' => 'WINDOWS-1251', |
||||
| 182 | '1252' => 'WINDOWS-1252', |
||||
| 183 | '1254' => 'WINDOWS-1254', |
||||
| 184 | '1255' => 'ISO-8859-8', |
||||
| 185 | '8859' => 'ISO-8859-1', |
||||
| 186 | ]; |
||||
| 187 | |||||
| 188 | /** |
||||
| 189 | * @var array<string, string> aliased charsets supported by iconv. |
||||
| 190 | */ |
||||
| 191 | public static $iconvAliases = [ |
||||
| 192 | // iconv aliases -- a lot of these may already be supported |
||||
| 193 | 'CESU8' => 'UTF8', |
||||
| 194 | 'CP154' => 'PT154', |
||||
| 195 | 'CPGR' => 'CP869', |
||||
| 196 | 'CPIS' => 'CP861', |
||||
| 197 | 'CSHPROMAN8' => 'ROMAN8', |
||||
| 198 | 'CSIBM037' => 'CP037', |
||||
| 199 | 'CSIBM1026' => 'CP1026', |
||||
| 200 | 'CSIBM424' => 'CP424', |
||||
| 201 | 'CSIBM500' => 'CP500', |
||||
| 202 | 'CSIBM860' => 'CP860', |
||||
| 203 | 'CSIBM861' => 'CP861', |
||||
| 204 | 'CSIBM863' => 'CP863', |
||||
| 205 | 'CSIBM864' => 'CP864', |
||||
| 206 | 'CSIBM865' => 'CP865', |
||||
| 207 | 'CSIBM869' => 'CP869', |
||||
| 208 | 'CSPC775BALTIC' => 'CP775', |
||||
| 209 | 'CSPC862LATINHEBREW' => 'CP862', |
||||
| 210 | 'CSPC8CODEPAGE437' => 'CP437', |
||||
| 211 | 'CSPTCP154' => 'PT154', |
||||
| 212 | 'CYRILLICASIAN' => 'PT154', |
||||
| 213 | 'EBCDICCPBE' => 'CP500', |
||||
| 214 | 'EBCDICCPCA' => 'CP037', |
||||
| 215 | 'EBCDICCPCH' => 'CP500', |
||||
| 216 | 'EBCDICCPHE' => 'CP424', |
||||
| 217 | 'EBCDICCPNL' => 'CP037', |
||||
| 218 | 'EBCDICCPUS' => 'CP037', |
||||
| 219 | 'EBCDICCPWT' => 'CP037', |
||||
| 220 | 'HKSCS' => 'BIG5HKSCS', |
||||
| 221 | 'HPROMAN8' => 'ROMAN8', |
||||
| 222 | 'IBM037' => 'CP037', |
||||
| 223 | 'IBM039' => 'CP037', |
||||
| 224 | 'IBM424' => 'CP424', |
||||
| 225 | 'IBM437' => 'CP437', |
||||
| 226 | 'IBM500' => 'CP500', |
||||
| 227 | 'IBM775' => 'CP775', |
||||
| 228 | 'IBM860' => 'CP860', |
||||
| 229 | 'IBM861' => 'CP861', |
||||
| 230 | 'IBM862' => 'CP862', |
||||
| 231 | 'IBM863' => 'CP863', |
||||
| 232 | 'IBM864' => 'CP864', |
||||
| 233 | 'IBM865' => 'CP865', |
||||
| 234 | 'IBM869' => 'CP869', |
||||
| 235 | 'IBM1026' => 'CP1026', |
||||
| 236 | 'IBM1140' => 'CP1140', |
||||
| 237 | 'ISO2022JP2' => 'ISO2022JP2', |
||||
| 238 | 'ISO8859112001' => 'ISO885911', |
||||
| 239 | 'ISO885911' => 'ISO885911', |
||||
| 240 | 'ISOIR166' => 'TIS620', |
||||
| 241 | 'JOHAB' => 'CP1361', |
||||
| 242 | 'MACCYRILLIC' => 'MACCYRILLIC', |
||||
| 243 | 'MS1361' => 'CP1361', |
||||
| 244 | 'MS949' => 'CP949', |
||||
| 245 | 'PTCP154' => 'PT154', |
||||
| 246 | 'R8' => 'ROMAN8', |
||||
| 247 | 'ROMAN8' => 'ROMAN8', |
||||
| 248 | 'THAI' => 'ISO885911', |
||||
| 249 | 'TIS6200' => 'TIS620', |
||||
| 250 | 'TIS62025290' => 'TIS620', |
||||
| 251 | 'TIS62025291' => 'TIS620', |
||||
| 252 | 'TIS620' => 'TIS620', |
||||
| 253 | 'UHC' => 'CP949', |
||||
| 254 | 'WINDOWS1250' => 'CP1250', |
||||
| 255 | 'WINDOWS1253' => 'CP1253', |
||||
| 256 | 'WINDOWS1256' => 'CP1256', |
||||
| 257 | 'WINDOWS1257' => 'CP1257', |
||||
| 258 | 'WINDOWS1258' => 'CP1258', |
||||
| 259 | '037' => 'CP037', |
||||
| 260 | '424' => 'CP424', |
||||
| 261 | '437' => 'CP437', |
||||
| 262 | '500' => 'CP500', |
||||
| 263 | '775' => 'CP775', |
||||
| 264 | '860' => 'CP860', |
||||
| 265 | '861' => 'CP861', |
||||
| 266 | '862' => 'CP862', |
||||
| 267 | '863' => 'CP863', |
||||
| 268 | '864' => 'CP864', |
||||
| 269 | '865' => 'CP865', |
||||
| 270 | '869' => 'CP869', |
||||
| 271 | '949' => 'CP949', |
||||
| 272 | '1026' => 'CP1026', |
||||
| 273 | '1140' => 'CP1140', |
||||
| 274 | '1250' => 'CP1250', |
||||
| 275 | '1253' => 'CP1253', |
||||
| 276 | '1256' => 'CP1256', |
||||
| 277 | '1257' => 'CP1257', |
||||
| 278 | '1258' => 'CP1258', |
||||
| 279 | ]; |
||||
| 280 | |||||
| 281 | /** |
||||
| 282 | * @var string[] cached lookups for quicker retrieval |
||||
| 283 | */ |
||||
| 284 | protected $mappedMbCharsets = [ |
||||
| 285 | 'UTF8' => 'UTF-8', |
||||
| 286 | 'USASCII' => 'US-ASCII', |
||||
| 287 | 'ISO88591' => 'ISO-8859-1', |
||||
| 288 | ]; |
||||
| 289 | |||||
| 290 | /** |
||||
| 291 | * @var string[] An array of encodings supported by the mb_* extension, as |
||||
| 292 | * returned by mb_list_encodings(), with the key set to the charset's |
||||
| 293 | * name afte |
||||
| 294 | */ |
||||
| 295 | private static $mbListedEncodings; |
||||
| 296 | |||||
| 297 | /** |
||||
| 298 | * Initializes the static mb_* encoding array. |
||||
| 299 | */ |
||||
| 300 | 14 | public function __construct() |
|||
| 301 | { |
||||
| 302 | 14 | if (self::$mbListedEncodings === null) { |
|||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||||
| 303 | 1 | $cs = \mb_list_encodings(); |
|||
| 304 | 1 | $keys = $this->getNormalizedCharset($cs); |
|||
| 305 | 1 | self::$mbListedEncodings = \array_combine($keys, $cs); |
|||
| 306 | } |
||||
| 307 | } |
||||
| 308 | |||||
| 309 | /** |
||||
| 310 | * The passed charset is uppercased, and stripped of non-alphanumeric |
||||
| 311 | * characters before being returned. |
||||
| 312 | * |
||||
| 313 | * @param string|string[] $charset |
||||
| 314 | * @return string|string[] |
||||
| 315 | */ |
||||
| 316 | 14 | private function getNormalizedCharset($charset) |
|||
| 317 | { |
||||
| 318 | 14 | $upper = null; |
|||
| 319 | 14 | if (\is_array($charset)) { |
|||
| 320 | 1 | $upper = \array_map('strtoupper', $charset); |
|||
| 321 | } else { |
||||
| 322 | 14 | $upper = \strtoupper($charset); |
|||
| 323 | } |
||||
| 324 | 14 | return \preg_replace('/[^A-Z0-9]+/', '', $upper); |
|||
| 325 | } |
||||
| 326 | |||||
| 327 | 9 | private function iconv(string $fromCharset, string $toCharset, string $str) : string |
|||
| 328 | { |
||||
| 329 | 9 | $ret = @\iconv($fromCharset, $toCharset . '//TRANSLIT//IGNORE', $str); |
|||
| 330 | 9 | if ($ret === false) { |
|||
| 331 | 1 | throw new UnsupportedCharsetException("Unable to convert from charsets: $fromCharset to $toCharset"); |
|||
| 332 | } |
||||
| 333 | 8 | return $ret; |
|||
| 334 | } |
||||
| 335 | |||||
| 336 | 4 | private function iconvStrlen(string $str, string $charset) : int |
|||
| 337 | { |
||||
| 338 | 4 | $ret = @\iconv_strlen($str, $charset . '//TRANSLIT//IGNORE'); |
|||
| 339 | 4 | if ($ret === false) { |
|||
| 340 | 2 | throw new UnsupportedCharsetException("Charset $charset is not supported"); |
|||
| 341 | } |
||||
| 342 | 2 | return $ret; |
|||
| 343 | } |
||||
| 344 | |||||
| 345 | 4 | private function iconvSubstr(string $str, string $charset, int $start, ?int $length = null) : string |
|||
| 346 | { |
||||
| 347 | 4 | $ret = @\iconv_substr($str, $start, $length, $charset . '//TRANSLIT//IGNORE'); |
|||
| 348 | 4 | if ($ret === false) { |
|||
| 349 | 1 | $strLength = $this->iconvStrlen($str, $charset); |
|||
| 350 | if ($start > $strLength) { |
||||
| 351 | // returns empty to keep in line with mb_substr functionality |
||||
| 352 | return ''; |
||||
| 353 | } |
||||
| 354 | throw new UnsupportedCharsetException("Charset $charset is not supported"); |
||||
| 355 | } |
||||
| 356 | 3 | return $ret; |
|||
| 357 | } |
||||
| 358 | |||||
| 359 | /** |
||||
| 360 | * Converts the passed string's charset from the passed $fromCharset to the |
||||
| 361 | * passed $toCharset |
||||
| 362 | * |
||||
| 363 | * The function attempts to use mb_convert_encoding if possible, and falls |
||||
| 364 | * back to iconv if not. If the source or destination character sets aren't |
||||
| 365 | * supported, a blank string is returned. |
||||
| 366 | * |
||||
| 367 | * @throws UnsupportedCharsetException if iconv fails |
||||
| 368 | */ |
||||
| 369 | 11 | public function convert(string $str, string $fromCharset, string $toCharset) : string |
|||
| 370 | { |
||||
| 371 | // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance |
||||
| 372 | // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv |
||||
| 373 | // may also fail even though both support an encoding separately. |
||||
| 374 | // For cases like that, a two-way encoding is done with UTF-8 as an intermediary. |
||||
| 375 | |||||
| 376 | 11 | $from = $this->getMbCharset($fromCharset); |
|||
| 377 | 11 | $to = $this->getMbCharset($toCharset); |
|||
| 378 | |||||
| 379 | 11 | if ($str !== '') { |
|||
| 380 | 9 | if ($from !== false && $to === false) { |
|||
| 381 | 9 | $str = \mb_convert_encoding($str, 'UTF-8', $from); |
|||
| 382 | 9 | return $this->iconv('UTF-8', $this->getIconvAlias($toCharset), $str); |
|||
|
0 ignored issues
–
show
It seems like
$str can also be of type array; however, parameter $str of ZBateson\MbWrapper\MbWrapper::iconv() does only seem to accept string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 383 | 8 | } elseif ($from === false && $to !== false) { |
|||
| 384 | 7 | $str = $this->iconv($this->getIconvAlias($fromCharset), 'UTF-8', $str); |
|||
| 385 | 7 | return \mb_convert_encoding($str, $to, 'UTF-8'); |
|||
| 386 | 8 | } elseif ($from !== false && $to !== false) { |
|||
| 387 | 6 | return \mb_convert_encoding($str, $to, $from); |
|||
| 388 | } |
||||
| 389 | 2 | return $this->iconv( |
|||
| 390 | 2 | $this->getIconvAlias($fromCharset), |
|||
| 391 | 2 | $this->getIconvAlias($toCharset), |
|||
| 392 | 2 | $str |
|||
| 393 | 2 | ); |
|||
| 394 | } |
||||
| 395 | 2 | return $str; |
|||
| 396 | } |
||||
| 397 | |||||
| 398 | /** |
||||
| 399 | * Returns true if the passed string is valid in the $charset encoding. |
||||
| 400 | * |
||||
| 401 | * Either uses mb_check_encoding, or iconv if it's not a supported mb |
||||
| 402 | * encoding. |
||||
| 403 | */ |
||||
| 404 | public function checkEncoding(string $str, string $charset) : bool |
||||
| 405 | { |
||||
| 406 | $mb = $this->getMbCharset($charset); |
||||
| 407 | if ($mb !== false) { |
||||
| 408 | return \mb_check_encoding($str, $mb); |
||||
| 409 | } |
||||
| 410 | $ic = $this->getIconvAlias($charset); |
||||
| 411 | return (@\iconv($ic, $ic . '//TRANSLIT//IGNORE', $str) !== false); |
||||
| 412 | } |
||||
| 413 | |||||
| 414 | /** |
||||
| 415 | * Uses either mb_strlen or iconv_strlen to return the number of characters |
||||
| 416 | * in the passed $str for the given $charset |
||||
| 417 | * |
||||
| 418 | * @throws UnsupportedCharsetException if iconv fails |
||||
| 419 | */ |
||||
| 420 | 3 | public function getLength(string $str, string $charset) : int |
|||
| 421 | { |
||||
| 422 | 3 | $mb = $this->getMbCharset($charset); |
|||
| 423 | 3 | if ($mb !== false) { |
|||
| 424 | 2 | return \mb_strlen($str, $mb); |
|||
| 425 | } |
||||
| 426 | 3 | return $this->iconvStrlen($str, $this->getIconvAlias($charset)); |
|||
| 427 | } |
||||
| 428 | |||||
| 429 | /** |
||||
| 430 | * Uses either mb_substr or iconv_substr to create and return a substring of |
||||
| 431 | * the passed $str. |
||||
| 432 | * |
||||
| 433 | * If the offset provided in $start is greater than the length of the |
||||
| 434 | * string, an empty string is returned. |
||||
| 435 | * |
||||
| 436 | * @throws UnsupportedCharsetException if iconv fails |
||||
| 437 | */ |
||||
| 438 | 4 | public function getSubstr(string $str, string $charset, int $start, ?int $length = null) : string |
|||
| 439 | { |
||||
| 440 | 4 | $mb = $this->getMbCharset($charset); |
|||
| 441 | 4 | if ($mb !== false) { |
|||
| 442 | 2 | return \mb_substr($str, $start, $length, $mb); |
|||
| 443 | } |
||||
| 444 | 4 | $ic = $this->getIconvAlias($charset); |
|||
| 445 | 4 | if ($ic === 'CP1258') { |
|||
| 446 | // iconv_substr fails with CP1258 for some reason, and returns only |
||||
| 447 | // a subset of characters (e.g. the first 5, instead of $length) |
||||
| 448 | $str = $this->convert($str, $ic, 'UTF-8'); |
||||
| 449 | return $this->convert($this->getSubstr($str, 'UTF-8', $start, $length), 'UTF-8', $ic); |
||||
| 450 | } |
||||
| 451 | 4 | return $this->iconvSubstr($str, $ic, $start, $length); |
|||
| 452 | } |
||||
| 453 | |||||
| 454 | /** |
||||
| 455 | * Looks up a charset from mb_list_encodings and identified aliases, |
||||
| 456 | * checking if the lookup has been cached already first. |
||||
| 457 | * |
||||
| 458 | * If the encoding is not listed, the method will return false. |
||||
| 459 | * |
||||
| 460 | * On success, the method will return the charset name as accepted by mb_*. |
||||
| 461 | * |
||||
| 462 | * @return string|bool |
||||
| 463 | */ |
||||
| 464 | 14 | private function getMbCharset(string $cs) |
|||
| 465 | { |
||||
| 466 | 14 | $normalized = $this->getNormalizedCharset($cs); |
|||
| 467 | 14 | if (\array_key_exists($normalized, self::$mbListedEncodings)) { |
|||
| 468 | 11 | return self::$mbListedEncodings[$normalized]; |
|||
| 469 | 12 | } elseif (\array_key_exists($normalized, self::$mbAliases)) { |
|||
| 470 | 5 | return self::$mbAliases[$normalized]; |
|||
| 471 | } |
||||
| 472 | 12 | return false; |
|||
| 473 | } |
||||
| 474 | |||||
| 475 | /** |
||||
| 476 | * Looks up the passed charset in self::$iconvAliases, returning the mapped |
||||
| 477 | * charset if applicable. Otherwise returns charset. |
||||
| 478 | * |
||||
| 479 | * @return string the mapped charset (if mapped) or $cs otherwise |
||||
| 480 | */ |
||||
| 481 | 12 | private function getIconvAlias(string $cs) : string |
|||
| 482 | { |
||||
| 483 | 12 | $normalized = $this->getNormalizedCharset($cs); |
|||
| 484 | 12 | if (\array_key_exists($normalized, self::$iconvAliases)) { |
|||
| 485 | 8 | return static::$iconvAliases[$normalized]; |
|||
| 486 | } |
||||
| 487 | 11 | return $cs; |
|||
| 488 | } |
||||
| 489 | } |
||||
| 490 |