Issues (2)

src/MbWrapper.php (2 issues)

1
<?php
2
/**
3
 * This file is part of the ZBateson\MbWrapper project.
4
 *
5
 * @license http://opensource.org/licenses/bsd-license.php BSD
6
 */
7
namespace ZBateson\MbWrapper;
8
9
/**
10
 * Helper class for converting strings between charsets, finding a multibyte
11
 * strings length, and creating a substring.
12
 *
13
 * MbWrapper prefers PHP's mb_* extension first, and reverts to iconv_* if the
14
 * charsets aren't listed as supported by mb_list_encodings().
15
 *
16
 * A list of aliased charsets are maintained to support the greatest number of
17
 * charsets.  In addition, when searching for a charset, separator characters
18
 * such as dashes are removed, and searches are always performed
19
 * case-insensitively.  This is to support strange reported encodings in emails,
20
 * etc...
21
 *
22
 * @author Zaahid Bateson
23
 */
24
class MbWrapper
25
{
26
    /**
27
     * @var array<string, string> aliased charsets supported by mb_convert_encoding.
28
     *      The alias is stripped of any non-alphanumeric characters (so CP367
29
     *      is equal to CP-367) when comparing.
30
     *      Some of these translations are already supported by
31
     *      mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
32
     *      other implementations or versions since they're not part of
33
     *      documented support.
34
     */
35
    public static $mbAliases = [
36
        // supported but not included in mb_list_encodings for some reason...
37
        'CP850' => 'CP850',
38
        'GB2312' => 'GB18030',
39
        'SJIS2004' => 'SJIS-2004',
40
        // aliases
41
        'ANSIX341968' => 'ASCII',
42
        'ANSIX341986' => 'ASCII',
43
        'ARABIC' => 'ISO-8859-6',
44
        'ASMO708' => 'ISO-8859-6',
45
        'BIG5' => 'BIG-5',
46
        'BIG5TW' => 'BIG-5',
47
        'CESU8' => 'UTF-8',
48
        'CHINESE' => 'GB18030',
49
        'CP367' => 'ASCII',
50
        'CP819' => 'ISO-8859-1',
51
        'CP1251' => 'WINDOWS-1251',
52
        'CP1252' => 'WINDOWS-1252',
53
        'CP1254' => 'WINDOWS-1254',
54
        'CP1255' => 'ISO-8859-8',
55
        'CSASCII' => 'ASCII',
56
        'CSBIG5' => 'BIG-5',
57
        'CSIBM866' => 'CP866',
58
        'CSISO2022JP' => 'ISO-2022-JP',
59
        'CSISO2022KR' => 'ISO-2022-KR',
60
        'CSISO58GB231280' => 'GB18030',
61
        'CSISOLATIN1' => 'ISO-8859-1',
62
        'CSISOLATIN2' => 'ISO-8859-2',
63
        'CSISOLATIN3' => 'ISO-8859-3',
64
        'CSISOLATIN4' => 'ISO-8859-4',
65
        'CSISOLATIN5' => 'ISO-8859-9',
66
        'CSISOLATIN6' => 'ISO-8859-10',
67
        'CSISOLATINARABIC' => 'ISO-8859-6',
68
        'CSISOLATINCYRILLIC' => 'ISO-8859-5',
69
        'CSISOLATINGREEK' => 'ISO-8859-7',
70
        'CSISOLATINHEBREW' => 'ISO-8859-8',
71
        'CSKOI8R' => 'KOI8-R',
72
        'CSPC850MULTILINGUAL' => 'CP850',
73
        'CSSHIFTJIS' => 'SJIS',
74
        'CYRILLIC' => 'ISO-8859-5',
75
        'ECMA114' => 'ISO-8859-6',
76
        'ECMA118' => 'ISO-8859-7',
77
        'ELOT928' => 'ISO-8859-7',
78
        'EUCCN' => 'GB18030',
79
        'EUCGB2312CN' => 'GB18030',
80
        'GB180302000' => 'GB18030',
81
        'GB23121980' => 'GB18030',
82
        'GB231280' => 'GB18030',
83
        'GBK' => 'CP936',
84
        'GREEK8' => 'ISO-8859-7',
85
        'GREEK' => 'ISO-8859-7',
86
        'HEBREW' => 'ISO-8859-8',
87
        'HZGB2312' => 'HZ',
88
        'HZGB' => 'HZ',
89
        'IBM367' => 'ASCII',
90
        'IBM819' => 'ISO-8859-1',
91
        'IBM850' => 'CP850',
92
        'IBM866' => 'CP866',
93
        'ISO2022JP2004' => 'ISO-2022-JP-2004',
94
        'ISO646IRV1991' => 'ASCII',
95
        'ISO646US' => 'ASCII',
96
        'ISO8859' => 'ISO-8859-1',
97
        'ISO8859101992' => 'ISO-8859-10',
98
        'ISO885911987' => 'ISO-8859-1',
99
        'ISO8859141998' => 'ISO-8859-14',
100
        'ISO8859162001' => 'ISO-8859-16',
101
        'ISO885921987' => 'ISO-8859-2',
102
        'ISO885931988' => 'ISO-8859-3',
103
        'ISO885941988' => 'ISO-8859-4',
104
        'ISO885951988' => 'ISO-8859-5',
105
        'ISO885961987' => 'ISO-8859-6',
106
        'ISO885971987' => 'ISO-8859-7',
107
        'ISO885981988' => 'ISO-8859-8',
108
        'ISO88598I' => 'ISO-8859-8',
109
        'ISO885991989' => 'ISO-8859-9',
110
        'ISOCELTIC' => 'ISO-8859-14',
111
        'ISOIR100' => 'ISO-8859-1',
112
        'ISOIR101' => 'ISO-8859-2',
113
        'ISOIR109' => 'ISO-8859-3',
114
        'ISOIR110' => 'ISO-8859-4',
115
        'ISOIR126' => 'ISO-8859-7',
116
        'ISOIR127' => 'ISO-8859-6',
117
        'ISOIR138' => 'ISO-8859-8',
118
        'ISOIR144' => 'ISO-8859-5',
119
        'ISOIR148' => 'ISO-8859-9',
120
        'ISOIR157' => 'ISO-8859-10',
121
        'ISOIR199' => 'ISO-8859-14',
122
        'ISOIR226' => 'ISO-8859-16',
123
        'ISOIR58' => 'GB18030',
124
        'ISOIR6' => 'ASCII',
125
        'KOI8R' => 'KOI8-R',
126
        'KOREAN' => 'EUC-KR',
127
        'KSC56011987' => 'EUC-KR',
128
        'KSC5601' => 'EUC-KR',
129
        'KSX1001' => 'EUC-KR',
130
        'L1' => 'ISO-8859-1',
131
        'L2' => 'ISO-8859-2',
132
        'L3' => 'ISO-8859-3',
133
        'L4' => 'ISO-8859-4',
134
        'L5' => 'ISO-8859-9',
135
        'L6' => 'ISO-8859-10',
136
        'L8' => 'ISO-8859-14',
137
        'L10' => 'ISO-8859-16',
138
        'LATIN' => 'ISO-8859-1',
139
        'LATIN1' => 'ISO-8859-1',
140
        'LATIN2' => 'ISO-8859-2',
141
        'LATIN3' => 'ISO-8859-3',
142
        'LATIN4' => 'ISO-8859-4',
143
        'LATIN5' => 'ISO-8859-9',
144
        'LATIN6' => 'ISO-8859-10',
145
        'LATIN8' => 'ISO-8859-14',
146
        'LATIN10' => 'ISO-8859-16',
147
        'MS932' => 'CP932',
148
        'ms936' => 'CP936',
149
        'MS950' => 'CP950',
150
        'MSKANJI' => 'CP932',
151
        'SHIFTJIS2004' => 'SJIS',
152
        'SHIFTJIS' => 'SJIS',
153
        'UJIS' => 'EUC-JP',
154
        'UNICODE11UTF7' => 'UTF-7',
155
        'US' => 'ASCII',
156
        'USASCII' => 'ASCII',
157
        'WE8MSWIN1252' => 'WINDOWS-1252',
158
        'WINDOWS1251' => 'WINDOWS-1251',
159
        'WINDOWS1252' => 'WINDOWS-1252',
160
        'WINDOWS1254' => 'WINDOWS-1254',
161
        'WINDOWS1255' => 'ISO-8859-8',
162
        '0' => 'WINDOWS-1252',
163
        '128' => 'SJIS',
164
        '129' => 'EUC-KR',
165
        '134' => 'GB18030',
166
        '136' => 'BIG-5',
167
        '161' => 'WINDOWS-1253',
168
        '162' => 'WINDOWS-1254',
169
        '177' => 'WINDOWS-1255',
170
        '178' => 'WINDOWS-1256',
171
        '186' => 'WINDOWS-1257',
172
        '204' => 'WINDOWS-1251',
173
        '222' => 'WINDOWS-874',
174
        '238' => 'WINDOWS-1250',
175
        '646' => 'ASCII',
176
        '850' => 'CP850',
177
        '866' => 'CP866',
178
        '932' => 'CP932',
179
        '936' => 'CP936',
180
        '950' => 'CP950',
181
        '1251' => 'WINDOWS-1251',
182
        '1252' => 'WINDOWS-1252',
183
        '1254' => 'WINDOWS-1254',
184
        '1255' => 'ISO-8859-8',
185
        '8859' => 'ISO-8859-1',
186
    ];
187
188
    /**
189
     * @var array<string, string> aliased charsets supported by iconv.
190
     */
191
    public static $iconvAliases = [
192
        // iconv aliases -- a lot of these may already be supported
193
        'CESU8' => 'UTF8',
194
        'CP154' => 'PT154',
195
        'CPGR' => 'CP869',
196
        'CPIS' => 'CP861',
197
        'CSHPROMAN8' => 'ROMAN8',
198
        'CSIBM037' => 'CP037',
199
        'CSIBM1026' => 'CP1026',
200
        'CSIBM424' => 'CP424',
201
        'CSIBM500' => 'CP500',
202
        'CSIBM860' => 'CP860',
203
        'CSIBM861' => 'CP861',
204
        'CSIBM863' => 'CP863',
205
        'CSIBM864' => 'CP864',
206
        'CSIBM865' => 'CP865',
207
        'CSIBM869' => 'CP869',
208
        'CSPC775BALTIC' => 'CP775',
209
        'CSPC862LATINHEBREW' => 'CP862',
210
        'CSPC8CODEPAGE437' => 'CP437',
211
        'CSPTCP154' => 'PT154',
212
        'CYRILLICASIAN' => 'PT154',
213
        'EBCDICCPBE' => 'CP500',
214
        'EBCDICCPCA' => 'CP037',
215
        'EBCDICCPCH' => 'CP500',
216
        'EBCDICCPHE' => 'CP424',
217
        'EBCDICCPNL' => 'CP037',
218
        'EBCDICCPUS' => 'CP037',
219
        'EBCDICCPWT' => 'CP037',
220
        'HKSCS' => 'BIG5HKSCS',
221
        'HPROMAN8' => 'ROMAN8',
222
        'IBM037' => 'CP037',
223
        'IBM039' => 'CP037',
224
        'IBM424' => 'CP424',
225
        'IBM437' => 'CP437',
226
        'IBM500' => 'CP500',
227
        'IBM775' => 'CP775',
228
        'IBM860' => 'CP860',
229
        'IBM861' => 'CP861',
230
        'IBM862' => 'CP862',
231
        'IBM863' => 'CP863',
232
        'IBM864' => 'CP864',
233
        'IBM865' => 'CP865',
234
        'IBM869' => 'CP869',
235
        'IBM1026' => 'CP1026',
236
        'IBM1140' => 'CP1140',
237
        'ISO2022JP2' => 'ISO2022JP2',
238
        'ISO8859112001' => 'ISO885911',
239
        'ISO885911' => 'ISO885911',
240
        'ISOIR166' => 'TIS620',
241
        'JOHAB' => 'CP1361',
242
        'MACCYRILLIC' => 'MACCYRILLIC',
243
        'MS1361' => 'CP1361',
244
        'MS949' => 'CP949',
245
        'PTCP154' => 'PT154',
246
        'R8' => 'ROMAN8',
247
        'ROMAN8' => 'ROMAN8',
248
        'THAI' => 'ISO885911',
249
        'TIS6200' => 'TIS620',
250
        'TIS62025290' => 'TIS620',
251
        'TIS62025291' => 'TIS620',
252
        'TIS620' => 'TIS620',
253
        'UHC' => 'CP949',
254
        'WINDOWS1250' => 'CP1250',
255
        'WINDOWS1253' => 'CP1253',
256
        'WINDOWS1256' => 'CP1256',
257
        'WINDOWS1257' => 'CP1257',
258
        'WINDOWS1258' => 'CP1258',
259
        '037' => 'CP037',
260
        '424' => 'CP424',
261
        '437' => 'CP437',
262
        '500' => 'CP500',
263
        '775' => 'CP775',
264
        '860' => 'CP860',
265
        '861' => 'CP861',
266
        '862' => 'CP862',
267
        '863' => 'CP863',
268
        '864' => 'CP864',
269
        '865' => 'CP865',
270
        '869' => 'CP869',
271
        '949' => 'CP949',
272
        '1026' => 'CP1026',
273
        '1140' => 'CP1140',
274
        '1250' => 'CP1250',
275
        '1253' => 'CP1253',
276
        '1256' => 'CP1256',
277
        '1257' => 'CP1257',
278
        '1258' => 'CP1258',
279
    ];
280
281
    /**
282
     * @var string[] cached lookups for quicker retrieval
283
     */
284
    protected $mappedMbCharsets = [
285
        'UTF8' => 'UTF-8',
286
        'USASCII' => 'US-ASCII',
287
        'ISO88591' => 'ISO-8859-1',
288
    ];
289
290
    /**
291
     * @var string[] An array of encodings supported by the mb_* extension, as
292
     *      returned by mb_list_encodings(), with the key set to the charset's
293
     *      name afte
294
     */
295
    private static $mbListedEncodings;
296
297
    /**
298
     * Initializes the static mb_* encoding array.
299
     */
300 14
    public function __construct()
301
    {
302 14
        if (self::$mbListedEncodings === null) {
0 ignored issues
show
The condition self::mbListedEncodings === null is always false.
Loading history...
303 1
            $cs = \mb_list_encodings();
304 1
            $keys = $this->getNormalizedCharset($cs);
305 1
            self::$mbListedEncodings = \array_combine($keys, $cs);
306
        }
307
    }
308
309
    /**
310
     * The passed charset is uppercased, and stripped of non-alphanumeric
311
     * characters before being returned.
312
     *
313
     * @param string|string[] $charset
314
     * @return string|string[]
315
     */
316 14
    private function getNormalizedCharset($charset)
317
    {
318 14
        $upper = null;
319 14
        if (\is_array($charset)) {
320 1
            $upper = \array_map('strtoupper', $charset);
321
        } else {
322 14
            $upper = \strtoupper($charset);
323
        }
324 14
        return \preg_replace('/[^A-Z0-9]+/', '', $upper);
325
    }
326
327 9
    private function iconv(string $fromCharset, string $toCharset, string $str) : string
328
    {
329 9
        $ret = @\iconv($fromCharset, $toCharset . '//TRANSLIT//IGNORE', $str);
330 9
        if ($ret === false) {
331 1
            throw new UnsupportedCharsetException("Unable to convert from charsets: $fromCharset to $toCharset");
332
        }
333 8
        return $ret;
334
    }
335
    
336 4
    private function iconvStrlen(string $str, string $charset) : int
337
    {
338 4
        $ret = @\iconv_strlen($str, $charset . '//TRANSLIT//IGNORE');
339 4
        if ($ret === false) {
340 2
            throw new UnsupportedCharsetException("Charset $charset is not supported");
341
        }
342 2
        return $ret;
343
    }
344
    
345 4
    private function iconvSubstr(string $str, string $charset, int $start, ?int $length = null) : string
346
    {
347 4
        $ret = @\iconv_substr($str, $start, $length, $charset . '//TRANSLIT//IGNORE');
348 4
        if ($ret === false) {
349 1
            $strLength = $this->iconvStrlen($str, $charset);
350
            if ($start > $strLength) {
351
                // returns empty to keep in line with mb_substr functionality
352
                return '';
353
            }
354
            throw new UnsupportedCharsetException("Charset $charset is not supported");
355
        }
356 3
        return $ret;
357
    }
358
359
    /**
360
     * Converts the passed string's charset from the passed $fromCharset to the
361
     * passed $toCharset
362
     *
363
     * The function attempts to use mb_convert_encoding if possible, and falls
364
     * back to iconv if not.  If the source or destination character sets aren't
365
     * supported, a blank string is returned.
366
     *
367
     * @throws UnsupportedCharsetException if iconv fails
368
     */
369 11
    public function convert(string $str, string $fromCharset, string $toCharset) : string
370
    {
371
        // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
372
        // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
373
        // may also fail even though both support an encoding separately.
374
        // For cases like that, a two-way encoding is done with UTF-8 as an intermediary.
375
376 11
        $from = $this->getMbCharset($fromCharset);
377 11
        $to = $this->getMbCharset($toCharset);
378
379 11
        if ($str !== '') {
380 9
            if ($from !== false && $to === false) {
381 9
                $str = \mb_convert_encoding($str, 'UTF-8', $from);
382 9
                return $this->iconv('UTF-8', $this->getIconvAlias($toCharset), $str);
0 ignored issues
show
It seems like $str can also be of type array; however, parameter $str of ZBateson\MbWrapper\MbWrapper::iconv() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

382
                return $this->iconv('UTF-8', $this->getIconvAlias($toCharset), /** @scrutinizer ignore-type */ $str);
Loading history...
383 8
            } elseif ($from === false && $to !== false) {
384 7
                $str = $this->iconv($this->getIconvAlias($fromCharset), 'UTF-8', $str);
385 7
                return \mb_convert_encoding($str, $to, 'UTF-8');
386 8
            } elseif ($from !== false && $to !== false) {
387 6
                return \mb_convert_encoding($str, $to, $from);
388
            }
389 2
            return $this->iconv(
390 2
                $this->getIconvAlias($fromCharset),
391 2
                $this->getIconvAlias($toCharset),
392 2
                $str
393 2
            );
394
        }
395 2
        return $str;
396
    }
397
398
    /**
399
     * Returns true if the passed string is valid in the $charset encoding.
400
     *
401
     * Either uses mb_check_encoding, or iconv if it's not a supported mb
402
     * encoding.
403
     */
404
    public function checkEncoding(string $str, string $charset) : bool
405
    {
406
        $mb = $this->getMbCharset($charset);
407
        if ($mb !== false) {
408
            return \mb_check_encoding($str, $mb);
409
        }
410
        $ic = $this->getIconvAlias($charset);
411
        return (@\iconv($ic, $ic . '//TRANSLIT//IGNORE', $str) !== false);
412
    }
413
414
    /**
415
     * Uses either mb_strlen or iconv_strlen to return the number of characters
416
     * in the passed $str for the given $charset
417
     *
418
     * @throws UnsupportedCharsetException if iconv fails
419
     */
420 3
    public function getLength(string $str, string $charset) : int
421
    {
422 3
        $mb = $this->getMbCharset($charset);
423 3
        if ($mb !== false) {
424 2
            return \mb_strlen($str, $mb);
425
        }
426 3
        return $this->iconvStrlen($str, $this->getIconvAlias($charset));
427
    }
428
429
    /**
430
     * Uses either mb_substr or iconv_substr to create and return a substring of
431
     * the passed $str.
432
     * 
433
     * If the offset provided in $start is greater than the length of the
434
     * string, an empty string is returned.
435
     *
436
     * @throws UnsupportedCharsetException if iconv fails
437
     */
438 4
    public function getSubstr(string $str, string $charset, int $start, ?int $length = null) : string
439
    {
440 4
        $mb = $this->getMbCharset($charset);
441 4
        if ($mb !== false) {
442 2
            return \mb_substr($str, $start, $length, $mb);
443
        }
444 4
        $ic = $this->getIconvAlias($charset);
445 4
        if ($ic === 'CP1258') {
446
            // iconv_substr fails with CP1258 for some reason, and returns only
447
            // a subset of characters (e.g. the first 5, instead of $length)
448
            $str = $this->convert($str, $ic, 'UTF-8');
449
            return $this->convert($this->getSubstr($str, 'UTF-8', $start, $length), 'UTF-8', $ic);
450
        }
451 4
        return $this->iconvSubstr($str, $ic, $start, $length);
452
    }
453
454
    /**
455
     * Looks up a charset from mb_list_encodings and identified aliases,
456
     * checking if the lookup has been cached already first.
457
     *
458
     * If the encoding is not listed, the method will return false.
459
     *
460
     * On success, the method will return the charset name as accepted by mb_*.
461
     *
462
     * @return string|bool
463
     */
464 14
    private function getMbCharset(string $cs)
465
    {
466 14
        $normalized = $this->getNormalizedCharset($cs);
467 14
        if (\array_key_exists($normalized, self::$mbListedEncodings)) {
468 11
            return self::$mbListedEncodings[$normalized];
469 12
        } elseif (\array_key_exists($normalized, self::$mbAliases)) {
470 5
            return self::$mbAliases[$normalized];
471
        }
472 12
        return false;
473
    }
474
475
    /**
476
     * Looks up the passed charset in self::$iconvAliases, returning the mapped
477
     * charset if applicable.  Otherwise returns charset.
478
     *
479
     * @return string the mapped charset (if mapped) or $cs otherwise
480
     */
481 12
    private function getIconvAlias(string $cs) : string
482
    {
483 12
        $normalized = $this->getNormalizedCharset($cs);
484 12
        if (\array_key_exists($normalized, self::$iconvAliases)) {
485 8
            return static::$iconvAliases[$normalized];
486
        }
487 11
        return $cs;
488
    }
489
}
490