Passed
Push — 0.4 ( 390ba6...d33bff )
by Zaahid
03:25
created

CharsetConverter   A

Complexity

Total Complexity 14

Size/Duplication

Total Lines 405
Duplicated Lines 0 %

Test Coverage

Coverage 97.14%

Importance

Changes 0
Metric Value
wmc 14
eloc 286
dl 0
loc 405
ccs 34
cts 35
cp 0.9714
rs 10
c 0
b 0
f 0

5 Methods

Rating   Name   Duplication   Size   Complexity  
A findAliasedIconvCharset() 0 8 3
A convert() 0 15 4
A findAliasedCharset() 0 9 3
A findSupportedCharset() 0 15 3
A __construct() 0 4 1
1
<?php
2
/**
3
 * This file is part of the ZBateson\MailMimeParser project.
4
 *
5
 * @license http => //opensource.org/licenses/bsd-license.php BSD
6
 */
7
namespace ZBateson\MailMimeParser\Stream\Helper;
8
9
/**
10
 * Helper class for converting strings between charsets.
11
 * 
12
 * CharasetConverter tries to convert using mb_convert_encoding when possible,
13
 * defining as many aliases as possible for supported encodings.  If not
14
 * supported, iconv is attempted.
15
 *
16
 * @author Zaahid Bateson
17
 */
18
class CharsetConverter
19
{
20
    /**
21
     * @var array aliased charsets supported by mb_convert_encoding.
22
     *      The alias is stripped of any non-alphanumeric characters (so CP367
23
     *      is equal to CP-367) when comparing.
24
     *      Some of these translations are already supported by
25
     *      mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
26
     *      other implementations or versions since they're not part of
27
     *      documented support.
28
     */
29
    public static $mbAliases = [
30
        // supported but not included in mb_list_encodings for some reason...
31
        'CP850' => 'CP850',
32
        'GB2312' => 'GB2312',
33
        // aliases
34
        '646' => 'ASCII',
35
        'ANSIX341968' => 'ASCII',
36
        'ANSIX341986' => 'ASCII',
37
        'CP367' => 'ASCII',
38
        'CSASCII' => 'ASCII',
39
        'IBM367' => 'ASCII',
40
        'ISO646US' => 'ASCII',
41
        'ISO646IRV1991' => 'ASCII',
42
        'ISOIR6' => 'ASCII',
43
        'US' => 'ASCII',
44
        'USASCII' => 'ASCII',
45
        'BIG5' => 'BIG-5',
46
        'BIG5TW' => 'BIG-5',
47
        'CSBIG5' => 'BIG-5',
48
        '1251' => 'WINDOWS-1251',
49
        'CP1251' => 'WINDOWS-1251',
50
        'WINDOWS1251' => 'WINDOWS-1251',
51
        '1252' => 'WINDOWS-1252',
52
        'CP1252' => 'WINDOWS-1252',
53
        'WINDOWS1252' => 'WINDOWS-1252',
54
        'WE8MSWIN1252' => 'WINDOWS-1252',
55
        '1254' => 'WINDOWS-1254',
56
        'CP1254' => 'WINDOWS-1254',
57
        'WINDOWS1254' => 'WINDOWS-1254',
58
        '1255' => 'ISO-8859-8',
59
        'CP1255' => 'ISO-8859-8',
60
        'ISO88598I' => 'ISO-8859-8',
61
        'WINDOWS1255' => 'ISO-8859-8',
62
        '850' => 'CP850',
63
        'CSPC850MULTILINGUAL' => 'CP850',
64
        'IBM850' => 'CP850',
65
        '866' => 'CP866',
66
        'CSIBM866' => 'CP866',
67
        'IBM866' => 'CP866',
68
        '932' => 'CP932',
69
        'MS932' => 'CP932',
70
        'MSKANJI' => 'CP932',
71
        '950' => 'CP950',
72
        'MS950' => 'CP950',
73
        'EUCJP' => 'EUC-JP',
74
        'UJIS' => 'EUC-JP',
75
        'EUCKR' => 'EUC-KR',
76
        'KOREAN' => 'EUC-KR',
77
        'KSC5601' => 'EUC-KR',
78
        'KSC56011987' => 'EUC-KR',
79
        'KSX1001' => 'EUC-KR',
80
        'GB180302000' => 'GB18030',
81
        // GB2312 not listed but supported
82
        'CHINESE' => 'GB2312',
83
        'CSISO58GB231280' => 'GB2312',
84
        'EUCCN' => 'GB2312',
85
        'EUCGB2312CN' => 'GB2312',
86
        'GB23121980' => 'GB2312',
87
        'GB231280' => 'GB2312',
88
        'ISOIR58' => 'GB2312',
89
        'GBK' => 'CP936',
90
        '936' => 'CP936',
91
        'ms936' => 'CP936',
92
        'HZGB' => 'HZ',
93
        'HZGB2312' => 'HZ',
94
        'CSISO2022JP' => 'ISO-2022-JP',
95
        'ISO2022JP' => 'ISO-2022-JP',
96
        'ISO2022JP2004' => 'ISO-2022-JP-2004',
97
        'CSISO2022KR' => 'ISO-2022-KR',
98
        'ISO2022KR' => 'ISO-2022-KR',
99
        'CSISOLATIN6' => 'ISO-8859-10',
100
        'ISO885910' => 'ISO-8859-10',
101
        'ISO8859101992' => 'ISO-8859-10',
102
        'ISOIR157' => 'ISO-8859-10',
103
        'L6' => 'ISO-8859-10',
104
        'LATIN6' => 'ISO-8859-10',
105
        'ISO885913' => 'ISO-8859-13',
106
        'ISO885914' => 'ISO-8859-14',
107
        'ISO8859141998' => 'ISO-8859-14',
108
        'ISOCELTIC' => 'ISO-8859-14',
109
        'ISOIR199' => 'ISO-8859-14',
110
        'L8' => 'ISO-8859-14',
111
        'LATIN8' => 'ISO-8859-14',
112
        'ISO885915' => 'ISO-8859-15',
113
        'ISO885916' => 'ISO-8859-16',
114
        'ISO8859162001' => 'ISO-8859-16',
115
        'ISOIR226' => 'ISO-8859-16',
116
        'L10' => 'ISO-8859-16',
117
        'LATIN10' => 'ISO-8859-16',
118
        'CSISOLATIN2' => 'ISO-8859-2',
119
        'ISO88592' => 'ISO-8859-2',
120
        'ISO885921987' => 'ISO-8859-2',
121
        'ISOIR101' => 'ISO-8859-2',
122
        'L2' => 'ISO-8859-2',
123
        'LATIN2' => 'ISO-8859-2',
124
        'CSISOLATIN3' => 'ISO-8859-3',
125
        'ISO88593' => 'ISO-8859-3',
126
        'ISO885931988' => 'ISO-8859-3',
127
        'ISOIR109' => 'ISO-8859-3',
128
        'L3' => 'ISO-8859-3',
129
        'LATIN3' => 'ISO-8859-3',
130
        'CSISOLATIN4' => 'ISO-8859-4',
131
        'ISO88594' => 'ISO-8859-4',
132
        'ISO885941988' => 'ISO-8859-4',
133
        'ISOIR110' => 'ISO-8859-4',
134
        'L4' => 'ISO-8859-4',
135
        'LATIN4' => 'ISO-8859-4',
136
        'CSISOLATINCYRILLIC' => 'ISO-8859-5',
137
        'CYRILLIC' => 'ISO-8859-5',
138
        'ISO88595' => 'ISO-8859-5',
139
        'ISO885951988' => 'ISO-8859-5',
140
        'ISOIR144' => 'ISO-8859-5',
141
        'ARABIC' => 'ISO-8859-6',
142
        'ASMO708' => 'ISO-8859-6',
143
        'CSISOLATINARABIC' => 'ISO-8859-6',
144
        'ECMA114' => 'ISO-8859-6',
145
        'ISO88596' => 'ISO-8859-6',
146
        'ISO885961987' => 'ISO-8859-6',
147
        'ISOIR127' => 'ISO-8859-6',
148
        'CSISOLATINGREEK' => 'ISO-8859-7',
149
        'ECMA118' => 'ISO-8859-7',
150
        'ELOT928' => 'ISO-8859-7',
151
        'GREEK' => 'ISO-8859-7',
152
        'GREEK8' => 'ISO-8859-7',
153
        'ISO88597' => 'ISO-8859-7',
154
        'ISO885971987' => 'ISO-8859-7',
155
        'ISOIR126' => 'ISO-8859-7',
156
        'CSISOLATINHEBREW' => 'ISO-8859-8',
157
        'HEBREW' => 'ISO-8859-8',
158
        'ISO88598' => 'ISO-8859-8',
159
        'ISO885981988' => 'ISO-8859-8',
160
        'ISOIR138' => 'ISO-8859-8',
161
        'CSISOLATIN5' => 'ISO-8859-9',
162
        'ISO88599' => 'ISO-8859-9',
163
        'ISO885991989' => 'ISO-8859-9',
164
        'ISOIR148' => 'ISO-8859-9',
165
        'L5' => 'ISO-8859-9',
166
        'LATIN5' => 'ISO-8859-9',
167
        'CSKOI8R' => 'KOI8-R',
168
        'KOI8R' => 'KOI8-R',
169
        '8859' => 'ISO-8859-1',
170
        'CP819' => 'ISO-8859-1',
171
        'CSISOLATIN1' => 'ISO-8859-1',
172
        'IBM819' => 'ISO-8859-1',
173
        'ISO8859' => 'ISO-8859-1',
174
        'ISO88591' => 'ISO-8859-1',
175
        'ISO885911987' => 'ISO-8859-1',
176
        'ISOIR100' => 'ISO-8859-1',
177
        'L1' => 'ISO-8859-1',
178
        'LATIN' => 'ISO-8859-1',
179
        'LATIN1' => 'ISO-8859-1',
180
        'CSSHIFTJIS' => 'SJIS',
181
        'SHIFTJIS' => 'SJIS',
182
        'SHIFTJIS2004' => 'SJIS-2004',
183
        'SJIS2004' => 'SJIS-2004',
184
        // Microsoft charset values
185
        '0' => 'WINDOWS-1252',
186
        '128' => 'SJIS',
187
        '129' => 'EUC-KR',
188
        '134' => 'GB2312',
189
        '136' => 'BIG-5',
190
        '161' => 'WINDOWS-1253',
191
        '162' => 'WINDOWS-1254',
192
        '177' => 'WINDOWS-1255',
193
        '178' => 'WINDOWS-1256',
194
        '186' => 'WINDOWS-1257',
195
        '204' => 'WINDOWS-1251',
196
        '222' => 'WINDOWS-874',
197
        '238' => 'WINDOWS-1250',
198
    ];
199
    
200
    /**
201
     * @var array aliased charsets supported by iconv.
202
     */
203
    public static $iconvAliases = [
204
        // iconv aliases -- a lot of these may already be supported
205
        'BIG5HKSCS' => 'BIG5HKSCS',
206
        'HKSCS' => 'BIG5HKSCS',
207
        '037' => 'CP037',
208
        'EBCDICCPCA' => 'CP037',
209
        'EBCDICCPNL' => 'CP037',
210
        'EBCDICCPUS' => 'CP037',
211
        'EBCDICCPWT' => 'CP037',
212
        'CSIBM037' => 'CP037',
213
        'IBM037' => 'CP037',
214
        'IBM039' => 'CP037',
215
        '1026' => 'CP1026',
216
        'CSIBM1026' => 'CP1026',
217
        'IBM1026' => 'CP1026',
218
        '1140' => 'CP1140',
219
        'IBM1140' => 'CP1140',
220
        '1250' => 'CP1250',
221
        'WINDOWS1250' => 'CP1250',
222
        '1253' => 'CP1253',
223
        'WINDOWS1253' => 'CP1253',
224
        '1256' => 'CP1256',
225
        'WINDOWS1256' => 'CP1256',
226
        '1257' => 'CP1257',
227
        'WINDOWS1257' => 'CP1257',
228
        '1258' => 'CP1258',
229
        'WINDOWS1258' => 'CP1258',
230
        '424' => 'CP424',
231
        'CSIBM424' => 'CP424',
232
        'EBCDICCPHE' => 'CP424',
233
        'IBM424' => 'CP424',
234
        '437' => 'CP437',
235
        'CSPC8CODEPAGE437' => 'CP437',
236
        'IBM437' => 'CP437',
237
        '500' => 'CP500',
238
        'CSIBM500' => 'CP500',
239
        'EBCDICCPBE' => 'CP500',
240
        'EBCDICCPCH' => 'CP500',
241
        'IBM500' => 'CP500',
242
        '775' => 'CP775',
243
        'CSPC775BALTIC' => 'CP775',
244
        'IBM775' => 'CP775',
245
        '860' => 'CP860',
246
        'CSIBM860' => 'CP860',
247
        'IBM860' => 'CP860',
248
        '861' => 'CP861',
249
        'CPIS' => 'CP861',
250
        'CSIBM861' => 'CP861',
251
        'IBM861' => 'CP861',
252
        '862' => 'CP862',
253
        'CSPC862LATINHEBREW' => 'CP862',
254
        'IBM862' => 'CP862',
255
        '863' => 'CP863',
256
        'CSIBM863' => 'CP863',
257
        'IBM863' => 'CP863',
258
        '864' => 'CP864',
259
        'CSIBM864' => 'CP864',
260
        'IBM864' => 'CP864',
261
        '865' => 'CP865',
262
        'CSIBM865' => 'CP865',
263
        'IBM865' => 'CP865',
264
        '869' => 'CP869',
265
        'CPGR' => 'CP869',
266
        'CSIBM869' => 'CP869',
267
        'IBM869' => 'CP869',
268
        '949' => 'CP949',
269
        'MS949' => 'CP949',
270
        'UHC' => 'CP949',
271
        'ROMAN8' => 'ROMAN8',
272
        'HPROMAN8' => 'ROMAN8',
273
        'R8' => 'ROMAN8',
274
        'CSHPROMAN8' => 'ROMAN8',
275
        'ISO2022JP2' => 'ISO2022JP2',
276
        'THAI' => 'ISO885911',
277
        'ISO885911' => 'ISO885911',
278
        'ISO8859112001' => 'ISO885911',
279
        'JOHAB' => 'CP1361',
280
        'MS1361' => 'CP1361',
281
        'MACCYRILLIC' => 'MACCYRILLIC',
282
        'CSPTCP154' => 'PT154',
283
        'PTCP154' => 'PT154',
284
        'CP154' => 'PT154',
285
        'CYRILLICASIAN' => 'PT154',
286
        'TIS620' => 'TIS620',
287
        'TIS6200' => 'TIS620',
288
        'TIS62025290' => 'TIS620',
289
        'TIS62025291' => 'TIS620',
290
        'ISOIR166' => 'TIS620',
291
    ];
292
    
293
    /**
294
     * @var string charset to convert from
295
     */
296
    protected $fromCharset;
297
    
298
    /**
299
     * @var string charset to convert to
300
     */
301
    protected $toCharset;
302
    
303
    /**
304
     * @var boolean indicates if $fromCharset is supported by
305
     * mb_convert_encoding
306
     */
307
    protected $fromCharsetMbSupported = true;
308
    
309
    /**
310
     * @var boolean indicates if $toCharset is supported by mb_convert_encoding
311
     */
312
    protected $toCharsetMbSupported = true;
313
    
314
    /**
315
     * Constructs the charset converter with source/destination charsets.
316
     * 
317
     * @param string $fromCharset
318
     * @param string $toCharset
319
     */
320 3
    public function __construct($fromCharset, $toCharset)
321
    {
322 3
        $this->fromCharset = $this->findSupportedCharset($fromCharset, $this->fromCharsetMbSupported);
323 3
        $this->toCharset = $this->findSupportedCharset($toCharset, $this->toCharsetMbSupported);
324 3
    }
325
    
326
    /**
327
     * Converts the passed string's charset from $this->fromCharset to
328
     * $this->toCharset.
329
     * 
330
     * The function attempts to use mb_convert_encoding if possible, and falls
331
     * back to iconv if not.  If the source or destination character sets aren't
332
     * supported, a blank string is returned.
333
     * 
334
     * @param string $str
335
     * @return string
336
     */
337 3
    public function convert($str)
338
    {
339
        // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
340
        // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
341
        // may also fail even though both support an encoding separately.
342
        // Unfortunately there's no great way of testing what charsets are available on iconv, and
343
        // attempting to blindly convert the string may be too costly, as could converting first
344
        // to an intermediate (ASSUMPTION: may be worth testing converting to an intermediate)
345 3
        if ($str !== '') {
346 3
            if ($this->fromCharsetMbSupported && $this->toCharsetMbSupported) {
347 2
                return mb_convert_encoding($str, $this->toCharset, $this->fromCharset);
348
            }
349 2
            return iconv($this->fromCharset, $this->toCharset . '//TRANSLIT//IGNORE', $str);
350
        }
351
        return $str;
352
    }
353
    
354
    /**
355
     * Looks up the passed $cs in mb_list_encodings, then strips non
356
     * alpha-numeric characters and tries again, then failing that calls
357
     * findAliasedCharset.  The method returns the charset name that should be
358
     * used in calls to mb_convert_encoding or iconv.
359
     * 
360
     * If the charset is part of mb_list_encodings, $mbSupported is set to true.
361
     * 
362
     * @param string $cs
363
     * @param boolean $mbSupported
364
     * @return string the final charset name to use
365
     */
366 3
    private function findSupportedCharset($cs, &$mbSupported)
367
    {
368 3
        $mbSupported = true;
369 3
        $comp = strtoupper($cs);
370 3
        $available = array_map('strtoupper', mb_list_encodings());
371 3
        if (in_array($comp, $available)) {
372 3
            return $comp;
373
        }
374 3
        $stripped = preg_replace('/[^A-Z0-9]+/', '', $comp);
375 3
        $amb = preg_replace('/[^A-Z0-9]+/', '', $available);
376 3
        $index = array_search($stripped, $amb, true);
377 3
        if ($index !== false) {
378 1
            return $available[$index];
379
        }
380 3
        return $this->findAliasedCharset($comp, $stripped, $mbSupported);
381
    }
382
    
383
    /**
384
     * Looks up the passed $comp and $stripped strings in self::$mbAliases, and
385
     * returns the mapped charset if applicable.  Otherwise calls
386
     * $this->findAliasedIconvCharset.
387
     * 
388
     * $mbSupported is set to false if the charset is not located in
389
     * self::$mbAliases.
390
     * 
391
     * @param string $comp
392
     * @param string $stripped
393
     * @param boolean $mbSupported
394
     * @return string the mapped charset
395
     */
396 3
    private function findAliasedCharset($comp, $stripped, &$mbSupported)
397
    {
398 3
        if (array_key_exists($comp, self::$mbAliases)) {
399 1
            return self::$mbAliases[$comp];
400 3
        } elseif (array_key_exists($stripped, self::$mbAliases)) {
401 2
            return self::$mbAliases[$stripped];
402
        }
403 2
        $mbSupported = false;
404 2
        return $this->findAliasedIconvCharset($comp, $stripped);
405
    }
406
    
407
    /**
408
     * Looks up the passed $comp and $stripped strings in self::$iconvAliases,
409
     * and returns the mapped charset if applicable.  Otherwise returns $comp.
410
     * 
411
     * @param string $comp
412
     * @param string $stripped
413
     * @return string the mapped charset (if mapped) or $comp otherwise
414
     */
415 2
    private function findAliasedIconvCharset($comp, $stripped)
416
    {
417 2
        if (array_key_exists($comp, self::$iconvAliases)) {
418 1
            return static::$iconvAliases[$comp];
419 2
        } elseif (array_key_exists($stripped, self::$iconvAliases)) {
420 1
            return static::$iconvAliases[$stripped];
421
        }
422 2
        return $comp;
423
    }
424
}
425