Completed
Pull Request — master (#54)
by
unknown
20:41 queued 16:09
created

CharsetConverter::findAliasedCharset()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 3

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 7
cts 7
cp 1
rs 9.4285
c 0
b 0
f 0
cc 3
eloc 7
nc 3
nop 3
crap 3
1
<?php
2
/**
3
 * This file is part of the ZBateson\MailMimeParser project.
4
 *
5
 * @license http => //opensource.org/licenses/bsd-license.php BSD
6
 */
7
namespace ZBateson\MailMimeParser\Stream\Helper;
8
9
/**
10
 * Helper class for converting strings between charsets.
11
 * 
12
 * CharasetConverter tries to convert using mb_convert_encoding when possible,
13
 * defining as many aliases as possible for supported encodings.  If not
14
 * supported, iconv is attempted.
15
 *
16
 * @author Zaahid Bateson
17
 */
18
class CharsetConverter
19
{
20
    /**
21
     * @var array aliased charsets supported by mb_convert_encoding.
22
     *      The alias is stripped of any non-alphanumeric characters (so CP367
23
     *      is equal to CP-367) when comparing.
24
     *      Some of these translations are already supported by
25
     *      mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
26
     *      other implementations or versions since they're not part of
27
     *      documented support.
28
     */
29
    public static $mbAliases = [
30
        // supported but not included in mb_list_encodings for some reason...
31
        'CP850' => 'CP850',
32
        'GB2312' => 'GB2312',
33
        // aliases
34
        '646' => 'ASCII',
35
        'ANSIX341968' => 'ASCII',
36
        'ANSIX341986' => 'ASCII',
37
        'CP367' => 'ASCII',
38
        'CSASCII' => 'ASCII',
39
        'IBM367' => 'ASCII',
40
        'ISO646US' => 'ASCII',
41
        'ISO646IRV1991' => 'ASCII',
42
        'ISOIR6' => 'ASCII',
43
        'US' => 'ASCII',
44
        'USASCII' => 'ASCII',
45
        'BIG5' => 'BIG-5',
46
        'BIG5TW' => 'BIG-5',
47
        'CSBIG5' => 'BIG-5',
48
        '1251' => 'WINDOWS-1251',
49
        'CP1251' => 'WINDOWS-1251',
50
        'WINDOWS1251' => 'WINDOWS-1251',
51
        '1252' => 'WINDOWS-1252',
52
        'CP1252' => 'WINDOWS-1252',
53
        'WINDOWS1252' => 'WINDOWS-1252',
54
        'WE8MSWIN1252' => 'WINDOWS-1252',
55
        '1254' => 'WINDOWS-1254',
56
        'CP1254' => 'WINDOWS-1254',
57
        'WINDOWS1254' => 'WINDOWS-1254',
58
        '1255' => 'ISO-8859-8',
59
        'CP1255' => 'ISO-8859-8',
60
        'ISO88598I' => 'ISO-8859-8',
61
        'WINDOWS1255' => 'ISO-8859-8',
62
        '850' => 'CP850',
63
        'CSPC850MULTILINGUAL' => 'CP850',
64
        'IBM850' => 'CP850',
65
        '866' => 'CP866',
66
        'CSIBM866' => 'CP866',
67
        'IBM866' => 'CP866',
68
        '932' => 'CP932',
69
        'MS932' => 'CP932',
70
        'MSKANJI' => 'CP932',
71
        '950' => 'CP950',
72
        'MS950' => 'CP950',
73
        'EUCJP' => 'EUC-JP',
74
        'UJIS' => 'EUC-JP',
75
        'EUCKR' => 'EUC-KR',
76
        'KOREAN' => 'EUC-KR',
77
        'KSC5601' => 'EUC-KR',
78
        'KSC56011987' => 'EUC-KR',
79
        'KSX1001' => 'EUC-KR',
80
        'GB180302000' => 'GB18030',
81
        // GB2312 not listed but supported
82
        'CHINESE' => 'GB2312',
83
        'CSISO58GB231280' => 'GB2312',
84
        'EUCCN' => 'GB2312',
85
        'EUCGB2312CN' => 'GB2312',
86
        'GB23121980' => 'GB2312',
87
        'GB231280' => 'GB2312',
88
        'ISOIR58' => 'GB2312',
89
        'GBK' => 'CP936',
90
        '936' => 'CP936',
91
        'ms936' => 'CP936',
92
        'HZGB' => 'HZ',
93
        'HZGB2312' => 'HZ',
94
        'CSISO2022JP' => 'ISO-2022-JP',
95
        'ISO2022JP' => 'ISO-2022-JP',
96
        'ISO2022JP2004' => 'ISO-2022-JP-2004',
97
        'CSISO2022KR' => 'ISO-2022-KR',
98
        'ISO2022KR' => 'ISO-2022-KR',
99
        'CSISOLATIN6' => 'ISO-8859-10',
100
        'ISO885910' => 'ISO-8859-10',
101
        'ISO8859101992' => 'ISO-8859-10',
102
        'ISOIR157' => 'ISO-8859-10',
103
        'L6' => 'ISO-8859-10',
104
        'LATIN6' => 'ISO-8859-10',
105
        'ISO885913' => 'ISO-8859-13',
106
        'ISO885914' => 'ISO-8859-14',
107
        'ISO8859141998' => 'ISO-8859-14',
108
        'ISOCELTIC' => 'ISO-8859-14',
109
        'ISOIR199' => 'ISO-8859-14',
110
        'L8' => 'ISO-8859-14',
111
        'LATIN8' => 'ISO-8859-14',
112
        'ISO885915' => 'ISO-8859-15',
113
        'ISO885916' => 'ISO-8859-16',
114
        'ISO8859162001' => 'ISO-8859-16',
115
        'ISOIR226' => 'ISO-8859-16',
116
        'L10' => 'ISO-8859-16',
117
        'LATIN10' => 'ISO-8859-16',
118
        'CSISOLATIN2' => 'ISO-8859-2',
119
        'ISO88592' => 'ISO-8859-2',
120
        'ISO885921987' => 'ISO-8859-2',
121
        'ISOIR101' => 'ISO-8859-2',
122
        'L2' => 'ISO-8859-2',
123
        'LATIN2' => 'ISO-8859-2',
124
        'CSISOLATIN3' => 'ISO-8859-3',
125
        'ISO88593' => 'ISO-8859-3',
126
        'ISO885931988' => 'ISO-8859-3',
127
        'ISOIR109' => 'ISO-8859-3',
128
        'L3' => 'ISO-8859-3',
129
        'LATIN3' => 'ISO-8859-3',
130
        'CSISOLATIN4' => 'ISO-8859-4',
131
        'ISO88594' => 'ISO-8859-4',
132
        'ISO885941988' => 'ISO-8859-4',
133
        'ISOIR110' => 'ISO-8859-4',
134
        'L4' => 'ISO-8859-4',
135
        'LATIN4' => 'ISO-8859-4',
136
        'CSISOLATINCYRILLIC' => 'ISO-8859-5',
137
        'CYRILLIC' => 'ISO-8859-5',
138
        'ISO88595' => 'ISO-8859-5',
139
        'ISO885951988' => 'ISO-8859-5',
140
        'ISOIR144' => 'ISO-8859-5',
141
        'ARABIC' => 'ISO-8859-6',
142
        'ASMO708' => 'ISO-8859-6',
143
        'CSISOLATINARABIC' => 'ISO-8859-6',
144
        'ECMA114' => 'ISO-8859-6',
145
        'ISO88596' => 'ISO-8859-6',
146
        'ISO885961987' => 'ISO-8859-6',
147
        'ISOIR127' => 'ISO-8859-6',
148
        'CSISOLATINGREEK' => 'ISO-8859-7',
149
        'ECMA118' => 'ISO-8859-7',
150
        'ELOT928' => 'ISO-8859-7',
151
        'GREEK' => 'ISO-8859-7',
152
        'GREEK8' => 'ISO-8859-7',
153
        'ISO88597' => 'ISO-8859-7',
154
        'ISO885971987' => 'ISO-8859-7',
155
        'ISOIR126' => 'ISO-8859-7',
156
        'CSISOLATINHEBREW' => 'ISO-8859-8',
157
        'HEBREW' => 'ISO-8859-8',
158
        'ISO88598' => 'ISO-8859-8',
159
        'ISO885981988' => 'ISO-8859-8',
160
        'ISOIR138' => 'ISO-8859-8',
161
        'CSISOLATIN5' => 'ISO-8859-9',
162
        'ISO88599' => 'ISO-8859-9',
163
        'ISO885991989' => 'ISO-8859-9',
164
        'ISOIR148' => 'ISO-8859-9',
165
        'L5' => 'ISO-8859-9',
166
        'LATIN5' => 'ISO-8859-9',
167
        'CSKOI8R' => 'KOI8-R',
168
        'KOI8R' => 'KOI8-R',
169
        '8859' => 'ISO-8859-1',
170
        'CP819' => 'ISO-8859-1',
171
        'CSISOLATIN1' => 'ISO-8859-1',
172
        'IBM819' => 'ISO-8859-1',
173
        'ISO8859' => 'ISO-8859-1',
174
        'ISO88591' => 'ISO-8859-1',
175
        'ISO885911987' => 'ISO-8859-1',
176
        'ISOIR100' => 'ISO-8859-1',
177
        'L1' => 'ISO-8859-1',
178
        'LATIN' => 'ISO-8859-1',
179
        'LATIN1' => 'ISO-8859-1',
180
        'CSSHIFTJIS' => 'SJIS',
181
        'SHIFTJIS' => 'SJIS',
182
        'SHIFTJIS2004' => 'SJIS-2004',
183
        'SJIS2004' => 'SJIS-2004',
184
    ];
185
    
186
    /**
187
     * @var array aliased charsets supported by iconv.
188
     */
189
    public static $iconvAliases = [
190
        // iconv aliases -- a lot of these may already be supported
191
        'BIG5HKSCS' => 'BIG5HKSCS',
192
        'HKSCS' => 'BIG5HKSCS',
193
        '037' => 'CP037',
194
        'EBCDICCPCA' => 'CP037',
195
        'EBCDICCPNL' => 'CP037',
196
        'EBCDICCPUS' => 'CP037',
197
        'EBCDICCPWT' => 'CP037',
198
        'CSIBM037' => 'CP037',
199
        'IBM037' => 'CP037',
200
        'IBM039' => 'CP037',
201
        '1026' => 'CP1026',
202
        'CSIBM1026' => 'CP1026',
203
        'IBM1026' => 'CP1026',
204
        '1140' => 'CP1140',
205
        'IBM1140' => 'CP1140',
206
        '1250' => 'CP1250',
207
        'WINDOWS1250' => 'CP1250',
208
        '1253' => 'CP1253',
209
        'WINDOWS1253' => 'CP1253',
210
        '1256' => 'CP1256',
211
        'WINDOWS1256' => 'CP1256',
212
        '1257' => 'CP1257',
213
        'WINDOWS1257' => 'CP1257',
214
        '1258' => 'CP1258',
215
        'WINDOWS1258' => 'CP1258',
216
        '424' => 'CP424',
217
        'CSIBM424' => 'CP424',
218
        'EBCDICCPHE' => 'CP424',
219
        'IBM424' => 'CP424',
220
        '437' => 'CP437',
221
        'CSPC8CODEPAGE437' => 'CP437',
222
        'IBM437' => 'CP437',
223
        '500' => 'CP500',
224
        'CSIBM500' => 'CP500',
225
        'EBCDICCPBE' => 'CP500',
226
        'EBCDICCPCH' => 'CP500',
227
        'IBM500' => 'CP500',
228
        '775' => 'CP775',
229
        'CSPC775BALTIC' => 'CP775',
230
        'IBM775' => 'CP775',
231
        '860' => 'CP860',
232
        'CSIBM860' => 'CP860',
233
        'IBM860' => 'CP860',
234
        '861' => 'CP861',
235
        'CPIS' => 'CP861',
236
        'CSIBM861' => 'CP861',
237
        'IBM861' => 'CP861',
238
        '862' => 'CP862',
239
        'CSPC862LATINHEBREW' => 'CP862',
240
        'IBM862' => 'CP862',
241
        '863' => 'CP863',
242
        'CSIBM863' => 'CP863',
243
        'IBM863' => 'CP863',
244
        '864' => 'CP864',
245
        'CSIBM864' => 'CP864',
246
        'IBM864' => 'CP864',
247
        '865' => 'CP865',
248
        'CSIBM865' => 'CP865',
249
        'IBM865' => 'CP865',
250
        '869' => 'CP869',
251
        'CPGR' => 'CP869',
252
        'CSIBM869' => 'CP869',
253
        'IBM869' => 'CP869',
254
        '949' => 'CP949',
255
        'MS949' => 'CP949',
256
        'UHC' => 'CP949',
257
        'ROMAN8' => 'ROMAN8',
258
        'HPROMAN8' => 'ROMAN8',
259
        'R8' => 'ROMAN8',
260
        'CSHPROMAN8' => 'ROMAN8',
261
        'ISO2022JP2' => 'ISO2022JP2',
262
        'THAI' => 'ISO885911',
263
        'ISO885911' => 'ISO885911',
264
        'ISO8859112001' => 'ISO885911',
265
        'JOHAB' => 'CP1361',
266
        'MS1361' => 'CP1361',
267
        'MACCYRILLIC' => 'MACCYRILLIC',
268
        'CSPTCP154' => 'PT154',
269
        'PTCP154' => 'PT154',
270
        'CP154' => 'PT154',
271
        'CYRILLICASIAN' => 'PT154',
272
        'TIS620' => 'TIS620',
273
        'TIS6200' => 'TIS620',
274
        'TIS62025290' => 'TIS620',
275
        'TIS62025291' => 'TIS620',
276
        'ISOIR166' => 'TIS620',
277
    ];
278
    
279
    /**
280
     * @var string charset to convert from
281
     */
282
    protected $fromCharset;
283
    
284
    /**
285
     * @var string charset to convert to
286
     */
287
    protected $toCharset;
288
    
289
    /**
290
     * @var boolean indicates if $fromCharset is supported by
291
     * mb_convert_encoding
292
     */
293
    protected $fromCharsetMbSupported = true;
294
    
295
    /**
296
     * @var boolean indicates if $toCharset is supported by mb_convert_encoding
297
     */
298
    protected $toCharsetMbSupported = true;
299
    
300
    /**
301
     * Constructs the charset converter with source/destination charsets.
302
     * 
303
     * @param string $fromCharset
304
     * @param string $toCharset
305
     */
306 3
    public function __construct($fromCharset, $toCharset)
307
    {
308 3
        $this->fromCharset = $this->findSupportedCharset($fromCharset, $this->fromCharsetMbSupported);
309 3
        $this->toCharset = $this->findSupportedCharset($toCharset, $this->toCharsetMbSupported);
310 3
    }
311
    
312
    /**
313
     * Converts the passed string's charset from $this->fromCharset to
314
     * $this->toCharset.
315
     * 
316
     * The function attempts to use mb_convert_encoding if possible, and falls
317
     * back to iconv if not.  If the source or destination character sets aren't
318
     * supported, a blank string is returned.
319
     * 
320
     * @param string $str
321
     * @return string
322
     */
323 3
    public function convert($str)
324
    {
325
        // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
326
        // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
327
        // may also fail even though both support an encoding separately.
328
        // Unfortunately there's no great way of testing what charsets are available on iconv, and
329
        // attempting to blindly convert the string may be too costly, as could converting first
330
        // to an intermediate (ASSUMPTION: may be worth testing converting to an intermediate)
331 3
        if ($str !== '') {
332 3
            if ($this->fromCharsetMbSupported && $this->toCharsetMbSupported) {
333 2
                return mb_convert_encoding($str, $this->toCharset, $this->fromCharset);
334
            }
335 1
            return iconv($this->fromCharset, $this->toCharset . '//TRANSLIT//IGNORE', $str);
336
        }
337
        return $str;
338
    }
339
    
340
    /**
341
     * Looks up the passed $cs in mb_list_encodings, then strips non
342
     * alpha-numeric characters and tries again, then failing that calls
343
     * findAliasedCharset.  The method returns the charset name that should be
344
     * used in calls to mb_convert_encoding or iconv.
345
     * 
346
     * If the charset is part of mb_list_encodings, $mbSupported is set to true.
347
     * 
348
     * @param string $cs
349
     * @param boolean $mbSupported
350
     * @return string the final charset name to use
351
     */
352 3
    private function findSupportedCharset($cs, &$mbSupported)
353
    {
354
        /** @see https://github.com/zbateson/MailMimeParser/issues/53 */
355 3
        if (preg_match('/UTF\-8/', $cs)) {
356 3
            $cs = 'UTF-8';
357 3
        }
358
359 3
        $mbSupported = true;
360 3
        $comp = strtoupper($cs);
361 3
        $available = array_map('strtoupper', mb_list_encodings());
362 3
        if (in_array($comp, $available)) {
363 3
            return $comp;
364
        }
365 3
        $stripped = preg_replace('/[^A-Z0-9]+/', '', $comp);
366 3
        if (in_array($stripped, $available)) {
367
            return $stripped;
368
        }
369 3
        return $this->findAliasedCharset($comp, $stripped, $mbSupported);
370
    }
371
    
372
    /**
373
     * Looks up the passed $comp and $stripped strings in self::$mbAliases, and
374
     * returns the mapped charset if applicable.  Otherwise calls
375
     * $this->findAliasedIconvCharset.
376
     * 
377
     * $mbSupported is set to false if the charset is not located in
378
     * self::$mbAliases.
379
     * 
380
     * @param string $comp
381
     * @param string $stripped
382
     * @param boolean $mbSupported
383
     * @return string the mapped charset
384
     */
385 3
    private function findAliasedCharset($comp, $stripped, &$mbSupported)
386
    {
387 3
        if (array_key_exists($comp, self::$mbAliases)) {
388 1
            return self::$mbAliases[$comp];
389 2
        } elseif (array_key_exists($stripped, self::$mbAliases)) {
390 1
            return self::$mbAliases[$stripped];
391
        }
392 1
        $mbSupported = false;
393 1
        return $this->findAliasedIconvCharset($comp, $stripped);
394
    }
395
    
396
    /**
397
     * Looks up the passed $comp and $stripped strings in self::$iconvAliases,
398
     * and returns the mapped charset if applicable.  Otherwise returns $comp.
399
     * 
400
     * @param string $comp
401
     * @param string $stripped
402
     * @return string the mapped charset (if mapped) or $comp otherwise
403
     */
404 1
    private function findAliasedIconvCharset($comp, $stripped)
405
    {
406 1
        if (array_key_exists($comp, self::$iconvAliases)) {
407 1
            return static::$iconvAliases[$comp];
408 1
        } elseif (array_key_exists($stripped, self::$iconvAliases)) {
409
            return static::$iconvAliases[$stripped];
410
        }
411 1
        return $comp;
412
    }
413
}
414