Passed
Push — 1.0.0 ( fcaf32...d3c3e6 )
by Zaahid
03:21
created

CharsetConverter::convert()   C

Complexity

Conditions 8
Paths 5

Size

Total Lines 24
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 8.0155

Importance

Changes 0
Metric Value
dl 0
loc 24
ccs 15
cts 16
cp 0.9375
rs 5.7377
c 0
b 0
f 0
cc 8
eloc 16
nc 5
nop 3
crap 8.0155
1
<?php
2
/**
3
 * This file is part of the ZBateson\MailMimeParser project.
4
 *
5
 * @license http://opensource.org/licenses/bsd-license.php BSD
6
 */
7
namespace ZBateson\MailMimeParser\Util;
8
9
/**
10
 * Helper class for converting strings between charsets.
11
 * 
12
 * CharsetConverter tries to convert using mb_convert_encoding when possible,
13
 * defining as many aliases as possible for supported encodings.  If not
14
 * supported, iconv is attempted.
15
 *
16
 * @author Zaahid Bateson
17
 */
18
class CharsetConverter
19
{
20
    /**
21
     * @var array aliased charsets supported by mb_convert_encoding.
22
     *      The alias is stripped of any non-alphanumeric characters (so CP367
23
     *      is equal to CP-367) when comparing.
24
     *      Some of these translations are already supported by
25
     *      mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
26
     *      other implementations or versions since they're not part of
27
     *      documented support.
28
     */
29
    public static $mbAliases = [
30
        // supported but not included in mb_list_encodings for some reason...
31
        'CP850' => 'CP850',
32
        'GB2312' => 'GB2312',
33
        // aliases
34
        '646' => 'ASCII',
35
        'ANSIX341968' => 'ASCII',
36
        'ANSIX341986' => 'ASCII',
37
        'CP367' => 'ASCII',
38
        'CSASCII' => 'ASCII',
39
        'IBM367' => 'ASCII',
40
        'ISO646US' => 'ASCII',
41
        'ISO646IRV1991' => 'ASCII',
42
        'ISOIR6' => 'ASCII',
43
        'US' => 'ASCII',
44
        'USASCII' => 'ASCII',
45
        'BIG5' => 'BIG-5',
46
        'BIG5TW' => 'BIG-5',
47
        'CSBIG5' => 'BIG-5',
48
        '1251' => 'WINDOWS-1251',
49
        'CP1251' => 'WINDOWS-1251',
50
        'WINDOWS1251' => 'WINDOWS-1251',
51
        '1252' => 'WINDOWS-1252',
52
        'CP1252' => 'WINDOWS-1252',
53
        'WINDOWS1252' => 'WINDOWS-1252',
54
        'WE8MSWIN1252' => 'WINDOWS-1252',
55
        '1254' => 'WINDOWS-1254',
56
        'CP1254' => 'WINDOWS-1254',
57
        'WINDOWS1254' => 'WINDOWS-1254',
58
        '1255' => 'ISO-8859-8',
59
        'CP1255' => 'ISO-8859-8',
60
        'ISO88598I' => 'ISO-8859-8',
61
        'WINDOWS1255' => 'ISO-8859-8',
62
        '850' => 'CP850',
63
        'CSPC850MULTILINGUAL' => 'CP850',
64
        'IBM850' => 'CP850',
65
        '866' => 'CP866',
66
        'CSIBM866' => 'CP866',
67
        'IBM866' => 'CP866',
68
        '932' => 'CP932',
69
        'MS932' => 'CP932',
70
        'MSKANJI' => 'CP932',
71
        '950' => 'CP950',
72
        'MS950' => 'CP950',
73
        'EUCJP' => 'EUC-JP',
74
        'UJIS' => 'EUC-JP',
75
        'EUCKR' => 'EUC-KR',
76
        'KOREAN' => 'EUC-KR',
77
        'KSC5601' => 'EUC-KR',
78
        'KSC56011987' => 'EUC-KR',
79
        'KSX1001' => 'EUC-KR',
80
        'GB180302000' => 'GB18030',
81
        // GB2312 not listed but supported
82
        'CHINESE' => 'GB2312',
83
        'CSISO58GB231280' => 'GB2312',
84
        'EUCCN' => 'GB2312',
85
        'EUCGB2312CN' => 'GB2312',
86
        'GB23121980' => 'GB2312',
87
        'GB231280' => 'GB2312',
88
        'ISOIR58' => 'GB2312',
89
        'GBK' => 'CP936',
90
        '936' => 'CP936',
91
        'ms936' => 'CP936',
92
        'HZGB' => 'HZ',
93
        'HZGB2312' => 'HZ',
94
        'CSISO2022JP' => 'ISO-2022-JP',
95
        'ISO2022JP' => 'ISO-2022-JP',
96
        'ISO2022JP2004' => 'ISO-2022-JP-2004',
97
        'CSISO2022KR' => 'ISO-2022-KR',
98
        'ISO2022KR' => 'ISO-2022-KR',
99
        'CSISOLATIN6' => 'ISO-8859-10',
100
        'ISO885910' => 'ISO-8859-10',
101
        'ISO8859101992' => 'ISO-8859-10',
102
        'ISOIR157' => 'ISO-8859-10',
103
        'L6' => 'ISO-8859-10',
104
        'LATIN6' => 'ISO-8859-10',
105
        'ISO885913' => 'ISO-8859-13',
106
        'ISO885914' => 'ISO-8859-14',
107
        'ISO8859141998' => 'ISO-8859-14',
108
        'ISOCELTIC' => 'ISO-8859-14',
109
        'ISOIR199' => 'ISO-8859-14',
110
        'L8' => 'ISO-8859-14',
111
        'LATIN8' => 'ISO-8859-14',
112
        'ISO885915' => 'ISO-8859-15',
113
        'ISO885916' => 'ISO-8859-16',
114
        'ISO8859162001' => 'ISO-8859-16',
115
        'ISOIR226' => 'ISO-8859-16',
116
        'L10' => 'ISO-8859-16',
117
        'LATIN10' => 'ISO-8859-16',
118
        'CSISOLATIN2' => 'ISO-8859-2',
119
        'ISO88592' => 'ISO-8859-2',
120
        'ISO885921987' => 'ISO-8859-2',
121
        'ISOIR101' => 'ISO-8859-2',
122
        'L2' => 'ISO-8859-2',
123
        'LATIN2' => 'ISO-8859-2',
124
        'CSISOLATIN3' => 'ISO-8859-3',
125
        'ISO88593' => 'ISO-8859-3',
126
        'ISO885931988' => 'ISO-8859-3',
127
        'ISOIR109' => 'ISO-8859-3',
128
        'L3' => 'ISO-8859-3',
129
        'LATIN3' => 'ISO-8859-3',
130
        'CSISOLATIN4' => 'ISO-8859-4',
131
        'ISO88594' => 'ISO-8859-4',
132
        'ISO885941988' => 'ISO-8859-4',
133
        'ISOIR110' => 'ISO-8859-4',
134
        'L4' => 'ISO-8859-4',
135
        'LATIN4' => 'ISO-8859-4',
136
        'CSISOLATINCYRILLIC' => 'ISO-8859-5',
137
        'CYRILLIC' => 'ISO-8859-5',
138
        'ISO88595' => 'ISO-8859-5',
139
        'ISO885951988' => 'ISO-8859-5',
140
        'ISOIR144' => 'ISO-8859-5',
141
        'ARABIC' => 'ISO-8859-6',
142
        'ASMO708' => 'ISO-8859-6',
143
        'CSISOLATINARABIC' => 'ISO-8859-6',
144
        'ECMA114' => 'ISO-8859-6',
145
        'ISO88596' => 'ISO-8859-6',
146
        'ISO885961987' => 'ISO-8859-6',
147
        'ISOIR127' => 'ISO-8859-6',
148
        'CSISOLATINGREEK' => 'ISO-8859-7',
149
        'ECMA118' => 'ISO-8859-7',
150
        'ELOT928' => 'ISO-8859-7',
151
        'GREEK' => 'ISO-8859-7',
152
        'GREEK8' => 'ISO-8859-7',
153
        'ISO88597' => 'ISO-8859-7',
154
        'ISO885971987' => 'ISO-8859-7',
155
        'ISOIR126' => 'ISO-8859-7',
156
        'CSISOLATINHEBREW' => 'ISO-8859-8',
157
        'HEBREW' => 'ISO-8859-8',
158
        'ISO88598' => 'ISO-8859-8',
159
        'ISO885981988' => 'ISO-8859-8',
160
        'ISOIR138' => 'ISO-8859-8',
161
        'CSISOLATIN5' => 'ISO-8859-9',
162
        'ISO88599' => 'ISO-8859-9',
163
        'ISO885991989' => 'ISO-8859-9',
164
        'ISOIR148' => 'ISO-8859-9',
165
        'L5' => 'ISO-8859-9',
166
        'LATIN5' => 'ISO-8859-9',
167
        'CSKOI8R' => 'KOI8-R',
168
        'KOI8R' => 'KOI8-R',
169
        '8859' => 'ISO-8859-1',
170
        'CP819' => 'ISO-8859-1',
171
        'CSISOLATIN1' => 'ISO-8859-1',
172
        'IBM819' => 'ISO-8859-1',
173
        'ISO8859' => 'ISO-8859-1',
174
        'ISO88591' => 'ISO-8859-1',
175
        'ISO885911987' => 'ISO-8859-1',
176
        'ISOIR100' => 'ISO-8859-1',
177
        'L1' => 'ISO-8859-1',
178
        'LATIN' => 'ISO-8859-1',
179
        'LATIN1' => 'ISO-8859-1',
180
        'CSSHIFTJIS' => 'SJIS',
181
        'SHIFTJIS' => 'SJIS',
182
        'SHIFTJIS2004' => 'SJIS-2004',
183
        'SJIS2004' => 'SJIS-2004',
184
    ];
185
    
186
    /**
187
     * @var array aliased charsets supported by iconv.
188
     */
189
    public static $iconvAliases = [
190
        // iconv aliases -- a lot of these may already be supported
191
        'BIG5HKSCS' => 'BIG5HKSCS',
192
        'HKSCS' => 'BIG5HKSCS',
193
        '037' => 'CP037',
194
        'EBCDICCPCA' => 'CP037',
195
        'EBCDICCPNL' => 'CP037',
196
        'EBCDICCPUS' => 'CP037',
197
        'EBCDICCPWT' => 'CP037',
198
        'CSIBM037' => 'CP037',
199
        'IBM037' => 'CP037',
200
        'IBM039' => 'CP037',
201
        '1026' => 'CP1026',
202
        'CSIBM1026' => 'CP1026',
203
        'IBM1026' => 'CP1026',
204
        '1140' => 'CP1140',
205
        'IBM1140' => 'CP1140',
206
        '1250' => 'CP1250',
207
        'WINDOWS1250' => 'CP1250',
208
        '1253' => 'CP1253',
209
        'WINDOWS1253' => 'CP1253',
210
        '1256' => 'CP1256',
211
        'WINDOWS1256' => 'CP1256',
212
        '1257' => 'CP1257',
213
        'WINDOWS1257' => 'CP1257',
214
        '1258' => 'CP1258',
215
        'WINDOWS1258' => 'CP1258',
216
        '424' => 'CP424',
217
        'CSIBM424' => 'CP424',
218
        'EBCDICCPHE' => 'CP424',
219
        'IBM424' => 'CP424',
220
        '437' => 'CP437',
221
        'CSPC8CODEPAGE437' => 'CP437',
222
        'IBM437' => 'CP437',
223
        '500' => 'CP500',
224
        'CSIBM500' => 'CP500',
225
        'EBCDICCPBE' => 'CP500',
226
        'EBCDICCPCH' => 'CP500',
227
        'IBM500' => 'CP500',
228
        '775' => 'CP775',
229
        'CSPC775BALTIC' => 'CP775',
230
        'IBM775' => 'CP775',
231
        '860' => 'CP860',
232
        'CSIBM860' => 'CP860',
233
        'IBM860' => 'CP860',
234
        '861' => 'CP861',
235
        'CPIS' => 'CP861',
236
        'CSIBM861' => 'CP861',
237
        'IBM861' => 'CP861',
238
        '862' => 'CP862',
239
        'CSPC862LATINHEBREW' => 'CP862',
240
        'IBM862' => 'CP862',
241
        '863' => 'CP863',
242
        'CSIBM863' => 'CP863',
243
        'IBM863' => 'CP863',
244
        '864' => 'CP864',
245
        'CSIBM864' => 'CP864',
246
        'IBM864' => 'CP864',
247
        '865' => 'CP865',
248
        'CSIBM865' => 'CP865',
249
        'IBM865' => 'CP865',
250
        '869' => 'CP869',
251
        'CPGR' => 'CP869',
252
        'CSIBM869' => 'CP869',
253
        'IBM869' => 'CP869',
254
        '949' => 'CP949',
255
        'MS949' => 'CP949',
256
        'UHC' => 'CP949',
257
        'ROMAN8' => 'ROMAN8',
258
        'HPROMAN8' => 'ROMAN8',
259
        'R8' => 'ROMAN8',
260
        'CSHPROMAN8' => 'ROMAN8',
261
        'ISO2022JP2' => 'ISO2022JP2',
262
        'THAI' => 'ISO885911',
263
        'ISO885911' => 'ISO885911',
264
        'ISO8859112001' => 'ISO885911',
265
        'JOHAB' => 'CP1361',
266
        'MS1361' => 'CP1361',
267
        'MACCYRILLIC' => 'MACCYRILLIC',
268
        'CSPTCP154' => 'PT154',
269
        'PTCP154' => 'PT154',
270
        'CP154' => 'PT154',
271
        'CYRILLICASIAN' => 'PT154',
272
        'TIS620' => 'TIS620',
273
        'TIS6200' => 'TIS620',
274
        'TIS62025290' => 'TIS620',
275
        'TIS62025291' => 'TIS620',
276
        'ISOIR166' => 'TIS620',
277
    ];
278
    
279
    // maps
280
    protected $mappedRequestedCharsets = [
281
        'UTF-8' => [ true, 'UTF-8' ],
282
        'US-ASCII' => [ true, 'US-ASCII' ],
283
        'ISO-8859-1' => [ true, 'ISO-8859-1' ],
284
    ];
285
    
286
    /**
287
     * Converts the passed string's charset from the passed $fromCharset to the
288
     * passed $toCharset
289
     * 
290
     * The function attempts to use mb_convert_encoding if possible, and falls
291
     * back to iconv if not.  If the source or destination character sets aren't
292
     * supported, a blank string is returned.
293
     * 
294
     * @param string $str
295
     * @return string
296
     */
297 4
    public function convert($str, $fromCharset, $toCharset)
298
    {
299
        // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
300
        // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
301
        // may also fail even though both support an encoding separately.
302
        // For cases like that, a two-way encoding is done with UTF-8 as an intermediary.
303 4
        $fromMbSupported = true;
304 4
        $toMbSupported = true;
305 4
        $from = $this->getRealCharset($fromCharset, $fromMbSupported);
306 4
        $to = $this->getRealCharset($toCharset, $toMbSupported);
307 4
        if ($str !== '') {
308 4
            if ($fromMbSupported && !$toMbSupported) {
309 2
                $str = mb_convert_encoding($str, 'UTF-8', $from);
310 2
                return iconv('UTF-8', $to . '//TRANSLIT//IGNORE', $str);
311 4
            } elseif (!$fromMbSupported && $toMbSupported) {
312 1
                $str = iconv($from, 'UTF-8//TRANSLIT//IGNORE', $str);
313 1
                return mb_convert_encoding($str, $to, 'UTF-8');
314 4
            } elseif ($fromMbSupported && $toMbSupported) {
315 3
                return mb_convert_encoding($str, $to, $from);
316
            }
317 1
            return iconv($from, $to . '//TRANSLIT//IGNORE', $str);
318
        }
319
        return $str;
320
    }
321
    
322 4
    private function getRealCharset($cs, &$mbSupported)
323
    {
324 4
        $csu = strtoupper($cs);
325 4
        if (!isset($this->mappedRequestedCharsets[$csu])) {
326 4
            $ret = $this->findSupportedCharset($csu, $mbSupported);
327 4
            $this->mappedRequestedCharsets[$csu] = [
328 4
                $mbSupported,
329
                $ret
330 4
            ];
331 4
        }
332 4
        $mbSupported = $this->mappedRequestedCharsets[$csu][0];
333 4
        return $this->mappedRequestedCharsets[$csu][1];
334
    }
335
    
336
    /**
337
     * Looks up the passed $cs in mb_list_encodings, then strips non
338
     * alpha-numeric characters and tries again, then failing that calls
339
     * findAliasedCharset.  The method returns the charset name that should be
340
     * used in calls to mb_convert_encoding or iconv.
341
     * 
342
     * If the charset is part of mb_list_encodings, $mbSupported is set to true.
343
     * 
344
     * @param string $cs
345
     * @param boolean $mbSupported
346
     * @return string the final charset name to use
347
     */
348 4
    private function findSupportedCharset($cs, &$mbSupported)
349
    {
350 4
        $mbSupported = true;
351 4
        $comp = strtoupper($cs);
352 4
        $available = array_map('strtoupper', mb_list_encodings());
353 4
        if (in_array($comp, $available)) {
354 3
            return $comp;
355
        }
356 4
        $stripped = preg_replace('/[^A-Z0-9]+/', '', $comp);
357 4
        if (in_array($stripped, $available)) {
358
            return $stripped;
359
        }
360 4
        return $this->findAliasedCharset($comp, $stripped, $mbSupported);
361
    }
362
    
363
    /**
364
     * Looks up the passed $comp and $stripped strings in self::$mbAliases, and
365
     * returns the mapped charset if applicable.  Otherwise calls
366
     * $this->findAliasedIconvCharset.
367
     * 
368
     * $mbSupported is set to false if the charset is not located in
369
     * self::$mbAliases.
370
     * 
371
     * @param string $comp
372
     * @param string $stripped
373
     * @param boolean $mbSupported
374
     * @return string the mapped charset
375
     */
376 4
    private function findAliasedCharset($comp, $stripped, &$mbSupported)
377
    {
378 4
        if (array_key_exists($comp, self::$mbAliases)) {
379 2
            return self::$mbAliases[$comp];
380 3
        } elseif (array_key_exists($stripped, self::$mbAliases)) {
381 1
            return self::$mbAliases[$stripped];
382
        }
383 2
        $mbSupported = false;
384 2
        return $this->findAliasedIconvCharset($comp, $stripped);
385
    }
386
    
387
    /**
388
     * Looks up the passed $comp and $stripped strings in self::$iconvAliases,
389
     * and returns the mapped charset if applicable.  Otherwise returns $comp.
390
     * 
391
     * @param string $comp
392
     * @param string $stripped
393
     * @return string the mapped charset (if mapped) or $comp otherwise
394
     */
395 2
    private function findAliasedIconvCharset($comp, $stripped)
396
    {
397 2
        if (array_key_exists($comp, self::$iconvAliases)) {
398 2
            return static::$iconvAliases[$comp];
399 2
        } elseif (array_key_exists($stripped, self::$iconvAliases)) {
400
            return static::$iconvAliases[$stripped];
401
        }
402 2
        return $comp;
403
    }
404
}
405