Completed
Push — master ( 07a33d...f690d1 )
by Zaahid
03:53
created

CharsetConverter::convert()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 16
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 4.074

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 16
ccs 5
cts 6
cp 0.8333
rs 9.2
cc 4
eloc 6
nc 3
nop 1
crap 4.074
1
<?php
2
/**
3
 * This file is part of the ZBateson\MailMimeParser project.
4
 *
5
 * @license http => //opensource.org/licenses/bsd-license.php BSD
6
 */
7
namespace ZBateson\MailMimeParser\Stream\Helper;
8
9
/**
10
 * Helper class for converting strings between charsets.
11
 * 
12
 * CharasetConverter tries to convert using mb_convert_encoding when possible,
13
 * defining as many aliases as possible for supported encodings.  If not
14
 * supported, iconv is attempted.
15
 *
16
 * @author Zaahid Bateson
17
 */
18
class CharsetConverter
19
{
20
    /**
21
     * @var array aliased charsets supported by mb_convert_encoding.
22
     *      The alias is stripped of any non-alphanumeric characters (so CP367
23
     *      is equal to CP-367) when comparing.
24
     *      Some of these translations are already supported by
25
     *      mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
26
     *      other implementations or versions since they're not part of
27
     *      documented support.
28
     */
29
    public static $mbAliases = [
30
        // supported but not included in mb_list_encodings for some reason...
31
        'CP850' => 'CP850',
32
        'GB2312' => 'GB2312',
33
        // aliases
34
        '646' => 'ASCII',
35
        'ANSIX341968' => 'ASCII',
36
        'ANSIX341986' => 'ASCII',
37
        'CP367' => 'ASCII',
38
        'CSASCII' => 'ASCII',
39
        'IBM367' => 'ASCII',
40
        'ISO646US' => 'ASCII',
41
        'ISO646IRV1991' => 'ASCII',
42
        'ISOIR6' => 'ASCII',
43
        'US' => 'ASCII',
44
        'USASCII' => 'ASCII',
45
        'BIG5' => 'BIG-5',
46
        'BIG5TW' => 'BIG-5',
47
        'CSBIG5' => 'BIG-5',
48
        '1251' => 'WINDOWS-1251',
49
        'CP1251' => 'WINDOWS-1251',
50
        'WINDOWS1251' => 'WINDOWS-1251',
51
        '1252' => 'WINDOWS-1252',
52
        'CP1252' => 'WINDOWS-1252',
53
        'WINDOWS1252' => 'WINDOWS-1252',
54
        '1254' => 'WINDOWS-1254',
55
        'CP1254' => 'WINDOWS-1254',
56
        'WINDOWS1254' => 'WINDOWS-1254',
57
        '1255' => 'ISO-8859-8',
58
        'CP1255' => 'ISO-8859-8',
59
        'ISO88598I' => 'ISO-8859-8',
60
        'WINDOWS1255' => 'ISO-8859-8',
61
        '850' => 'CP850',
62
        'CSPC850MULTILINGUAL' => 'CP850',
63
        'IBM850' => 'CP850',
64
        '866' => 'CP866',
65
        'CSIBM866' => 'CP866',
66
        'IBM866' => 'CP866',
67
        '932' => 'CP932',
68
        'MS932' => 'CP932',
69
        'MSKANJI' => 'CP932',
70
        '950' => 'CP950',
71
        'MS950' => 'CP950',
72
        'EUCJP' => 'EUC-JP',
73
        'UJIS' => 'EUC-JP',
74
        'EUCKR' => 'EUC-KR',
75
        'KOREAN' => 'EUC-KR',
76
        'KSC5601' => 'EUC-KR',
77
        'KSC56011987' => 'EUC-KR',
78
        'KSX1001' => 'EUC-KR',
79
        'GB180302000' => 'GB18030',
80
        // GB2312 not listed but supported
81
        'CHINESE' => 'GB2312',
82
        'CSISO58GB231280' => 'GB2312',
83
        'EUCCN' => 'GB2312',
84
        'EUCGB2312CN' => 'GB2312',
85
        'GB23121980' => 'GB2312',
86
        'GB231280' => 'GB2312',
87
        'ISOIR58' => 'GB2312',
88
        'GBK' => 'CP936',
89
        '936' => 'CP936',
90
        'ms936' => 'CP936',
91
        'HZGB' => 'HZ',
92
        'HZGB2312' => 'HZ',
93
        'CSISO2022JP' => 'ISO-2022-JP',
94
        'ISO2022JP' => 'ISO-2022-JP',
95
        'ISO2022JP2004' => 'ISO-2022-JP-2004',
96
        'CSISO2022KR' => 'ISO-2022-KR',
97
        'ISO2022KR' => 'ISO-2022-KR',
98
        'CSISOLATIN6' => 'ISO-8859-10',
99
        'ISO885910' => 'ISO-8859-10',
100
        'ISO8859101992' => 'ISO-8859-10',
101
        'ISOIR157' => 'ISO-8859-10',
102
        'L6' => 'ISO-8859-10',
103
        'LATIN6' => 'ISO-8859-10',
104
        'ISO885913' => 'ISO-8859-13',
105
        'ISO885914' => 'ISO-8859-14',
106
        'ISO8859141998' => 'ISO-8859-14',
107
        'ISOCELTIC' => 'ISO-8859-14',
108
        'ISOIR199' => 'ISO-8859-14',
109
        'L8' => 'ISO-8859-14',
110
        'LATIN8' => 'ISO-8859-14',
111
        'ISO885915' => 'ISO-8859-15',
112
        'ISO885916' => 'ISO-8859-16',
113
        'ISO8859162001' => 'ISO-8859-16',
114
        'ISOIR226' => 'ISO-8859-16',
115
        'L10' => 'ISO-8859-16',
116
        'LATIN10' => 'ISO-8859-16',
117
        'CSISOLATIN2' => 'ISO-8859-2',
118
        'ISO88592' => 'ISO-8859-2',
119
        'ISO885921987' => 'ISO-8859-2',
120
        'ISOIR101' => 'ISO-8859-2',
121
        'L2' => 'ISO-8859-2',
122
        'LATIN2' => 'ISO-8859-2',
123
        'CSISOLATIN3' => 'ISO-8859-3',
124
        'ISO88593' => 'ISO-8859-3',
125
        'ISO885931988' => 'ISO-8859-3',
126
        'ISOIR109' => 'ISO-8859-3',
127
        'L3' => 'ISO-8859-3',
128
        'LATIN3' => 'ISO-8859-3',
129
        'CSISOLATIN4' => 'ISO-8859-4',
130
        'ISO88594' => 'ISO-8859-4',
131
        'ISO885941988' => 'ISO-8859-4',
132
        'ISOIR110' => 'ISO-8859-4',
133
        'L4' => 'ISO-8859-4',
134
        'LATIN4' => 'ISO-8859-4',
135
        'CSISOLATINCYRILLIC' => 'ISO-8859-5',
136
        'CYRILLIC' => 'ISO-8859-5',
137
        'ISO88595' => 'ISO-8859-5',
138
        'ISO885951988' => 'ISO-8859-5',
139
        'ISOIR144' => 'ISO-8859-5',
140
        'ARABIC' => 'ISO-8859-6',
141
        'ASMO708' => 'ISO-8859-6',
142
        'CSISOLATINARABIC' => 'ISO-8859-6',
143
        'ECMA114' => 'ISO-8859-6',
144
        'ISO88596' => 'ISO-8859-6',
145
        'ISO885961987' => 'ISO-8859-6',
146
        'ISOIR127' => 'ISO-8859-6',
147
        'CSISOLATINGREEK' => 'ISO-8859-7',
148
        'ECMA118' => 'ISO-8859-7',
149
        'ELOT928' => 'ISO-8859-7',
150
        'GREEK' => 'ISO-8859-7',
151
        'GREEK8' => 'ISO-8859-7',
152
        'ISO88597' => 'ISO-8859-7',
153
        'ISO885971987' => 'ISO-8859-7',
154
        'ISOIR126' => 'ISO-8859-7',
155
        'CSISOLATINHEBREW' => 'ISO-8859-8',
156
        'HEBREW' => 'ISO-8859-8',
157
        'ISO88598' => 'ISO-8859-8',
158
        'ISO885981988' => 'ISO-8859-8',
159
        'ISOIR138' => 'ISO-8859-8',
160
        'CSISOLATIN5' => 'ISO-8859-9',
161
        'ISO88599' => 'ISO-8859-9',
162
        'ISO885991989' => 'ISO-8859-9',
163
        'ISOIR148' => 'ISO-8859-9',
164
        'L5' => 'ISO-8859-9',
165
        'LATIN5' => 'ISO-8859-9',
166
        'CSKOI8R' => 'KOI8-R',
167
        'KOI8R' => 'KOI8-R',
168
        '8859' => 'ISO-8859-1',
169
        'CP819' => 'ISO-8859-1',
170
        'CSISOLATIN1' => 'ISO-8859-1',
171
        'IBM819' => 'ISO-8859-1',
172
        'ISO8859' => 'ISO-8859-1',
173
        'ISO88591' => 'ISO-8859-1',
174
        'ISO885911987' => 'ISO-8859-1',
175
        'ISOIR100' => 'ISO-8859-1',
176
        'L1' => 'ISO-8859-1',
177
        'LATIN' => 'ISO-8859-1',
178
        'LATIN1' => 'ISO-8859-1',
179
        'CSSHIFTJIS' => 'SJIS',
180
        'SHIFTJIS' => 'SJIS',
181
        'SHIFTJIS2004' => 'SJIS-2004',
182
        'SJIS2004' => 'SJIS-2004',
183
    ];
184
    
185
    /**
186
     * @var array aliased charsets supported by iconv.
187
     */
188
    public static $iconvAliases = [
189
        // iconv aliases -- a lot of these may already be supported
190
        'BIG5HKSCS' => 'BIG5HKSCS',
191
        'HKSCS' => 'BIG5HKSCS',
192
        '037' => 'CP037',
193
        'EBCDICCPCA' => 'CP037',
194
        'EBCDICCPNL' => 'CP037',
195
        'EBCDICCPUS' => 'CP037',
196
        'EBCDICCPWT' => 'CP037',
197
        'CSIBM037' => 'CP037',
198
        'IBM037' => 'CP037',
199
        'IBM039' => 'CP037',
200
        '1026' => 'CP1026',
201
        'CSIBM1026' => 'CP1026',
202
        'IBM1026' => 'CP1026',
203
        '1140' => 'CP1140',
204
        'IBM1140' => 'CP1140',
205
        '1250' => 'CP1250',
206
        'WINDOWS1250' => 'CP1250',
207
        '1253' => 'CP1253',
208
        'WINDOWS1253' => 'CP1253',
209
        '1256' => 'CP1256',
210
        'WINDOWS1256' => 'CP1256',
211
        '1257' => 'CP1257',
212
        'WINDOWS1257' => 'CP1257',
213
        '1258' => 'CP1258',
214
        'WINDOWS1258' => 'CP1258',
215
        '424' => 'CP424',
216
        'CSIBM424' => 'CP424',
217
        'EBCDICCPHE' => 'CP424',
218
        'IBM424' => 'CP424',
219
        '437' => 'CP437',
220
        'CSPC8CODEPAGE437' => 'CP437',
221
        'IBM437' => 'CP437',
222
        '500' => 'CP500',
223
        'CSIBM500' => 'CP500',
224
        'EBCDICCPBE' => 'CP500',
225
        'EBCDICCPCH' => 'CP500',
226
        'IBM500' => 'CP500',
227
        '775' => 'CP775',
228
        'CSPC775BALTIC' => 'CP775',
229
        'IBM775' => 'CP775',
230
        '860' => 'CP860',
231
        'CSIBM860' => 'CP860',
232
        'IBM860' => 'CP860',
233
        '861' => 'CP861',
234
        'CPIS' => 'CP861',
235
        'CSIBM861' => 'CP861',
236
        'IBM861' => 'CP861',
237
        '862' => 'CP862',
238
        'CSPC862LATINHEBREW' => 'CP862',
239
        'IBM862' => 'CP862',
240
        '863' => 'CP863',
241
        'CSIBM863' => 'CP863',
242
        'IBM863' => 'CP863',
243
        '864' => 'CP864',
244
        'CSIBM864' => 'CP864',
245
        'IBM864' => 'CP864',
246
        '865' => 'CP865',
247
        'CSIBM865' => 'CP865',
248
        'IBM865' => 'CP865',
249
        '869' => 'CP869',
250
        'CPGR' => 'CP869',
251
        'CSIBM869' => 'CP869',
252
        'IBM869' => 'CP869',
253
        '949' => 'CP949',
254
        'MS949' => 'CP949',
255
        'UHC' => 'CP949',
256
        'ROMAN8' => 'ROMAN8',
257
        'HPROMAN8' => 'ROMAN8',
258
        'R8' => 'ROMAN8',
259
        'CSHPROMAN8' => 'ROMAN8',
260
        'ISO2022JP2' => 'ISO2022JP2',
261
        'THAI' => 'ISO885911',
262
        'ISO885911' => 'ISO885911',
263
        'ISO8859112001' => 'ISO885911',
264
        'JOHAB' => 'CP1361',
265
        'MS1361' => 'CP1361',
266
        'MACCYRILLIC' => 'MACCYRILLIC',
267
        'CSPTCP154' => 'PT154',
268
        'PTCP154' => 'PT154',
269
        'CP154' => 'PT154',
270
        'CYRILLICASIAN' => 'PT154',
271
        'TIS620' => 'TIS620',
272
        'TIS6200' => 'TIS620',
273
        'TIS62025290' => 'TIS620',
274
        'TIS62025291' => 'TIS620',
275
        'ISOIR166' => 'TIS620',
276
    ];
277
    
278
    /**
279
     * @var string charset to convert from
280
     */
281
    protected $fromCharset;
282
    
283
    /**
284
     * @var string charset to convert to
285
     */
286
    protected $toCharset;
287
    
288
    protected $fromCharsetMbSupported = true;
289
    protected $toCharsetMbSupported = true;
290
    
291
    /**
292
     * Constructs the charset converter with source/destination charsets.
293
     * 
294
     * @param string $fromCharset
295
     * @param string $toCharset
296
     */
297 3
    public function __construct($fromCharset, $toCharset)
298
    {
299 3
        $this->fromCharset = $this->findSupportedCharset($fromCharset, $this->fromCharsetMbSupported);
300 3
        $this->toCharset = $this->findSupportedCharset($toCharset, $this->toCharsetMbSupported);
301 3
    }
302
    
303
    /**
304
     * Converts the passed string's charset from $this->fromCharset to
305
     * $this->toCharset.
306
     * 
307
     * The function attempts to use mb_convert_encoding if possible, and falls
308
     * back to iconv if not.  If the source or destination character sets aren't
309
     * supported, a blank string is returned.
310
     * 
311
     * @param string $str
312
     * @return string
313
     */
314 3
    public function convert($str)
315
    {
316
        // there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
317
        // HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
318
        // may also fail even though both support an encoding separately.
319
        // Unfortunately there's no great way of testing what charsets are available on iconv, and
320
        // attempting to blindly convert the string may be too costly, as could converting first
321
        // to an intermediate (ASSUMPTION: may be worth testing converting to an intermediate)
322 3
        if ($str !== '') {
323 3
            if ($this->fromCharsetMbSupported && $this->toCharsetMbSupported) {
324 2
                return mb_convert_encoding($str, $this->toCharset, $this->fromCharset);
325
            }
326 1
            return iconv($this->fromCharset, $this->toCharset . '//TRANSLIT//IGNORE', $str);
327
        }
328
        return $str;
329
    }
330
    
331
    /**
332
     * Looks up the passed $cs in mb_list_encodings, then strips non
333
     * alpha-numeric characters and tries again, then failing that calls
334
     * findAliasedCharset.  The method returns the charset name that should be
335
     * used in calls to mb_convert_encoding or iconv.
336
     * 
337
     * If the charset is part of mb_list_encodings, $mbSupported is set to true.
338
     * 
339
     * @param string $cs
340
     * @param boolean $mbSupported
341
     * @return string the final charset name to use
342
     */
343 3
    private function findSupportedCharset($cs, &$mbSupported)
344
    {
345 3
        $mbSupported = true;
346 3
        $comp = strtoupper($cs);
347 3
        $available = array_map('strtoupper', mb_list_encodings());
348 3
        if (in_array($comp, $available)) {
349 3
            return $comp;
350
        }
351 3
        $stripped = preg_replace('/[^A-Z0-9]+/', '', $comp);
352 3
        if (in_array($stripped, $available)) {
353
            return $stripped;
354
        }
355 3
        return $this->findAliasedCharset($comp, $stripped, $mbSupported);
356
    }
357
    
358
    /**
359
     * Looks up the passed $comp and $stripped strings in self::$mbAliases, and
360
     * returns the mapped charset if applicable.  Otherwise calls
361
     * $this->findAliasedIconvCharset.
362
     * 
363
     * $mbSupported is set to false if the charset is not located in
364
     * self::$mbAliases.
365
     * 
366
     * @param string $comp
367
     * @param string $stripped
368
     * @param boolean $mbSupported
369
     * @return string the mapped charset
370
     */
371 3
    private function findAliasedCharset($comp, $stripped, &$mbSupported)
372
    {
373 3 View Code Duplication
        if (array_key_exists($comp, self::$mbAliases)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
374 1
            return self::$mbAliases[$comp];
375 2
        } elseif (array_key_exists($stripped, self::$mbAliases)) {
376 1
            return self::$mbAliases[$stripped];
377
        }
378 1
        $mbSupported = false;
379 1
        return $this->findAliasedIconvCharset($comp, $stripped);
380
    }
381
    
382
    /**
383
     * Looks up the passed $comp and $stripped strings in self::$iconvAliases,
384
     * and returns the mapped charset if applicable.  Otherwise returns $comp.
385
     * 
386
     * @param string $comp
387
     * @param string $stripped
388
     * @return string the mapped charset (if mapped) or $comp otherwise
389
     */
390 1
    private function findAliasedIconvCharset($comp, $stripped)
391
    {
392 1 View Code Duplication
        if (array_key_exists($comp, self::$iconvAliases)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
393 1
            return self::$iconvAliases[$comp];
394 1
        } elseif (array_key_exists($stripped, self::$iconvAliases)) {
395
            return self::$iconvAliases[$stripped];
396
        }
397 1
        return $comp;
398
    }
399
}
400