1 | <?php |
||
18 | class CharsetConverter |
||
19 | { |
||
20 | /** |
||
21 | * @var array aliased charsets supported by mb_convert_encoding. |
||
22 | * The alias is stripped of any non-alphanumeric characters (so CP367 |
||
23 | * is equal to CP-367) when comparing. |
||
24 | * Some of these translations are already supported by |
||
25 | * mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in |
||
26 | * other implementations or versions since they're not part of |
||
27 | * documented support. |
||
28 | */ |
||
29 | public static $mbAliases = [ |
||
30 | // supported but not included in mb_list_encodings for some reason... |
||
31 | 'CP850' => 'CP850', |
||
32 | 'GB2312' => 'GB2312', |
||
33 | // aliases |
||
34 | '646' => 'ASCII', |
||
35 | 'ANSIX341968' => 'ASCII', |
||
36 | 'ANSIX341986' => 'ASCII', |
||
37 | 'CP367' => 'ASCII', |
||
38 | 'CSASCII' => 'ASCII', |
||
39 | 'IBM367' => 'ASCII', |
||
40 | 'ISO646US' => 'ASCII', |
||
41 | 'ISO646IRV1991' => 'ASCII', |
||
42 | 'ISOIR6' => 'ASCII', |
||
43 | 'US' => 'ASCII', |
||
44 | 'USASCII' => 'ASCII', |
||
45 | 'BIG5' => 'BIG-5', |
||
46 | 'BIG5TW' => 'BIG-5', |
||
47 | 'CSBIG5' => 'BIG-5', |
||
48 | '1251' => 'WINDOWS-1251', |
||
49 | 'CP1251' => 'WINDOWS-1251', |
||
50 | 'WINDOWS1251' => 'WINDOWS-1251', |
||
51 | '1252' => 'WINDOWS-1252', |
||
52 | 'CP1252' => 'WINDOWS-1252', |
||
53 | 'WINDOWS1252' => 'WINDOWS-1252', |
||
54 | 'WE8MSWIN1252' => 'WINDOWS-1252', |
||
55 | '1254' => 'WINDOWS-1254', |
||
56 | 'CP1254' => 'WINDOWS-1254', |
||
57 | 'WINDOWS1254' => 'WINDOWS-1254', |
||
58 | '1255' => 'ISO-8859-8', |
||
59 | 'CP1255' => 'ISO-8859-8', |
||
60 | 'ISO88598I' => 'ISO-8859-8', |
||
61 | 'WINDOWS1255' => 'ISO-8859-8', |
||
62 | '850' => 'CP850', |
||
63 | 'CSPC850MULTILINGUAL' => 'CP850', |
||
64 | 'IBM850' => 'CP850', |
||
65 | '866' => 'CP866', |
||
66 | 'CSIBM866' => 'CP866', |
||
67 | 'IBM866' => 'CP866', |
||
68 | '932' => 'CP932', |
||
69 | 'MS932' => 'CP932', |
||
70 | 'MSKANJI' => 'CP932', |
||
71 | '950' => 'CP950', |
||
72 | 'MS950' => 'CP950', |
||
73 | 'EUCJP' => 'EUC-JP', |
||
74 | 'UJIS' => 'EUC-JP', |
||
75 | 'EUCKR' => 'EUC-KR', |
||
76 | 'KOREAN' => 'EUC-KR', |
||
77 | 'KSC5601' => 'EUC-KR', |
||
78 | 'KSC56011987' => 'EUC-KR', |
||
79 | 'KSX1001' => 'EUC-KR', |
||
80 | 'GB180302000' => 'GB18030', |
||
81 | // GB2312 not listed but supported |
||
82 | 'CHINESE' => 'GB2312', |
||
83 | 'CSISO58GB231280' => 'GB2312', |
||
84 | 'EUCCN' => 'GB2312', |
||
85 | 'EUCGB2312CN' => 'GB2312', |
||
86 | 'GB23121980' => 'GB2312', |
||
87 | 'GB231280' => 'GB2312', |
||
88 | 'ISOIR58' => 'GB2312', |
||
89 | 'GBK' => 'CP936', |
||
90 | '936' => 'CP936', |
||
91 | 'ms936' => 'CP936', |
||
92 | 'HZGB' => 'HZ', |
||
93 | 'HZGB2312' => 'HZ', |
||
94 | 'CSISO2022JP' => 'ISO-2022-JP', |
||
95 | 'ISO2022JP' => 'ISO-2022-JP', |
||
96 | 'ISO2022JP2004' => 'ISO-2022-JP-2004', |
||
97 | 'CSISO2022KR' => 'ISO-2022-KR', |
||
98 | 'ISO2022KR' => 'ISO-2022-KR', |
||
99 | 'CSISOLATIN6' => 'ISO-8859-10', |
||
100 | 'ISO885910' => 'ISO-8859-10', |
||
101 | 'ISO8859101992' => 'ISO-8859-10', |
||
102 | 'ISOIR157' => 'ISO-8859-10', |
||
103 | 'L6' => 'ISO-8859-10', |
||
104 | 'LATIN6' => 'ISO-8859-10', |
||
105 | 'ISO885913' => 'ISO-8859-13', |
||
106 | 'ISO885914' => 'ISO-8859-14', |
||
107 | 'ISO8859141998' => 'ISO-8859-14', |
||
108 | 'ISOCELTIC' => 'ISO-8859-14', |
||
109 | 'ISOIR199' => 'ISO-8859-14', |
||
110 | 'L8' => 'ISO-8859-14', |
||
111 | 'LATIN8' => 'ISO-8859-14', |
||
112 | 'ISO885915' => 'ISO-8859-15', |
||
113 | 'ISO885916' => 'ISO-8859-16', |
||
114 | 'ISO8859162001' => 'ISO-8859-16', |
||
115 | 'ISOIR226' => 'ISO-8859-16', |
||
116 | 'L10' => 'ISO-8859-16', |
||
117 | 'LATIN10' => 'ISO-8859-16', |
||
118 | 'CSISOLATIN2' => 'ISO-8859-2', |
||
119 | 'ISO88592' => 'ISO-8859-2', |
||
120 | 'ISO885921987' => 'ISO-8859-2', |
||
121 | 'ISOIR101' => 'ISO-8859-2', |
||
122 | 'L2' => 'ISO-8859-2', |
||
123 | 'LATIN2' => 'ISO-8859-2', |
||
124 | 'CSISOLATIN3' => 'ISO-8859-3', |
||
125 | 'ISO88593' => 'ISO-8859-3', |
||
126 | 'ISO885931988' => 'ISO-8859-3', |
||
127 | 'ISOIR109' => 'ISO-8859-3', |
||
128 | 'L3' => 'ISO-8859-3', |
||
129 | 'LATIN3' => 'ISO-8859-3', |
||
130 | 'CSISOLATIN4' => 'ISO-8859-4', |
||
131 | 'ISO88594' => 'ISO-8859-4', |
||
132 | 'ISO885941988' => 'ISO-8859-4', |
||
133 | 'ISOIR110' => 'ISO-8859-4', |
||
134 | 'L4' => 'ISO-8859-4', |
||
135 | 'LATIN4' => 'ISO-8859-4', |
||
136 | 'CSISOLATINCYRILLIC' => 'ISO-8859-5', |
||
137 | 'CYRILLIC' => 'ISO-8859-5', |
||
138 | 'ISO88595' => 'ISO-8859-5', |
||
139 | 'ISO885951988' => 'ISO-8859-5', |
||
140 | 'ISOIR144' => 'ISO-8859-5', |
||
141 | 'ARABIC' => 'ISO-8859-6', |
||
142 | 'ASMO708' => 'ISO-8859-6', |
||
143 | 'CSISOLATINARABIC' => 'ISO-8859-6', |
||
144 | 'ECMA114' => 'ISO-8859-6', |
||
145 | 'ISO88596' => 'ISO-8859-6', |
||
146 | 'ISO885961987' => 'ISO-8859-6', |
||
147 | 'ISOIR127' => 'ISO-8859-6', |
||
148 | 'CSISOLATINGREEK' => 'ISO-8859-7', |
||
149 | 'ECMA118' => 'ISO-8859-7', |
||
150 | 'ELOT928' => 'ISO-8859-7', |
||
151 | 'GREEK' => 'ISO-8859-7', |
||
152 | 'GREEK8' => 'ISO-8859-7', |
||
153 | 'ISO88597' => 'ISO-8859-7', |
||
154 | 'ISO885971987' => 'ISO-8859-7', |
||
155 | 'ISOIR126' => 'ISO-8859-7', |
||
156 | 'CSISOLATINHEBREW' => 'ISO-8859-8', |
||
157 | 'HEBREW' => 'ISO-8859-8', |
||
158 | 'ISO88598' => 'ISO-8859-8', |
||
159 | 'ISO885981988' => 'ISO-8859-8', |
||
160 | 'ISOIR138' => 'ISO-8859-8', |
||
161 | 'CSISOLATIN5' => 'ISO-8859-9', |
||
162 | 'ISO88599' => 'ISO-8859-9', |
||
163 | 'ISO885991989' => 'ISO-8859-9', |
||
164 | 'ISOIR148' => 'ISO-8859-9', |
||
165 | 'L5' => 'ISO-8859-9', |
||
166 | 'LATIN5' => 'ISO-8859-9', |
||
167 | 'CSKOI8R' => 'KOI8-R', |
||
168 | 'KOI8R' => 'KOI8-R', |
||
169 | '8859' => 'ISO-8859-1', |
||
170 | 'CP819' => 'ISO-8859-1', |
||
171 | 'CSISOLATIN1' => 'ISO-8859-1', |
||
172 | 'IBM819' => 'ISO-8859-1', |
||
173 | 'ISO8859' => 'ISO-8859-1', |
||
174 | 'ISO88591' => 'ISO-8859-1', |
||
175 | 'ISO885911987' => 'ISO-8859-1', |
||
176 | 'ISOIR100' => 'ISO-8859-1', |
||
177 | 'L1' => 'ISO-8859-1', |
||
178 | 'LATIN' => 'ISO-8859-1', |
||
179 | 'LATIN1' => 'ISO-8859-1', |
||
180 | 'CSSHIFTJIS' => 'SJIS', |
||
181 | 'SHIFTJIS' => 'SJIS', |
||
182 | 'SHIFTJIS2004' => 'SJIS-2004', |
||
183 | 'SJIS2004' => 'SJIS-2004', |
||
184 | ]; |
||
185 | |||
186 | /** |
||
187 | * @var array aliased charsets supported by iconv. |
||
188 | */ |
||
189 | public static $iconvAliases = [ |
||
190 | // iconv aliases -- a lot of these may already be supported |
||
191 | 'BIG5HKSCS' => 'BIG5HKSCS', |
||
192 | 'HKSCS' => 'BIG5HKSCS', |
||
193 | '037' => 'CP037', |
||
194 | 'EBCDICCPCA' => 'CP037', |
||
195 | 'EBCDICCPNL' => 'CP037', |
||
196 | 'EBCDICCPUS' => 'CP037', |
||
197 | 'EBCDICCPWT' => 'CP037', |
||
198 | 'CSIBM037' => 'CP037', |
||
199 | 'IBM037' => 'CP037', |
||
200 | 'IBM039' => 'CP037', |
||
201 | '1026' => 'CP1026', |
||
202 | 'CSIBM1026' => 'CP1026', |
||
203 | 'IBM1026' => 'CP1026', |
||
204 | '1140' => 'CP1140', |
||
205 | 'IBM1140' => 'CP1140', |
||
206 | '1250' => 'CP1250', |
||
207 | 'WINDOWS1250' => 'CP1250', |
||
208 | '1253' => 'CP1253', |
||
209 | 'WINDOWS1253' => 'CP1253', |
||
210 | '1256' => 'CP1256', |
||
211 | 'WINDOWS1256' => 'CP1256', |
||
212 | '1257' => 'CP1257', |
||
213 | 'WINDOWS1257' => 'CP1257', |
||
214 | '1258' => 'CP1258', |
||
215 | 'WINDOWS1258' => 'CP1258', |
||
216 | '424' => 'CP424', |
||
217 | 'CSIBM424' => 'CP424', |
||
218 | 'EBCDICCPHE' => 'CP424', |
||
219 | 'IBM424' => 'CP424', |
||
220 | '437' => 'CP437', |
||
221 | 'CSPC8CODEPAGE437' => 'CP437', |
||
222 | 'IBM437' => 'CP437', |
||
223 | '500' => 'CP500', |
||
224 | 'CSIBM500' => 'CP500', |
||
225 | 'EBCDICCPBE' => 'CP500', |
||
226 | 'EBCDICCPCH' => 'CP500', |
||
227 | 'IBM500' => 'CP500', |
||
228 | '775' => 'CP775', |
||
229 | 'CSPC775BALTIC' => 'CP775', |
||
230 | 'IBM775' => 'CP775', |
||
231 | '860' => 'CP860', |
||
232 | 'CSIBM860' => 'CP860', |
||
233 | 'IBM860' => 'CP860', |
||
234 | '861' => 'CP861', |
||
235 | 'CPIS' => 'CP861', |
||
236 | 'CSIBM861' => 'CP861', |
||
237 | 'IBM861' => 'CP861', |
||
238 | '862' => 'CP862', |
||
239 | 'CSPC862LATINHEBREW' => 'CP862', |
||
240 | 'IBM862' => 'CP862', |
||
241 | '863' => 'CP863', |
||
242 | 'CSIBM863' => 'CP863', |
||
243 | 'IBM863' => 'CP863', |
||
244 | '864' => 'CP864', |
||
245 | 'CSIBM864' => 'CP864', |
||
246 | 'IBM864' => 'CP864', |
||
247 | '865' => 'CP865', |
||
248 | 'CSIBM865' => 'CP865', |
||
249 | 'IBM865' => 'CP865', |
||
250 | '869' => 'CP869', |
||
251 | 'CPGR' => 'CP869', |
||
252 | 'CSIBM869' => 'CP869', |
||
253 | 'IBM869' => 'CP869', |
||
254 | '949' => 'CP949', |
||
255 | 'MS949' => 'CP949', |
||
256 | 'UHC' => 'CP949', |
||
257 | 'ROMAN8' => 'ROMAN8', |
||
258 | 'HPROMAN8' => 'ROMAN8', |
||
259 | 'R8' => 'ROMAN8', |
||
260 | 'CSHPROMAN8' => 'ROMAN8', |
||
261 | 'ISO2022JP2' => 'ISO2022JP2', |
||
262 | 'THAI' => 'ISO885911', |
||
263 | 'ISO885911' => 'ISO885911', |
||
264 | 'ISO8859112001' => 'ISO885911', |
||
265 | 'JOHAB' => 'CP1361', |
||
266 | 'MS1361' => 'CP1361', |
||
267 | 'MACCYRILLIC' => 'MACCYRILLIC', |
||
268 | 'CSPTCP154' => 'PT154', |
||
269 | 'PTCP154' => 'PT154', |
||
270 | 'CP154' => 'PT154', |
||
271 | 'CYRILLICASIAN' => 'PT154', |
||
272 | 'TIS620' => 'TIS620', |
||
273 | 'TIS6200' => 'TIS620', |
||
274 | 'TIS62025290' => 'TIS620', |
||
275 | 'TIS62025291' => 'TIS620', |
||
276 | 'ISOIR166' => 'TIS620', |
||
277 | ]; |
||
278 | |||
279 | /** |
||
280 | * @var string charset to convert from |
||
281 | */ |
||
282 | protected $fromCharset; |
||
283 | |||
284 | /** |
||
285 | * @var string charset to convert to |
||
286 | */ |
||
287 | protected $toCharset; |
||
288 | |||
289 | /** |
||
290 | * @var boolean indicates if $fromCharset is supported by |
||
291 | * mb_convert_encoding |
||
292 | */ |
||
293 | protected $fromCharsetMbSupported = true; |
||
294 | |||
295 | /** |
||
296 | * @var boolean indicates if $toCharset is supported by mb_convert_encoding |
||
297 | */ |
||
298 | protected $toCharsetMbSupported = true; |
||
299 | |||
300 | /** |
||
301 | * Constructs the charset converter with source/destination charsets. |
||
302 | * |
||
303 | * @param string $fromCharset |
||
304 | * @param string $toCharset |
||
305 | */ |
||
306 | 3 | public function __construct($fromCharset, $toCharset) |
|
311 | |||
312 | /** |
||
313 | * Converts the passed string's charset from $this->fromCharset to |
||
314 | * $this->toCharset. |
||
315 | * |
||
316 | * The function attempts to use mb_convert_encoding if possible, and falls |
||
317 | * back to iconv if not. If the source or destination character sets aren't |
||
318 | * supported, a blank string is returned. |
||
319 | * |
||
320 | * @param string $str |
||
321 | * @return string |
||
322 | */ |
||
323 | 3 | public function convert($str) |
|
339 | |||
340 | /** |
||
341 | * Looks up the passed $cs in mb_list_encodings, then strips non |
||
342 | * alpha-numeric characters and tries again, then failing that calls |
||
343 | * findAliasedCharset. The method returns the charset name that should be |
||
344 | * used in calls to mb_convert_encoding or iconv. |
||
345 | * |
||
346 | * If the charset is part of mb_list_encodings, $mbSupported is set to true. |
||
347 | * |
||
348 | * @param string $cs |
||
349 | * @param boolean $mbSupported |
||
350 | * @return string the final charset name to use |
||
351 | */ |
||
352 | 3 | private function findSupportedCharset($cs, &$mbSupported) |
|
371 | |||
372 | /** |
||
373 | * Looks up the passed $comp and $stripped strings in self::$mbAliases, and |
||
374 | * returns the mapped charset if applicable. Otherwise calls |
||
375 | * $this->findAliasedIconvCharset. |
||
376 | * |
||
377 | * $mbSupported is set to false if the charset is not located in |
||
378 | * self::$mbAliases. |
||
379 | * |
||
380 | * @param string $comp |
||
381 | * @param string $stripped |
||
382 | * @param boolean $mbSupported |
||
383 | * @return string the mapped charset |
||
384 | */ |
||
385 | 3 | private function findAliasedCharset($comp, $stripped, &$mbSupported) |
|
395 | |||
396 | /** |
||
397 | * Looks up the passed $comp and $stripped strings in self::$iconvAliases, |
||
398 | * and returns the mapped charset if applicable. Otherwise returns $comp. |
||
399 | * |
||
400 | * @param string $comp |
||
401 | * @param string $stripped |
||
402 | * @return string the mapped charset (if mapped) or $comp otherwise |
||
403 | */ |
||
404 | 1 | private function findAliasedIconvCharset($comp, $stripped) |
|
413 | } |
||
414 |