1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PhpXmlRpc\Helper; |
4
|
|
|
|
5
|
|
|
use PhpXmlRpc\PhpXmlRpc; |
6
|
|
|
|
7
|
|
|
class Charset |
8
|
|
|
{ |
9
|
|
|
// tables used for transcoding different charsets into us-ascii xml |
10
|
|
|
protected $xml_iso88591_Entities = array("in" => array(), "out" => array()); |
11
|
|
|
protected $xml_iso88591_utf8 = array("in" => array(), "out" => array()); |
12
|
|
|
|
13
|
|
|
/// @todo add to iso table the characters from cp_1252 range, i.e. 128 to 159? |
14
|
|
|
/// These will NOT be present in true ISO-8859-1, but will save the unwary |
15
|
|
|
/// windows user from sending junk (though no luck when receiving them...) |
16
|
|
|
/* |
17
|
|
|
protected $xml_cp1252_Entities = array('in' => array(), out' => array( |
18
|
|
|
'€', '?', '‚', 'ƒ', |
19
|
|
|
'„', '…', '†', '‡', |
20
|
|
|
'ˆ', '‰', 'Š', '‹', |
21
|
|
|
'Œ', '?', 'Ž', '?', |
22
|
|
|
'?', '‘', '’', '“', |
23
|
|
|
'”', '•', '–', '—', |
24
|
|
|
'˜', '™', 'š', '›', |
25
|
|
|
'œ', '?', 'ž', 'Ÿ' |
26
|
|
|
)); |
27
|
|
|
*/ |
28
|
|
|
|
29
|
|
|
protected $charset_supersets = array( |
30
|
|
|
'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4', |
31
|
|
|
'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', |
32
|
|
|
'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12', |
33
|
|
|
'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8', |
34
|
|
|
'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',), |
35
|
|
|
); |
36
|
|
|
|
37
|
|
|
protected static $instance = null; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* This class is singleton for performance reasons. |
41
|
|
|
* |
42
|
|
|
* @return Charset |
43
|
|
|
*/ |
44
|
591 |
|
public static function instance() |
45
|
|
|
{ |
46
|
591 |
|
if (self::$instance === null) { |
47
|
336 |
|
self::$instance = new self(); |
48
|
|
|
} |
49
|
|
|
|
50
|
591 |
|
return self::$instance; |
51
|
|
|
} |
52
|
|
|
|
53
|
336 |
|
private function __construct() |
54
|
|
|
{ |
55
|
336 |
View Code Duplication |
for ($i = 0; $i < 32; $i++) { |
|
|
|
|
56
|
336 |
|
$this->xml_iso88591_Entities["in"][] = chr($i); |
57
|
336 |
|
$this->xml_iso88591_Entities["out"][] = "&#{$i};"; |
58
|
|
|
} |
59
|
|
|
|
60
|
336 |
View Code Duplication |
for ($i = 160; $i < 256; $i++) { |
|
|
|
|
61
|
336 |
|
$this->xml_iso88591_Entities["in"][] = chr($i); |
62
|
336 |
|
$this->xml_iso88591_Entities["out"][] = "&#{$i};"; |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
/*for ($i = 128; $i < 160; $i++) |
66
|
|
|
{ |
67
|
|
|
$this->xml_cp1252_Entities['in'][] = chr($i); |
68
|
|
|
}*/ |
69
|
336 |
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Convert a string to the correct XML representation in a target charset. |
73
|
|
|
* |
74
|
|
|
* To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending |
75
|
|
|
* requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars |
76
|
|
|
* present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are |
77
|
|
|
* independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them. |
78
|
|
|
* Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are |
79
|
|
|
* bound by RFC 3023 to emit strict us-ascii. |
80
|
|
|
* |
81
|
|
|
* @todo do a bit of basic benchmarking (strtr vs. str_replace) |
82
|
|
|
* @todo make usage of iconv() or recode_string() or mb_string() where available |
83
|
|
|
* |
84
|
|
|
* @param string $data |
85
|
|
|
* @param string $srcEncoding |
86
|
|
|
* @param string $destEncoding |
87
|
|
|
* |
88
|
|
|
* @return string |
89
|
|
|
*/ |
90
|
574 |
|
public function encodeEntities($data, $srcEncoding = '', $destEncoding = '') |
91
|
|
|
{ |
92
|
574 |
|
if ($srcEncoding == '') { |
93
|
|
|
// lame, but we know no better... |
94
|
|
|
$srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding; |
95
|
|
|
} |
96
|
|
|
|
97
|
574 |
|
$conversion = strtoupper($srcEncoding . '_' . $destEncoding); |
98
|
574 |
|
switch ($conversion) { |
99
|
574 |
|
case 'ISO-8859-1_': |
100
|
348 |
|
case 'ISO-8859-1_US-ASCII': |
101
|
486 |
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
102
|
486 |
|
$escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData); |
103
|
486 |
|
break; |
104
|
|
|
|
105
|
348 |
|
case 'ISO-8859-1_UTF-8': |
106
|
28 |
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
107
|
28 |
|
$escapedData = utf8_encode($escapedData); |
108
|
28 |
|
break; |
109
|
|
|
|
110
|
337 |
|
case 'ISO-8859-1_ISO-8859-1': |
111
|
326 |
|
case 'US-ASCII_US-ASCII': |
112
|
326 |
|
case 'US-ASCII_UTF-8': |
113
|
326 |
|
case 'US-ASCII_': |
114
|
326 |
|
case 'US-ASCII_ISO-8859-1': |
115
|
326 |
|
case 'UTF-8_UTF-8': |
116
|
|
|
//case 'CP1252_CP1252': |
117
|
46 |
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
118
|
46 |
|
break; |
119
|
|
|
|
120
|
308 |
|
case 'UTF-8_': |
121
|
24 |
|
case 'UTF-8_US-ASCII': |
122
|
24 |
|
case 'UTF-8_ISO-8859-1': |
123
|
|
|
// NB: this will choke on invalid UTF-8, going most likely beyond EOF |
124
|
308 |
|
$escapedData = ''; |
125
|
|
|
// be kind to users creating string xmlrpc values out of different php types |
126
|
308 |
|
$data = (string)$data; |
127
|
308 |
|
$ns = strlen($data); |
128
|
308 |
|
for ($nn = 0; $nn < $ns; $nn++) { |
129
|
308 |
|
$ch = $data[$nn]; |
130
|
308 |
|
$ii = ord($ch); |
131
|
|
|
// 7 bits: 0bbbbbbb (127) |
132
|
308 |
|
if ($ii < 128) { |
133
|
|
|
/// @todo shall we replace this with a (supposedly) faster str_replace? |
134
|
301 |
|
switch ($ii) { |
135
|
301 |
|
case 34: |
136
|
18 |
|
$escapedData .= '"'; |
137
|
18 |
|
break; |
138
|
301 |
|
case 38: |
139
|
18 |
|
$escapedData .= '&'; |
140
|
18 |
|
break; |
141
|
301 |
|
case 39: |
142
|
20 |
|
$escapedData .= '''; |
143
|
20 |
|
break; |
144
|
301 |
|
case 60: |
145
|
18 |
|
$escapedData .= '<'; |
146
|
18 |
|
break; |
147
|
301 |
|
case 62: |
148
|
18 |
|
$escapedData .= '>'; |
149
|
18 |
|
break; |
150
|
|
|
default: |
151
|
301 |
|
$escapedData .= $ch; |
152
|
|
|
} // switch |
153
|
|
|
} // 11 bits: 110bbbbb 10bbbbbb (2047) |
154
|
64 |
|
elseif ($ii >> 5 == 6) { |
155
|
60 |
|
$b1 = ($ii & 31); |
156
|
60 |
|
$ii = ord($data[$nn + 1]); |
157
|
60 |
|
$b2 = ($ii & 63); |
158
|
60 |
|
$ii = ($b1 * 64) + $b2; |
159
|
60 |
|
$ent = sprintf('&#%d;', $ii); |
160
|
60 |
|
$escapedData .= $ent; |
161
|
60 |
|
$nn += 1; |
162
|
|
|
} // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb |
163
|
28 |
|
elseif ($ii >> 4 == 14) { |
164
|
28 |
|
$b1 = ($ii & 15); |
165
|
28 |
|
$ii = ord($data[$nn + 1]); |
166
|
28 |
|
$b2 = ($ii & 63); |
167
|
28 |
|
$ii = ord($data[$nn + 2]); |
168
|
28 |
|
$b3 = ($ii & 63); |
169
|
28 |
|
$ii = ((($b1 * 64) + $b2) * 64) + $b3; |
170
|
28 |
|
$ent = sprintf('&#%d;', $ii); |
171
|
28 |
|
$escapedData .= $ent; |
172
|
28 |
|
$nn += 2; |
173
|
|
|
} // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb |
174
|
|
|
elseif ($ii >> 3 == 30) { |
175
|
|
|
$b1 = ($ii & 7); |
176
|
|
|
$ii = ord($data[$nn + 1]); |
177
|
|
|
$b2 = ($ii & 63); |
178
|
|
|
$ii = ord($data[$nn + 2]); |
179
|
|
|
$b3 = ($ii & 63); |
180
|
|
|
$ii = ord($data[$nn + 3]); |
181
|
|
|
$b4 = ($ii & 63); |
182
|
|
|
$ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4; |
183
|
|
|
$ent = sprintf('&#%d;', $ii); |
184
|
|
|
$escapedData .= $ent; |
185
|
|
|
$nn += 3; |
186
|
|
|
} |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
// when converting to latin-1, do not be so eager with using entities for characters 160-255 |
190
|
308 |
|
if ($conversion == 'UTF-8_ISO-8859-1') { |
191
|
24 |
|
$escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData); |
192
|
|
|
} |
193
|
308 |
|
break; |
194
|
|
|
|
195
|
|
|
/* |
196
|
|
|
case 'CP1252_': |
197
|
|
|
case 'CP1252_US-ASCII': |
198
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
199
|
|
|
$escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData); |
200
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
201
|
|
|
break; |
202
|
|
|
case 'CP1252_UTF-8': |
203
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
204
|
|
|
/// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all allone will NOT convert them) |
205
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
206
|
|
|
$escapedData = utf8_encode($escapedData); |
207
|
|
|
break; |
208
|
|
|
case 'CP1252_ISO-8859-1': |
209
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
210
|
|
|
// we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities... |
211
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
212
|
|
|
break; |
213
|
|
|
*/ |
214
|
|
|
|
215
|
|
|
default: |
216
|
|
|
$escapedData = ''; |
217
|
|
|
error_log('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported..."); |
218
|
|
|
} |
219
|
|
|
|
220
|
574 |
|
return $escapedData; |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
/** |
224
|
|
|
* Checks if a given charset encoding is present in a list of encodings or |
225
|
|
|
* if it is a valid subset of any encoding in the list. |
226
|
|
|
* |
227
|
|
|
* @param string $encoding charset to be tested |
228
|
|
|
* @param string|array $validList comma separated list of valid charsets (or array of charsets) |
229
|
|
|
* |
230
|
|
|
* @return bool |
231
|
|
|
*/ |
232
|
|
|
public function isValidCharset($encoding, $validList) |
233
|
|
|
{ |
234
|
|
|
if (is_string($validList)) { |
235
|
|
|
$validList = explode(',', $validList); |
236
|
|
|
} |
237
|
|
|
if (@in_array(strtoupper($encoding), $validList)) { |
238
|
|
|
return true; |
239
|
|
|
} else { |
240
|
|
|
if (array_key_exists($encoding, $this->charset_supersets)) { |
241
|
|
|
foreach ($validList as $allowed) { |
242
|
|
|
if (in_array($allowed, $this->charset_supersets[$encoding])) { |
243
|
|
|
return true; |
244
|
|
|
} |
245
|
|
|
} |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
return false; |
249
|
|
|
} |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
/** |
253
|
|
|
* Used only for backwards compatibility |
254
|
|
|
* @deprecated |
255
|
|
|
* |
256
|
|
|
* @param string $charset |
257
|
|
|
* |
258
|
|
|
* @return array |
259
|
|
|
* |
260
|
|
|
* @throws \Exception for unknown/unsupported charsets |
261
|
|
|
*/ |
262
|
|
|
public function getEntities($charset) |
263
|
|
|
{ |
264
|
|
|
switch ($charset) |
265
|
|
|
{ |
266
|
|
|
case 'iso88591': |
267
|
|
|
return $this->xml_iso88591_Entities; |
268
|
|
|
default: |
269
|
|
|
throw new \Exception('Unsupported charset: ' . $charset); |
270
|
|
|
} |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
} |
274
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.