1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PhpXmlRpc\Helper; |
4
|
|
|
|
5
|
|
|
use PhpXmlRpc\PhpXmlRpc; |
6
|
|
|
|
7
|
|
|
class Charset |
8
|
|
|
{ |
9
|
|
|
// tables used for transcoding different charsets into us-ascii xml |
10
|
|
|
protected $xml_iso88591_Entities = array("in" => array(), "out" => array()); |
11
|
|
|
protected $xml_iso88591_utf8 = array("in" => array(), "out" => array()); |
12
|
|
|
|
13
|
|
|
/// @todo add to iso table the characters from cp_1252 range, i.e. 128 to 159? |
14
|
|
|
/// These will NOT be present in true ISO-8859-1, but will save the unwary |
15
|
|
|
/// windows user from sending junk (though no luck when receiving them...) |
16
|
|
|
/* |
|
|
|
|
17
|
|
|
protected $xml_cp1252_Entities = array('in' => array(), out' => array( |
18
|
|
|
'€', '?', '‚', 'ƒ', |
19
|
|
|
'„', '…', '†', '‡', |
20
|
|
|
'ˆ', '‰', 'Š', '‹', |
21
|
|
|
'Œ', '?', 'Ž', '?', |
22
|
|
|
'?', '‘', '’', '“', |
23
|
|
|
'”', '•', '–', '—', |
24
|
|
|
'˜', '™', 'š', '›', |
25
|
|
|
'œ', '?', 'ž', 'Ÿ' |
26
|
|
|
)); |
27
|
|
|
*/ |
28
|
|
|
|
29
|
|
|
protected $charset_supersets = array( |
30
|
|
|
'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4', |
31
|
|
|
'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', |
32
|
|
|
'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12', |
33
|
|
|
'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8', |
34
|
|
|
'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',), |
35
|
|
|
); |
36
|
|
|
|
37
|
|
|
protected static $instance = null; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* This class is singleton for performance reasons. |
41
|
|
|
* |
42
|
|
|
* @return Charset |
43
|
|
|
*/ |
44
|
|
|
public static function instance() |
45
|
|
|
{ |
46
|
|
|
if (self::$instance === null) { |
47
|
|
|
self::$instance = new self(); |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
return self::$instance; |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
private function __construct() |
54
|
|
|
{ |
55
|
|
View Code Duplication |
for ($i = 0; $i < 32; $i++) { |
|
|
|
|
56
|
|
|
$this->xml_iso88591_Entities["in"][] = chr($i); |
57
|
|
|
$this->xml_iso88591_Entities["out"][] = "&#{$i};"; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
View Code Duplication |
for ($i = 160; $i < 256; $i++) { |
|
|
|
|
61
|
|
|
$this->xml_iso88591_Entities["in"][] = chr($i); |
62
|
|
|
$this->xml_iso88591_Entities["out"][] = "&#{$i};"; |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
/*for ($i = 128; $i < 160; $i++) |
|
|
|
|
66
|
|
|
{ |
67
|
|
|
$this->xml_cp1252_Entities['in'][] = chr($i); |
68
|
|
|
}*/ |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Convert a string to the correct XML representation in a target charset. |
73
|
|
|
* |
74
|
|
|
* To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending |
75
|
|
|
* requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars |
76
|
|
|
* present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are |
77
|
|
|
* independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them. |
78
|
|
|
* Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are |
79
|
|
|
* bound by RFC 3023 to emit strict us-ascii. |
80
|
|
|
* |
81
|
|
|
* @todo do a bit of basic benchmarking (strtr vs. str_replace) |
82
|
|
|
* @todo make usage of iconv() or recode_string() or mb_string() where available |
83
|
|
|
* |
84
|
|
|
* @param string $data |
85
|
|
|
* @param string $srcEncoding |
86
|
|
|
* @param string $destEncoding |
87
|
|
|
* |
88
|
|
|
* @return string |
89
|
|
|
*/ |
90
|
|
|
public function encodeEntities($data, $srcEncoding = '', $destEncoding = '') |
91
|
|
|
{ |
92
|
|
|
if ($srcEncoding == '') { |
93
|
|
|
// lame, but we know no better... |
94
|
|
|
$srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding; |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
$conversion = strtoupper($srcEncoding . '_' . $destEncoding); |
98
|
|
|
switch ($conversion) { |
99
|
|
|
case 'ISO-8859-1_': |
100
|
|
|
case 'ISO-8859-1_US-ASCII': |
101
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
102
|
|
|
$escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData); |
103
|
|
|
break; |
104
|
|
|
|
105
|
|
|
case 'ISO-8859-1_UTF-8': |
106
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
107
|
|
|
$escapedData = utf8_encode($escapedData); |
108
|
|
|
break; |
109
|
|
|
|
110
|
|
|
case 'ISO-8859-1_ISO-8859-1': |
111
|
|
|
case 'US-ASCII_US-ASCII': |
112
|
|
|
case 'US-ASCII_UTF-8': |
113
|
|
|
case 'US-ASCII_': |
114
|
|
|
case 'US-ASCII_ISO-8859-1': |
115
|
|
|
case 'UTF-8_UTF-8': |
116
|
|
|
//case 'CP1252_CP1252': |
117
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
118
|
|
|
break; |
119
|
|
|
|
120
|
|
|
case 'UTF-8_': |
121
|
|
|
case 'UTF-8_US-ASCII': |
122
|
|
|
case 'UTF-8_ISO-8859-1': |
123
|
|
|
// NB: this will choke on invalid UTF-8, going most likely beyond EOF |
124
|
|
|
$escapedData = ''; |
125
|
|
|
// be kind to users creating string xmlrpc values out of different php types |
126
|
|
|
$data = (string)$data; |
127
|
|
|
$ns = strlen($data); |
128
|
|
|
for ($nn = 0; $nn < $ns; $nn++) { |
129
|
|
|
$ch = $data[$nn]; |
130
|
|
|
$ii = ord($ch); |
131
|
|
|
// 7 bits: 0bbbbbbb (127) |
|
|
|
|
132
|
|
|
if ($ii < 128) { |
133
|
|
|
/// @todo shall we replace this with a (supposedly) faster str_replace? |
134
|
|
|
switch ($ii) { |
135
|
|
|
case 34: |
136
|
|
|
$escapedData .= '"'; |
137
|
|
|
break; |
138
|
|
|
case 38: |
139
|
|
|
$escapedData .= '&'; |
140
|
|
|
break; |
141
|
|
|
case 39: |
142
|
|
|
$escapedData .= '''; |
143
|
|
|
break; |
144
|
|
|
case 60: |
145
|
|
|
$escapedData .= '<'; |
146
|
|
|
break; |
147
|
|
|
case 62: |
148
|
|
|
$escapedData .= '>'; |
149
|
|
|
break; |
150
|
|
|
default: |
151
|
|
|
$escapedData .= $ch; |
152
|
|
|
} // switch |
153
|
|
|
} // 11 bits: 110bbbbb 10bbbbbb (2047) |
|
|
|
|
154
|
|
|
elseif ($ii >> 5 == 6) { |
155
|
|
|
$b1 = ($ii & 31); |
156
|
|
|
$ii = ord($data[$nn + 1]); |
157
|
|
|
$b2 = ($ii & 63); |
158
|
|
|
$ii = ($b1 * 64) + $b2; |
159
|
|
|
$ent = sprintf('&#%d;', $ii); |
160
|
|
|
$escapedData .= $ent; |
161
|
|
|
$nn += 1; |
162
|
|
|
} // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb |
163
|
|
|
elseif ($ii >> 4 == 14) { |
164
|
|
|
$b1 = ($ii & 15); |
165
|
|
|
$ii = ord($data[$nn + 1]); |
166
|
|
|
$b2 = ($ii & 63); |
167
|
|
|
$ii = ord($data[$nn + 2]); |
168
|
|
|
$b3 = ($ii & 63); |
169
|
|
|
$ii = ((($b1 * 64) + $b2) * 64) + $b3; |
170
|
|
|
$ent = sprintf('&#%d;', $ii); |
171
|
|
|
$escapedData .= $ent; |
172
|
|
|
$nn += 2; |
173
|
|
|
} // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb |
174
|
|
|
elseif ($ii >> 3 == 30) { |
175
|
|
|
$b1 = ($ii & 7); |
176
|
|
|
$ii = ord($data[$nn + 1]); |
177
|
|
|
$b2 = ($ii & 63); |
178
|
|
|
$ii = ord($data[$nn + 2]); |
179
|
|
|
$b3 = ($ii & 63); |
180
|
|
|
$ii = ord($data[$nn + 3]); |
181
|
|
|
$b4 = ($ii & 63); |
182
|
|
|
$ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4; |
183
|
|
|
$ent = sprintf('&#%d;', $ii); |
184
|
|
|
$escapedData .= $ent; |
185
|
|
|
$nn += 3; |
186
|
|
|
} |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
// when converting to latin-1, do not be so eager with using entities for characters 160-255 |
190
|
|
|
if ($conversion == 'UTF-8_ISO-8859-1') { |
191
|
|
|
$escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData); |
192
|
|
|
} |
193
|
|
|
break; |
194
|
|
|
|
195
|
|
|
/* |
196
|
|
|
case 'CP1252_': |
197
|
|
|
case 'CP1252_US-ASCII': |
198
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
199
|
|
|
$escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData); |
200
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
201
|
|
|
break; |
202
|
|
|
case 'CP1252_UTF-8': |
203
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
204
|
|
|
/// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all allone will NOT convert them) |
205
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
206
|
|
|
$escapedData = utf8_encode($escapedData); |
207
|
|
|
break; |
208
|
|
|
case 'CP1252_ISO-8859-1': |
209
|
|
|
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); |
210
|
|
|
// we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities... |
211
|
|
|
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); |
212
|
|
|
break; |
213
|
|
|
*/ |
214
|
|
|
|
215
|
|
|
default: |
216
|
|
|
$escapedData = ''; |
217
|
|
|
error_log('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported..."); |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
return $escapedData; |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
/** |
224
|
|
|
* Checks if a given charset encoding is present in a list of encodings or |
225
|
|
|
* if it is a valid subset of any encoding in the list. |
226
|
|
|
* |
227
|
|
|
* @param string $encoding charset to be tested |
228
|
|
|
* @param string|array $validList comma separated list of valid charsets (or array of charsets) |
229
|
|
|
* |
230
|
|
|
* @return bool |
231
|
|
|
*/ |
232
|
|
|
public function isValidCharset($encoding, $validList) |
233
|
|
|
{ |
234
|
|
|
if (is_string($validList)) { |
235
|
|
|
$validList = explode(',', $validList); |
236
|
|
|
} |
237
|
|
|
if (@in_array(strtoupper($encoding), $validList)) { |
238
|
|
|
return true; |
239
|
|
|
} else { |
240
|
|
|
if (array_key_exists($encoding, $this->charset_supersets)) { |
241
|
|
|
foreach ($validList as $allowed) { |
242
|
|
|
if (in_array($allowed, $this->charset_supersets[$encoding])) { |
243
|
|
|
return true; |
244
|
|
|
} |
245
|
|
|
} |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
return false; |
249
|
|
|
} |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
/** |
253
|
|
|
* Used only for backwards compatibility |
254
|
|
|
* @deprecated |
255
|
|
|
* |
256
|
|
|
* @param string $charset |
257
|
|
|
* |
258
|
|
|
* @return array |
259
|
|
|
* |
260
|
|
|
* @throws \Exception for unknown/unsupported charsets |
261
|
|
|
*/ |
262
|
|
|
public function getEntities($charset) |
263
|
|
|
{ |
264
|
|
|
switch ($charset) |
265
|
|
|
{ |
266
|
|
|
case 'iso88591': |
267
|
|
|
return $this->xml_iso88591_Entities; |
268
|
|
|
default: |
269
|
|
|
throw new \Exception('Unsupported charset: ' . $charset); |
270
|
|
|
} |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
} |
274
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.