Passed
Push — master ( da2c1e...110169 )
by Gaetano
04:55 queued 01:13
created

Charset::instance()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 7
rs 10
ccs 4
cts 4
cp 1
crap 2
1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
class Charset
8
{
9
    // tables used for transcoding different charsets into us-ascii xml
10
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
11
12
    /// @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
13
    ///       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
14
    ///       (though no luck when receiving them...)
15
    ///       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
16
    ///       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
17
    ///       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
18
    ///       Check what mbstring/iconv do by default with those?
19
    //
20
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
21
22
    protected $charset_supersets = array(
23
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
24
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
25
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
26
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
27
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
28
    );
29
30
    protected static $instance = null;
31
32
    /**
33
     * This class is singleton for performance reasons.
34
     * @todo can't we just make $xml_iso88591_Entities a static variable instead ?
35
     *
36
     * @return Charset
37
     */
38 604
    public static function instance()
39
    {
40 604
        if (self::$instance === null) {
41 382
            self::$instance = new self();
42
        }
43
44 604
        return self::$instance;
45
    }
46
47
    /**
48
     * Force usage as singleton
49
     */
50 382
    protected function __construct()
51
    {
52 382
    }
53
54
    /**
55
     * @param string $tableName
56
     * @throws \Exception for unsupported $tableName
57
     */
58 494
    protected function buildConversionTable($tableName)
59
    {
60 494
        switch($tableName) {
61 494
            case 'xml_iso88591_Entities':
62 494
                if (count($this->xml_iso88591_Entities['in'])) {
63 492
                    return;
64
                }
65 19
                for ($i = 0; $i < 32; $i++) {
66 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
67 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
68
                }
69
70 19
                for ($i = 160; $i < 256; $i++) {
71 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
72 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
73
                }
74 19
                break;
75
            /*case 'xml_cp1252_Entities':
76
                if (count($this->xml_cp1252_Entities['in'])) {
77
                    return;
78
                }
79
                for ($i = 128; $i < 160; $i++)
80
                {
81
                    $this->xml_cp1252_Entities['in'][] = chr($i);
82
                }
83
                $this->xml_cp1252_Entities['out'] = array(
84
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
85
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
86
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
87
                    '&#x0152;', '?',        '&#x017D;', '?',
88
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
89
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
90
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
91
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
92
                );
93
                $this->buildConversionTable('xml_iso88591_Entities');
94
                break;*/
95
            default:
96
                throw new \Exception('Unsupported table: ' . $tableName);
97
        }
98 19
    }
99
100
    /**
101
     * Convert a string to the correct XML representation in a target charset.
102
     *
103
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
104
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
105
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
106
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
107
     * Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are
108
     * bound by RFC 3023 to emit strict us-ascii.
109
     *
110
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
111
     * @todo make usage of iconv() or recode_string() or mb_string() where available
112
     *
113
     * @param string $data
114
     * @param string $srcEncoding
115
     * @param string $destEncoding
116
     *
117
     * @return string
118
     */
119 585
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
120
    {
121 585
        if ($srcEncoding == '') {
122
            // lame, but we know no better...
123
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
124
        }
125
126 585
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
127 585
        switch ($conversion) {
128 585
            case 'ISO-8859-1_':
129 392
            case 'ISO-8859-1_US-ASCII':
130 487
                $this->buildConversionTable('xml_iso88591_Entities');
131 487
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
132 487
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
133 487
                break;
134
135 392
            case 'ISO-8859-1_UTF-8':
136 28
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
137 28
                $escapedData = utf8_encode($escapedData);
138 28
                break;
139
140 381
            case 'ISO-8859-1_ISO-8859-1':
141 370
            case 'US-ASCII_US-ASCII':
142 370
            case 'US-ASCII_UTF-8':
143 370
            case 'US-ASCII_':
144 370
            case 'US-ASCII_ISO-8859-1':
145 370
            case 'UTF-8_UTF-8':
146
            //case 'CP1252_CP1252':
147 46
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
148 46
                break;
149
150 352
            case 'UTF-8_':
151 24
            case 'UTF-8_US-ASCII':
152 24
            case 'UTF-8_ISO-8859-1':
153
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
154 352
                $escapedData = '';
155
                // be kind to users creating string xmlrpc values out of different php types
156 352
                $data = (string)$data;
157 352
                $ns = strlen($data);
158 352
                for ($nn = 0; $nn < $ns; $nn++) {
159 352
                    $ch = $data[$nn];
160 352
                    $ii = ord($ch);
161
                    // 7 bits: 0bbbbbbb (127)
162 352
                    if ($ii < 128) {
163
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
164 345
                        switch ($ii) {
165 345
                            case 34:
166 20
                                $escapedData .= '&quot;';
167 20
                                break;
168 345
                            case 38:
169 20
                                $escapedData .= '&amp;';
170 20
                                break;
171 345
                            case 39:
172 22
                                $escapedData .= '&apos;';
173 22
                                break;
174 345
                            case 60:
175 20
                                $escapedData .= '&lt;';
176 20
                                break;
177 345
                            case 62:
178 20
                                $escapedData .= '&gt;';
179 20
                                break;
180
                            default:
181 345
                                $escapedData .= $ch;
182
                        } // switch
183
                    } // 11 bits: 110bbbbb 10bbbbbb (2047)
184 71
                    elseif ($ii >> 5 == 6) {
185 67
                        $b1 = ($ii & 31);
186 67
                        $ii = ord($data[$nn + 1]);
187 67
                        $b2 = ($ii & 63);
188 67
                        $ii = ($b1 * 64) + $b2;
189 67
                        $ent = sprintf('&#%d;', $ii);
190 67
                        $escapedData .= $ent;
191 67
                        $nn += 1;
192
                    } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb
193 31
                    elseif ($ii >> 4 == 14) {
194 31
                        $b1 = ($ii & 15);
195 31
                        $ii = ord($data[$nn + 1]);
196 31
                        $b2 = ($ii & 63);
197 31
                        $ii = ord($data[$nn + 2]);
198 31
                        $b3 = ($ii & 63);
199 31
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
200 31
                        $ent = sprintf('&#%d;', $ii);
201 31
                        $escapedData .= $ent;
202 31
                        $nn += 2;
203
                    } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
204
                    elseif ($ii >> 3 == 30) {
205
                        $b1 = ($ii & 7);
206
                        $ii = ord($data[$nn + 1]);
207
                        $b2 = ($ii & 63);
208
                        $ii = ord($data[$nn + 2]);
209
                        $b3 = ($ii & 63);
210
                        $ii = ord($data[$nn + 3]);
211
                        $b4 = ($ii & 63);
212
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
213
                        $ent = sprintf('&#%d;', $ii);
214
                        $escapedData .= $ent;
215
                        $nn += 3;
216
                    }
217
                }
218
219
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
220 352
                if ($conversion == 'UTF-8_ISO-8859-1') {
221 24
                    $this->buildConversionTable('xml_iso88591_Entities');
222 24
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
223
                }
224 352
                break;
225
226
            /*
227
            case 'CP1252_':
228
            case 'CP1252_US-ASCII':
229
                $this->buildConversionTable('xml_cp1252_Entities');
230
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
231
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
232
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
233
                break;
234
            case 'CP1252_UTF-8':
235
                $this->buildConversionTable('xml_cp1252_Entities');
236
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
237
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
238
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
239
                $escapedData = utf8_encode($escapedData);
240
                break;
241
            case 'CP1252_ISO-8859-1':
242
                $this->buildConversionTable('xml_cp1252_Entities');
243
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
244
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
245
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
246
                break;
247
            */
248
249
            default:
250
                $escapedData = '';
251
                Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
252
        }
253
254 585
        return $escapedData;
255
    }
256
257
    /**
258
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
259
     * in the list.
260
     *
261
     * @param string $encoding charset to be tested
262
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
263
     *
264
     * @return bool
265
     */
266
    public function isValidCharset($encoding, $validList)
267
    {
268
        if (is_string($validList)) {
269
            $validList = explode(',', $validList);
270
        }
271
        if (@in_array(strtoupper($encoding), $validList)) {
272
            return true;
273
        } else {
274
            if (array_key_exists($encoding, $this->charset_supersets)) {
275
                foreach ($validList as $allowed) {
276
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
277
                        return true;
278
                    }
279
                }
280
            }
281
282
            return false;
283
        }
284
    }
285
286
    /**
287
     * Used only for backwards compatibility
288
     * @deprecated
289
     *
290
     * @param string $charset
291
     *
292
     * @return array
293
     *
294
     * @throws \Exception for unknown/unsupported charsets
295
     */
296
    public function getEntities($charset)
297
    {
298
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
299
300
        switch ($charset)
301
        {
302
            case 'iso88591':
303
                return $this->xml_iso88591_Entities;
304
            default:
305
                throw new \Exception('Unsupported charset: ' . $charset);
306
        }
307
    }
308
309
}
310