Passed
Push — master ( 8cdf44...424db6 )
by Gaetano
08:49
created

Charset::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 2
Code Lines 0

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 0
nc 1
nop 0
dl 0
loc 2
ccs 2
cts 2
cp 1
crap 1
rs 10
c 1
b 0
f 0
1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    protected static $logger;
13
14
    // tables used for transcoding different charsets into us-ascii xml
15
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
16
17
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
18
19
    protected $charset_supersets = array(
20
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
21
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
22
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
23
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
24
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
25
    );
26
27
    /** @var Charset $instance */
28
    protected static $instance = null;
29
30
    /**
31
     * This class is singleton for performance reasons.
32
     *
33
     * @return Charset
34 450
     *
35
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
36 450
     */
37 443
    public static function instance()
38
    {
39
        if (self::$instance === null) {
40 450
            self::$instance = new static();
41
        }
42
43
        return self::$instance;
44
    }
45
46 443
    public function getLogger()
47
    {
48 443
        if (self::$logger === null) {
49
            self::$logger = Logger::instance();
50
        }
51
        return self::$logger;
52
    }
53
54
    /**
55
     * @param $logger
56
     * @return void
57
     */
58
    public static function setLogger($logger)
59
    {
60
        self::$logger = $logger;
61
    }
62
63
    /**
64 590
     * Force usage as singleton.
65
     */
66 590
    protected function __construct()
67 590
    {
68 590
    }
69 588
70
    /**
71 20
     * @param string $tableName
72 20
     * @return void
73 20
     *
74
     * @throws \Exception for unsupported $tableName
75
     *
76
     * @todo add support for cp1252 as well as latin-2 .. latin-10
77
     *       Optimization creep: instead of building all those tables on load, keep them ready-made php files
78 20
     *       which are not even included until needed
79 20
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
80 20
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
81
     *       (though no luck when receiving them...)
82 20
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
83
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
84
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes
85
     *       from ISO/IEC 6429." Check what mbstring/iconv do by default with those?
86
     */
87
    protected function buildConversionTable($tableName)
88
    {
89
        switch ($tableName) {
90
            case 'xml_iso88591_Entities':
91
                if (count($this->xml_iso88591_Entities['in'])) {
92
                    return;
93
                }
94
                for ($i = 0; $i < 32; $i++) {
95
                    $this->xml_iso88591_Entities["in"][] = chr($i);
96
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
97
                }
98
99
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
100
101
                for ($i = 160; $i < 256; $i++) {
102
                    $this->xml_iso88591_Entities["in"][] = chr($i);
103
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
104
                }
105
                break;
106
107
            /*case 'xml_cp1252_Entities':
108 20
                if (count($this->xml_cp1252_Entities['in'])) {
109
                    return;
110
                }
111
                for ($i = 128; $i < 160; $i++)
112
                {
113
                    $this->xml_cp1252_Entities['in'][] = chr($i);
114
                }
115
                $this->xml_cp1252_Entities['out'] = array(
116
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
117
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
118
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
119
                    '&#x0152;', '?',        '&#x017D;', '?',
120
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
121
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
122
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
123
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
124
                );
125
                $this->buildConversionTable('xml_iso88591_Entities');
126
                break;*/
127
128
            default:
129
                throw new \Exception('Unsupported table: ' . $tableName);
130
        }
131
    }
132
133
    /**
134
     * Convert a string to the correct XML representation in a target charset.
135
     * This involves:
136
     * - character transformation for all characters which have a different representation in source and dest charsets
137
     * - using 'charset entity' representation for all characters which are outside of the target charset
138 689
     *
139
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
140 689
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
141
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
142
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
143
     *
144
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
145 689
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
146 657
     *
147
     * @param string $data
148
     * @param string $srcEncoding
149 689
     * @param string $destEncoding
150
     * @return string
151
     *
152 689
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
153 689
     * @todo make usage of iconv() or mb_string() where available
154 688
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
155 676
     *       but then take those into account as well in other methods, ie. isValidCharset)
156 676
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
157 676
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
158
     * @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')
159 49
     */
160 49
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
161
    {
162 676
        if ($srcEncoding == '') {
163 620
            // lame, but we know no better...
164
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
165 410
        }
166
167 410
        if ($destEncoding == '') {
168 410
            $destEncoding = 'US-ASCII';
169 410
        }
170 410
171 410
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
172
173 410
        // list ordered with (expected) most common scenarios first
174 44
        switch ($conversion) {
175 41
            case 'UTF-8_UTF-8':
176
            case 'ISO-8859-1_ISO-8859-1':
177 3
            case 'US-ASCII_UTF-8':
178
            case 'US-ASCII_US-ASCII':
179
            case 'US-ASCII_ISO-8859-1':
180 410
            //case 'CP1252_CP1252':
181
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
182
                break;
183 403
184 403
            case 'UTF-8_US-ASCII':
185 22
            case 'UTF-8_ISO-8859-1':
186 22
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
187 403
                $escapedData = '';
188 24
                // be kind to users creating string xmlrpc values out of different php types
189 24
                $data = (string)$data;
190 403
                $ns = strlen($data);
191 24
                for ($nn = 0; $nn < $ns; $nn++) {
192 24
                    $ch = $data[$nn];
193 403
                    $ii = ord($ch);
194 23
                    // 7 bits in 1 byte: 0bbbbbbb (127)
195 23
                    if ($ii < 32) {
196 403
                        if ($conversion == 'UTF-8_US-ASCII') {
197 23
                            $escapedData .= sprintf('&#%d;', $ii);
198 23
                        } else {
199
                            $escapedData .= $ch;
200 403
                        }
201
                    }
202
                    else if ($ii < 128) {
203 77
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
204 73
                        /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
205 73
                        switch ($ii) {
206 73
                            case 34:
207 73
                                $escapedData .= '&quot;';
208 73
                                break;
209
                            case 38:
210 33
                                $escapedData .= '&amp;';
211 33
                                break;
212 33
                            case 39:
213 33
                                $escapedData .= '&apos;';
214 33
                                break;
215 33
                            case 60:
216 33
                                $escapedData .= '&lt;';
217
                                break;
218
                            case 62:
219
                                $escapedData .= '&gt;';
220
                                break;
221
                            default:
222
                                $escapedData .= $ch;
223
                        } // switch
224
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
225
                    elseif ($ii >> 5 == 6) {
226
                        $b1 = ($ii & 31);
227
                        $b2 = (ord($data[$nn + 1]) & 63);
228
                        $ii = ($b1 * 64) + $b2;
229
                        $escapedData .= sprintf('&#%d;', $ii);
230 410
                        $nn += 1;
231 25
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
232 25
                    elseif ($ii >> 4 == 14) {
233
                        $b1 = ($ii & 15);
234 410
                        $b2 = (ord($data[$nn + 1]) & 63);
235
                        $b3 = (ord($data[$nn + 2]) & 63);
236 613
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
237 30
                        $escapedData .= sprintf('&#%d;', $ii);
238 30
                        $nn += 2;
239 30
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
240
                    elseif ($ii >> 3 == 30) {
241 583
                        $b1 = ($ii & 7);
242 583
                        $b2 = (ord($data[$nn + 1]) & 63);
243 583
                        $b3 = (ord($data[$nn + 2]) & 63);
244 583
                        $b4 = (ord($data[$nn + 3]) & 63);
245 583
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
246
                        $escapedData .= sprintf('&#%d;', $ii);
247
                        $nn += 3;
248
                    }
249
                }
250
251
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
252
                if ($conversion == 'UTF-8_ISO-8859-1') {
253
                    $this->buildConversionTable('xml_iso88591_Entities');
254
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
255
                }
256
                break;
257
258
            case 'ISO-8859-1_UTF-8':
259
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
260
                /// @todo if on php >= 8.2, prefer using mbstring or iconv
261
                $escapedData = utf8_encode($escapedData);
262
                break;
263
264
            case 'ISO-8859-1_US-ASCII':
265
                $this->buildConversionTable('xml_iso88591_Entities');
266
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
267
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
268
                break;
269
270
            /*
271
            case 'CP1252_US-ASCII':
272
                $this->buildConversionTable('xml_cp1252_Entities');
273
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
274 689
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
275
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
276
                break;
277
            case 'CP1252_UTF-8':
278
                $this->buildConversionTable('xml_cp1252_Entities');
279
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
280
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
281
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
282
                $escapedData = utf8_encode($escapedData);
283
                break;
284
            case 'CP1252_ISO-8859-1':
285
                $this->buildConversionTable('xml_cp1252_Entities');
286
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
287
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
288
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
289
                break;
290
            */
291
292
            default:
293
                $escapedData = '';
294
                /// @todo allow usage of a custom Logger via the DIC(ish) pattern we use in other classes
295
                $this->getLogger()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
296
        }
297
298
        return $escapedData;
299
    }
300
301
    /**
302
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
303
     * in the list.
304
     *
305
     * @param string $encoding charset to be tested
306
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
307
     * @return bool
308
     */
309
    public function isValidCharset($encoding, $validList)
310
    {
311
        if (is_string($validList)) {
312
            $validList = explode(',', $validList);
313
        }
314
        if (@in_array(strtoupper($encoding), $validList)) {
315
            return true;
316
        } else {
317
            if (array_key_exists($encoding, $this->charset_supersets)) {
318
                foreach ($validList as $allowed) {
319
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
320
                        return true;
321
                    }
322
                }
323
            }
324
325
            return false;
326
        }
327
    }
328
329
    /**
330
     * Used only for backwards compatibility.
331
     * @deprecated
332
     *
333
     * @param string $charset
334
     * @return array
335
     * @throws \Exception for unknown/unsupported charsets
336
     */
337
    public function getEntities($charset)
338
    {
339
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
340
341
        switch ($charset)
342
        {
343
            case 'iso88591':
344
                return $this->xml_iso88591_Entities;
345
            default:
346
                throw new \Exception('Unsupported charset: ' . $charset);
347
        }
348
    }
349
}
350