Passed
Push — master ( feb128...404971 )
by Gaetano
09:34
created

Charset::isValidCharset()   A

Complexity

Conditions 6
Paths 8

Size

Total Lines 19
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 0
Metric Value
cc 6
eloc 10
c 0
b 0
f 0
nc 8
nop 2
dl 0
loc 19
rs 9.2222
ccs 0
cts 0
cp 0
crap 42
1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    protected static $logger;
13
14
    // tables used for transcoding different charsets into us-ascii xml
15
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
16
17
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
18
19
    protected $charset_supersets = array(
20
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
21
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
22
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
23
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
24
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
25
    );
26
27
    /** @var Charset $instance */
28
    protected static $instance = null;
29
30
    /**
31
     * This class is singleton for performance reasons.
32
     *
33
     * @return Charset
34 450
     *
35
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
36 450
     */
37 443
    public static function instance()
38
    {
39
        if (self::$instance === null) {
40 450
            self::$instance = new static();
41
        }
42
43
        return self::$instance;
44
    }
45
46 443
    public function getLogger()
47
    {
48 443
        if (self::$logger === null) {
49
            self::$logger = Logger::instance();
50
        }
51
        return self::$logger;
52
    }
53
54
    /**
55
     * @param $logger
56
     * @return void
57
     */
58
    public static function setLogger($logger)
59
    {
60
        self::$logger = $logger;
61
    }
62
63
    /**
64 590
     * Force usage as singleton.
65
     */
66 590
    protected function __construct()
67 590
    {
68 590
    }
69 588
70
    /**
71 20
     * @param string $tableName
72 20
     * @return void
73 20
     *
74
     * @throws \Exception for unsupported $tableName
75
     *
76
     * @todo add support for cp1252 as well as latin-2 .. latin-10
77
     *       Optimization creep: instead of building all those tables on load, keep them ready-made php files
78 20
     *       which are not even included until needed
79 20
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
80 20
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
81
     *       (though no luck when receiving them...)
82 20
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
83
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
84
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes
85
     *       from ISO/IEC 6429." Check what mbstring/iconv do by default with those?
86
     */
87
    protected function buildConversionTable($tableName)
88
    {
89
        switch ($tableName) {
90
            case 'xml_iso88591_Entities':
91
                if (count($this->xml_iso88591_Entities['in'])) {
92
                    return;
93
                }
94
                for ($i = 0; $i < 32; $i++) {
95
                    $this->xml_iso88591_Entities["in"][] = chr($i);
96
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
97
                }
98
99
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
100
101
                for ($i = 160; $i < 256; $i++) {
102
                    $this->xml_iso88591_Entities["in"][] = chr($i);
103
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
104
                }
105
                break;
106
107
            /*case 'xml_cp1252_Entities':
108 20
                if (count($this->xml_cp1252_Entities['in'])) {
109
                    return;
110
                }
111
                for ($i = 128; $i < 160; $i++)
112
                {
113
                    $this->xml_cp1252_Entities['in'][] = chr($i);
114
                }
115
                $this->xml_cp1252_Entities['out'] = array(
116
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
117
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
118
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
119
                    '&#x0152;', '?',        '&#x017D;', '?',
120
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
121
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
122
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
123
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
124
                );
125
                $this->buildConversionTable('xml_iso88591_Entities');
126
                break;*/
127
128
            default:
129
                throw new \Exception('Unsupported table: ' . $tableName);
130
        }
131
    }
132
133
    /**
134
     * Convert a string to the correct XML representation in a target charset.
135
     * This involves:
136
     * - character transformation for all characters which have a different representation in source and dest charsets
137
     * - using 'charset entity' representation for all characters which are outside the target charset
138 689
     *
139
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
140 689
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
141
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
142
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
143
     *
144
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
145 689
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
146 657
     *
147
     * @param string $data
148
     * @param string $srcEncoding
149 689
     * @param string $destEncoding
150
     * @return string
151
     *
152 689
     * @todo do a bit of basic benchmarking: strtr vs. str_replace, str_replace vs htmlspecialchars, hand-coded conversion
153 689
     *       vs mbstring when that is enabled
154 688
     * @todo make use of iconv when it is available and mbstring is not
155 676
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
156 676
     *       but then take those into account as well in other methods, ie. isValidCharset)
157 676
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
158
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
159 49
     * @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')
160 49
     */
161
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
162 676
    {
163 620
        if ($srcEncoding == '') {
164
            // lame, but we know no better...
165 410
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
166
        }
167 410
168 410
        if ($destEncoding == '') {
169 410
            $destEncoding = 'US-ASCII';
170 410
        }
171 410
172
        // in case there is transcoding going on, let's upscale to UTF8
173 410
        /// @todo we should do this as well when $srcEncoding == $destEncoding and the encoding is not supported by
174 44
        ///       htmlspecialchars
175 41
        if (!in_array($srcEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')) && $srcEncoding != $destEncoding &&
176
            function_exists('mb_convert_encoding')) {
177 3
            $data = mb_convert_encoding($data, 'UTF-8', str_replace('US-ASCII', 'ASCII', $srcEncoding));
178
            $srcEncoding = 'UTF-8';
179
        }
180 410
181
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
182
183 403
        // list ordered with (expected) most common scenarios first
184 403
        switch ($conversion) {
185 22
            case 'UTF-8_UTF-8':
186 22
            case 'ISO-8859-1_ISO-8859-1':
187 403
            case 'US-ASCII_UTF-8':
188 24
            case 'US-ASCII_US-ASCII':
189 24
            case 'US-ASCII_ISO-8859-1':
190 403
            //case 'CP1252_CP1252':
191 24
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
192 24
                break;
193 403
194 23
            case 'UTF-8_US-ASCII':
195 23
            case 'UTF-8_ISO-8859-1':
196 403
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
197 23
                $escapedData = '';
198 23
                // be kind to users creating string xmlrpc values out of different php types
199
                $data = (string)$data;
200 403
                $ns = strlen($data);
201
                for ($nn = 0; $nn < $ns; $nn++) {
202
                    $ch = $data[$nn];
203 77
                    $ii = ord($ch);
204 73
                    // 7 bits in 1 byte: 0bbbbbbb (127)
205 73
                    if ($ii < 32) {
206 73
                        if ($conversion == 'UTF-8_US-ASCII') {
207 73
                            $escapedData .= sprintf('&#%d;', $ii);
208 73
                        } else {
209
                            $escapedData .= $ch;
210 33
                        }
211 33
                    }
212 33
                    else if ($ii < 128) {
213 33
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
214 33
                        /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
215 33
                        switch ($ii) {
216 33
                            case 34:
217
                                $escapedData .= '&quot;';
218
                                break;
219
                            case 38:
220
                                $escapedData .= '&amp;';
221
                                break;
222
                            case 39:
223
                                $escapedData .= '&apos;';
224
                                break;
225
                            case 60:
226
                                $escapedData .= '&lt;';
227
                                break;
228
                            case 62:
229
                                $escapedData .= '&gt;';
230 410
                                break;
231 25
                            default:
232 25
                                $escapedData .= $ch;
233
                        } // switch
234 410
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
235
                    elseif ($ii >> 5 == 6) {
236 613
                        $b1 = ($ii & 31);
237 30
                        $b2 = (ord($data[$nn + 1]) & 63);
238 30
                        $ii = ($b1 * 64) + $b2;
239 30
                        $escapedData .= sprintf('&#%d;', $ii);
240
                        $nn += 1;
241 583
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
242 583
                    elseif ($ii >> 4 == 14) {
243 583
                        $b1 = ($ii & 15);
244 583
                        $b2 = (ord($data[$nn + 1]) & 63);
245 583
                        $b3 = (ord($data[$nn + 2]) & 63);
246
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
247
                        $escapedData .= sprintf('&#%d;', $ii);
248
                        $nn += 2;
249
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
250
                    elseif ($ii >> 3 == 30) {
251
                        $b1 = ($ii & 7);
252
                        $b2 = (ord($data[$nn + 1]) & 63);
253
                        $b3 = (ord($data[$nn + 2]) & 63);
254
                        $b4 = (ord($data[$nn + 3]) & 63);
255
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
256
                        $escapedData .= sprintf('&#%d;', $ii);
257
                        $nn += 3;
258
                    }
259
                }
260
261
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
262
                if ($conversion == 'UTF-8_ISO-8859-1') {
263
                    $this->buildConversionTable('xml_iso88591_Entities');
264
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
265
                }
266
                break;
267
268
            case 'ISO-8859-1_UTF-8':
269
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
270
                /// @todo if on php >= 8.2, prefer using mbstring or iconv. Also: suppress the warning!
271
                $escapedData = utf8_encode($escapedData);
272
                break;
273
274 689
            case 'ISO-8859-1_US-ASCII':
275
                $this->buildConversionTable('xml_iso88591_Entities');
276
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
277
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
278
                break;
279
280
            /*
281
            case 'CP1252_US-ASCII':
282
                $this->buildConversionTable('xml_cp1252_Entities');
283
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
284
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
285
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
286
                break;
287
            case 'CP1252_UTF-8':
288
                $this->buildConversionTable('xml_cp1252_Entities');
289
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
290
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
291
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
292
                $escapedData = utf8_encode($escapedData);
293
                break;
294
            case 'CP1252_ISO-8859-1':
295
                $this->buildConversionTable('xml_cp1252_Entities');
296
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
297
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
298
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
299
                break;
300
            */
301
302
            default:
303
                if (function_exists('mb_convert_encoding')) {
304
                    // If reaching where, there are only 2 cases possible: UTF8->XXX or XXX->XXX
305
                    // If src is UTF8, we run htmlspecialchars before converting to the target charset, as
306
                    // htmlspecialchars has limited charset support, but it groks utf8
307
                    if ($srcEncoding === 'UTF-8') {
308
                        $data = htmlspecialchars($data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, parameter $string of htmlspecialchars() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

308
                        $data = htmlspecialchars(/** @scrutinizer ignore-type */ $data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
Loading history...
309
                    }
310
                    if ($srcEncoding !== $destEncoding) {
311
                        $data = mb_convert_encoding($data, str_replace('US-ASCII', 'ASCII', $destEncoding), str_replace('US-ASCII', 'ASCII', $srcEncoding));
312
                    }
313
                    if ($data === false) {
0 ignored issues
show
introduced by
The condition $data === false is always false.
Loading history...
314
                        $escapedData = '';
315
                        $this->getLogger()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding via mbstring: failed...");
316
                    } else {
317
                        if ($srcEncoding === 'UTF-8') {
318
                            $escapedData = $data;
319
                        } else {
320
                            $escapedData = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, $destEncoding);
321
                        }
322
                    }
323
                } else {
324
                    $escapedData = '';
325
                    $this->getLogger()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
326
                }
327
        }
328
329
        return $escapedData;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $escapedData also could return the type array which is incompatible with the documented return type string.
Loading history...
330
    }
331
332
    /**
333
     * @return string[]
334
     */
335
    public function knownCharsets()
336
    {
337
        $knownCharsets = array('UTF-8', 'ISO-8859-1', 'US-ASCII');
338
        // Add all charsets which mbstring can handle, but remove junk not found in IANA registry at
339
        // http://www.iana.org/assignments/character-sets/character-sets.xhtml
340
        if (function_exists('mb_list_encodings')) {
341
            $knownCharsets = array_unique(array_merge($knownCharsets, array_diff(mb_list_encodings(), array(
342
                'pass', 'auto', 'wchar', 'BASE64', 'UUENCODE', 'ASCII', 'HTML-ENTITIES', 'Quoted-Printable',
343
                '7bit','8bit', 'byte2be', 'byte2le', 'byte4be', 'byte4le'
344
            ))));
345
        }
346
        return $knownCharsets;
347
    }
348
349
    /**
350
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
351
     * in the list.
352
     * @deprecated kept around for BC, as it is not in use by the lib
353
     *
354
     * @param string $encoding charset to be tested
355
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
356
     * @return bool
357
     */
358
    public function isValidCharset($encoding, $validList)
359
    {
360
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
361
362
        if (is_string($validList)) {
363
            $validList = explode(',', $validList);
364
        }
365
        if (in_array(strtoupper($encoding), $validList)) {
366
            return true;
367
        } else {
368
            if (array_key_exists($encoding, $this->charset_supersets)) {
369
                foreach ($validList as $allowed) {
370
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
371
                        return true;
372
                    }
373
                }
374
            }
375
376
            return false;
377
        }
378
    }
379
380
    /**
381
     * Used only for backwards compatibility.
382
     * @deprecated
383
     *
384
     * @param string $charset
385
     * @return array
386
     * @throws \Exception for unknown/unsupported charsets
387
     */
388
    public function getEntities($charset)
389
    {
390
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
391
392
        switch ($charset)
393
        {
394
            case 'iso88591':
395
                return $this->xml_iso88591_Entities;
396
            default:
397
                throw new \Exception('Unsupported charset: ' . $charset);
398
        }
399
    }
400
}
401