Passed
Push — master ( feb128...404971 )
by Gaetano
09:34
created

Charset::encodeEntities()   F

Complexity

Conditions 35
Paths 192

Size

Total Lines 169
Code Lines 101

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 57
CRAP Score 69.7222

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 35
eloc 101
c 1
b 0
f 0
nc 192
nop 3
dl 0
loc 169
rs 2.72
ccs 57
cts 82
cp 0.6951
crap 69.7222

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    protected static $logger;
13
14
    // tables used for transcoding different charsets into us-ascii xml
15
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
16
17
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
18
19
    protected $charset_supersets = array(
20
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
21
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
22
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
23
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
24
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
25
    );
26
27
    /** @var Charset $instance */
28
    protected static $instance = null;
29
30
    /**
31
     * This class is singleton for performance reasons.
32
     *
33
     * @return Charset
34 450
     *
35
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
36 450
     */
37 443
    public static function instance()
38
    {
39
        if (self::$instance === null) {
40 450
            self::$instance = new static();
41
        }
42
43
        return self::$instance;
44
    }
45
46 443
    public function getLogger()
47
    {
48 443
        if (self::$logger === null) {
49
            self::$logger = Logger::instance();
50
        }
51
        return self::$logger;
52
    }
53
54
    /**
55
     * @param $logger
56
     * @return void
57
     */
58
    public static function setLogger($logger)
59
    {
60
        self::$logger = $logger;
61
    }
62
63
    /**
64 590
     * Force usage as singleton.
65
     */
66 590
    protected function __construct()
67 590
    {
68 590
    }
69 588
70
    /**
71 20
     * @param string $tableName
72 20
     * @return void
73 20
     *
74
     * @throws \Exception for unsupported $tableName
75
     *
76
     * @todo add support for cp1252 as well as latin-2 .. latin-10
77
     *       Optimization creep: instead of building all those tables on load, keep them ready-made php files
78 20
     *       which are not even included until needed
79 20
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
80 20
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
81
     *       (though no luck when receiving them...)
82 20
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
83
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
84
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes
85
     *       from ISO/IEC 6429." Check what mbstring/iconv do by default with those?
86
     */
87
    protected function buildConversionTable($tableName)
88
    {
89
        switch ($tableName) {
90
            case 'xml_iso88591_Entities':
91
                if (count($this->xml_iso88591_Entities['in'])) {
92
                    return;
93
                }
94
                for ($i = 0; $i < 32; $i++) {
95
                    $this->xml_iso88591_Entities["in"][] = chr($i);
96
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
97
                }
98
99
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
100
101
                for ($i = 160; $i < 256; $i++) {
102
                    $this->xml_iso88591_Entities["in"][] = chr($i);
103
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
104
                }
105
                break;
106
107
            /*case 'xml_cp1252_Entities':
108 20
                if (count($this->xml_cp1252_Entities['in'])) {
109
                    return;
110
                }
111
                for ($i = 128; $i < 160; $i++)
112
                {
113
                    $this->xml_cp1252_Entities['in'][] = chr($i);
114
                }
115
                $this->xml_cp1252_Entities['out'] = array(
116
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
117
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
118
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
119
                    '&#x0152;', '?',        '&#x017D;', '?',
120
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
121
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
122
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
123
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
124
                );
125
                $this->buildConversionTable('xml_iso88591_Entities');
126
                break;*/
127
128
            default:
129
                throw new \Exception('Unsupported table: ' . $tableName);
130
        }
131
    }
132
133
    /**
134
     * Convert a string to the correct XML representation in a target charset.
135
     * This involves:
136
     * - character transformation for all characters which have a different representation in source and dest charsets
137
     * - using 'charset entity' representation for all characters which are outside the target charset
138 689
     *
139
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
140 689
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
141
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
142
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
143
     *
144
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
145 689
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
146 657
     *
147
     * @param string $data
148
     * @param string $srcEncoding
149 689
     * @param string $destEncoding
150
     * @return string
151
     *
152 689
     * @todo do a bit of basic benchmarking: strtr vs. str_replace, str_replace vs htmlspecialchars, hand-coded conversion
153 689
     *       vs mbstring when that is enabled
154 688
     * @todo make use of iconv when it is available and mbstring is not
155 676
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
156 676
     *       but then take those into account as well in other methods, ie. isValidCharset)
157 676
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
158
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
159 49
     * @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')
160 49
     */
161
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
162 676
    {
163 620
        if ($srcEncoding == '') {
164
            // lame, but we know no better...
165 410
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
166
        }
167 410
168 410
        if ($destEncoding == '') {
169 410
            $destEncoding = 'US-ASCII';
170 410
        }
171 410
172
        // in case there is transcoding going on, let's upscale to UTF8
173 410
        /// @todo we should do this as well when $srcEncoding == $destEncoding and the encoding is not supported by
174 44
        ///       htmlspecialchars
175 41
        if (!in_array($srcEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')) && $srcEncoding != $destEncoding &&
176
            function_exists('mb_convert_encoding')) {
177 3
            $data = mb_convert_encoding($data, 'UTF-8', str_replace('US-ASCII', 'ASCII', $srcEncoding));
178
            $srcEncoding = 'UTF-8';
179
        }
180 410
181
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
182
183 403
        // list ordered with (expected) most common scenarios first
184 403
        switch ($conversion) {
185 22
            case 'UTF-8_UTF-8':
186 22
            case 'ISO-8859-1_ISO-8859-1':
187 403
            case 'US-ASCII_UTF-8':
188 24
            case 'US-ASCII_US-ASCII':
189 24
            case 'US-ASCII_ISO-8859-1':
190 403
            //case 'CP1252_CP1252':
191 24
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
192 24
                break;
193 403
194 23
            case 'UTF-8_US-ASCII':
195 23
            case 'UTF-8_ISO-8859-1':
196 403
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
197 23
                $escapedData = '';
198 23
                // be kind to users creating string xmlrpc values out of different php types
199
                $data = (string)$data;
200 403
                $ns = strlen($data);
201
                for ($nn = 0; $nn < $ns; $nn++) {
202
                    $ch = $data[$nn];
203 77
                    $ii = ord($ch);
204 73
                    // 7 bits in 1 byte: 0bbbbbbb (127)
205 73
                    if ($ii < 32) {
206 73
                        if ($conversion == 'UTF-8_US-ASCII') {
207 73
                            $escapedData .= sprintf('&#%d;', $ii);
208 73
                        } else {
209
                            $escapedData .= $ch;
210 33
                        }
211 33
                    }
212 33
                    else if ($ii < 128) {
213 33
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
214 33
                        /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
215 33
                        switch ($ii) {
216 33
                            case 34:
217
                                $escapedData .= '&quot;';
218
                                break;
219
                            case 38:
220
                                $escapedData .= '&amp;';
221
                                break;
222
                            case 39:
223
                                $escapedData .= '&apos;';
224
                                break;
225
                            case 60:
226
                                $escapedData .= '&lt;';
227
                                break;
228
                            case 62:
229
                                $escapedData .= '&gt;';
230 410
                                break;
231 25
                            default:
232 25
                                $escapedData .= $ch;
233
                        } // switch
234 410
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
235
                    elseif ($ii >> 5 == 6) {
236 613
                        $b1 = ($ii & 31);
237 30
                        $b2 = (ord($data[$nn + 1]) & 63);
238 30
                        $ii = ($b1 * 64) + $b2;
239 30
                        $escapedData .= sprintf('&#%d;', $ii);
240
                        $nn += 1;
241 583
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
242 583
                    elseif ($ii >> 4 == 14) {
243 583
                        $b1 = ($ii & 15);
244 583
                        $b2 = (ord($data[$nn + 1]) & 63);
245 583
                        $b3 = (ord($data[$nn + 2]) & 63);
246
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
247
                        $escapedData .= sprintf('&#%d;', $ii);
248
                        $nn += 2;
249
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
250
                    elseif ($ii >> 3 == 30) {
251
                        $b1 = ($ii & 7);
252
                        $b2 = (ord($data[$nn + 1]) & 63);
253
                        $b3 = (ord($data[$nn + 2]) & 63);
254
                        $b4 = (ord($data[$nn + 3]) & 63);
255
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
256
                        $escapedData .= sprintf('&#%d;', $ii);
257
                        $nn += 3;
258
                    }
259
                }
260
261
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
262
                if ($conversion == 'UTF-8_ISO-8859-1') {
263
                    $this->buildConversionTable('xml_iso88591_Entities');
264
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
265
                }
266
                break;
267
268
            case 'ISO-8859-1_UTF-8':
269
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
270
                /// @todo if on php >= 8.2, prefer using mbstring or iconv. Also: suppress the warning!
271
                $escapedData = utf8_encode($escapedData);
272
                break;
273
274 689
            case 'ISO-8859-1_US-ASCII':
275
                $this->buildConversionTable('xml_iso88591_Entities');
276
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
277
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
278
                break;
279
280
            /*
281
            case 'CP1252_US-ASCII':
282
                $this->buildConversionTable('xml_cp1252_Entities');
283
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
284
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
285
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
286
                break;
287
            case 'CP1252_UTF-8':
288
                $this->buildConversionTable('xml_cp1252_Entities');
289
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
290
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
291
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
292
                $escapedData = utf8_encode($escapedData);
293
                break;
294
            case 'CP1252_ISO-8859-1':
295
                $this->buildConversionTable('xml_cp1252_Entities');
296
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
297
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
298
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
299
                break;
300
            */
301
302
            default:
303
                if (function_exists('mb_convert_encoding')) {
304
                    // If reaching where, there are only 2 cases possible: UTF8->XXX or XXX->XXX
305
                    // If src is UTF8, we run htmlspecialchars before converting to the target charset, as
306
                    // htmlspecialchars has limited charset support, but it groks utf8
307
                    if ($srcEncoding === 'UTF-8') {
308
                        $data = htmlspecialchars($data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, parameter $string of htmlspecialchars() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

308
                        $data = htmlspecialchars(/** @scrutinizer ignore-type */ $data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
Loading history...
309
                    }
310
                    if ($srcEncoding !== $destEncoding) {
311
                        $data = mb_convert_encoding($data, str_replace('US-ASCII', 'ASCII', $destEncoding), str_replace('US-ASCII', 'ASCII', $srcEncoding));
312
                    }
313
                    if ($data === false) {
0 ignored issues
show
introduced by
The condition $data === false is always false.
Loading history...
314
                        $escapedData = '';
315
                        $this->getLogger()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding via mbstring: failed...");
316
                    } else {
317
                        if ($srcEncoding === 'UTF-8') {
318
                            $escapedData = $data;
319
                        } else {
320
                            $escapedData = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, $destEncoding);
321
                        }
322
                    }
323
                } else {
324
                    $escapedData = '';
325
                    $this->getLogger()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
326
                }
327
        }
328
329
        return $escapedData;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $escapedData also could return the type array which is incompatible with the documented return type string.
Loading history...
330
    }
331
332
    /**
333
     * @return string[]
334
     */
335
    public function knownCharsets()
336
    {
337
        $knownCharsets = array('UTF-8', 'ISO-8859-1', 'US-ASCII');
338
        // Add all charsets which mbstring can handle, but remove junk not found in IANA registry at
339
        // http://www.iana.org/assignments/character-sets/character-sets.xhtml
340
        if (function_exists('mb_list_encodings')) {
341
            $knownCharsets = array_unique(array_merge($knownCharsets, array_diff(mb_list_encodings(), array(
342
                'pass', 'auto', 'wchar', 'BASE64', 'UUENCODE', 'ASCII', 'HTML-ENTITIES', 'Quoted-Printable',
343
                '7bit','8bit', 'byte2be', 'byte2le', 'byte4be', 'byte4le'
344
            ))));
345
        }
346
        return $knownCharsets;
347
    }
348
349
    /**
350
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
351
     * in the list.
352
     * @deprecated kept around for BC, as it is not in use by the lib
353
     *
354
     * @param string $encoding charset to be tested
355
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
356
     * @return bool
357
     */
358
    public function isValidCharset($encoding, $validList)
359
    {
360
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
361
362
        if (is_string($validList)) {
363
            $validList = explode(',', $validList);
364
        }
365
        if (in_array(strtoupper($encoding), $validList)) {
366
            return true;
367
        } else {
368
            if (array_key_exists($encoding, $this->charset_supersets)) {
369
                foreach ($validList as $allowed) {
370
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
371
                        return true;
372
                    }
373
                }
374
            }
375
376
            return false;
377
        }
378
    }
379
380
    /**
381
     * Used only for backwards compatibility.
382
     * @deprecated
383
     *
384
     * @param string $charset
385
     * @return array
386
     * @throws \Exception for unknown/unsupported charsets
387
     */
388
    public function getEntities($charset)
389
    {
390
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
391
392
        switch ($charset)
393
        {
394
            case 'iso88591':
395
                return $this->xml_iso88591_Entities;
396
            default:
397
                throw new \Exception('Unsupported charset: ' . $charset);
398
        }
399
    }
400
}
401