Charset::encodeEntities()   F
last analyzed

Complexity

Conditions 37
Paths 248

Size

Total Lines 178
Code Lines 107

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 68
CRAP Score 59.0911

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 37
eloc 107
c 2
b 0
f 0
nc 248
nop 3
dl 0
loc 178
ccs 68
cts 91
cp 0.7473
crap 59.0911
rs 2.1866

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\Exception\ValueErrorException;
6
use PhpXmlRpc\PhpXmlRpc;
7
use PhpXmlRpc\Traits\DeprecationLogger;
8
9
/**
10
 * @todo implement an interface
11
 */
12
class Charset
13
{
14
    use DeprecationLogger;
15
16
    // tables used for transcoding different charsets into us-ascii xml
17
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
18
19
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
20
21
    protected $charset_supersets = array(
22
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
23
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
24
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
25
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
26
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
27
    );
28
29
    /** @var Charset $instance */
30
    protected static $instance = null;
31
32
    /**
33
     * This class is singleton for performance reasons.
34 450
     *
35
     * @return Charset
36 450
     *
37 443
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
38
     */
39
    public static function instance()
40 450
    {
41
        if (self::$instance === null) {
42
            self::$instance = new static();
43
        }
44
45
        return self::$instance;
46 443
    }
47
48 443
    /**
49
     * Force usage as singleton.
50
     */
51
    protected function __construct()
52
    {
53
    }
54
55
    /**
56
     * @param string $tableName
57
     * @return void
58
     * @throws ValueErrorException for unsupported $tableName
59
     *
60
     * @todo add support for cp1252 as well as latin-2 .. latin-10
61
     *       Optimization creep: instead of building all those tables on load, keep them ready-made php files
62
     *       which are not even included until needed
63
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
64 590
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
65
     *       (though no luck when receiving them...)
66 590
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
67 590
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
68 590
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes
69 588
     *       from ISO/IEC 6429." Check what mbstring/iconv do by default with those?
70
     */
71 20
    protected function buildConversionTable($tableName)
72 20
    {
73 20
        switch ($tableName) {
74
            case 'xml_iso88591_Entities':
75
                if (count($this->xml_iso88591_Entities['in'])) {
76
                    return;
77
                }
78 20
                for ($i = 0; $i < 32; $i++) {
79 20
                    $this->xml_iso88591_Entities["in"][] = chr($i);
80 20
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
81
                }
82 20
83
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
84
85
                for ($i = 160; $i < 256; $i++) {
86
                    $this->xml_iso88591_Entities["in"][] = chr($i);
87
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
88
                }
89
                break;
90
91
            /*case 'xml_cp1252_Entities':
92
                if (count($this->xml_cp1252_Entities['in'])) {
93
                    return;
94
                }
95
                for ($i = 128; $i < 160; $i++)
96
                {
97
                    $this->xml_cp1252_Entities['in'][] = chr($i);
98
                }
99
                $this->xml_cp1252_Entities['out'] = array(
100
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
101
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
102
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
103
                    '&#x0152;', '?',        '&#x017D;', '?',
104
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
105
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
106
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
107
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
108 20
                );
109
                $this->buildConversionTable('xml_iso88591_Entities');
110
                break;*/
111
112
            default:
113
                throw new ValueErrorException('Unsupported table: ' . $tableName);
114
        }
115
    }
116
117
    /**
118
     * Convert a string to the correct XML representation in a target charset.
119
     * This involves:
120
     * - character transformation for all characters which have a different representation in source and dest charsets
121
     * - using 'charset entity' representation for all characters which are outside the target charset
122
     *
123
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
124
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
125
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
126
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
127
     *
128
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
129
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
130
     *
131
     * @param string $data
132
     * @param string $srcEncoding
133
     * @param string $destEncoding
134
     * @return string
135
     *
136
     * @todo do a bit of basic benchmarking: strtr vs. str_replace, str_replace vs htmlspecialchars, hand-coded conversion
137
     *       vs mbstring when that is enabled
138 689
     * @todo make use of iconv when it is available and mbstring is not
139
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
140 689
     *       but then take those into account as well in other methods, ie. isValidCharset)
141
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
142
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
143
     * @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')
144
     */
145 689
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
146 657
    {
147
        if ($srcEncoding == '') {
148
            // lame, but we know no better...
149 689
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
150
        }
151
152 689
        if ($destEncoding == '') {
153 689
            $destEncoding = 'US-ASCII';
154 688
        }
155 676
156 676
        // in case there is transcoding going on, let's upscale to UTF8
157 676
        /// @todo we should do this as well when $srcEncoding == $destEncoding and the encoding is not supported by
158
        ///       htmlspecialchars
159 49
        if (!in_array($srcEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')) && $srcEncoding != $destEncoding &&
160 49
            function_exists('mb_convert_encoding')) {
161
            $data = mb_convert_encoding($data, 'UTF-8', str_replace('US-ASCII', 'ASCII', $srcEncoding));
162 676
            $srcEncoding = 'UTF-8';
163 620
        }
164
165 410
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
166
167 410
        // list ordered with (expected) most common scenarios first
168 410
        switch ($conversion) {
169 410
            case 'UTF-8_UTF-8':
170 410
            case 'ISO-8859-1_ISO-8859-1':
171 410
            case 'US-ASCII_UTF-8':
172
            case 'US-ASCII_US-ASCII':
173 410
            case 'US-ASCII_ISO-8859-1':
174 44
            //case 'CP1252_CP1252':
175 41
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
176
                break;
177 3
178
            case 'UTF-8_US-ASCII':
179
            case 'UTF-8_ISO-8859-1':
180 410
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
181
                $escapedData = '';
182
                // be kind to users creating string xml-rpc values out of different php types
183 403
                $data = (string)$data;
184 403
                $ns = strlen($data);
185 22
                for ($nn = 0; $nn < $ns; $nn++) {
186 22
                    $ch = $data[$nn];
187 403
                    $ii = ord($ch);
188 24
                    // 7 bits in 1 byte: 0bbbbbbb (127)
189 24
                    if ($ii < 32) {
190 403
                        if ($conversion == 'UTF-8_US-ASCII') {
191 24
                            $escapedData .= sprintf('&#%d;', $ii);
192 24
                        } else {
193 403
                            $escapedData .= $ch;
194 23
                        }
195 23
                    }
196 403
                    else if ($ii < 128) {
197 23
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
198 23
                        /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
199
                        switch ($ii) {
200 403
                            case 34:
201
                                $escapedData .= '&quot;';
202
                                break;
203 77
                            case 38:
204 73
                                $escapedData .= '&amp;';
205 73
                                break;
206 73
                            case 39:
207 73
                                $escapedData .= '&apos;';
208 73
                                break;
209
                            case 60:
210 33
                                $escapedData .= '&lt;';
211 33
                                break;
212 33
                            case 62:
213 33
                                $escapedData .= '&gt;';
214 33
                                break;
215 33
                            default:
216 33
                                $escapedData .= $ch;
217
                        } // switch
218
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
219
                    elseif ($ii >> 5 == 6) {
220
                        $b1 = ($ii & 31);
221
                        $b2 = (ord($data[$nn + 1]) & 63);
222
                        $ii = ($b1 * 64) + $b2;
223
                        $escapedData .= sprintf('&#%d;', $ii);
224
                        $nn += 1;
225
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
226
                    elseif ($ii >> 4 == 14) {
227
                        $b1 = ($ii & 15);
228
                        $b2 = (ord($data[$nn + 1]) & 63);
229
                        $b3 = (ord($data[$nn + 2]) & 63);
230 410
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
231 25
                        $escapedData .= sprintf('&#%d;', $ii);
232 25
                        $nn += 2;
233
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
234 410
                    elseif ($ii >> 3 == 30) {
235
                        $b1 = ($ii & 7);
236 613
                        $b2 = (ord($data[$nn + 1]) & 63);
237 30
                        $b3 = (ord($data[$nn + 2]) & 63);
238 30
                        $b4 = (ord($data[$nn + 3]) & 63);
239 30
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
240
                        $escapedData .= sprintf('&#%d;', $ii);
241 583
                        $nn += 3;
242 583
                    }
243 583
                }
244 583
245 583
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
246
                if ($conversion == 'UTF-8_ISO-8859-1') {
247
                    $this->buildConversionTable('xml_iso88591_Entities');
248
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
249
                }
250
                break;
251
252
            case 'ISO-8859-1_UTF-8':
253
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
254
                /// @todo if on php >= 8.2, prefer using mbstring or iconv. Also: suppress the warning!
255
                if (function_exists('mb_convert_encoding')) {
256
                        $escapedData = mb_convert_encoding($escapedData, 'UTF-8', 'ISO-8859-1');
257
                } else {
258
                    $escapedData = utf8_encode($escapedData);
259
                }
260
                break;
261
262
            case 'ISO-8859-1_US-ASCII':
263
                $this->buildConversionTable('xml_iso88591_Entities');
264
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
265
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
266
                break;
267
268
            /*
269
            case 'CP1252_US-ASCII':
270
                $this->buildConversionTable('xml_cp1252_Entities');
271
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
272
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
273
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
274 689
                break;
275
            case 'CP1252_UTF-8':
276
                $this->buildConversionTable('xml_cp1252_Entities');
277
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
278
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
279
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
280
                $escapedData = utf8_encode($escapedData);
281
                break;
282
            case 'CP1252_ISO-8859-1':
283
                $this->buildConversionTable('xml_cp1252_Entities');
284
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
285
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
286
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
287
                break;
288
            */
289
290
            default:
291
                if (function_exists('mb_convert_encoding')) {
292
                    // If reaching where, there are only 2 cases possible: UTF8->XXX or XXX->XXX
293
                    // If src is UTF8, we run htmlspecialchars before converting to the target charset, as
294
                    // htmlspecialchars has limited charset support, but it groks utf8
295
                    if ($srcEncoding === 'UTF-8') {
296
                        $data = htmlspecialchars($data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, parameter $string of htmlspecialchars() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

296
                        $data = htmlspecialchars(/** @scrutinizer ignore-type */ $data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
Loading history...
297
                    }
298
                    if ($srcEncoding !== $destEncoding) {
299
                        try {
300
                            // php 7.4 and lower: a warning is generated. php 8.0 and up: an Error is thrown. So much for BC...
301
                            $data = @mb_convert_encoding($data, str_replace('US-ASCII', 'ASCII', $destEncoding), str_replace('US-ASCII', 'ASCII', $srcEncoding));
302
                        } catch (\ValueError $e) {
303
                            $data = false;
304
                        }
305
                    }
306
                    if ($data === false) {
307
                        $escapedData = '';
308
                        $this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding via mbstring: failed...");
309
                    } else {
310
                        if ($srcEncoding === 'UTF-8') {
311
                            $escapedData = $data;
312
                        } else {
313
                            $escapedData = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, $destEncoding);
314
                        }
315
                    }
316
                } else {
317
                    $escapedData = '';
318
                    $this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
319
                }
320
        }
321
322
        return $escapedData;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $escapedData also could return the type array|false which is incompatible with the documented return type string.
Loading history...
323
    }
324
325
    /**
326
     * @return string[]
327
     */
328
    public function knownCharsets()
329
    {
330
        $knownCharsets = array('UTF-8', 'ISO-8859-1', 'US-ASCII');
331
        // Add all charsets which mbstring can handle, but remove junk not found in IANA registry at
332
        // http://www.iana.org/assignments/character-sets/character-sets.xhtml
333
        if (function_exists('mb_list_encodings')) {
334
            $knownCharsets = array_unique(array_merge($knownCharsets, array_diff(mb_list_encodings(), array(
335
                'pass', 'auto', 'wchar', 'BASE64', 'UUENCODE', 'ASCII', 'HTML-ENTITIES', 'Quoted-Printable',
336
                '7bit','8bit', 'byte2be', 'byte2le', 'byte4be', 'byte4le'
337
            ))));
338
        }
339
        return $knownCharsets;
340
    }
341
342
    // *** BC layer ***
343
344
    /**
345
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
346
     * in the list.
347
     * @deprecated kept around for BC, as it is not in use by the lib
348
     *
349
     * @param string $encoding charset to be tested
350
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
351
     * @return bool
352
     */
353
    public function isValidCharset($encoding, $validList)
354
    {
355
        $this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');
356
357
        if (is_string($validList)) {
358
            $validList = explode(',', $validList);
359
        }
360
        if (in_array(strtoupper($encoding), $validList)) {
361
            return true;
362
        } else {
363
            if (array_key_exists($encoding, $this->charset_supersets)) {
364
                foreach ($validList as $allowed) {
365
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
366
                        return true;
367
                    }
368
                }
369
            }
370
371
            return false;
372
        }
373
    }
374
375
    /**
376
     * Used only for backwards compatibility (the .inc shims).
377
     * @deprecated
378
     *
379
     * @param string $charset
380
     * @return array
381
     * @throws ValueErrorException for unknown/unsupported charsets
382
     */
383
    public function getEntities($charset)
384
    {
385
        $this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');
386
387
        switch ($charset)
388
        {
389
            case 'iso88591':
390
                return $this->xml_iso88591_Entities;
391
            default:
392
                throw new ValueErrorException('Unsupported charset: ' . $charset);
393
        }
394
    }
395
}
396