Passed
Push — master ( ca0b51...526b0d )
by Gaetano
11:52 queued 09:18
created

Charset::encodeEntities()   D

Complexity

Conditions 25
Paths 48

Size

Total Lines 136
Code Lines 83

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 70
CRAP Score 26.5652

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 83
c 1
b 0
f 0
nc 48
nop 3
dl 0
loc 136
ccs 70
cts 81
cp 0.8642
crap 26.5652
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    // tables used for transcoding different charsets into us-ascii xml
13
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
14
15
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
16
17
    protected $charset_supersets = array(
18
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
19
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
20
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
21
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
22
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
23
    );
24
25
    /** @var Charset $instance */
26
    protected static $instance = null;
27
28
    /**
29
     * This class is singleton for performance reasons.
30
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
31
     *
32
     * @return Charset
33
     */
34 605
    public static function instance()
35
    {
36 605
        if (self::$instance === null) {
37 383
            self::$instance = new static();
38
        }
39
40 605
        return self::$instance;
41
    }
42
43
    /**
44
     * Force usage as singleton
45
     */
46 383
    protected function __construct()
47
    {
48 383
    }
49
50
    /**
51
     * @param string $tableName
52
     * @throws \Exception for unsupported $tableName
53
     * @todo add support for cp1252 as well as latin-2 .. latin-10
54
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
55
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
56
     *       (though no luck when receiving them...)
57
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
58
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
59
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
60
     *       Check what mbstring/iconv do by default with those?
61
     */
62 494
    protected function buildConversionTable($tableName)
63
    {
64 494
        switch($tableName) {
65 494
            case 'xml_iso88591_Entities':
66 494
                if (count($this->xml_iso88591_Entities['in'])) {
67 492
                    return;
68
                }
69 19
                for ($i = 0; $i < 32; $i++) {
70 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
71 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
72
                }
73
74
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
75
76 19
                for ($i = 160; $i < 256; $i++) {
77 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
78 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
79
                }
80 19
                break;
81
82
            /*case 'xml_cp1252_Entities':
83
                if (count($this->xml_cp1252_Entities['in'])) {
84
                    return;
85
                }
86
                for ($i = 128; $i < 160; $i++)
87
                {
88
                    $this->xml_cp1252_Entities['in'][] = chr($i);
89
                }
90
                $this->xml_cp1252_Entities['out'] = array(
91
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
92
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
93
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
94
                    '&#x0152;', '?',        '&#x017D;', '?',
95
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
96
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
97
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
98
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
99
                );
100
                $this->buildConversionTable('xml_iso88591_Entities');
101
                break;*/
102
103
            default:
104
                throw new \Exception('Unsupported table: ' . $tableName);
105
        }
106 19
    }
107
108
    /**
109
     * Convert a string to the correct XML representation in a target charset.
110
     * This involves:
111
     * - character transformation for all characters which have a different representation in source and dest charsets
112
     * - using 'charset entity' representation for all characters which are outside of the target charset
113
     *
114
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
115
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
116
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
117
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
118
     *
119
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
120
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
121
     *
122
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
123
     * @todo make usage of iconv() or mb_string() where available
124
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
125
     *       but then take those into account as well in other methods, ie.isValidCharset)
126
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
127
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
128
     *
129
     * @param string $data
130
     * @param string $srcEncoding
131
     * @param string $destEncoding
132
     *
133
     * @return string
134
     */
135 586
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
136
    {
137 586
        if ($srcEncoding == '') {
138
            // lame, but we know no better...
139
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
140
        }
141
142 586
        if ($destEncoding == '') {
143 556
            $destEncoding = 'US-ASCII';
144
        }
145
146 586
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
147
148
        // list ordered with (expected) most common scenarios first
149 586
        switch ($conversion) {
150 586
            case 'UTF-8_UTF-8':
151 585
            case 'ISO-8859-1_ISO-8859-1':
152 574
            case 'US-ASCII_UTF-8':
153 574
            case 'US-ASCII_US-ASCII':
154 574
            case 'US-ASCII_ISO-8859-1':
155
            //case 'CP1252_CP1252':
156 46
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
157 46
                break;
158
159 574
            case 'UTF-8_US-ASCII':
160 522
            case 'UTF-8_ISO-8859-1':
161
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
162 353
                $escapedData = '';
163
                // be kind to users creating string xmlrpc values out of different php types
164 353
                $data = (string)$data;
165 353
                $ns = strlen($data);
166 353
                for ($nn = 0; $nn < $ns; $nn++) {
167 353
                    $ch = $data[$nn];
168 353
                    $ii = ord($ch);
169
                    // 7 bits in 1 byte: 0bbbbbbb (127)
170 353
                    if ($ii < 32) {
171 40
                        if ($conversion == 'UTF-8_US-ASCII') {
172 37
                            $escapedData .= sprintf('&#%d;', $ii);
173
                        } else {
174 3
                            $escapedData .= $ch;
175
                        }
176
                    }
177 353
                    else if ($ii < 128) {
178
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
179 346
                        switch ($ii) {
180 346
                            case 34:
181 20
                                $escapedData .= '&quot;';
182 20
                                break;
183 346
                            case 38:
184 21
                                $escapedData .= '&amp;';
185 21
                                break;
186 346
                            case 39:
187 22
                                $escapedData .= '&apos;';
188 22
                                break;
189 346
                            case 60:
190 21
                                $escapedData .= '&lt;';
191 21
                                break;
192 346
                            case 62:
193 21
                                $escapedData .= '&gt;';
194 21
                                break;
195
                            default:
196 346
                                $escapedData .= $ch;
197
                        } // switch
198
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
199 71
                    elseif ($ii >> 5 == 6) {
200 67
                        $b1 = ($ii & 31);
201 67
                        $b2 = (ord($data[$nn + 1]) & 63);
202 67
                        $ii = ($b1 * 64) + $b2;
203 67
                        $escapedData .= sprintf('&#%d;', $ii);
204 67
                        $nn += 1;
205
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
206 31
                    elseif ($ii >> 4 == 14) {
207 31
                        $b1 = ($ii & 15);
208 31
                        $b2 = (ord($data[$nn + 1]) & 63);
209 31
                        $b3 = (ord($data[$nn + 2]) & 63);
210 31
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
211 31
                        $escapedData .= sprintf('&#%d;', $ii);
212 31
                        $nn += 2;
213
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
214
                    elseif ($ii >> 3 == 30) {
215
                        $b1 = ($ii & 7);
216
                        $b2 = (ord($data[$nn + 1]) & 63);
217
                        $b3 = (ord($data[$nn + 2]) & 63);
218
                        $b4 = (ord($data[$nn + 3]) & 63);
219
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
220
                        $escapedData .= sprintf('&#%d;', $ii);
221
                        $nn += 3;
222
                    }
223
                }
224
225
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
226 353
                if ($conversion == 'UTF-8_ISO-8859-1') {
227 24
                    $this->buildConversionTable('xml_iso88591_Entities');
228 24
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
229
                }
230 353
                break;
231
232 515
            case 'ISO-8859-1_UTF-8':
233 28
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
234 28
                $escapedData = utf8_encode($escapedData);
235 28
                break;
236
237 487
            case 'ISO-8859-1_US-ASCII':
238 487
                $this->buildConversionTable('xml_iso88591_Entities');
239 487
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
240 487
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
241 487
                break;
242
243
            /*
244
            case 'CP1252_US-ASCII':
245
                $this->buildConversionTable('xml_cp1252_Entities');
246
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
247
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
248
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
249
                break;
250
            case 'CP1252_UTF-8':
251
                $this->buildConversionTable('xml_cp1252_Entities');
252
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
253
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
254
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
255
                $escapedData = utf8_encode($escapedData);
256
                break;
257
            case 'CP1252_ISO-8859-1':
258
                $this->buildConversionTable('xml_cp1252_Entities');
259
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
260
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
261
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
262
                break;
263
            */
264
265
            default:
266
                $escapedData = '';
267
                Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
268
        }
269
270 586
        return $escapedData;
271
    }
272
273
    /**
274
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
275
     * in the list.
276
     *
277
     * @param string $encoding charset to be tested
278
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
279
     *
280
     * @return bool
281
     */
282
    public function isValidCharset($encoding, $validList)
283
    {
284
        if (is_string($validList)) {
285
            $validList = explode(',', $validList);
286
        }
287
        if (@in_array(strtoupper($encoding), $validList)) {
288
            return true;
289
        } else {
290
            if (array_key_exists($encoding, $this->charset_supersets)) {
291
                foreach ($validList as $allowed) {
292
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
293
                        return true;
294
                    }
295
                }
296
            }
297
298
            return false;
299
        }
300
    }
301
302
    /**
303
     * Used only for backwards compatibility
304
     * @deprecated
305
     *
306
     * @param string $charset
307
     *
308
     * @return array
309
     *
310
     * @throws \Exception for unknown/unsupported charsets
311
     */
312
    public function getEntities($charset)
313
    {
314
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
315
316
        switch ($charset)
317
        {
318
            case 'iso88591':
319
                return $this->xml_iso88591_Entities;
320
            default:
321
                throw new \Exception('Unsupported charset: ' . $charset);
322
        }
323
    }
324
}
325