Passed
Push — master ( 5d63c2...ca0b51 )
by Gaetano
10:02
created

Charset::encodeEntities()   D

Complexity

Conditions 25
Paths 48

Size

Total Lines 136
Code Lines 83

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 69
CRAP Score 26.2681

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 83
c 1
b 0
f 0
nc 48
nop 3
dl 0
loc 136
ccs 69
cts 79
cp 0.8734
crap 26.2681
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    // tables used for transcoding different charsets into us-ascii xml
13
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
14
15
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
16
17
    protected $charset_supersets = array(
18
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
19
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
20
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
21
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
22
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
23
    );
24
25
    /** @var Charset $instance */
26
    protected static $instance = null;
27
28
    /**
29
     * This class is singleton for performance reasons.
30
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
31
     *
32
     * @return Charset
33
     */
34
    public static function instance()
35
    {
36
        if (self::$instance === null) {
37
            self::$instance = new static();
38
        }
39
40
        return self::$instance;
41
    }
42 605
43
    /**
44 605
     * Force usage as singleton
45 383
     */
46
    protected function __construct()
47
    {
48 605
    }
49
50
    /**
51
     * @param string $tableName
52
     * @throws \Exception for unsupported $tableName
53
     * @todo add support for cp1252 as well as latin-2 .. latin-10
54 383
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
55
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
56 383
     *       (though no luck when receiving them...)
57
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
58
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
59
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
60
     *       Check what mbstring/iconv do by default with those?
61
     */
62 494
    protected function buildConversionTable($tableName)
63
    {
64 494
        switch($tableName) {
65 494
            case 'xml_iso88591_Entities':
66 494
                if (count($this->xml_iso88591_Entities['in'])) {
67 492
                    return;
68
                }
69 19
                for ($i = 0; $i < 32; $i++) {
70 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
71 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
72
                }
73
74 19
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
75 19
76 19
                for ($i = 160; $i < 256; $i++) {
77
                    $this->xml_iso88591_Entities["in"][] = chr($i);
78 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
79
                }
80
                break;
81
82
            /*case 'xml_cp1252_Entities':
83
                if (count($this->xml_cp1252_Entities['in'])) {
84
                    return;
85
                }
86
                for ($i = 128; $i < 160; $i++)
87
                {
88
                    $this->xml_cp1252_Entities['in'][] = chr($i);
89
                }
90
                $this->xml_cp1252_Entities['out'] = array(
91
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
92
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
93
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
94
                    '&#x0152;', '?',        '&#x017D;', '?',
95
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
96
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
97
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
98
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
99
                );
100
                $this->buildConversionTable('xml_iso88591_Entities');
101
                break;*/
102 19
103
            default:
104
                throw new \Exception('Unsupported table: ' . $tableName);
105
        }
106
    }
107
108
    /**
109
     * Convert a string to the correct XML representation in a target charset.
110
     * This involves:
111
     * - character transformation for all characters which have a different representation in source and dest charsets
112
     * - using 'charset entity' representation for all characters which are outside of the target charset
113
     *
114
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
115
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
116
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
117
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
118
     *
119
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
120
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
121
     *
122
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
123
     * @todo make usage of iconv() or mb_string() where available
124
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
125
     *       but then take those into account as well in other methods, ie.isValidCharset)
126
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
127
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
128 586
     *
129
     * @param string $data
130 586
     * @param string $srcEncoding
131
     * @param string $destEncoding
132
     *
133
     * @return string
134
     */
135 586
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
136 586
    {
137 586
        if ($srcEncoding == '') {
138 393
            // lame, but we know no better...
139 487
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
140 487
        }
141 487
142 487
        if ($destEncoding == '') {
143
            $destEncoding = 'US-ASCII';
144 393
        }
145 28
146 28
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
147 28
148
        // list ordered with (expected) most common scenarios first
149 382
        switch ($conversion) {
150 371
            case 'UTF-8_UTF-8':
151 371
            case 'ISO-8859-1_ISO-8859-1':
152 371
            case 'US-ASCII_UTF-8':
153 371
            case 'US-ASCII_US-ASCII':
154 371
            case 'US-ASCII_ISO-8859-1':
155
            //case 'CP1252_CP1252':
156 46
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
157 46
                break;
158
159 353
            case 'UTF-8_US-ASCII':
160 24
            case 'UTF-8_ISO-8859-1':
161 24
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
162
                $escapedData = '';
163 353
                // be kind to users creating string xmlrpc values out of different php types
164
                $data = (string)$data;
165 353
                $ns = strlen($data);
166 353
                for ($nn = 0; $nn < $ns; $nn++) {
167 353
                    $ch = $data[$nn];
168 353
                    $ii = ord($ch);
169 353
                    // 7 bits in 1 byte: 0bbbbbbb (127)
170
                    if ($ii < 32) {
171 353
                        if ($conversion == 'UTF-8_US-ASCII') {
172 40
                            $escapedData .= sprintf('&#%d;', $ii);
173 37
                        } else {
174
                            $escapedData .= $ch;
175 3
                        }
176
                    }
177
                    else if ($ii < 128) {
178 353
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
179
                        switch ($ii) {
180 346
                            case 34:
181 346
                                $escapedData .= '&quot;';
182 20
                                break;
183 20
                            case 38:
184 346
                                $escapedData .= '&amp;';
185 21
                                break;
186 21
                            case 39:
187 346
                                $escapedData .= '&apos;';
188 22
                                break;
189 22
                            case 60:
190 346
                                $escapedData .= '&lt;';
191 21
                                break;
192 21
                            case 62:
193 346
                                $escapedData .= '&gt;';
194 21
                                break;
195 21
                            default:
196
                                $escapedData .= $ch;
197 346
                        } // switch
198
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
199
                    elseif ($ii >> 5 == 6) {
200 71
                        $b1 = ($ii & 31);
201 67
                        $b2 = (ord($data[$nn + 1]) & 63);
202 67
                        $ii = ($b1 * 64) + $b2;
203 67
                        $escapedData .= sprintf('&#%d;', $ii);
204 67
                        $nn += 1;
205 67
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
206
                    elseif ($ii >> 4 == 14) {
207 31
                        $b1 = ($ii & 15);
208 31
                        $b2 = (ord($data[$nn + 1]) & 63);
209 31
                        $b3 = (ord($data[$nn + 2]) & 63);
210 31
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
211 31
                        $escapedData .= sprintf('&#%d;', $ii);
212 31
                        $nn += 2;
213 31
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
214
                    elseif ($ii >> 3 == 30) {
215
                        $b1 = ($ii & 7);
216
                        $b2 = (ord($data[$nn + 1]) & 63);
217
                        $b3 = (ord($data[$nn + 2]) & 63);
218
                        $b4 = (ord($data[$nn + 3]) & 63);
219
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
220
                        $escapedData .= sprintf('&#%d;', $ii);
221
                        $nn += 3;
222
                    }
223
                }
224
225
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
226
                if ($conversion == 'UTF-8_ISO-8859-1') {
227 353
                    $this->buildConversionTable('xml_iso88591_Entities');
228 24
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
229 24
                }
230
                break;
231 353
232
            case 'ISO-8859-1_UTF-8':
233
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
234
                $escapedData = utf8_encode($escapedData);
235
                break;
236
237
            case 'ISO-8859-1_US-ASCII':
238
                $this->buildConversionTable('xml_iso88591_Entities');
239
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
240
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
241
                break;
242
243
            /*
244
            case 'CP1252_US-ASCII':
245
                $this->buildConversionTable('xml_cp1252_Entities');
246
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
247
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
248
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
249
                break;
250
            case 'CP1252_UTF-8':
251
                $this->buildConversionTable('xml_cp1252_Entities');
252
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
253
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
254
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
255
                $escapedData = utf8_encode($escapedData);
256
                break;
257
            case 'CP1252_ISO-8859-1':
258
                $this->buildConversionTable('xml_cp1252_Entities');
259
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
260
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
261 586
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
262
                break;
263
            */
264
265
            default:
266
                $escapedData = '';
267
                Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
268
        }
269
270
        return $escapedData;
271
    }
272
273
    /**
274
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
275
     * in the list.
276
     *
277
     * @param string $encoding charset to be tested
278
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
279
     *
280
     * @return bool
281
     */
282
    public function isValidCharset($encoding, $validList)
283
    {
284
        if (is_string($validList)) {
285
            $validList = explode(',', $validList);
286
        }
287
        if (@in_array(strtoupper($encoding), $validList)) {
288
            return true;
289
        } else {
290
            if (array_key_exists($encoding, $this->charset_supersets)) {
291
                foreach ($validList as $allowed) {
292
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
293
                        return true;
294
                    }
295
                }
296
            }
297
298
            return false;
299
        }
300
    }
301
302
    /**
303
     * Used only for backwards compatibility
304
     * @deprecated
305
     *
306
     * @param string $charset
307
     *
308
     * @return array
309
     *
310
     * @throws \Exception for unknown/unsupported charsets
311
     */
312
    public function getEntities($charset)
313
    {
314
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
315
316
        switch ($charset)
317
        {
318
            case 'iso88591':
319
                return $this->xml_iso88591_Entities;
320
            default:
321
                throw new \Exception('Unsupported charset: ' . $charset);
322
        }
323
    }
324
}
325