Passed
Push — master ( da2c1e...110169 )
by Gaetano
04:55 queued 01:13
created

Charset::encodeEntities()   D

Complexity

Conditions 25
Paths 32

Size

Total Lines 136
Code Lines 88

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 72
CRAP Score 28.2025

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 88
c 1
b 0
f 0
nc 32
nop 3
dl 0
loc 136
ccs 72
cts 87
cp 0.8276
crap 28.2025
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
class Charset
8
{
9
    // tables used for transcoding different charsets into us-ascii xml
10
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
11
12
    /// @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
13
    ///       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
14
    ///       (though no luck when receiving them...)
15
    ///       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
16
    ///       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
17
    ///       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
18
    ///       Check what mbstring/iconv do by default with those?
19
    //
20
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
21
22
    protected $charset_supersets = array(
23
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
24
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
25
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
26
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
27
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
28
    );
29
30
    protected static $instance = null;
31
32
    /**
33
     * This class is singleton for performance reasons.
34
     * @todo can't we just make $xml_iso88591_Entities a static variable instead ?
35
     *
36
     * @return Charset
37
     */
38 604
    public static function instance()
39
    {
40 604
        if (self::$instance === null) {
41 382
            self::$instance = new self();
42
        }
43
44 604
        return self::$instance;
45
    }
46
47
    /**
48
     * Force usage as singleton
49
     */
50 382
    protected function __construct()
51
    {
52 382
    }
53
54
    /**
55
     * @param string $tableName
56
     * @throws \Exception for unsupported $tableName
57
     */
58 494
    protected function buildConversionTable($tableName)
59
    {
60 494
        switch($tableName) {
61 494
            case 'xml_iso88591_Entities':
62 494
                if (count($this->xml_iso88591_Entities['in'])) {
63 492
                    return;
64
                }
65 19
                for ($i = 0; $i < 32; $i++) {
66 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
67 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
68
                }
69
70 19
                for ($i = 160; $i < 256; $i++) {
71 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
72 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
73
                }
74 19
                break;
75
            /*case 'xml_cp1252_Entities':
76
                if (count($this->xml_cp1252_Entities['in'])) {
77
                    return;
78
                }
79
                for ($i = 128; $i < 160; $i++)
80
                {
81
                    $this->xml_cp1252_Entities['in'][] = chr($i);
82
                }
83
                $this->xml_cp1252_Entities['out'] = array(
84
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
85
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
86
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
87
                    '&#x0152;', '?',        '&#x017D;', '?',
88
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
89
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
90
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
91
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
92
                );
93
                $this->buildConversionTable('xml_iso88591_Entities');
94
                break;*/
95
            default:
96
                throw new \Exception('Unsupported table: ' . $tableName);
97
        }
98 19
    }
99
100
    /**
101
     * Convert a string to the correct XML representation in a target charset.
102
     *
103
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
104
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
105
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
106
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
107
     * Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are
108
     * bound by RFC 3023 to emit strict us-ascii.
109
     *
110
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
111
     * @todo make usage of iconv() or recode_string() or mb_string() where available
112
     *
113
     * @param string $data
114
     * @param string $srcEncoding
115
     * @param string $destEncoding
116
     *
117
     * @return string
118
     */
119 585
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
120
    {
121 585
        if ($srcEncoding == '') {
122
            // lame, but we know no better...
123
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
124
        }
125
126 585
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
127 585
        switch ($conversion) {
128 585
            case 'ISO-8859-1_':
129 392
            case 'ISO-8859-1_US-ASCII':
130 487
                $this->buildConversionTable('xml_iso88591_Entities');
131 487
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
132 487
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
133 487
                break;
134
135 392
            case 'ISO-8859-1_UTF-8':
136 28
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
137 28
                $escapedData = utf8_encode($escapedData);
138 28
                break;
139
140 381
            case 'ISO-8859-1_ISO-8859-1':
141 370
            case 'US-ASCII_US-ASCII':
142 370
            case 'US-ASCII_UTF-8':
143 370
            case 'US-ASCII_':
144 370
            case 'US-ASCII_ISO-8859-1':
145 370
            case 'UTF-8_UTF-8':
146
            //case 'CP1252_CP1252':
147 46
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
148 46
                break;
149
150 352
            case 'UTF-8_':
151 24
            case 'UTF-8_US-ASCII':
152 24
            case 'UTF-8_ISO-8859-1':
153
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
154 352
                $escapedData = '';
155
                // be kind to users creating string xmlrpc values out of different php types
156 352
                $data = (string)$data;
157 352
                $ns = strlen($data);
158 352
                for ($nn = 0; $nn < $ns; $nn++) {
159 352
                    $ch = $data[$nn];
160 352
                    $ii = ord($ch);
161
                    // 7 bits: 0bbbbbbb (127)
162 352
                    if ($ii < 128) {
163
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
164 345
                        switch ($ii) {
165 345
                            case 34:
166 20
                                $escapedData .= '&quot;';
167 20
                                break;
168 345
                            case 38:
169 20
                                $escapedData .= '&amp;';
170 20
                                break;
171 345
                            case 39:
172 22
                                $escapedData .= '&apos;';
173 22
                                break;
174 345
                            case 60:
175 20
                                $escapedData .= '&lt;';
176 20
                                break;
177 345
                            case 62:
178 20
                                $escapedData .= '&gt;';
179 20
                                break;
180
                            default:
181 345
                                $escapedData .= $ch;
182
                        } // switch
183
                    } // 11 bits: 110bbbbb 10bbbbbb (2047)
184 71
                    elseif ($ii >> 5 == 6) {
185 67
                        $b1 = ($ii & 31);
186 67
                        $ii = ord($data[$nn + 1]);
187 67
                        $b2 = ($ii & 63);
188 67
                        $ii = ($b1 * 64) + $b2;
189 67
                        $ent = sprintf('&#%d;', $ii);
190 67
                        $escapedData .= $ent;
191 67
                        $nn += 1;
192
                    } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb
193 31
                    elseif ($ii >> 4 == 14) {
194 31
                        $b1 = ($ii & 15);
195 31
                        $ii = ord($data[$nn + 1]);
196 31
                        $b2 = ($ii & 63);
197 31
                        $ii = ord($data[$nn + 2]);
198 31
                        $b3 = ($ii & 63);
199 31
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
200 31
                        $ent = sprintf('&#%d;', $ii);
201 31
                        $escapedData .= $ent;
202 31
                        $nn += 2;
203
                    } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
204
                    elseif ($ii >> 3 == 30) {
205
                        $b1 = ($ii & 7);
206
                        $ii = ord($data[$nn + 1]);
207
                        $b2 = ($ii & 63);
208
                        $ii = ord($data[$nn + 2]);
209
                        $b3 = ($ii & 63);
210
                        $ii = ord($data[$nn + 3]);
211
                        $b4 = ($ii & 63);
212
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
213
                        $ent = sprintf('&#%d;', $ii);
214
                        $escapedData .= $ent;
215
                        $nn += 3;
216
                    }
217
                }
218
219
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
220 352
                if ($conversion == 'UTF-8_ISO-8859-1') {
221 24
                    $this->buildConversionTable('xml_iso88591_Entities');
222 24
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
223
                }
224 352
                break;
225
226
            /*
227
            case 'CP1252_':
228
            case 'CP1252_US-ASCII':
229
                $this->buildConversionTable('xml_cp1252_Entities');
230
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
231
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
232
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
233
                break;
234
            case 'CP1252_UTF-8':
235
                $this->buildConversionTable('xml_cp1252_Entities');
236
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
237
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
238
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
239
                $escapedData = utf8_encode($escapedData);
240
                break;
241
            case 'CP1252_ISO-8859-1':
242
                $this->buildConversionTable('xml_cp1252_Entities');
243
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
244
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
245
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
246
                break;
247
            */
248
249
            default:
250
                $escapedData = '';
251
                Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
252
        }
253
254 585
        return $escapedData;
255
    }
256
257
    /**
258
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
259
     * in the list.
260
     *
261
     * @param string $encoding charset to be tested
262
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
263
     *
264
     * @return bool
265
     */
266
    public function isValidCharset($encoding, $validList)
267
    {
268
        if (is_string($validList)) {
269
            $validList = explode(',', $validList);
270
        }
271
        if (@in_array(strtoupper($encoding), $validList)) {
272
            return true;
273
        } else {
274
            if (array_key_exists($encoding, $this->charset_supersets)) {
275
                foreach ($validList as $allowed) {
276
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
277
                        return true;
278
                    }
279
                }
280
            }
281
282
            return false;
283
        }
284
    }
285
286
    /**
287
     * Used only for backwards compatibility
288
     * @deprecated
289
     *
290
     * @param string $charset
291
     *
292
     * @return array
293
     *
294
     * @throws \Exception for unknown/unsupported charsets
295
     */
296
    public function getEntities($charset)
297
    {
298
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
299
300
        switch ($charset)
301
        {
302
            case 'iso88591':
303
                return $this->xml_iso88591_Entities;
304
            default:
305
                throw new \Exception('Unsupported charset: ' . $charset);
306
        }
307
    }
308
309
}
310