Passed
Push — master ( c27693...5d63c2 )
by Gaetano
10:40
created

Charset::encodeEntities()   D

Complexity

Conditions 28
Paths 32

Size

Total Lines 134
Code Lines 84

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 71
CRAP Score 29.8906

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 28
eloc 84
c 1
b 0
f 0
nc 32
nop 3
dl 0
loc 134
rs 4.1666
ccs 71
cts 82
cp 0.8659
crap 29.8906

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
/**
8
 * @todo implement an interface
9
 */
10
class Charset
11
{
12
    // tables used for transcoding different charsets into us-ascii xml
13
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
14
15
    /// @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
16
    ///       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
17
    ///       (though no luck when receiving them...)
18
    ///       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
19
    ///       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
20
    ///       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
21
    ///       Check what mbstring/iconv do by default with those?
22
    //
23
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
24
25
    protected $charset_supersets = array(
26
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
27
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
28
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
29
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
30
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
31
    );
32
33
    /** @var Charset $instance */
34
    protected static $instance = null;
35
36
    /**
37
     * This class is singleton for performance reasons.
38
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
39
     *
40
     * @return Charset
41
     */
42 605
    public static function instance()
43
    {
44 605
        if (self::$instance === null) {
45 383
            self::$instance = new static();
46
        }
47
48 605
        return self::$instance;
49
    }
50
51
    /**
52
     * Force usage as singleton
53
     */
54 383
    protected function __construct()
55
    {
56 383
    }
57
58
    /**
59
     * @param string $tableName
60
     * @throws \Exception for unsupported $tableName
61
     */
62 494
    protected function buildConversionTable($tableName)
63
    {
64 494
        switch($tableName) {
65 494
            case 'xml_iso88591_Entities':
66 494
                if (count($this->xml_iso88591_Entities['in'])) {
67 492
                    return;
68
                }
69 19
                for ($i = 0; $i < 32; $i++) {
70 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
71 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
72
                }
73
74 19
                for ($i = 160; $i < 256; $i++) {
75 19
                    $this->xml_iso88591_Entities["in"][] = chr($i);
76 19
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
77
                }
78 19
                break;
79
            /*case 'xml_cp1252_Entities':
80
                if (count($this->xml_cp1252_Entities['in'])) {
81
                    return;
82
                }
83
                for ($i = 128; $i < 160; $i++)
84
                {
85
                    $this->xml_cp1252_Entities['in'][] = chr($i);
86
                }
87
                $this->xml_cp1252_Entities['out'] = array(
88
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
89
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
90
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
91
                    '&#x0152;', '?',        '&#x017D;', '?',
92
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
93
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
94
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
95
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
96
                );
97
                $this->buildConversionTable('xml_iso88591_Entities');
98
                break;*/
99
            default:
100
                throw new \Exception('Unsupported table: ' . $tableName);
101
        }
102 19
    }
103
104
    /**
105
     * Convert a string to the correct XML representation in a target charset.
106
     * This involves:
107
     * - character transformation for all characters which have a different representation in source and dest charsets
108
     * - using 'charset entity' representation for all characters which are outside of the target charset
109
     *
110
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
111
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
112
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
113
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
114
     *
115
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
116
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
117
     *
118
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
119
     * @todo make usage of iconv() or mb_string() where available
120
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list)
121
     *
122
     * @param string $data
123
     * @param string $srcEncoding
124
     * @param string $destEncoding
125
     *
126
     * @return string
127
     */
128 586
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
129
    {
130 586
        if ($srcEncoding == '') {
131
            // lame, but we know no better...
132
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
133
        }
134
135 586
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
136 586
        switch ($conversion) {
137 586
            case 'ISO-8859-1_':
138 393
            case 'ISO-8859-1_US-ASCII':
139 487
                $this->buildConversionTable('xml_iso88591_Entities');
140 487
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
141 487
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
142 487
                break;
143
144 393
            case 'ISO-8859-1_UTF-8':
145 28
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
146 28
                $escapedData = utf8_encode($escapedData);
147 28
                break;
148
149 382
            case 'ISO-8859-1_ISO-8859-1':
150 371
            case 'US-ASCII_US-ASCII':
151 371
            case 'US-ASCII_UTF-8':
152 371
            case 'US-ASCII_':
153 371
            case 'US-ASCII_ISO-8859-1':
154 371
            case 'UTF-8_UTF-8':
155
            //case 'CP1252_CP1252':
156 46
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
157 46
                break;
158
159 353
            case 'UTF-8_':
160 24
            case 'UTF-8_US-ASCII':
161 24
            case 'UTF-8_ISO-8859-1':
162
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
163 353
                $escapedData = '';
164
                // be kind to users creating string xmlrpc values out of different php types
165 353
                $data = (string)$data;
166 353
                $ns = strlen($data);
167 353
                for ($nn = 0; $nn < $ns; $nn++) {
168 353
                    $ch = $data[$nn];
169 353
                    $ii = ord($ch);
170
                    // 7 bits: 0bbbbbbb (127)
171 353
                    if ($ii < 32) {
172 40
                        if ($conversion == 'UTF-8_' || $conversion == 'UTF-8_US-ASCII') {
173 37
                            $escapedData .= sprintf('&#%d;', $ii);
174
                        } else {
175 3
                            $escapedData .= $ch;
176
                        }
177
                    }
178 353
                    else if ($ii < 128) {
179
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
180 346
                        switch ($ii) {
181 346
                            case 34:
182 20
                                $escapedData .= '&quot;';
183 20
                                break;
184 346
                            case 38:
185 21
                                $escapedData .= '&amp;';
186 21
                                break;
187 346
                            case 39:
188 22
                                $escapedData .= '&apos;';
189 22
                                break;
190 346
                            case 60:
191 21
                                $escapedData .= '&lt;';
192 21
                                break;
193 346
                            case 62:
194 21
                                $escapedData .= '&gt;';
195 21
                                break;
196
                            default:
197 346
                                $escapedData .= $ch;
198
                        } // switch
199
                    } // 11 bits: 110bbbbb 10bbbbbb (2047)
200 71
                    elseif ($ii >> 5 == 6) {
201 67
                        $b1 = ($ii & 31);
202 67
                        $b2 = (ord($data[$nn + 1]) & 63);
203 67
                        $ii = ($b1 * 64) + $b2;
204 67
                        $escapedData .= sprintf('&#%d;', $ii);
205 67
                        $nn += 1;
206
                    } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb
207 31
                    elseif ($ii >> 4 == 14) {
208 31
                        $b1 = ($ii & 15);
209 31
                        $b2 = (ord($data[$nn + 1]) & 63);
210 31
                        $b3 = (ord($data[$nn + 2]) & 63);
211 31
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
212 31
                        $escapedData .= sprintf('&#%d;', $ii);
213 31
                        $nn += 2;
214
                    } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
215
                    elseif ($ii >> 3 == 30) {
216
                        $b1 = ($ii & 7);
217
                        $b2 = (ord($data[$nn + 1]) & 63);
218
                        $b3 = (ord($data[$nn + 2]) & 63);
219
                        $b4 = (ord($data[$nn + 3]) & 63);
220
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
221
                        $escapedData .= sprintf('&#%d;', $ii);
222
                        $nn += 3;
223
                    }
224
                }
225
226
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
227 353
                if ($conversion == 'UTF-8_ISO-8859-1') {
228 24
                    $this->buildConversionTable('xml_iso88591_Entities');
229 24
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
230
                }
231 353
                break;
232
233
            /*
234
            case 'CP1252_':
235
            case 'CP1252_US-ASCII':
236
                $this->buildConversionTable('xml_cp1252_Entities');
237
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
238
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
239
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
240
                break;
241
            case 'CP1252_UTF-8':
242
                $this->buildConversionTable('xml_cp1252_Entities');
243
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
244
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
245
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
246
                $escapedData = utf8_encode($escapedData);
247
                break;
248
            case 'CP1252_ISO-8859-1':
249
                $this->buildConversionTable('xml_cp1252_Entities');
250
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
251
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
252
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
253
                break;
254
            */
255
256
            default:
257
                $escapedData = '';
258
                Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
259
        }
260
261 586
        return $escapedData;
262
    }
263
264
    /**
265
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
266
     * in the list.
267
     *
268
     * @param string $encoding charset to be tested
269
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
270
     *
271
     * @return bool
272
     */
273
    public function isValidCharset($encoding, $validList)
274
    {
275
        if (is_string($validList)) {
276
            $validList = explode(',', $validList);
277
        }
278
        if (@in_array(strtoupper($encoding), $validList)) {
279
            return true;
280
        } else {
281
            if (array_key_exists($encoding, $this->charset_supersets)) {
282
                foreach ($validList as $allowed) {
283
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
284
                        return true;
285
                    }
286
                }
287
            }
288
289
            return false;
290
        }
291
    }
292
293
    /**
294
     * Used only for backwards compatibility
295
     * @deprecated
296
     *
297
     * @param string $charset
298
     *
299
     * @return array
300
     *
301
     * @throws \Exception for unknown/unsupported charsets
302
     */
303
    public function getEntities($charset)
304
    {
305
        //trigger_error('Method ' . __METHOD__ . ' is deprecated', E_USER_DEPRECATED);
306
307
        switch ($charset)
308
        {
309
            case 'iso88591':
310
                return $this->xml_iso88591_Entities;
311
            default:
312
                throw new \Exception('Unsupported charset: ' . $charset);
313
        }
314
    }
315
}
316