Completed
Push — master ( 5fc0c8...b5d242 )
by Gaetano
06:30
created

Charset::__construct()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 17
Code Lines 7

Duplication

Lines 8
Ratio 47.06 %

Importance

Changes 0
Metric Value
cc 3
eloc 7
nc 4
nop 0
dl 8
loc 17
rs 9.4285
c 0
b 0
f 0
1
<?php
2
3
namespace PhpXmlRpc\Helper;
4
5
use PhpXmlRpc\PhpXmlRpc;
6
7
class Charset
8
{
9
    // tables used for transcoding different charsets into us-ascii xml
10
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
11
    protected $xml_iso88591_utf8 = array("in" => array(), "out" => array());
12
13
    /// @todo add to iso table the characters from cp_1252 range, i.e. 128 to 159?
14
    /// These will NOT be present in true ISO-8859-1, but will save the unwary
15
    /// windows user from sending junk (though no luck when receiving them...)
16
    /*
0 ignored issues
show
Unused Code Comprehensibility introduced by
65% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
17
    protected $xml_cp1252_Entities = array('in' => array(), out' => array(
18
        '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
19
        '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
20
        '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
21
        '&#x0152;', '?',        '&#x017D;', '?',
22
        '?',        '&#x2018;', '&#x2019;', '&#x201C;',
23
        '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
24
        '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
25
        '&#x0153;', '?',        '&#x017E;', '&#x0178;'
26
    ));
27
    */
28
29
    protected $charset_supersets = array(
30
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
31
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
32
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
33
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
34
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
35
    );
36
37
    protected static $instance = null;
38
39
    /**
40
     * This class is singleton for performance reasons.
41
     *
42
     * @return Charset
43
     */
44
    public static function instance()
45
    {
46
        if (self::$instance === null) {
47
            self::$instance = new self();
48
        }
49
50
        return self::$instance;
51
    }
52
53
    private function __construct()
54
    {
55 View Code Duplication
        for ($i = 0; $i < 32; $i++) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
56
            $this->xml_iso88591_Entities["in"][] = chr($i);
57
            $this->xml_iso88591_Entities["out"][] = "&#{$i};";
58
        }
59
60 View Code Duplication
        for ($i = 160; $i < 256; $i++) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
61
            $this->xml_iso88591_Entities["in"][] = chr($i);
62
            $this->xml_iso88591_Entities["out"][] = "&#{$i};";
63
        }
64
65
        /*for ($i = 128; $i < 160; $i++)
0 ignored issues
show
Unused Code Comprehensibility introduced by
59% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
66
        {
67
            $this->xml_cp1252_Entities['in'][] = chr($i);
68
        }*/
69
    }
70
71
    /**
72
     * Convert a string to the correct XML representation in a target charset.
73
     *
74
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
75
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
76
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
77
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
78
     * Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are
79
     * bound by RFC 3023 to emit strict us-ascii.
80
     *
81
     * @todo do a bit of basic benchmarking (strtr vs. str_replace)
82
     * @todo make usage of iconv() or recode_string() or mb_string() where available
83
     *
84
     * @param string $data
85
     * @param string $srcEncoding
86
     * @param string $destEncoding
87
     *
88
     * @return string
89
     */
90
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
91
    {
92
        if ($srcEncoding == '') {
93
            // lame, but we know no better...
94
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
95
        }
96
97
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
98
        switch ($conversion) {
99
            case 'ISO-8859-1_':
100
            case 'ISO-8859-1_US-ASCII':
101
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
102
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
103
                break;
104
105
            case 'ISO-8859-1_UTF-8':
106
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
107
                $escapedData = utf8_encode($escapedData);
108
                break;
109
110
            case 'ISO-8859-1_ISO-8859-1':
111
            case 'US-ASCII_US-ASCII':
112
            case 'US-ASCII_UTF-8':
113
            case 'US-ASCII_':
114
            case 'US-ASCII_ISO-8859-1':
115
            case 'UTF-8_UTF-8':
116
            //case 'CP1252_CP1252':
117
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
118
                break;
119
120
            case 'UTF-8_':
121
            case 'UTF-8_US-ASCII':
122
            case 'UTF-8_ISO-8859-1':
123
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
124
                $escapedData = '';
125
                // be kind to users creating string xmlrpc values out of different php types
126
                $data = (string)$data;
127
                $ns = strlen($data);
128
                for ($nn = 0; $nn < $ns; $nn++) {
129
                    $ch = $data[$nn];
130
                    $ii = ord($ch);
131
                    // 7 bits: 0bbbbbbb (127)
0 ignored issues
show
Unused Code Comprehensibility introduced by
46% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
132
                    if ($ii < 128) {
133
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
134
                        switch ($ii) {
135
                            case 34:
136
                                $escapedData .= '&quot;';
137
                                break;
138
                            case 38:
139
                                $escapedData .= '&amp;';
140
                                break;
141
                            case 39:
142
                                $escapedData .= '&apos;';
143
                                break;
144
                            case 60:
145
                                $escapedData .= '&lt;';
146
                                break;
147
                            case 62:
148
                                $escapedData .= '&gt;';
149
                                break;
150
                            default:
151
                                $escapedData .= $ch;
152
                        } // switch
153
                    } // 11 bits: 110bbbbb 10bbbbbb (2047)
0 ignored issues
show
Unused Code Comprehensibility introduced by
43% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
154
                    elseif ($ii >> 5 == 6) {
155
                        $b1 = ($ii & 31);
156
                        $ii = ord($data[$nn + 1]);
157
                        $b2 = ($ii & 63);
158
                        $ii = ($b1 * 64) + $b2;
159
                        $ent = sprintf('&#%d;', $ii);
160
                        $escapedData .= $ent;
161
                        $nn += 1;
162
                    } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb
163
                    elseif ($ii >> 4 == 14) {
164
                        $b1 = ($ii & 15);
165
                        $ii = ord($data[$nn + 1]);
166
                        $b2 = ($ii & 63);
167
                        $ii = ord($data[$nn + 2]);
168
                        $b3 = ($ii & 63);
169
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
170
                        $ent = sprintf('&#%d;', $ii);
171
                        $escapedData .= $ent;
172
                        $nn += 2;
173
                    } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
174
                    elseif ($ii >> 3 == 30) {
175
                        $b1 = ($ii & 7);
176
                        $ii = ord($data[$nn + 1]);
177
                        $b2 = ($ii & 63);
178
                        $ii = ord($data[$nn + 2]);
179
                        $b3 = ($ii & 63);
180
                        $ii = ord($data[$nn + 3]);
181
                        $b4 = ($ii & 63);
182
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
183
                        $ent = sprintf('&#%d;', $ii);
184
                        $escapedData .= $ent;
185
                        $nn += 3;
186
                    }
187
                }
188
189
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
190
                if ($conversion == 'UTF-8_ISO-8859-1') {
191
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
192
                }
193
                break;
194
195
            /*
196
            case 'CP1252_':
197
            case 'CP1252_US-ASCII':
198
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
199
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
200
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
201
                break;
202
            case 'CP1252_UTF-8':
203
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
204
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all allone will NOT convert them)
205
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
206
                $escapedData = utf8_encode($escapedData);
207
                break;
208
            case 'CP1252_ISO-8859-1':
209
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
210
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
211
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
212
                break;
213
            */
214
215
            default:
216
                $escapedData = '';
217
                error_log('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
218
        }
219
220
        return $escapedData;
221
    }
222
223
    /**
224
     * Checks if a given charset encoding is present in a list of encodings or
225
     * if it is a valid subset of any encoding in the list.
226
     *
227
     * @param string $encoding charset to be tested
228
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
229
     *
230
     * @return bool
231
     */
232
    public function isValidCharset($encoding, $validList)
233
    {
234
        if (is_string($validList)) {
235
            $validList = explode(',', $validList);
236
        }
237
        if (@in_array(strtoupper($encoding), $validList)) {
238
            return true;
239
        } else {
240
            if (array_key_exists($encoding, $this->charset_supersets)) {
241
                foreach ($validList as $allowed) {
242
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
243
                        return true;
244
                    }
245
                }
246
            }
247
248
            return false;
249
        }
250
    }
251
252
    /**
253
     * Used only for backwards compatibility
254
     * @deprecated
255
     *
256
     * @param string $charset
257
     *
258
     * @return array
259
     *
260
     * @throws \Exception for unknown/unsupported charsets
261
     */
262
    public function getEntities($charset)
263
    {
264
        switch ($charset)
265
        {
266
            case 'iso88591':
267
                return $this->xml_iso88591_Entities;
268
            default:
269
                throw new \Exception('Unsupported charset: ' . $charset);
270
        }
271
    }
272
273
}
274