Issues (3627)

app/bundles/CoreBundle/Helper/UTF8Helper.php (8 issues)

1
<?php
2
3
/*
4
 * @copyright   2015 Mautic Contributors. All rights reserved
5
 * @author      Mautic
6
 *
7
 * @link        http://mautic.org
8
 *
9
 * @license     GNU/GPLv3 http://www.gnu.org/licenses/gpl-3.0.html
10
 */
11
12
/*
13
Copyright (c) 2008 Sebastián Grignoli
14
All rights reserved.
15
16
Redistribution and use in source and binary forms, with or without
17
modification, are permitted provided that the following conditions
18
are met:
19
1. Redistributions of source code must retain the above copyright
20
   notice, this list of conditions and the following disclaimer.
21
2. Redistributions in binary form must reproduce the above copyright
22
   notice, this list of conditions and the following disclaimer in the
23
   documentation and/or other materials provided with the distribution.
24
3. Neither the name of copyright holders nor the names of its
25
   contributors may be used to endorse or promote products derived
26
   from this software without specific prior written permission.
27
28
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31
PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
32
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
*/
40
41
/*
42
 * @author   "Sebastián Grignoli" <[email protected]>
43
 *
44
 * @version  2.0
45
 *
46
 * @link     https://github.com/neitanod/forceutf8
47
 *
48
 * @example  https://github.com/neitanod/forceutf8
49
 *
50
 * @license  Revised BSD
51
 */
52
53
namespace Mautic\CoreBundle\Helper;
54
55
class UTF8Helper
56
{
57
    const ICONV_TRANSLIT = 'TRANSLIT';
58
    const ICONV_IGNORE   = 'IGNORE';
59
    const WITHOUT_ICONV  = '';
60
61
    protected static $win1252ToUtf8 = [
62
        128 => "\xe2\x82\xac",
63
        130 => "\xe2\x80\x9a",
64
        131 => "\xc6\x92",
65
        132 => "\xe2\x80\x9e",
66
        133 => "\xe2\x80\xa6",
67
        134 => "\xe2\x80\xa0",
68
        135 => "\xe2\x80\xa1",
69
        136 => "\xcb\x86",
70
        137 => "\xe2\x80\xb0",
71
        138 => "\xc5\xa0",
72
        139 => "\xe2\x80\xb9",
73
        140 => "\xc5\x92",
74
        142 => "\xc5\xbd",
75
        145 => "\xe2\x80\x98",
76
        146 => "\xe2\x80\x99",
77
        147 => "\xe2\x80\x9c",
78
        148 => "\xe2\x80\x9d",
79
        149 => "\xe2\x80\xa2",
80
        150 => "\xe2\x80\x93",
81
        151 => "\xe2\x80\x94",
82
        152 => "\xcb\x9c",
83
        153 => "\xe2\x84\xa2",
84
        154 => "\xc5\xa1",
85
        155 => "\xe2\x80\xba",
86
        156 => "\xc5\x93",
87
        158 => "\xc5\xbe",
88
        159 => "\xc5\xb8",
89
    ];
90
91
    protected static $brokenUtf8ToUtf8 = [
92
        "\xc2\x80" => "\xe2\x82\xac",
93
        "\xc2\x82" => "\xe2\x80\x9a",
94
        "\xc2\x83" => "\xc6\x92",
95
        "\xc2\x84" => "\xe2\x80\x9e",
96
        "\xc2\x85" => "\xe2\x80\xa6",
97
        "\xc2\x86" => "\xe2\x80\xa0",
98
        "\xc2\x87" => "\xe2\x80\xa1",
99
        "\xc2\x88" => "\xcb\x86",
100
        "\xc2\x89" => "\xe2\x80\xb0",
101
        "\xc2\x8a" => "\xc5\xa0",
102
        "\xc2\x8b" => "\xe2\x80\xb9",
103
        "\xc2\x8c" => "\xc5\x92",
104
        "\xc2\x8e" => "\xc5\xbd",
105
        "\xc2\x91" => "\xe2\x80\x98",
106
        "\xc2\x92" => "\xe2\x80\x99",
107
        "\xc2\x93" => "\xe2\x80\x9c",
108
        "\xc2\x94" => "\xe2\x80\x9d",
109
        "\xc2\x95" => "\xe2\x80\xa2",
110
        "\xc2\x96" => "\xe2\x80\x93",
111
        "\xc2\x97" => "\xe2\x80\x94",
112
        "\xc2\x98" => "\xcb\x9c",
113
        "\xc2\x99" => "\xe2\x84\xa2",
114
        "\xc2\x9a" => "\xc5\xa1",
115
        "\xc2\x9b" => "\xe2\x80\xba",
116
        "\xc2\x9c" => "\xc5\x93",
117
        "\xc2\x9e" => "\xc5\xbe",
118
        "\xc2\x9f" => "\xc5\xb8",
119
    ];
120
121
    protected static $utf8ToWin1252 = [
122
        "\xe2\x82\xac" => "\x80",
123
        "\xe2\x80\x9a" => "\x82",
124
        "\xc6\x92"     => "\x83",
125
        "\xe2\x80\x9e" => "\x84",
126
        "\xe2\x80\xa6" => "\x85",
127
        "\xe2\x80\xa0" => "\x86",
128
        "\xe2\x80\xa1" => "\x87",
129
        "\xcb\x86"     => "\x88",
130
        "\xe2\x80\xb0" => "\x89",
131
        "\xc5\xa0"     => "\x8a",
132
        "\xe2\x80\xb9" => "\x8b",
133
        "\xc5\x92"     => "\x8c",
134
        "\xc5\xbd"     => "\x8e",
135
        "\xe2\x80\x98" => "\x91",
136
        "\xe2\x80\x99" => "\x92",
137
        "\xe2\x80\x9c" => "\x93",
138
        "\xe2\x80\x9d" => "\x94",
139
        "\xe2\x80\xa2" => "\x95",
140
        "\xe2\x80\x93" => "\x96",
141
        "\xe2\x80\x94" => "\x97",
142
        "\xcb\x9c"     => "\x98",
143
        "\xe2\x84\xa2" => "\x99",
144
        "\xc5\xa1"     => "\x9a",
145
        "\xe2\x80\xba" => "\x9b",
146
        "\xc5\x93"     => "\x9c",
147
        "\xc5\xbe"     => "\x9e",
148
        "\xc5\xb8"     => "\x9f",
149
    ];
150
151
    public static function toUTF8($text)
152
    {
153
        /**
154
         * Function \ForceUTF8\Encoding::toUTF8.
155
         *
156
         * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
157
         *
158
         * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
159
         *
160
         * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
161
         *
162
         * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
163
         *    are followed by any of these:  ("group B")
164
         *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
165
         * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
166
         * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
167
         * is also a valid unicode character, and will be left unchanged.
168
         *
169
         * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
170
         * 3) when any of these: ðñòó  are followed by THREE chars from group B.
171
         *
172
         * @name         toUTF8
173
         *
174
         * @param string $text Any string
175
         *
176
         * @return string The same string, UTF8 encoded
177
         */
178
        if (is_array($text)) {
179
            foreach ($text as $k => $v) {
180
                $text[$k] = self::toUTF8($v);
181
            }
182
183
            return $text;
184
        }
185
186
        if (!is_string($text)) {
187
            return $text;
188
        }
189
190
        $max = self::strlen($text);
191
192
        $buf = '';
193
        for ($i = 0; $i < $max; ++$i) {
194
            $c1 = $text[$i];
195
            if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
196
                $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
197
                $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
198
                $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
199
                if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
0 ignored issues
show
Are you sure you want to use the bitwise & or did you mean &&?
Loading history...
200
                    if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
201
                        $buf .= $c1.$c2;
202
                        ++$i;
203
                    } else { //not valid UTF8.  Convert it.
204
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
0 ignored issues
show
Are you sure you want to use the bitwise | or did you mean ||?
Loading history...
205
                        $cc2 = ($c1 & "\x3f") | "\x80";
206
                        $buf .= $cc1.$cc2;
207
                    }
208
                } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
0 ignored issues
show
Are you sure you want to use the bitwise & or did you mean &&?
Loading history...
209
                    if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
210
                        $buf .= $c1.$c2.$c3;
211
                        $i = $i + 2;
212
                    } else { //not valid UTF8.  Convert it.
213
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
0 ignored issues
show
Are you sure you want to use the bitwise | or did you mean ||?
Loading history...
214
                        $cc2 = ($c1 & "\x3f") | "\x80";
215
                        $buf .= $cc1.$cc2;
216
                    }
217
                } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
0 ignored issues
show
Are you sure you want to use the bitwise & or did you mean &&?
Loading history...
218
                    if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80"
219
                        && $c4 <= "\xbf"
220
                    ) { //yeah, almost sure it's UTF8 already
221
                        $buf .= $c1.$c2.$c3.$c4;
222
                        $i = $i + 3;
223
                    } else { //not valid UTF8.  Convert it.
224
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
0 ignored issues
show
Are you sure you want to use the bitwise | or did you mean ||?
Loading history...
225
                        $cc2 = ($c1 & "\x3f") | "\x80";
226
                        $buf .= $cc1.$cc2;
227
                    }
228
                } else { //doesn't look like UTF8, but should be converted
229
                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
0 ignored issues
show
Are you sure you want to use the bitwise | or did you mean ||?
Loading history...
230
                    $cc2 = (($c1 & "\x3f") | "\x80");
231
                    $buf .= $cc1.$cc2;
232
                }
233
            } elseif ("\x80" == ($c1 & "\xc0")) { // needs conversion
234
                if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
235
                    $buf .= self::$win1252ToUtf8[ord($c1)];
236
                } else {
237
                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
0 ignored issues
show
Are you sure you want to use the bitwise | or did you mean ||?
Loading history...
238
                    $cc2 = (($c1 & "\x3f") | "\x80");
239
                    $buf .= $cc1.$cc2;
240
                }
241
            } else { // it doesn't need conversion
242
                $buf .= $c1;
243
            }
244
        }
245
246
        return $buf;
247
    }
248
249
    public static function toWin1252($text, $option = self::WITHOUT_ICONV)
250
    {
251
        if (is_array($text)) {
252
            foreach ($text as $k => $v) {
253
                $text[$k] = self::toWin1252($v, $option);
254
            }
255
256
            return $text;
257
        } elseif (is_string($text)) {
258
            return static::utf8_decode($text, $option);
259
        } else {
260
            return $text;
261
        }
262
    }
263
264
    public static function toISO8859($text)
265
    {
266
        return self::toWin1252($text);
267
    }
268
269
    public static function toLatin1($text)
270
    {
271
        return self::toWin1252($text);
272
    }
273
274
    public static function fixUTF8($text, $option = self::WITHOUT_ICONV)
275
    {
276
        if (is_array($text)) {
277
            foreach ($text as $k => $v) {
278
                $text[$k] = self::fixUTF8($v, $option);
279
            }
280
281
            return $text;
282
        }
283
284
        $last = '';
285
        while ($last != $text) {
286
            $last = $text;
287
            $text = self::toUTF8(static::utf8_decode($text, $option));
288
        }
289
290
        return self::toUTF8(static::utf8_decode($text, $option));
291
    }
292
293
    public static function UTF8FixWin1252Chars($text)
294
    {
295
        // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
296
        // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
297
        // See: http://en.wikipedia.org/wiki/Windows-1252
298
299
        return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
300
    }
301
302
    public static function removeBOM($str = '')
303
    {
304
        if (substr($str, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
305
            $str = substr($str, 3);
306
        }
307
308
        return $str;
309
    }
310
311
    protected static function strlen($text)
312
    {
313
        return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ?
314
            mb_strlen($text, '8bit') : strlen($text);
315
    }
316
317
    public static function normalizeEncoding($encodingLabel)
318
    {
319
        $encoding     = strtoupper($encodingLabel);
320
        $encoding     = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
321
        $equivalences = [
322
            'ISO88591'    => 'ISO-8859-1',
323
            'ISO8859'     => 'ISO-8859-1',
324
            'ISO'         => 'ISO-8859-1',
325
            'LATIN1'      => 'ISO-8859-1',
326
            'LATIN'       => 'ISO-8859-1',
327
            'UTF8'        => 'UTF-8',
328
            'UTF'         => 'UTF-8',
329
            'WIN1252'     => 'ISO-8859-1',
330
            'WINDOWS1252' => 'ISO-8859-1',
331
        ];
332
333
        if (empty($equivalences[$encoding])) {
334
            return 'UTF-8';
335
        }
336
337
        return $equivalences[$encoding];
338
    }
339
340
    public static function encode($encodingLabel, $text)
341
    {
342
        $encodingLabel = self::normalizeEncoding($encodingLabel);
343
        if ('ISO-8859-1' == $encodingLabel) {
344
            return self::toLatin1($text);
345
        }
346
347
        return self::toUTF8($text);
348
    }
349
350
    protected static function utf8_decode($text, $option)
351
    {
352
        if (self::WITHOUT_ICONV == $option || !function_exists('iconv')) {
353
            $o = utf8_decode(
354
                str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))
355
            );
356
        } else {
357
            $o = iconv(
358
                'UTF-8',
359
                'Windows-1252'.(self::ICONV_TRANSLIT == $option ? '//TRANSLIT' : (self::ICONV_IGNORE == $option ? '//IGNORE' : '')),
360
                $text
361
            );
362
        }
363
364
        return $o;
365
    }
366
}
367