Passed
Branch master (6c65a4)
by Christian
16:31
created

CharsetConverter   F

Complexity

Total Complexity 217

Size/Duplication

Total Lines 1224
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
dl 0
loc 1224
rs 0.6314
c 0
b 0
f 0
wmc 217

21 Methods

Rating   Name   Duplication   Size   Complexity  
A convArray() 0 7 4
B conv() 0 20 7
A specCharsToASCII() 0 11 3
D euc_char_mapping() 0 31 9
C utf8_char_mapping() 0 31 7
A sb_char_mapping() 0 17 4
B entities_to_utf8() 0 30 6
A crop() 0 11 4
A parse_charset() 0 7 2
D utf8_char2byte_pos() 0 37 10
C initCharset() 0 55 16
A convCaseFirst() 0 8 2
B utf8CharToUnumber() 0 25 5
C UnumberToChar() 0 35 7
C initCaseFolding() 0 52 16
B utf8_to_entities() 0 36 6
F initUnicodeData() 0 227 71
B utf8_to_numberarray() 0 40 6
D initToASCII() 0 31 9
C utf8_encode() 0 51 12
C utf8_decode() 0 59 11

How to fix   Complexity   

Complex Class

Complex classes like CharsetConverter often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CharsetConverter, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace TYPO3\CMS\Core\Charset;
3
4
/*
5
 * This file is part of the TYPO3 CMS project.
6
 *
7
 * It is free software; you can redistribute it and/or modify it under
8
 * the terms of the GNU General Public License, either version 2
9
 * of the License, or any later version.
10
 *
11
 * For the full copyright and license information, please read the
12
 * LICENSE.txt file that was distributed with this source code.
13
 *
14
 * The TYPO3 project - inspiring people to share!
15
 */
16
17
use TYPO3\CMS\Core\SingletonInterface;
18
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21
/**
22
 * Notes on UTF-8
23
 *
24
 * Functions working on UTF-8 strings:
25
 *
26
 * - strchr/strstr
27
 * - strrchr
28
 * - substr_count
29
 * - implode/explode/join
30
 *
31
 * Functions nearly working on UTF-8 strings:
32
 *
33
 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
34
 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
35
 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
36
 *
37
 * Functions NOT working on UTF-8 strings:
38
 *
39
 * - str*cmp
40
 * - stristr
41
 * - stripos
42
 * - substr
43
 * - strrev
44
 * - split/spliti
45
 * - ...
46
 */
47
48
/**
49
 * Class for conversion between charsets
50
 */
51
class CharsetConverter implements SingletonInterface
52
{
53
    /**
54
     * ASCII Value for chars with no equivalent.
55
     *
56
     * @var int
57
     */
58
    public $noCharByteVal = 63;
59
60
    /**
61
     * This is the array where parsed conversion tables are stored (cached)
62
     *
63
     * @var array
64
     */
65
    public $parsedCharsets = [];
66
67
    /**
68
     * An array where case folding data will be stored (cached)
69
     *
70
     * @var array
71
     */
72
    public $caseFolding = [];
73
74
    /**
75
     * An array where charset-to-ASCII mappings are stored (cached)
76
     *
77
     * @var array
78
     */
79
    public $toASCII = [];
80
81
    /**
82
     * This tells the converter which charsets has two bytes per char:
83
     *
84
     * @var array
85
     */
86
    public $twoByteSets = [
87
        'ucs-2' => 1
88
    ];
89
90
    /**
91
     * This tells the converter which charsets use a scheme like the Extended Unix Code:
92
     *
93
     * @var array
94
     */
95
    public $eucBasedSets = [
96
        'gb2312' => 1, // Chinese, simplified.
97
        'big5' => 1, // Chinese, traditional.
98
        'euc-kr' => 1, // Korean
99
        'shift_jis' => 1
100
    ];
101
102
    /**
103
     * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
104
     * @link http://czyborra.com/charsets/iso8859.html
105
     *
106
     * @var array
107
     */
108
    public $synonyms = [
109
        'us' => 'ascii',
110
        'us-ascii' => 'ascii',
111
        'cp819' => 'iso-8859-1',
112
        'ibm819' => 'iso-8859-1',
113
        'iso-ir-100' => 'iso-8859-1',
114
        'iso-ir-101' => 'iso-8859-2',
115
        'iso-ir-109' => 'iso-8859-3',
116
        'iso-ir-110' => 'iso-8859-4',
117
        'iso-ir-144' => 'iso-8859-5',
118
        'iso-ir-127' => 'iso-8859-6',
119
        'iso-ir-126' => 'iso-8859-7',
120
        'iso-ir-138' => 'iso-8859-8',
121
        'iso-ir-148' => 'iso-8859-9',
122
        'iso-ir-157' => 'iso-8859-10',
123
        'iso-ir-179' => 'iso-8859-13',
124
        'iso-ir-199' => 'iso-8859-14',
125
        'iso-ir-203' => 'iso-8859-15',
126
        'csisolatin1' => 'iso-8859-1',
127
        'csisolatin2' => 'iso-8859-2',
128
        'csisolatin3' => 'iso-8859-3',
129
        'csisolatin5' => 'iso-8859-9',
130
        'csisolatin8' => 'iso-8859-14',
131
        'csisolatin9' => 'iso-8859-15',
132
        'csisolatingreek' => 'iso-8859-7',
133
        'iso-celtic' => 'iso-8859-14',
134
        'latin1' => 'iso-8859-1',
135
        'latin2' => 'iso-8859-2',
136
        'latin3' => 'iso-8859-3',
137
        'latin5' => 'iso-8859-9',
138
        'latin6' => 'iso-8859-10',
139
        'latin8' => 'iso-8859-14',
140
        'latin9' => 'iso-8859-15',
141
        'l1' => 'iso-8859-1',
142
        'l2' => 'iso-8859-2',
143
        'l3' => 'iso-8859-3',
144
        'l5' => 'iso-8859-9',
145
        'l6' => 'iso-8859-10',
146
        'l8' => 'iso-8859-14',
147
        'l9' => 'iso-8859-15',
148
        'cyrillic' => 'iso-8859-5',
149
        'arabic' => 'iso-8859-6',
150
        'tis-620' => 'iso-8859-11',
151
        'win874' => 'windows-874',
152
        'win1250' => 'windows-1250',
153
        'win1251' => 'windows-1251',
154
        'win1252' => 'windows-1252',
155
        'win1253' => 'windows-1253',
156
        'win1254' => 'windows-1254',
157
        'win1255' => 'windows-1255',
158
        'win1256' => 'windows-1256',
159
        'win1257' => 'windows-1257',
160
        'win1258' => 'windows-1258',
161
        'cp1250' => 'windows-1250',
162
        'cp1251' => 'windows-1251',
163
        'cp1252' => 'windows-1252',
164
        'ms-ee' => 'windows-1250',
165
        'ms-ansi' => 'windows-1252',
166
        'ms-greek' => 'windows-1253',
167
        'ms-turk' => 'windows-1254',
168
        'winbaltrim' => 'windows-1257',
169
        'koi-8ru' => 'koi-8r',
170
        'koi8r' => 'koi-8r',
171
        'cp878' => 'koi-8r',
172
        'mac' => 'macroman',
173
        'macintosh' => 'macroman',
174
        'euc-cn' => 'gb2312',
175
        'x-euc-cn' => 'gb2312',
176
        'euccn' => 'gb2312',
177
        'cp936' => 'gb2312',
178
        'big-5' => 'big5',
179
        'cp950' => 'big5',
180
        'eucjp' => 'euc-jp',
181
        'sjis' => 'shift_jis',
182
        'shift-jis' => 'shift_jis',
183
        'cp932' => 'shift_jis',
184
        'cp949' => 'euc-kr',
185
        'utf7' => 'utf-7',
186
        'utf8' => 'utf-8',
187
        'utf16' => 'utf-16',
188
        'utf32' => 'utf-32',
189
        'ucs2' => 'ucs-2',
190
        'ucs4' => 'ucs-4'
191
    ];
192
193
    /**
194
     * Normalize - changes input character set to lowercase letters.
195
     *
196
     * @param string $charset Input charset
197
     * @return string Normalized charset
198
     */
199
    public function parse_charset($charset)
200
    {
201
        $charset = trim(strtolower($charset));
202
        if (isset($this->synonyms[$charset])) {
203
            $charset = $this->synonyms[$charset];
204
        }
205
        return $charset;
206
    }
207
208
    /********************************************
209
     *
210
     * Charset Conversion functions
211
     *
212
     ********************************************/
213
    /**
214
     * Convert from one charset to another charset.
215
     *
216
     * @param string $inputString Input string
217
     * @param string $fromCharset From charset (the current charset of the string)
218
     * @param string $toCharset To charset (the output charset wanted)
219
     * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
220
     * @return string Converted string
221
     * @see convArray()
222
     */
223
    public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
224
    {
225
        if ($fromCharset === $toCharset) {
226
            return $inputString;
227
        }
228
        // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
229
        if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
230
            // Returns FALSE for unsupported charsets
231
            $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
232
            if (false !== $convertedString) {
233
                return $convertedString;
234
            }
235
        }
236
        if ($fromCharset !== 'utf-8') {
237
            $inputString = $this->utf8_encode($inputString, $fromCharset);
238
        }
239
        if ($toCharset !== 'utf-8') {
240
            $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
241
        }
242
        return $inputString;
243
    }
244
245
    /**
246
     * Convert all elements in ARRAY with type string from one charset to another charset.
247
     * NOTICE: Array is passed by reference!
248
     *
249
     * @param array $array Input array, possibly multidimensional
250
     * @param string $fromCharset From charset (the current charset of the string)
251
     * @param string $toCharset To charset (the output charset wanted)
252
     * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
253
     * @see conv()
254
     */
255
    public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
256
    {
257
        foreach ($array as $key => $value) {
258
            if (is_array($array[$key])) {
259
                $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
260
            } elseif (is_string($array[$key])) {
261
                $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
262
            }
263
        }
264
    }
265
266
    /**
267
     * Converts $str from $charset to UTF-8
268
     *
269
     * @param string $str String in local charset to convert to UTF-8
270
     * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
271
     * @return string Output string, converted to UTF-8
272
     */
273
    public function utf8_encode($str, $charset)
274
    {
275
        if ($charset === 'utf-8') {
276
            return $str;
277
        }
278
        // Charset is case-insensitive
279
        // Parse conv. table if not already
280
        if ($this->initCharset($charset)) {
281
            $strLen = strlen($str);
282
            $outStr = '';
283
            // Traverse each char in string
284
            for ($a = 0; $a < $strLen; $a++) {
285
                $chr = substr($str, $a, 1);
286
                $ord = ord($chr);
287
                // If the charset has two bytes per char
288
                if (isset($this->twoByteSets[$charset])) {
289
                    $ord2 = ord($str[$a + 1]);
290
                    // Assume big endian
291
                    $ord = $ord << 8 | $ord2;
292
                    // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
293
                    if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
294
                        $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
295
                    } else {
296
                        $outStr .= chr($this->noCharByteVal);
297
                    }
298
                    // No char exists
299
                    $a++;
300
                } elseif ($ord > 127) {
301
                    // If char has value over 127 it's a multibyte char in UTF-8
302
                    // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
303
                    if (isset($this->eucBasedSets[$charset])) {
304
                        // Shift-JIS: chars between 160 and 223 are single byte
305
                        if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
306
                            $a++;
307
                            $ord2 = ord(substr($str, $a, 1));
308
                            $ord = $ord * 256 + $ord2;
309
                        }
310
                    }
311
                    if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
312
                        // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
313
                        $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
314
                    } else {
315
                        $outStr .= chr($this->noCharByteVal);
316
                    }
317
                } else {
318
                    $outStr .= $chr;
319
                }
320
            }
321
            return $outStr;
322
        }
323
        return '';
324
    }
325
326
    /**
327
     * Converts $str from UTF-8 to $charset
328
     *
329
     * @param string $str String in UTF-8 to convert to local charset
330
     * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
331
     * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
332
     * @return string Output string, converted to local charset
333
     */
334
    public function utf8_decode($str, $charset, $useEntityForNoChar = false)
335
    {
336
        if ($charset === 'utf-8') {
337
            return $str;
338
        }
339
        // Charset is case-insensitive.
340
        // Parse conv. table if not already
341
        if ($this->initCharset($charset)) {
342
            $strLen = strlen($str);
343
            $outStr = '';
344
            // Traverse each char in UTF-8 string
345
            for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
346
                $chr = substr($str, $a, 1);
347
                $ord = ord($chr);
348
                // This means multibyte! (first byte!)
349
                if ($ord > 127) {
350
                    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
351
                    if ($ord & 64) {
352
                        // Add first byte
353
                        $buf = $chr;
354
                        // For each byte in multibyte string
355
                        for ($b = 0; $b < 8; $b++) {
356
                            // Shift it left and
357
                            $ord = $ord << 1;
358
                            // ... and with 8th bit - if that is set, then there are still bytes in sequence.
359
                            if ($ord & 128) {
360
                                $a++;
361
                                // ... and add the next char.
362
                                $buf .= substr($str, $a, 1);
363
                            } else {
364
                                break;
365
                            }
366
                        }
367
                        // If the UTF-8 char-sequence is found then...
368
                        if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
369
                            // The local number
370
                            $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
371
                            // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
372
                            if ($mByte > 255) {
373
                                $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
374
                            } else {
375
                                $outStr .= chr($mByte);
376
                            }
377
                        } elseif ($useEntityForNoChar) {
378
                            // Create num entity:
379
                            $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
380
                        } else {
381
                            $outStr .= chr($this->noCharByteVal);
382
                        }
383
                    } else {
384
                        $outStr .= chr($this->noCharByteVal);
385
                    }
386
                } else {
387
                    $outStr .= $chr;
388
                }
389
            }
390
            return $outStr;
391
        }
392
        return '';
393
    }
394
395
    /**
396
     * Converts all chars > 127 to numeric entities.
397
     *
398
     * @param string $str Input string
399
     * @return string Output string
400
     */
401
    public function utf8_to_entities($str)
402
    {
403
        $strLen = strlen($str);
404
        $outStr = '';
405
        // Traverse each char in UTF-8 string.
406
        for ($a = 0; $a < $strLen; $a++) {
407
            $chr = substr($str, $a, 1);
408
            $ord = ord($chr);
409
            // This means multibyte! (first byte!)
410
            if ($ord > 127) {
411
                // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
412
                if ($ord & 64) {
413
                    // Add first byte
414
                    $buf = $chr;
415
                    // For each byte in multibyte string...
416
                    for ($b = 0; $b < 8; $b++) {
417
                        // Shift it left and ...
418
                        $ord = $ord << 1;
419
                        // ... and with 8th bit - if that is set, then there are still bytes in sequence.
420
                        if ($ord & 128) {
421
                            $a++;
422
                            // ... and add the next char.
423
                            $buf .= substr($str, $a, 1);
424
                        } else {
425
                            break;
426
                        }
427
                    }
428
                    $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
429
                } else {
430
                    $outStr .= chr($this->noCharByteVal);
431
                }
432
            } else {
433
                $outStr .= $chr;
434
            }
435
        }
436
        return $outStr;
437
    }
438
439
    /**
440
     * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
441
     * All string-HTML entities (like &amp; or &pound;) will be converted as well
442
     * @param string $str Input string, UTF-8
443
     * @return string Output string
444
     */
445
    public function entities_to_utf8($str)
446
    {
447
        $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
448
        $token = md5(microtime());
449
        $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
450
        foreach ($parts as $k => $v) {
451
            // Only take every second element
452
            if ($k % 2 === 0) {
453
                continue;
454
            }
455
            $position = 0;
456
            // Dec or hex entities
457
            if (substr($v, $position, 1) === '#') {
458
                $position++;
459
                if (substr($v, $position, 1) === 'x') {
460
                    $v = hexdec(substr($v, ++$position));
461
                } else {
462
                    $v = substr($v, $position);
463
                }
464
                $parts[$k] = $this->UnumberToChar($v);
0 ignored issues
show
Bug introduced by
It seems like $v can also be of type string and double; however, parameter $unicodeInteger of TYPO3\CMS\Core\Charset\C...verter::UnumberToChar() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

464
                $parts[$k] = $this->UnumberToChar(/** @scrutinizer ignore-type */ $v);
Loading history...
465
            } elseif (isset($trans_tbl['&' . $v . ';'])) {
466
                // Other entities:
467
                $v = $trans_tbl['&' . $v . ';'];
468
                $parts[$k] = $v;
469
            } else {
470
                // No conversion:
471
                $parts[$k] = '&' . $v . ';';
472
            }
473
        }
474
        return implode('', $parts);
475
    }
476
477
    /**
478
     * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
479
     * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
480
     * Also, instead of integer numbers the real UTF-8 char is returned.
481
     *
482
     * @param string $str Input string, UTF-8
483
     * @return array Output array with the char numbers
484
     */
485
    public function utf8_to_numberarray($str)
486
    {
487
        // Entities must be registered as well
488
        $str = $this->entities_to_utf8($str);
489
490
        // Do conversion:
491
        $strLen = strlen($str);
492
        $outArr = [];
493
        // Traverse each char in UTF-8 string.
494
        for ($a = 0; $a < $strLen; $a++) {
495
            $chr = substr($str, $a, 1);
496
            $ord = ord($chr);
497
            // This means multibyte! (first byte!)
498
            if ($ord > 127) {
499
                // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
500
                if ($ord & 64) {
501
                    // Add first byte
502
                    $buf = $chr;
503
                    // For each byte in multibyte string...
504
                    for ($b = 0; $b < 8; $b++) {
505
                        // Shift it left and ...
506
                        $ord = $ord << 1;
507
                        // ... and with 8th bit - if that is set, then there are still bytes in sequence.
508
                        if ($ord & 128) {
509
                            $a++;
510
                            // ... and add the next char.
511
                            $buf .= substr($str, $a, 1);
512
                        } else {
513
                            break;
514
                        }
515
                    }
516
                    $outArr[] = $buf;
517
                } else {
518
                    $outArr[] = chr($this->noCharByteVal);
519
                }
520
            } else {
521
                $outArr[] = chr($ord);
522
            }
523
        }
524
        return $outArr;
525
    }
526
527
    /**
528
     * Converts a UNICODE number to a UTF-8 multibyte character
529
     * Algorithm based on script found at From: http://czyborra.com/utf/
530
     * Unit-tested by Kasper
531
     *
532
     * The binary representation of the character's integer value is thus simply spread across the bytes
533
     * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
534
     *
535
     * bytes | bits | representation
536
     *     1 |    7 | 0vvvvvvv
537
     *     2 |   11 | 110vvvvv 10vvvvvv
538
     *     3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
539
     *     4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
540
     *     5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
541
     *     6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
542
     *
543
     * @param int $unicodeInteger UNICODE integer
544
     * @return string UTF-8 multibyte character string
545
     * @see utf8CharToUnumber()
546
     */
547
    public function UnumberToChar($unicodeInteger)
548
    {
549
        $str = '';
550
        if ($unicodeInteger < 128) {
551
            $str .= chr($unicodeInteger);
552
        } elseif ($unicodeInteger < 2048) {
553
            $str .= chr(192 | $unicodeInteger >> 6);
554
            $str .= chr(128 | $unicodeInteger & 63);
555
        } elseif ($unicodeInteger < 65536) {
556
            $str .= chr(224 | $unicodeInteger >> 12);
557
            $str .= chr(128 | $unicodeInteger >> 6 & 63);
558
            $str .= chr(128 | $unicodeInteger & 63);
559
        } elseif ($unicodeInteger < 2097152) {
560
            $str .= chr(240 | $unicodeInteger >> 18);
561
            $str .= chr(128 | $unicodeInteger >> 12 & 63);
562
            $str .= chr(128 | $unicodeInteger >> 6 & 63);
563
            $str .= chr(128 | $unicodeInteger & 63);
564
        } elseif ($unicodeInteger < 67108864) {
565
            $str .= chr(248 | $unicodeInteger >> 24);
566
            $str .= chr(128 | $unicodeInteger >> 18 & 63);
567
            $str .= chr(128 | $unicodeInteger >> 12 & 63);
568
            $str .= chr(128 | $unicodeInteger >> 6 & 63);
569
            $str .= chr(128 | $unicodeInteger & 63);
570
        } elseif ($unicodeInteger < 2147483648) {
571
            $str .= chr(252 | $unicodeInteger >> 30);
572
            $str .= chr(128 | $unicodeInteger >> 24 & 63);
573
            $str .= chr(128 | $unicodeInteger >> 18 & 63);
574
            $str .= chr(128 | $unicodeInteger >> 12 & 63);
575
            $str .= chr(128 | $unicodeInteger >> 6 & 63);
576
            $str .= chr(128 | $unicodeInteger & 63);
577
        } else {
578
            // Cannot express a 32-bit character in UTF-8
579
            $str .= chr($this->noCharByteVal);
580
        }
581
        return $str;
582
    }
583
584
    /**
585
     * Converts a UTF-8 Multibyte character to a UNICODE number
586
     * Unit-tested by Kasper
587
     *
588
     * @param string $str UTF-8 multibyte character string
589
     * @param bool $hex If set, then a hex. number is returned.
590
     * @return int UNICODE integer
591
     * @see UnumberToChar()
592
     */
593
    public function utf8CharToUnumber($str, $hex = false)
594
    {
595
        // First char
596
        $ord = ord($str[0]);
597
        // This verifies that it IS a multi byte string
598
        if (($ord & 192) === 192) {
599
            $binBuf = '';
600
            $b = 0;
601
            // For each byte in multibyte string...
602
            for (; $b < 8; $b++) {
603
                // Shift it left and ...
604
                $ord = $ord << 1;
605
                // ... and with 8th bit - if that is set, then there are still bytes in sequence.
606
                if ($ord & 128) {
607
                    $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
608
                } else {
609
                    break;
610
                }
611
            }
612
            $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
613
            $int = bindec($binBuf);
614
        } else {
615
            $int = $ord;
616
        }
617
        return $hex ? 'x' . dechex($int) : $int;
0 ignored issues
show
Bug introduced by
It seems like $int can also be of type double; however, parameter $number of dechex() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

617
        return $hex ? 'x' . dechex(/** @scrutinizer ignore-type */ $int) : $int;
Loading history...
Bug Best Practice introduced by
The expression return $hex ? 'x' . dechex($int) : $int also could return the type string which is incompatible with the documented return type integer.
Loading history...
618
    }
619
620
    /********************************************
621
     *
622
     * Init functions
623
     *
624
     ********************************************/
625
    /**
626
     * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
627
     * This function is automatically called by the conversion functions
628
     *
629
     * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
630
     *
631
     * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
632
     * @return int Returns '1' if already loaded, '2' if the charset conversion table was found and parsed.
633
     * @throws UnknownCharsetException if no charset table was found
634
     * @access private
635
     */
636
    public function initCharset($charset)
637
    {
638
        // Only process if the charset is not yet loaded:
639
        if (!is_array($this->parsedCharsets[$charset])) {
640
            // Conversion table filename:
641
            $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
642
            // If the conversion table is found:
643
            if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
644
                // Cache file for charsets:
645
                // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
646
                $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
647
                if ($cacheFile && @is_file($cacheFile)) {
648
                    $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
649
                } else {
650
                    // Parse conversion table into lines:
651
                    $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
652
                    // Initialize the internal variable holding the conv. table:
653
                    $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
654
                    // traverse the lines:
655
                    $detectedType = '';
656
                    foreach ($lines as $value) {
657
                        // Comment line or blanks are ignored.
658
                        if (trim($value) && $value[0] !== '#') {
659
                            // Detect type if not done yet: (Done on first real line)
660
                            // The "whitespaced" type is on the syntax 	"0x0A	0x000A	#LINE FEED" 	while 	"ms-token" is like 		"B9 = U+00B9 : SUPERSCRIPT ONE"
661
                            if (!$detectedType) {
662
                                $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
663
                            }
664
                            $hexbyte = '';
665
                            $utf8 = '';
666
                            if ($detectedType === 'ms-token') {
667
                                list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
668
                            } elseif ($detectedType === 'whitespaced') {
669
                                $regA = [];
670
                                preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
671
                                $hexbyte = $regA[1];
672
                                $utf8 = 'U+' . $regA[2];
673
                            }
674
                            $decval = hexdec(trim($hexbyte));
675
                            if ($decval > 127) {
676
                                $utf8decval = hexdec(substr(trim($utf8), 2));
677
                                $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
0 ignored issues
show
Bug introduced by
It seems like $utf8decval can also be of type double; however, parameter $unicodeInteger of TYPO3\CMS\Core\Charset\C...verter::UnumberToChar() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

677
                                $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar(/** @scrutinizer ignore-type */ $utf8decval);
Loading history...
678
                                $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
679
                            }
680
                        }
681
                    }
682
                    if ($cacheFile) {
683
                        GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
684
                    }
685
                }
686
                return 2;
687
            }
688
            throw new UnknownCharsetException(sprintf('Unknown charset "%s"', $charset), 1508916031);
689
        }
690
        return 1;
691
    }
692
693
    /**
694
     * This function initializes all UTF-8 character data tables.
695
     *
696
     * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
697
     *
698
     * @param string $mode Mode ("case", "ascii", ...)
699
     * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
700
     * @access private
701
     */
702
    public function initUnicodeData($mode = null)
703
    {
704
        // Cache files
705
        $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
706
        $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
707
        // Only process if the tables are not yet loaded
708
        switch ($mode) {
709
            case 'case':
710
                if (is_array($this->caseFolding['utf-8'])) {
711
                    return 1;
712
                }
713
                // Use cached version if possible
714
                if ($cacheFileCase && @is_file($cacheFileCase)) {
715
                    $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
716
                    return 2;
717
                }
718
                break;
719
            case 'ascii':
720
                if (is_array($this->toASCII['utf-8'])) {
721
                    return 1;
722
                }
723
                // Use cached version if possible
724
                if ($cacheFileASCII && @is_file($cacheFileASCII)) {
725
                    $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
726
                    return 2;
727
                }
728
                break;
729
        }
730
        // Process main Unicode data file
731
        $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
732
        if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
733
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
734
        }
735
        $fh = fopen($unicodeDataFile, 'rb');
736
        if (!$fh) {
737
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
738
        }
739
        // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
740
        // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
741
        $this->caseFolding['utf-8'] = [];
742
        $utf8CaseFolding = &$this->caseFolding['utf-8'];
743
        // a shorthand
744
        $utf8CaseFolding['toUpper'] = [];
745
        $utf8CaseFolding['toLower'] = [];
746
        $utf8CaseFolding['toTitle'] = [];
747
        // Array of temp. decompositions
748
        $decomposition = [];
749
        // Array of chars that are marks (eg. composing accents)
750
        $mark = [];
751
        // Array of chars that are numbers (eg. digits)
752
        $number = [];
753
        // Array of chars to be omitted (eg. Russian hard sign)
754
        $omit = [];
755
        while (!feof($fh)) {
756
            $line = fgets($fh, 4096);
757
            // Has a lot of info
758
            list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
759
            $ord = hexdec($char);
760
            if ($ord > 65535) {
761
                // Only process the BMP
762
                break;
763
            }
764
            $utf8_char = $this->UnumberToChar($ord);
765
            if ($upper) {
766
                $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
767
            }
768
            if ($lower) {
769
                $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
770
            }
771
            // Store "title" only when different from "upper" (only a few)
772
            if ($title && $title !== $upper) {
773
                $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
774
            }
775
            switch ($cat[0]) {
776
                case 'M':
777
                    // mark (accent, umlaut, ...)
778
                    $mark['U+' . $char] = 1;
779
                    break;
780
                case 'N':
781
                    // numeric value
782
                    if ($ord > 128 && $num !== '') {
783
                        $number['U+' . $char] = $num;
784
                    }
785
            }
786
            // Accented Latin letters without "official" decomposition
787
            $match = [];
788
            if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
789
                $c = ord($match[2]);
790
                if ($match[1] === 'SMALL') {
791
                    $c += 32;
792
                }
793
                $decomposition['U+' . $char] = [dechex($c)];
794
                continue;
795
            }
796
            $match = [];
797
            if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
798
                switch ($match[1]) {
799
                    case '<circle>':
800
                        // add parenthesis as circle replacement, eg (1)
801
                        $match[2] = '0028 ' . $match[2] . ' 0029';
802
                        break;
803
                    case '<square>':
804
                        // add square brackets as square replacement, eg [1]
805
                        $match[2] = '005B ' . $match[2] . ' 005D';
806
                        break;
807
                    case '<compat>':
808
                        // ignore multi char decompositions that start with a space
809
                        if (preg_match('/^0020 /', $match[2])) {
810
                            continue 2;
811
                        }
812
                        break;
813
                    case '<initial>':
814
                    case '<medial>':
815
                    case '<final>':
816
                    case '<isolated>':
817
                    case '<vertical>':
818
                        continue 2;
819
                }
820
                $decomposition['U+' . $char] = explode(' ', $match[2]);
821
            }
822
        }
823
        fclose($fh);
824
        // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
825
        $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
826
        if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
827
            $fh = fopen($specialCasingFile, 'rb');
828
            if ($fh) {
829
                while (!feof($fh)) {
830
                    $line = fgets($fh, 4096);
831
                    if ($line[0] !== '#' && trim($line) !== '') {
832
                        list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
833
                        if ($cond === '' || $cond[0] === '#') {
834
                            $utf8_char = $this->UnumberToChar(hexdec($char));
835
                            if ($char !== $lower) {
836
                                $arr = explode(' ', $lower);
837
                                for ($i = 0; isset($arr[$i]); $i++) {
838
                                    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
839
                                }
840
                                $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
841
                            }
842
                            if ($char !== $title && $title !== $upper) {
843
                                $arr = explode(' ', $title);
844
                                for ($i = 0; isset($arr[$i]); $i++) {
845
                                    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
846
                                }
847
                                $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
848
                            }
849
                            if ($char !== $upper) {
850
                                $arr = explode(' ', $upper);
851
                                for ($i = 0; isset($arr[$i]); $i++) {
852
                                    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
853
                                }
854
                                $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
855
                            }
856
                        }
857
                    }
858
                }
859
                fclose($fh);
860
            }
861
        }
862
        // Process custom decompositions
863
        $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
864
        if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
865
            $fh = fopen($customTranslitFile, 'rb');
866
            if ($fh) {
867
                while (!feof($fh)) {
868
                    $line = fgets($fh, 4096);
869
                    if ($line[0] !== '#' && trim($line) !== '') {
870
                        list($char, $translit) = GeneralUtility::trimExplode(';', $line);
871
                        if (!$translit) {
872
                            $omit['U+' . $char] = 1;
873
                        }
874
                        $decomposition['U+' . $char] = explode(' ', $translit);
875
                    }
876
                }
877
                fclose($fh);
878
            }
879
        }
880
        // Decompose and remove marks; inspired by unac (Loic Dachary <[email protected]>)
881
        foreach ($decomposition as $from => $to) {
882
            $code_decomp = [];
883
            while ($code_value = array_shift($to)) {
884
                // Do recursive decomposition
885
                if (isset($decomposition['U+' . $code_value])) {
886
                    foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
887
                        array_unshift($to, $cv);
888
                    }
889
                } elseif (!isset($mark['U+' . $code_value])) {
890
                    // remove mark
891
                    $code_decomp[] = $code_value;
892
                }
893
            }
894
            if (!empty($code_decomp) || isset($omit[$from])) {
895
                $decomposition[$from] = $code_decomp;
896
            } else {
897
                unset($decomposition[$from]);
898
            }
899
        }
900
        // Create ascii only mapping
901
        $this->toASCII['utf-8'] = [];
902
        $ascii = &$this->toASCII['utf-8'];
903
        foreach ($decomposition as $from => $to) {
904
            $code_decomp = [];
905
            while ($code_value = array_shift($to)) {
906
                $ord = hexdec($code_value);
907
                if ($ord > 127) {
908
                    continue 2;
909
                }
910
                // Skip decompositions containing non-ASCII chars
911
                $code_decomp[] = chr($ord);
912
            }
913
            $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
914
        }
915
        // Add numeric decompositions
916
        foreach ($number as $from => $to) {
917
            $utf8_char = $this->UnumberToChar(hexdec($from));
918
            if (!isset($ascii[$utf8_char])) {
919
                $ascii[$utf8_char] = $to;
920
            }
921
        }
922
        if ($cacheFileCase) {
923
            GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
924
        }
925
        if ($cacheFileASCII) {
926
            GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
927
        }
928
        return 3;
929
    }
930
931
    /**
932
     * This function initializes the folding table for a charset other than UTF-8.
933
     * This function is automatically called by the case folding functions.
934
     *
935
     * @param string $charset Charset for which to initialize case folding.
936
     * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
937
     * @access private
938
     */
939
    public function initCaseFolding($charset)
940
    {
941
        // Only process if the case table is not yet loaded:
942
        if (is_array($this->caseFolding[$charset])) {
943
            return 1;
944
        }
945
        // Use cached version if possible
946
        $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
947
        if ($cacheFile && @is_file($cacheFile)) {
948
            $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
949
            return 2;
950
        }
951
        // init UTF-8 conversion for this charset
952
        if (!$this->initCharset($charset)) {
953
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
954
        }
955
        // UTF-8 case folding is used as the base conversion table
956
        if (!$this->initUnicodeData('case')) {
957
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
958
        }
959
        $nochar = chr($this->noCharByteVal);
960
        foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
961
            // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
962
            $c = $this->utf8_decode($utf8, $charset);
963
            $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
964
            if ($cc !== '' && $cc !== $nochar) {
965
                $this->caseFolding[$charset]['toUpper'][$c] = $cc;
966
            }
967
            $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
968
            if ($cc !== '' && $cc !== $nochar) {
969
                $this->caseFolding[$charset]['toLower'][$c] = $cc;
970
            }
971
            $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
972
            if ($cc !== '' && $cc !== $nochar) {
973
                $this->caseFolding[$charset]['toTitle'][$c] = $cc;
974
            }
975
        }
976
        // Add the ASCII case table
977
        $start = ord('a');
978
        $end = ord('z');
979
        for ($i = $start; $i <= $end; $i++) {
980
            $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
981
        }
982
        $start = ord('A');
983
        $end = ord('Z');
984
        for ($i = $start; $i <= $end; $i++) {
985
            $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
986
        }
987
        if ($cacheFile) {
988
            GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
989
        }
990
        return 3;
991
    }
992
993
    /**
994
     * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
995
     * This function is automatically called by the ASCII transliteration functions.
996
     *
997
     * @param string $charset Charset for which to initialize conversion.
998
     * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
999
     * @access private
1000
     */
1001
    public function initToASCII($charset)
1002
    {
1003
        // Only process if the case table is not yet loaded:
1004
        if (is_array($this->toASCII[$charset])) {
1005
            return 1;
1006
        }
1007
        // Use cached version if possible
1008
        $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1009
        if ($cacheFile && @is_file($cacheFile)) {
1010
            $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1011
            return 2;
1012
        }
1013
        // Init UTF-8 conversion for this charset
1014
        if (!$this->initCharset($charset)) {
1015
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
1016
        }
1017
        // UTF-8/ASCII transliteration is used as the base conversion table
1018
        if (!$this->initUnicodeData('ascii')) {
1019
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
1020
        }
1021
        foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1022
            // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1023
            $c = $this->utf8_decode($utf8, $charset);
1024
            if (isset($this->toASCII['utf-8'][$utf8])) {
1025
                $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1026
            }
1027
        }
1028
        if ($cacheFile) {
1029
            GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1030
        }
1031
        return 3;
1032
    }
1033
1034
    /********************************************
1035
     *
1036
     * String operation functions
1037
     *
1038
     ********************************************/
1039
1040
    /**
1041
     * Truncates a string and pre-/appends a string.
1042
     * Unit tested by Kasper
1043
     *
1044
     * @param string $charset The character set
1045
     * @param string $string Character string
1046
     * @param int $len Length (in characters)
1047
     * @param string $crop Crop signifier
1048
     * @return string The shortened string
1049
     * @see substr(), mb_strimwidth()
1050
     */
1051
    public function crop($charset, $string, $len, $crop = '')
1052
    {
1053
        if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1054
            return $string;
1055
        }
1056
        if ($len > 0) {
1057
            $string = mb_substr($string, 0, $len, $charset) . $crop;
1058
        } else {
1059
            $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1060
        }
1061
        return $string;
1062
    }
1063
1064
    /**
1065
     * Equivalent of lcfirst/ucfirst but using character set.
1066
     *
1067
     * @param string $charset
1068
     * @param string $string
1069
     * @param string $case can be 'toLower' or 'toUpper'
1070
     * @return string
1071
     */
1072
    public function convCaseFirst($charset, $string, $case)
1073
    {
1074
        $firstChar = mb_substr($string, 0, 1, $charset);
1075
        $firstChar = $case === 'toLower'
1076
            ? mb_strtolower($firstChar, $charset)
1077
            : mb_strtoupper($firstChar, $charset);
1078
        $remainder = mb_substr($string, 1, null, $charset);
1079
        return $firstChar . $remainder;
1080
    }
1081
1082
    /**
1083
     * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1084
     *
1085
     * @param string $charset Character set of string
1086
     * @param string $string Input string to convert
1087
     * @return string The converted string
1088
     */
1089
    public function specCharsToASCII($charset, $string)
1090
    {
1091
        if ($charset === 'utf-8') {
1092
            $string = $this->utf8_char_mapping($string);
1093
        } elseif (isset($this->eucBasedSets[$charset])) {
1094
            $string = $this->euc_char_mapping($string, $charset);
1095
        } else {
1096
            // Treat everything else as single-byte encoding
1097
            $string = $this->sb_char_mapping($string, $charset);
1098
        }
1099
        return $string;
1100
    }
1101
1102
    /********************************************
1103
     *
1104
     * Internal string operation functions
1105
     *
1106
     ********************************************/
1107
    /**
1108
     * Maps all characters of a string in a single byte charset.
1109
     *
1110
     * @param string $str The string
1111
     * @param string $charset The charset
1112
     * @return string The converted string
1113
     */
1114
    public function sb_char_mapping($str, $charset)
1115
    {
1116
        if (!$this->initToASCII($charset)) {
1117
            return $str;
1118
        }
1119
        // Do nothing
1120
        $map = &$this->toASCII[$charset];
1121
        $out = '';
1122
        for ($i = 0; isset($str[$i]); $i++) {
1123
            $c = $str[$i];
1124
            if (isset($map[$c])) {
1125
                $out .= $map[$c];
1126
            } else {
1127
                $out .= $c;
1128
            }
1129
        }
1130
        return $out;
1131
    }
1132
1133
    /********************************************
1134
     *
1135
     * Internal UTF-8 string operation functions
1136
     *
1137
     ********************************************/
1138
1139
    /**
1140
     * Translates a character position into an 'absolute' byte position.
1141
     * Unit tested by Kasper.
1142
     *
1143
     * @param string $str UTF-8 string
1144
     * @param int $pos Character position (negative values start from the end)
1145
     * @return int Byte position
1146
     */
1147
    public function utf8_char2byte_pos($str, $pos)
1148
    {
1149
        // Number of characters found
1150
        $n = 0;
1151
        // Number of characters wanted
1152
        $p = abs($pos);
1153
        if ($pos >= 0) {
1154
            $i = 0;
1155
            $d = 1;
1156
        } else {
1157
            $i = strlen($str) - 1;
1158
            $d = -1;
1159
        }
1160
        for (; isset($str[$i]) && $n < $p; $i += $d) {
1161
            $c = (int)ord($str[$i]);
1162
            // single-byte (0xxxxxx)
1163
            if (!($c & 128)) {
1164
                $n++;
1165
            } elseif (($c & 192) === 192) {
1166
                // Multi-byte starting byte (11xxxxxx)
1167
                $n++;
1168
            }
1169
        }
1170
        if (!isset($str[$i])) {
1171
            // Offset beyond string length
1172
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
1173
        }
1174
        if ($pos >= 0) {
1175
            // Skip trailing multi-byte data bytes
1176
            while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1177
                $i++;
1178
            }
1179
        } else {
1180
            // Correct offset
1181
            $i++;
1182
        }
1183
        return $i;
1184
    }
1185
1186
    /**
1187
     * Maps all characters of an UTF-8 string.
1188
     *
1189
     * @param string $str UTF-8 string
1190
     * @return string The converted string
1191
     */
1192
    public function utf8_char_mapping($str)
1193
    {
1194
        if (!$this->initUnicodeData('ascii')) {
1195
            // Do nothing
1196
            return $str;
1197
        }
1198
        $out = '';
1199
        $map = &$this->toASCII['utf-8'];
1200
        for ($i = 0; isset($str[$i]); $i++) {
1201
            $c = ord($str[$i]);
1202
            $mbc = '';
1203
            // single-byte (0xxxxxx)
1204
            if (!($c & 128)) {
1205
                $mbc = $str[$i];
1206
            } elseif (($c & 192) === 192) {
1207
                $bc = 0;
1208
                // multi-byte starting byte (11xxxxxx)
1209
                for (; $c & 128; $c = $c << 1) {
1210
                    $bc++;
1211
                }
1212
                // calculate number of bytes
1213
                $mbc = substr($str, $i, $bc);
1214
                $i += $bc - 1;
1215
            }
1216
            if (isset($map[$mbc])) {
1217
                $out .= $map[$mbc];
1218
            } else {
1219
                $out .= $mbc;
1220
            }
1221
        }
1222
        return $out;
1223
    }
1224
1225
    /********************************************
1226
     *
1227
     * Internal EUC string operation functions
1228
     *
1229
     * Extended Unix Code:
1230
     *  ASCII compatible 7bit single bytes chars
1231
     *  8bit two byte chars
1232
     *
1233
     * Shift-JIS is treated as a special case.
1234
     *
1235
     ********************************************/
1236
1237
    /**
1238
     * Maps all characters of a string in the EUC charset family.
1239
     *
1240
     * @param string $str EUC multibyte character string
1241
     * @param string $charset The charset
1242
     * @return string The converted string
1243
     */
1244
    public function euc_char_mapping($str, $charset)
1245
    {
1246
        if (!$this->initToASCII($charset)) {
1247
            return $str;
1248
        }
1249
        // do nothing
1250
        $map = &$this->toASCII[$charset];
1251
        $out = '';
1252
        for ($i = 0; isset($str[$i]); $i++) {
1253
            $mbc = $str[$i];
1254
            $c = ord($mbc);
1255
            if ($charset === 'shift_jis') {
1256
                // A double-byte char
1257
                if ($c >= 128 && $c < 160 || $c >= 224) {
1258
                    $mbc = substr($str, $i, 2);
1259
                    $i++;
1260
                }
1261
            } else {
1262
                // A double-byte char
1263
                if ($c >= 128) {
1264
                    $mbc = substr($str, $i, 2);
1265
                    $i++;
1266
                }
1267
            }
1268
            if (isset($map[$mbc])) {
1269
                $out .= $map[$mbc];
1270
            } else {
1271
                $out .= $mbc;
1272
            }
1273
        }
1274
        return $out;
1275
    }
1276
}
1277