Completed
Push — master ( 7a68f8...d5d534 )
by Lars
03:42
created

UTF8::is_binary()   B

Complexity

Conditions 5
Paths 2

Size

Total Lines 17
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 5.0488

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 17
ccs 7
cts 8
cp 0.875
rs 8.8571
cc 5
eloc 11
nc 2
nop 1
crap 5.0488
1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  protected static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  protected static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  protected static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  protected static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  protected static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  protected static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  protected static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  protected static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  protected static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  protected static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  protected static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param    string $str A UTF-8 string.
816
   * @param    int    $pos The position of character to return.
817
   *
818
   * @return   string Single Multi-Byte character.
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param    string $str The input string
831
   *
832
   * @return   string The output string that contains BOM
833
   */
834
  public static function add_bom_to_string($str)
835
  {
836
    if (self::string_has_bom($str) === false) {
837
      $str = self::bom() . $str;
838
    }
839
840
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 1
  public static function bom()
861
  {
862 1
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * Returns an array of all lower and upper case UTF-8 encoded characters.
881
   *
882
   * @return   string An array with lower case chars as keys and upper chars as values.
883
   */
884
  protected static function case_table()
885
  {
886
    static $case = array(
887
888
      // lower => upper
889
      "\xf0\x90\x91\x8f" => "\xf0\x90\x90\xa7",
890
      "\xf0\x90\x91\x8e" => "\xf0\x90\x90\xa6",
891
      "\xf0\x90\x91\x8d" => "\xf0\x90\x90\xa5",
892
      "\xf0\x90\x91\x8c" => "\xf0\x90\x90\xa4",
893
      "\xf0\x90\x91\x8b" => "\xf0\x90\x90\xa3",
894
      "\xf0\x90\x91\x8a" => "\xf0\x90\x90\xa2",
895
      "\xf0\x90\x91\x89" => "\xf0\x90\x90\xa1",
896
      "\xf0\x90\x91\x88" => "\xf0\x90\x90\xa0",
897
      "\xf0\x90\x91\x87" => "\xf0\x90\x90\x9f",
898
      "\xf0\x90\x91\x86" => "\xf0\x90\x90\x9e",
899
      "\xf0\x90\x91\x85" => "\xf0\x90\x90\x9d",
900
      "\xf0\x90\x91\x84" => "\xf0\x90\x90\x9c",
901
      "\xf0\x90\x91\x83" => "\xf0\x90\x90\x9b",
902
      "\xf0\x90\x91\x82" => "\xf0\x90\x90\x9a",
903
      "\xf0\x90\x91\x81" => "\xf0\x90\x90\x99",
904
      "\xf0\x90\x91\x80" => "\xf0\x90\x90\x98",
905
      "\xf0\x90\x90\xbf" => "\xf0\x90\x90\x97",
906
      "\xf0\x90\x90\xbe" => "\xf0\x90\x90\x96",
907
      "\xf0\x90\x90\xbd" => "\xf0\x90\x90\x95",
908
      "\xf0\x90\x90\xbc" => "\xf0\x90\x90\x94",
909
      "\xf0\x90\x90\xbb" => "\xf0\x90\x90\x93",
910
      "\xf0\x90\x90\xba" => "\xf0\x90\x90\x92",
911
      "\xf0\x90\x90\xb9" => "\xf0\x90\x90\x91",
912
      "\xf0\x90\x90\xb8" => "\xf0\x90\x90\x90",
913
      "\xf0\x90\x90\xb7" => "\xf0\x90\x90\x8f",
914
      "\xf0\x90\x90\xb6" => "\xf0\x90\x90\x8e",
915
      "\xf0\x90\x90\xb5" => "\xf0\x90\x90\x8d",
916
      "\xf0\x90\x90\xb4" => "\xf0\x90\x90\x8c",
917
      "\xf0\x90\x90\xb3" => "\xf0\x90\x90\x8b",
918
      "\xf0\x90\x90\xb2" => "\xf0\x90\x90\x8a",
919
      "\xf0\x90\x90\xb1" => "\xf0\x90\x90\x89",
920
      "\xf0\x90\x90\xb0" => "\xf0\x90\x90\x88",
921
      "\xf0\x90\x90\xaf" => "\xf0\x90\x90\x87",
922
      "\xf0\x90\x90\xae" => "\xf0\x90\x90\x86",
923
      "\xf0\x90\x90\xad" => "\xf0\x90\x90\x85",
924
      "\xf0\x90\x90\xac" => "\xf0\x90\x90\x84",
925
      "\xf0\x90\x90\xab" => "\xf0\x90\x90\x83",
926
      "\xf0\x90\x90\xaa" => "\xf0\x90\x90\x82",
927
      "\xf0\x90\x90\xa9" => "\xf0\x90\x90\x81",
928
      "\xf0\x90\x90\xa8" => "\xf0\x90\x90\x80",
929
      "\xef\xbd\x9a"     => "\xef\xbc\xba",
930
      "\xef\xbd\x99"     => "\xef\xbc\xb9",
931
      "\xef\xbd\x98"     => "\xef\xbc\xb8",
932
      "\xef\xbd\x97"     => "\xef\xbc\xb7",
933
      "\xef\xbd\x96"     => "\xef\xbc\xb6",
934
      "\xef\xbd\x95"     => "\xef\xbc\xb5",
935
      "\xef\xbd\x94"     => "\xef\xbc\xb4",
936
      "\xef\xbd\x93"     => "\xef\xbc\xb3",
937
      "\xef\xbd\x92"     => "\xef\xbc\xb2",
938
      "\xef\xbd\x91"     => "\xef\xbc\xb1",
939
      "\xef\xbd\x90"     => "\xef\xbc\xb0",
940
      "\xef\xbd\x8f"     => "\xef\xbc\xaf",
941
      "\xef\xbd\x8e"     => "\xef\xbc\xae",
942
      "\xef\xbd\x8d"     => "\xef\xbc\xad",
943
      "\xef\xbd\x8c"     => "\xef\xbc\xac",
944
      "\xef\xbd\x8b"     => "\xef\xbc\xab",
945
      "\xef\xbd\x8a"     => "\xef\xbc\xaa",
946
      "\xef\xbd\x89"     => "\xef\xbc\xa9",
947
      "\xef\xbd\x88"     => "\xef\xbc\xa8",
948
      "\xef\xbd\x87"     => "\xef\xbc\xa7",
949
      "\xef\xbd\x86"     => "\xef\xbc\xa6",
950
      "\xef\xbd\x85"     => "\xef\xbc\xa5",
951
      "\xef\xbd\x84"     => "\xef\xbc\xa4",
952
      "\xef\xbd\x83"     => "\xef\xbc\xa3",
953
      "\xef\xbd\x82"     => "\xef\xbc\xa2",
954
      "\xef\xbd\x81"     => "\xef\xbc\xa1",
955
      "\xea\x9e\x8c"     => "\xea\x9e\x8b",
956
      "\xea\x9e\x87"     => "\xea\x9e\x86",
957
      "\xea\x9e\x85"     => "\xea\x9e\x84",
958
      "\xea\x9e\x83"     => "\xea\x9e\x82",
959
      "\xea\x9e\x81"     => "\xea\x9e\x80",
960
      "\xea\x9d\xbf"     => "\xea\x9d\xbe",
961
      "\xea\x9d\xbc"     => "\xea\x9d\xbb",
962
      "\xea\x9d\xba"     => "\xea\x9d\xb9",
963
      "\xea\x9d\xaf"     => "\xea\x9d\xae",
964
      "\xea\x9d\xad"     => "\xea\x9d\xac",
965
      "\xea\x9d\xab"     => "\xea\x9d\xaa",
966
      "\xea\x9d\xa9"     => "\xea\x9d\xa8",
967
      "\xea\x9d\xa7"     => "\xea\x9d\xa6",
968
      "\xea\x9d\xa5"     => "\xea\x9d\xa4",
969
      "\xea\x9d\xa3"     => "\xea\x9d\xa2",
970
      "\xea\x9d\xa1"     => "\xea\x9d\xa0",
971
      "\xea\x9d\x9f"     => "\xea\x9d\x9e",
972
      "\xea\x9d\x9d"     => "\xea\x9d\x9c",
973
      "\xea\x9d\x9b"     => "\xea\x9d\x9a",
974
      "\xea\x9d\x99"     => "\xea\x9d\x98",
975
      "\xea\x9d\x97"     => "\xea\x9d\x96",
976
      "\xea\x9d\x95"     => "\xea\x9d\x94",
977
      "\xea\x9d\x93"     => "\xea\x9d\x92",
978
      "\xea\x9d\x91"     => "\xea\x9d\x90",
979
      "\xea\x9d\x8f"     => "\xea\x9d\x8e",
980
      "\xea\x9d\x8d"     => "\xea\x9d\x8c",
981
      "\xea\x9d\x8b"     => "\xea\x9d\x8a",
982
      "\xea\x9d\x89"     => "\xea\x9d\x88",
983
      "\xea\x9d\x87"     => "\xea\x9d\x86",
984
      "\xea\x9d\x85"     => "\xea\x9d\x84",
985
      "\xea\x9d\x83"     => "\xea\x9d\x82",
986
      "\xea\x9d\x81"     => "\xea\x9d\x80",
987
      "\xea\x9c\xbf"     => "\xea\x9c\xbe",
988
      "\xea\x9c\xbd"     => "\xea\x9c\xbc",
989
      "\xea\x9c\xbb"     => "\xea\x9c\xba",
990
      "\xea\x9c\xb9"     => "\xea\x9c\xb8",
991
      "\xea\x9c\xb7"     => "\xea\x9c\xb6",
992
      "\xea\x9c\xb5"     => "\xea\x9c\xb4",
993
      "\xea\x9c\xb3"     => "\xea\x9c\xb2",
994
      "\xea\x9c\xaf"     => "\xea\x9c\xae",
995
      "\xea\x9c\xad"     => "\xea\x9c\xac",
996
      "\xea\x9c\xab"     => "\xea\x9c\xaa",
997
      "\xea\x9c\xa9"     => "\xea\x9c\xa8",
998
      "\xea\x9c\xa7"     => "\xea\x9c\xa6",
999
      "\xea\x9c\xa5"     => "\xea\x9c\xa4",
1000
      "\xea\x9c\xa3"     => "\xea\x9c\xa2",
1001
      "\xea\x9a\x97"     => "\xea\x9a\x96",
1002
      "\xea\x9a\x95"     => "\xea\x9a\x94",
1003
      "\xea\x9a\x93"     => "\xea\x9a\x92",
1004
      "\xea\x9a\x91"     => "\xea\x9a\x90",
1005
      "\xea\x9a\x8f"     => "\xea\x9a\x8e",
1006
      "\xea\x9a\x8d"     => "\xea\x9a\x8c",
1007
      "\xea\x9a\x8b"     => "\xea\x9a\x8a",
1008
      "\xea\x9a\x89"     => "\xea\x9a\x88",
1009
      "\xea\x9a\x87"     => "\xea\x9a\x86",
1010
      "\xea\x9a\x85"     => "\xea\x9a\x84",
1011
      "\xea\x9a\x83"     => "\xea\x9a\x82",
1012
      "\xea\x9a\x81"     => "\xea\x9a\x80",
1013
      "\xea\x99\xad"     => "\xea\x99\xac",
1014
      "\xea\x99\xab"     => "\xea\x99\xaa",
1015
      "\xea\x99\xa9"     => "\xea\x99\xa8",
1016
      "\xea\x99\xa7"     => "\xea\x99\xa6",
1017
      "\xea\x99\xa5"     => "\xea\x99\xa4",
1018
      "\xea\x99\xa3"     => "\xea\x99\xa2",
1019
      "\xea\x99\x9f"     => "\xea\x99\x9e",
1020
      "\xea\x99\x9d"     => "\xea\x99\x9c",
1021
      "\xea\x99\x9b"     => "\xea\x99\x9a",
1022
      "\xea\x99\x99"     => "\xea\x99\x98",
1023
      "\xea\x99\x97"     => "\xea\x99\x96",
1024
      "\xea\x99\x95"     => "\xea\x99\x94",
1025
      "\xea\x99\x93"     => "\xea\x99\x92",
1026
      "\xea\x99\x91"     => "\xea\x99\x90",
1027
      "\xea\x99\x8f"     => "\xea\x99\x8e",
1028
      "\xea\x99\x8d"     => "\xea\x99\x8c",
1029
      "\xea\x99\x8b"     => "\xea\x99\x8a",
1030
      "\xea\x99\x89"     => "\xea\x99\x88",
1031
      "\xea\x99\x87"     => "\xea\x99\x86",
1032
      "\xea\x99\x85"     => "\xea\x99\x84",
1033
      "\xea\x99\x83"     => "\xea\x99\x82",
1034
      "\xea\x99\x81"     => "\xea\x99\x80",
1035
      "\xe2\xb4\xa5"     => "\xe1\x83\x85",
1036
      "\xe2\xb4\xa4"     => "\xe1\x83\x84",
1037
      "\xe2\xb4\xa3"     => "\xe1\x83\x83",
1038
      "\xe2\xb4\xa2"     => "\xe1\x83\x82",
1039
      "\xe2\xb4\xa1"     => "\xe1\x83\x81",
1040
      "\xe2\xb4\xa0"     => "\xe1\x83\x80",
1041
      "\xe2\xb4\x9f"     => "\xe1\x82\xbf",
1042
      "\xe2\xb4\x9e"     => "\xe1\x82\xbe",
1043
      "\xe2\xb4\x9d"     => "\xe1\x82\xbd",
1044
      "\xe2\xb4\x9c"     => "\xe1\x82\xbc",
1045
      "\xe2\xb4\x9b"     => "\xe1\x82\xbb",
1046
      "\xe2\xb4\x9a"     => "\xe1\x82\xba",
1047
      "\xe2\xb4\x99"     => "\xe1\x82\xb9",
1048
      "\xe2\xb4\x98"     => "\xe1\x82\xb8",
1049
      "\xe2\xb4\x97"     => "\xe1\x82\xb7",
1050
      "\xe2\xb4\x96"     => "\xe1\x82\xb6",
1051
      "\xe2\xb4\x95"     => "\xe1\x82\xb5",
1052
      "\xe2\xb4\x94"     => "\xe1\x82\xb4",
1053
      "\xe2\xb4\x93"     => "\xe1\x82\xb3",
1054
      "\xe2\xb4\x92"     => "\xe1\x82\xb2",
1055
      "\xe2\xb4\x91"     => "\xe1\x82\xb1",
1056
      "\xe2\xb4\x90"     => "\xe1\x82\xb0",
1057
      "\xe2\xb4\x8f"     => "\xe1\x82\xaf",
1058
      "\xe2\xb4\x8e"     => "\xe1\x82\xae",
1059
      "\xe2\xb4\x8d"     => "\xe1\x82\xad",
1060
      "\xe2\xb4\x8c"     => "\xe1\x82\xac",
1061
      "\xe2\xb4\x8b"     => "\xe1\x82\xab",
1062
      "\xe2\xb4\x8a"     => "\xe1\x82\xaa",
1063
      "\xe2\xb4\x89"     => "\xe1\x82\xa9",
1064
      "\xe2\xb4\x88"     => "\xe1\x82\xa8",
1065
      "\xe2\xb4\x87"     => "\xe1\x82\xa7",
1066
      "\xe2\xb4\x86"     => "\xe1\x82\xa6",
1067
      "\xe2\xb4\x85"     => "\xe1\x82\xa5",
1068
      "\xe2\xb4\x84"     => "\xe1\x82\xa4",
1069
      "\xe2\xb4\x83"     => "\xe1\x82\xa3",
1070
      "\xe2\xb4\x82"     => "\xe1\x82\xa2",
1071
      "\xe2\xb4\x81"     => "\xe1\x82\xa1",
1072
      "\xe2\xb4\x80"     => "\xe1\x82\xa0",
1073
      "\xe2\xb3\xae"     => "\xe2\xb3\xad",
1074
      "\xe2\xb3\xac"     => "\xe2\xb3\xab",
1075
      "\xe2\xb3\xa3"     => "\xe2\xb3\xa2",
1076
      "\xe2\xb3\xa1"     => "\xe2\xb3\xa0",
1077
      "\xe2\xb3\x9f"     => "\xe2\xb3\x9e",
1078
      "\xe2\xb3\x9d"     => "\xe2\xb3\x9c",
1079
      "\xe2\xb3\x9b"     => "\xe2\xb3\x9a",
1080
      "\xe2\xb3\x99"     => "\xe2\xb3\x98",
1081
      "\xe2\xb3\x97"     => "\xe2\xb3\x96",
1082
      "\xe2\xb3\x95"     => "\xe2\xb3\x94",
1083
      "\xe2\xb3\x93"     => "\xe2\xb3\x92",
1084
      "\xe2\xb3\x91"     => "\xe2\xb3\x90",
1085
      "\xe2\xb3\x8f"     => "\xe2\xb3\x8e",
1086
      "\xe2\xb3\x8d"     => "\xe2\xb3\x8c",
1087
      "\xe2\xb3\x8b"     => "\xe2\xb3\x8a",
1088
      "\xe2\xb3\x89"     => "\xe2\xb3\x88",
1089
      "\xe2\xb3\x87"     => "\xe2\xb3\x86",
1090
      "\xe2\xb3\x85"     => "\xe2\xb3\x84",
1091
      "\xe2\xb3\x83"     => "\xe2\xb3\x82",
1092
      "\xe2\xb3\x81"     => "\xe2\xb3\x80",
1093
      "\xe2\xb2\xbf"     => "\xe2\xb2\xbe",
1094
      "\xe2\xb2\xbd"     => "\xe2\xb2\xbc",
1095
      "\xe2\xb2\xbb"     => "\xe2\xb2\xba",
1096
      "\xe2\xb2\xb9"     => "\xe2\xb2\xb8",
1097
      "\xe2\xb2\xb7"     => "\xe2\xb2\xb6",
1098
      "\xe2\xb2\xb5"     => "\xe2\xb2\xb4",
1099
      "\xe2\xb2\xb3"     => "\xe2\xb2\xb2",
1100
      "\xe2\xb2\xb1"     => "\xe2\xb2\xb0",
1101
      "\xe2\xb2\xaf"     => "\xe2\xb2\xae",
1102
      "\xe2\xb2\xad"     => "\xe2\xb2\xac",
1103
      "\xe2\xb2\xab"     => "\xe2\xb2\xaa",
1104
      "\xe2\xb2\xa9"     => "\xe2\xb2\xa8",
1105
      "\xe2\xb2\xa7"     => "\xe2\xb2\xa6",
1106
      "\xe2\xb2\xa5"     => "\xe2\xb2\xa4",
1107
      "\xe2\xb2\xa3"     => "\xe2\xb2\xa2",
1108
      "\xe2\xb2\xa1"     => "\xe2\xb2\xa0",
1109
      "\xe2\xb2\x9f"     => "\xe2\xb2\x9e",
1110
      "\xe2\xb2\x9d"     => "\xe2\xb2\x9c",
1111
      "\xe2\xb2\x9b"     => "\xe2\xb2\x9a",
1112
      "\xe2\xb2\x99"     => "\xe2\xb2\x98",
1113
      "\xe2\xb2\x97"     => "\xe2\xb2\x96",
1114
      "\xe2\xb2\x95"     => "\xe2\xb2\x94",
1115
      "\xe2\xb2\x93"     => "\xe2\xb2\x92",
1116
      "\xe2\xb2\x91"     => "\xe2\xb2\x90",
1117
      "\xe2\xb2\x8f"     => "\xe2\xb2\x8e",
1118
      "\xe2\xb2\x8d"     => "\xe2\xb2\x8c",
1119
      "\xe2\xb2\x8b"     => "\xe2\xb2\x8a",
1120
      "\xe2\xb2\x89"     => "\xe2\xb2\x88",
1121
      "\xe2\xb2\x87"     => "\xe2\xb2\x86",
1122
      "\xe2\xb2\x85"     => "\xe2\xb2\x84",
1123
      "\xe2\xb2\x83"     => "\xe2\xb2\x82",
1124
      "\xe2\xb2\x81"     => "\xe2\xb2\x80",
1125
      "\xe2\xb1\xb6"     => "\xe2\xb1\xb5",
1126
      "\xe2\xb1\xb3"     => "\xe2\xb1\xb2",
1127
      "\xe2\xb1\xac"     => "\xe2\xb1\xab",
1128
      "\xe2\xb1\xaa"     => "\xe2\xb1\xa9",
1129
      "\xe2\xb1\xa8"     => "\xe2\xb1\xa7",
1130
      "\xe2\xb1\xa6"     => "\xc8\xbe",
1131
      "\xe2\xb1\xa5"     => "\xc8\xba",
1132
      "\xe2\xb1\xa1"     => "\xe2\xb1\xa0",
1133
      "\xe2\xb1\x9e"     => "\xe2\xb0\xae",
1134
      "\xe2\xb1\x9d"     => "\xe2\xb0\xad",
1135
      "\xe2\xb1\x9c"     => "\xe2\xb0\xac",
1136
      "\xe2\xb1\x9b"     => "\xe2\xb0\xab",
1137
      "\xe2\xb1\x9a"     => "\xe2\xb0\xaa",
1138
      "\xe2\xb1\x99"     => "\xe2\xb0\xa9",
1139
      "\xe2\xb1\x98"     => "\xe2\xb0\xa8",
1140
      "\xe2\xb1\x97"     => "\xe2\xb0\xa7",
1141
      "\xe2\xb1\x96"     => "\xe2\xb0\xa6",
1142
      "\xe2\xb1\x95"     => "\xe2\xb0\xa5",
1143
      "\xe2\xb1\x94"     => "\xe2\xb0\xa4",
1144
      "\xe2\xb1\x93"     => "\xe2\xb0\xa3",
1145
      "\xe2\xb1\x92"     => "\xe2\xb0\xa2",
1146
      "\xe2\xb1\x91"     => "\xe2\xb0\xa1",
1147
      "\xe2\xb1\x90"     => "\xe2\xb0\xa0",
1148
      "\xe2\xb1\x8f"     => "\xe2\xb0\x9f",
1149
      "\xe2\xb1\x8e"     => "\xe2\xb0\x9e",
1150
      "\xe2\xb1\x8d"     => "\xe2\xb0\x9d",
1151
      "\xe2\xb1\x8c"     => "\xe2\xb0\x9c",
1152
      "\xe2\xb1\x8b"     => "\xe2\xb0\x9b",
1153
      "\xe2\xb1\x8a"     => "\xe2\xb0\x9a",
1154
      "\xe2\xb1\x89"     => "\xe2\xb0\x99",
1155
      "\xe2\xb1\x88"     => "\xe2\xb0\x98",
1156
      "\xe2\xb1\x87"     => "\xe2\xb0\x97",
1157
      "\xe2\xb1\x86"     => "\xe2\xb0\x96",
1158
      "\xe2\xb1\x85"     => "\xe2\xb0\x95",
1159
      "\xe2\xb1\x84"     => "\xe2\xb0\x94",
1160
      "\xe2\xb1\x83"     => "\xe2\xb0\x93",
1161
      "\xe2\xb1\x82"     => "\xe2\xb0\x92",
1162
      "\xe2\xb1\x81"     => "\xe2\xb0\x91",
1163
      "\xe2\xb1\x80"     => "\xe2\xb0\x90",
1164
      "\xe2\xb0\xbf"     => "\xe2\xb0\x8f",
1165
      "\xe2\xb0\xbe"     => "\xe2\xb0\x8e",
1166
      "\xe2\xb0\xbd"     => "\xe2\xb0\x8d",
1167
      "\xe2\xb0\xbc"     => "\xe2\xb0\x8c",
1168
      "\xe2\xb0\xbb"     => "\xe2\xb0\x8b",
1169
      "\xe2\xb0\xba"     => "\xe2\xb0\x8a",
1170
      "\xe2\xb0\xb9"     => "\xe2\xb0\x89",
1171
      "\xe2\xb0\xb8"     => "\xe2\xb0\x88",
1172
      "\xe2\xb0\xb7"     => "\xe2\xb0\x87",
1173
      "\xe2\xb0\xb6"     => "\xe2\xb0\x86",
1174
      "\xe2\xb0\xb5"     => "\xe2\xb0\x85",
1175
      "\xe2\xb0\xb4"     => "\xe2\xb0\x84",
1176
      "\xe2\xb0\xb3"     => "\xe2\xb0\x83",
1177
      "\xe2\xb0\xb2"     => "\xe2\xb0\x82",
1178
      "\xe2\xb0\xb1"     => "\xe2\xb0\x81",
1179
      "\xe2\xb0\xb0"     => "\xe2\xb0\x80",
1180
      "\xe2\x86\x84"     => "\xe2\x86\x83",
1181
      "\xe2\x85\x8e"     => "\xe2\x84\xb2",
1182
      "\xe1\xbf\xb3"     => "\xe1\xbf\xbc",
1183
      "\xe1\xbf\xa5"     => "\xe1\xbf\xac",
1184
      "\xe1\xbf\xa1"     => "\xe1\xbf\xa9",
1185
      "\xe1\xbf\xa0"     => "\xe1\xbf\xa8",
1186
      "\xe1\xbf\x91"     => "\xe1\xbf\x99",
1187
      "\xe1\xbf\x90"     => "\xe1\xbf\x98",
1188
      "\xe1\xbf\x83"     => "\xe1\xbf\x8c",
1189
      "\xe1\xbe\xbe"     => "\xce\x99",
1190
      "\xe1\xbe\xb3"     => "\xe1\xbe\xbc",
1191
      "\xe1\xbe\xb1"     => "\xe1\xbe\xb9",
1192
      "\xe1\xbe\xb0"     => "\xe1\xbe\xb8",
1193
      "\xe1\xbe\xa7"     => "\xe1\xbe\xaf",
1194
      "\xe1\xbe\xa6"     => "\xe1\xbe\xae",
1195
      "\xe1\xbe\xa5"     => "\xe1\xbe\xad",
1196
      "\xe1\xbe\xa4"     => "\xe1\xbe\xac",
1197
      "\xe1\xbe\xa3"     => "\xe1\xbe\xab",
1198
      "\xe1\xbe\xa2"     => "\xe1\xbe\xaa",
1199
      "\xe1\xbe\xa1"     => "\xe1\xbe\xa9",
1200
      "\xe1\xbe\xa0"     => "\xe1\xbe\xa8",
1201
      "\xe1\xbe\x97"     => "\xe1\xbe\x9f",
1202
      "\xe1\xbe\x96"     => "\xe1\xbe\x9e",
1203
      "\xe1\xbe\x95"     => "\xe1\xbe\x9d",
1204
      "\xe1\xbe\x94"     => "\xe1\xbe\x9c",
1205
      "\xe1\xbe\x93"     => "\xe1\xbe\x9b",
1206
      "\xe1\xbe\x92"     => "\xe1\xbe\x9a",
1207
      "\xe1\xbe\x91"     => "\xe1\xbe\x99",
1208
      "\xe1\xbe\x90"     => "\xe1\xbe\x98",
1209
      "\xe1\xbe\x87"     => "\xe1\xbe\x8f",
1210
      "\xe1\xbe\x86"     => "\xe1\xbe\x8e",
1211
      "\xe1\xbe\x85"     => "\xe1\xbe\x8d",
1212
      "\xe1\xbe\x84"     => "\xe1\xbe\x8c",
1213
      "\xe1\xbe\x83"     => "\xe1\xbe\x8b",
1214
      "\xe1\xbe\x82"     => "\xe1\xbe\x8a",
1215
      "\xe1\xbe\x81"     => "\xe1\xbe\x89",
1216
      "\xe1\xbe\x80"     => "\xe1\xbe\x88",
1217
      "\xe1\xbd\xbd"     => "\xe1\xbf\xbb",
1218
      "\xe1\xbd\xbc"     => "\xe1\xbf\xba",
1219
      "\xe1\xbd\xbb"     => "\xe1\xbf\xab",
1220
      "\xe1\xbd\xba"     => "\xe1\xbf\xaa",
1221
      "\xe1\xbd\xb9"     => "\xe1\xbf\xb9",
1222
      "\xe1\xbd\xb8"     => "\xe1\xbf\xb8",
1223
      "\xe1\xbd\xb7"     => "\xe1\xbf\x9b",
1224
      "\xe1\xbd\xb6"     => "\xe1\xbf\x9a",
1225
      "\xe1\xbd\xb5"     => "\xe1\xbf\x8b",
1226
      "\xe1\xbd\xb4"     => "\xe1\xbf\x8a",
1227
      "\xe1\xbd\xb3"     => "\xe1\xbf\x89",
1228
      "\xe1\xbd\xb2"     => "\xe1\xbf\x88",
1229
      "\xe1\xbd\xb1"     => "\xe1\xbe\xbb",
1230
      "\xe1\xbd\xb0"     => "\xe1\xbe\xba",
1231
      "\xe1\xbd\xa7"     => "\xe1\xbd\xaf",
1232
      "\xe1\xbd\xa6"     => "\xe1\xbd\xae",
1233
      "\xe1\xbd\xa5"     => "\xe1\xbd\xad",
1234
      "\xe1\xbd\xa4"     => "\xe1\xbd\xac",
1235
      "\xe1\xbd\xa3"     => "\xe1\xbd\xab",
1236
      "\xe1\xbd\xa2"     => "\xe1\xbd\xaa",
1237
      "\xe1\xbd\xa1"     => "\xe1\xbd\xa9",
1238
      "\xe1\xbd\xa0"     => "\xe1\xbd\xa8",
1239
      "\xe1\xbd\x97"     => "\xe1\xbd\x9f",
1240
      "\xe1\xbd\x95"     => "\xe1\xbd\x9d",
1241
      "\xe1\xbd\x93"     => "\xe1\xbd\x9b",
1242
      "\xe1\xbd\x91"     => "\xe1\xbd\x99",
1243
      "\xe1\xbd\x85"     => "\xe1\xbd\x8d",
1244
      "\xe1\xbd\x84"     => "\xe1\xbd\x8c",
1245
      "\xe1\xbd\x83"     => "\xe1\xbd\x8b",
1246
      "\xe1\xbd\x82"     => "\xe1\xbd\x8a",
1247
      "\xe1\xbd\x81"     => "\xe1\xbd\x89",
1248
      "\xe1\xbd\x80"     => "\xe1\xbd\x88",
1249
      "\xe1\xbc\xb7"     => "\xe1\xbc\xbf",
1250
      "\xe1\xbc\xb6"     => "\xe1\xbc\xbe",
1251
      "\xe1\xbc\xb5"     => "\xe1\xbc\xbd",
1252
      "\xe1\xbc\xb4"     => "\xe1\xbc\xbc",
1253
      "\xe1\xbc\xb3"     => "\xe1\xbc\xbb",
1254
      "\xe1\xbc\xb2"     => "\xe1\xbc\xba",
1255
      "\xe1\xbc\xb1"     => "\xe1\xbc\xb9",
1256
      "\xe1\xbc\xb0"     => "\xe1\xbc\xb8",
1257
      "\xe1\xbc\xa7"     => "\xe1\xbc\xaf",
1258
      "\xe1\xbc\xa6"     => "\xe1\xbc\xae",
1259
      "\xe1\xbc\xa5"     => "\xe1\xbc\xad",
1260
      "\xe1\xbc\xa4"     => "\xe1\xbc\xac",
1261
      "\xe1\xbc\xa3"     => "\xe1\xbc\xab",
1262
      "\xe1\xbc\xa2"     => "\xe1\xbc\xaa",
1263
      "\xe1\xbc\xa1"     => "\xe1\xbc\xa9",
1264
      "\xe1\xbc\xa0"     => "\xe1\xbc\xa8",
1265
      "\xe1\xbc\x95"     => "\xe1\xbc\x9d",
1266
      "\xe1\xbc\x94"     => "\xe1\xbc\x9c",
1267
      "\xe1\xbc\x93"     => "\xe1\xbc\x9b",
1268
      "\xe1\xbc\x92"     => "\xe1\xbc\x9a",
1269
      "\xe1\xbc\x91"     => "\xe1\xbc\x99",
1270
      "\xe1\xbc\x90"     => "\xe1\xbc\x98",
1271
      "\xe1\xbc\x87"     => "\xe1\xbc\x8f",
1272
      "\xe1\xbc\x86"     => "\xe1\xbc\x8e",
1273
      "\xe1\xbc\x85"     => "\xe1\xbc\x8d",
1274
      "\xe1\xbc\x84"     => "\xe1\xbc\x8c",
1275
      "\xe1\xbc\x83"     => "\xe1\xbc\x8b",
1276
      "\xe1\xbc\x82"     => "\xe1\xbc\x8a",
1277
      "\xe1\xbc\x81"     => "\xe1\xbc\x89",
1278
      "\xe1\xbc\x80"     => "\xe1\xbc\x88",
1279
      "\xe1\xbb\xbf"     => "\xe1\xbb\xbe",
1280
      "\xe1\xbb\xbd"     => "\xe1\xbb\xbc",
1281
      "\xe1\xbb\xbb"     => "\xe1\xbb\xba",
1282
      "\xe1\xbb\xb9"     => "\xe1\xbb\xb8",
1283
      "\xe1\xbb\xb7"     => "\xe1\xbb\xb6",
1284
      "\xe1\xbb\xb5"     => "\xe1\xbb\xb4",
1285
      "\xe1\xbb\xb3"     => "\xe1\xbb\xb2",
1286
      "\xe1\xbb\xb1"     => "\xe1\xbb\xb0",
1287
      "\xe1\xbb\xaf"     => "\xe1\xbb\xae",
1288
      "\xe1\xbb\xad"     => "\xe1\xbb\xac",
1289
      "\xe1\xbb\xab"     => "\xe1\xbb\xaa",
1290
      "\xe1\xbb\xa9"     => "\xe1\xbb\xa8",
1291
      "\xe1\xbb\xa7"     => "\xe1\xbb\xa6",
1292
      "\xe1\xbb\xa5"     => "\xe1\xbb\xa4",
1293
      "\xe1\xbb\xa3"     => "\xe1\xbb\xa2",
1294
      "\xe1\xbb\xa1"     => "\xe1\xbb\xa0",
1295
      "\xe1\xbb\x9f"     => "\xe1\xbb\x9e",
1296
      "\xe1\xbb\x9d"     => "\xe1\xbb\x9c",
1297
      "\xe1\xbb\x9b"     => "\xe1\xbb\x9a",
1298
      "\xe1\xbb\x99"     => "\xe1\xbb\x98",
1299
      "\xe1\xbb\x97"     => "\xe1\xbb\x96",
1300
      "\xe1\xbb\x95"     => "\xe1\xbb\x94",
1301
      "\xe1\xbb\x93"     => "\xe1\xbb\x92",
1302
      "\xe1\xbb\x91"     => "\xe1\xbb\x90",
1303
      "\xe1\xbb\x8f"     => "\xe1\xbb\x8e",
1304
      "\xe1\xbb\x8d"     => "\xe1\xbb\x8c",
1305
      "\xe1\xbb\x8b"     => "\xe1\xbb\x8a",
1306
      "\xe1\xbb\x89"     => "\xe1\xbb\x88",
1307
      "\xe1\xbb\x87"     => "\xe1\xbb\x86",
1308
      "\xe1\xbb\x85"     => "\xe1\xbb\x84",
1309
      "\xe1\xbb\x83"     => "\xe1\xbb\x82",
1310
      "\xe1\xbb\x81"     => "\xe1\xbb\x80",
1311
      "\xe1\xba\xbf"     => "\xe1\xba\xbe",
1312
      "\xe1\xba\xbd"     => "\xe1\xba\xbc",
1313
      "\xe1\xba\xbb"     => "\xe1\xba\xba",
1314
      "\xe1\xba\xb9"     => "\xe1\xba\xb8",
1315
      "\xe1\xba\xb7"     => "\xe1\xba\xb6",
1316
      "\xe1\xba\xb5"     => "\xe1\xba\xb4",
1317
      "\xe1\xba\xb3"     => "\xe1\xba\xb2",
1318
      "\xe1\xba\xb1"     => "\xe1\xba\xb0",
1319
      "\xe1\xba\xaf"     => "\xe1\xba\xae",
1320
      "\xe1\xba\xad"     => "\xe1\xba\xac",
1321
      "\xe1\xba\xab"     => "\xe1\xba\xaa",
1322
      "\xe1\xba\xa9"     => "\xe1\xba\xa8",
1323
      "\xe1\xba\xa7"     => "\xe1\xba\xa6",
1324
      "\xe1\xba\xa5"     => "\xe1\xba\xa4",
1325
      "\xe1\xba\xa3"     => "\xe1\xba\xa2",
1326
      "\xe1\xba\xa1"     => "\xe1\xba\xa0",
1327
      "\xe1\xba\x9b"     => "\xe1\xb9\xa0",
1328
      "\xe1\xba\x95"     => "\xe1\xba\x94",
1329
      "\xe1\xba\x93"     => "\xe1\xba\x92",
1330
      "\xe1\xba\x91"     => "\xe1\xba\x90",
1331
      "\xe1\xba\x8f"     => "\xe1\xba\x8e",
1332
      "\xe1\xba\x8d"     => "\xe1\xba\x8c",
1333
      "\xe1\xba\x8b"     => "\xe1\xba\x8a",
1334
      "\xe1\xba\x89"     => "\xe1\xba\x88",
1335
      "\xe1\xba\x87"     => "\xe1\xba\x86",
1336
      "\xe1\xba\x85"     => "\xe1\xba\x84",
1337
      "\xe1\xba\x83"     => "\xe1\xba\x82",
1338
      "\xe1\xba\x81"     => "\xe1\xba\x80",
1339
      "\xe1\xb9\xbf"     => "\xe1\xb9\xbe",
1340
      "\xe1\xb9\xbd"     => "\xe1\xb9\xbc",
1341
      "\xe1\xb9\xbb"     => "\xe1\xb9\xba",
1342
      "\xe1\xb9\xb9"     => "\xe1\xb9\xb8",
1343
      "\xe1\xb9\xb7"     => "\xe1\xb9\xb6",
1344
      "\xe1\xb9\xb5"     => "\xe1\xb9\xb4",
1345
      "\xe1\xb9\xb3"     => "\xe1\xb9\xb2",
1346
      "\xe1\xb9\xb1"     => "\xe1\xb9\xb0",
1347
      "\xe1\xb9\xaf"     => "\xe1\xb9\xae",
1348
      "\xe1\xb9\xad"     => "\xe1\xb9\xac",
1349
      "\xe1\xb9\xab"     => "\xe1\xb9\xaa",
1350
      "\xe1\xb9\xa9"     => "\xe1\xb9\xa8",
1351
      "\xe1\xb9\xa7"     => "\xe1\xb9\xa6",
1352
      "\xe1\xb9\xa5"     => "\xe1\xb9\xa4",
1353
      "\xe1\xb9\xa3"     => "\xe1\xb9\xa2",
1354
      "\xe1\xb9\xa1"     => "\xe1\xb9\xa0",
1355
      "\xe1\xb9\x9f"     => "\xe1\xb9\x9e",
1356
      "\xe1\xb9\x9d"     => "\xe1\xb9\x9c",
1357
      "\xe1\xb9\x9b"     => "\xe1\xb9\x9a",
1358
      "\xe1\xb9\x99"     => "\xe1\xb9\x98",
1359
      "\xe1\xb9\x97"     => "\xe1\xb9\x96",
1360
      "\xe1\xb9\x95"     => "\xe1\xb9\x94",
1361
      "\xe1\xb9\x93"     => "\xe1\xb9\x92",
1362
      "\xe1\xb9\x91"     => "\xe1\xb9\x90",
1363
      "\xe1\xb9\x8f"     => "\xe1\xb9\x8e",
1364
      "\xe1\xb9\x8d"     => "\xe1\xb9\x8c",
1365
      "\xe1\xb9\x8b"     => "\xe1\xb9\x8a",
1366
      "\xe1\xb9\x89"     => "\xe1\xb9\x88",
1367
      "\xe1\xb9\x87"     => "\xe1\xb9\x86",
1368
      "\xe1\xb9\x85"     => "\xe1\xb9\x84",
1369
      "\xe1\xb9\x83"     => "\xe1\xb9\x82",
1370
      "\xe1\xb9\x81"     => "\xe1\xb9\x80",
1371
      "\xe1\xb8\xbf"     => "\xe1\xb8\xbe",
1372
      "\xe1\xb8\xbd"     => "\xe1\xb8\xbc",
1373
      "\xe1\xb8\xbb"     => "\xe1\xb8\xba",
1374
      "\xe1\xb8\xb9"     => "\xe1\xb8\xb8",
1375
      "\xe1\xb8\xb7"     => "\xe1\xb8\xb6",
1376
      "\xe1\xb8\xb5"     => "\xe1\xb8\xb4",
1377
      "\xe1\xb8\xb3"     => "\xe1\xb8\xb2",
1378
      "\xe1\xb8\xb1"     => "\xe1\xb8\xb0",
1379
      "\xe1\xb8\xaf"     => "\xe1\xb8\xae",
1380
      "\xe1\xb8\xad"     => "\xe1\xb8\xac",
1381
      "\xe1\xb8\xab"     => "\xe1\xb8\xaa",
1382
      "\xe1\xb8\xa9"     => "\xe1\xb8\xa8",
1383
      "\xe1\xb8\xa7"     => "\xe1\xb8\xa6",
1384
      "\xe1\xb8\xa5"     => "\xe1\xb8\xa4",
1385
      "\xe1\xb8\xa3"     => "\xe1\xb8\xa2",
1386
      "\xe1\xb8\xa1"     => "\xe1\xb8\xa0",
1387
      "\xe1\xb8\x9f"     => "\xe1\xb8\x9e",
1388
      "\xe1\xb8\x9d"     => "\xe1\xb8\x9c",
1389
      "\xe1\xb8\x9b"     => "\xe1\xb8\x9a",
1390
      "\xe1\xb8\x99"     => "\xe1\xb8\x98",
1391
      "\xe1\xb8\x97"     => "\xe1\xb8\x96",
1392
      "\xe1\xb8\x95"     => "\xe1\xb8\x94",
1393
      "\xe1\xb8\x93"     => "\xe1\xb8\x92",
1394
      "\xe1\xb8\x91"     => "\xe1\xb8\x90",
1395
      "\xe1\xb8\x8f"     => "\xe1\xb8\x8e",
1396
      "\xe1\xb8\x8d"     => "\xe1\xb8\x8c",
1397
      "\xe1\xb8\x8b"     => "\xe1\xb8\x8a",
1398
      "\xe1\xb8\x89"     => "\xe1\xb8\x88",
1399
      "\xe1\xb8\x87"     => "\xe1\xb8\x86",
1400
      "\xe1\xb8\x85"     => "\xe1\xb8\x84",
1401
      "\xe1\xb8\x83"     => "\xe1\xb8\x82",
1402
      "\xe1\xb8\x81"     => "\xe1\xb8\x80",
1403
      "\xe1\xb5\xbd"     => "\xe2\xb1\xa3",
1404
      "\xe1\xb5\xb9"     => "\xea\x9d\xbd",
1405
      "\xd6\x86"         => "\xd5\x96",
1406
      "\xd6\x85"         => "\xd5\x95",
1407
      "\xd6\x84"         => "\xd5\x94",
1408
      "\xd6\x83"         => "\xd5\x93",
1409
      "\xd6\x82"         => "\xd5\x92",
1410
      "\xd6\x81"         => "\xd5\x91",
1411
      "\xd6\x80"         => "\xd5\x90",
1412
      "\xd5\xbf"         => "\xd5\x8f",
1413
      "\xd5\xbe"         => "\xd5\x8e",
1414
      "\xd5\xbd"         => "\xd5\x8d",
1415
      "\xd5\xbc"         => "\xd5\x8c",
1416
      "\xd5\xbb"         => "\xd5\x8b",
1417
      "\xd5\xba"         => "\xd5\x8a",
1418
      "\xd5\xb9"         => "\xd5\x89",
1419
      "\xd5\xb8"         => "\xd5\x88",
1420
      "\xd5\xb7"         => "\xd5\x87",
1421
      "\xd5\xb6"         => "\xd5\x86",
1422
      "\xd5\xb5"         => "\xd5\x85",
1423
      "\xd5\xb4"         => "\xd5\x84",
1424
      "\xd5\xb3"         => "\xd5\x83",
1425
      "\xd5\xb2"         => "\xd5\x82",
1426
      "\xd5\xb1"         => "\xd5\x81",
1427
      "\xd5\xb0"         => "\xd5\x80",
1428
      "\xd5\xaf"         => "\xd4\xbf",
1429
      "\xd5\xae"         => "\xd4\xbe",
1430
      "\xd5\xad"         => "\xd4\xbd",
1431
      "\xd5\xac"         => "\xd4\xbc",
1432
      "\xd5\xab"         => "\xd4\xbb",
1433
      "\xd5\xaa"         => "\xd4\xba",
1434
      "\xd5\xa9"         => "\xd4\xb9",
1435
      "\xd5\xa8"         => "\xd4\xb8",
1436
      "\xd5\xa7"         => "\xd4\xb7",
1437
      "\xd5\xa6"         => "\xd4\xb6",
1438
      "\xd5\xa5"         => "\xd4\xb5",
1439
      "\xd5\xa4"         => "\xd4\xb4",
1440
      "\xd5\xa3"         => "\xd4\xb3",
1441
      "\xd5\xa2"         => "\xd4\xb2",
1442
      "\xd5\xa1"         => "\xd4\xb1",
1443
      "\xd4\xa5"         => "\xd4\xa4",
1444
      "\xd4\xa3"         => "\xd4\xa2",
1445
      "\xd4\xa1"         => "\xd4\xa0",
1446
      "\xd4\x9f"         => "\xd4\x9e",
1447
      "\xd4\x9d"         => "\xd4\x9c",
1448
      "\xd4\x9b"         => "\xd4\x9a",
1449
      "\xd4\x99"         => "\xd4\x98",
1450
      "\xd4\x97"         => "\xd4\x96",
1451
      "\xd4\x95"         => "\xd4\x94",
1452
      "\xd4\x93"         => "\xd4\x92",
1453
      "\xd4\x91"         => "\xd4\x90",
1454
      "\xd4\x8f"         => "\xd4\x8e",
1455
      "\xd4\x8d"         => "\xd4\x8c",
1456
      "\xd4\x8b"         => "\xd4\x8a",
1457
      "\xd4\x89"         => "\xd4\x88",
1458
      "\xd4\x87"         => "\xd4\x86",
1459
      "\xd4\x85"         => "\xd4\x84",
1460
      "\xd4\x83"         => "\xd4\x82",
1461
      "\xd4\x81"         => "\xd4\x80",
1462
      "\xd3\xbf"         => "\xd3\xbe",
1463
      "\xd3\xbd"         => "\xd3\xbc",
1464
      "\xd3\xbb"         => "\xd3\xba",
1465
      "\xd3\xb9"         => "\xd3\xb8",
1466
      "\xd3\xb7"         => "\xd3\xb6",
1467
      "\xd3\xb5"         => "\xd3\xb4",
1468
      "\xd3\xb3"         => "\xd3\xb2",
1469
      "\xd3\xb1"         => "\xd3\xb0",
1470
      "\xd3\xaf"         => "\xd3\xae",
1471
      "\xd3\xad"         => "\xd3\xac",
1472
      "\xd3\xab"         => "\xd3\xaa",
1473
      "\xd3\xa9"         => "\xd3\xa8",
1474
      "\xd3\xa7"         => "\xd3\xa6",
1475
      "\xd3\xa5"         => "\xd3\xa4",
1476
      "\xd3\xa3"         => "\xd3\xa2",
1477
      "\xd3\xa1"         => "\xd3\xa0",
1478
      "\xd3\x9f"         => "\xd3\x9e",
1479
      "\xd3\x9d"         => "\xd3\x9c",
1480
      "\xd3\x9b"         => "\xd3\x9a",
1481
      "\xd3\x99"         => "\xd3\x98",
1482
      "\xd3\x97"         => "\xd3\x96",
1483
      "\xd3\x95"         => "\xd3\x94",
1484
      "\xd3\x93"         => "\xd3\x92",
1485
      "\xd3\x91"         => "\xd3\x90",
1486
      "\xd3\x8f"         => "\xd3\x80",
1487
      "\xd3\x8e"         => "\xd3\x8d",
1488
      "\xd3\x8c"         => "\xd3\x8b",
1489
      "\xd3\x8a"         => "\xd3\x89",
1490
      "\xd3\x88"         => "\xd3\x87",
1491
      "\xd3\x86"         => "\xd3\x85",
1492
      "\xd3\x84"         => "\xd3\x83",
1493
      "\xd3\x82"         => "\xd3\x81",
1494
      "\xd2\xbf"         => "\xd2\xbe",
1495
      "\xd2\xbd"         => "\xd2\xbc",
1496
      "\xd2\xbb"         => "\xd2\xba",
1497
      "\xd2\xb9"         => "\xd2\xb8",
1498
      "\xd2\xb7"         => "\xd2\xb6",
1499
      "\xd2\xb5"         => "\xd2\xb4",
1500
      "\xd2\xb3"         => "\xd2\xb2",
1501
      "\xd2\xb1"         => "\xd2\xb0",
1502
      "\xd2\xaf"         => "\xd2\xae",
1503
      "\xd2\xad"         => "\xd2\xac",
1504
      "\xd2\xab"         => "\xd2\xaa",
1505
      "\xd2\xa9"         => "\xd2\xa8",
1506
      "\xd2\xa7"         => "\xd2\xa6",
1507
      "\xd2\xa5"         => "\xd2\xa4",
1508
      "\xd2\xa3"         => "\xd2\xa2",
1509
      "\xd2\xa1"         => "\xd2\xa0",
1510
      "\xd2\x9f"         => "\xd2\x9e",
1511
      "\xd2\x9d"         => "\xd2\x9c",
1512
      "\xd2\x9b"         => "\xd2\x9a",
1513
      "\xd2\x99"         => "\xd2\x98",
1514
      "\xd2\x97"         => "\xd2\x96",
1515
      "\xd2\x95"         => "\xd2\x94",
1516
      "\xd2\x93"         => "\xd2\x92",
1517
      "\xd2\x91"         => "\xd2\x90",
1518
      "\xd2\x8f"         => "\xd2\x8e",
1519
      "\xd2\x8d"         => "\xd2\x8c",
1520
      "\xd2\x8b"         => "\xd2\x8a",
1521
      "\xd2\x81"         => "\xd2\x80",
1522
      "\xd1\xbf"         => "\xd1\xbe",
1523
      "\xd1\xbd"         => "\xd1\xbc",
1524
      "\xd1\xbb"         => "\xd1\xba",
1525
      "\xd1\xb9"         => "\xd1\xb8",
1526
      "\xd1\xb7"         => "\xd1\xb6",
1527
      "\xd1\xb5"         => "\xd1\xb4",
1528
      "\xd1\xb3"         => "\xd1\xb2",
1529
      "\xd1\xb1"         => "\xd1\xb0",
1530
      "\xd1\xaf"         => "\xd1\xae",
1531
      "\xd1\xad"         => "\xd1\xac",
1532
      "\xd1\xab"         => "\xd1\xaa",
1533
      "\xd1\xa9"         => "\xd1\xa8",
1534
      "\xd1\xa7"         => "\xd1\xa6",
1535
      "\xd1\xa5"         => "\xd1\xa4",
1536
      "\xd1\xa3"         => "\xd1\xa2",
1537
      "\xd1\xa1"         => "\xd1\xa0",
1538
      "\xd1\x9f"         => "\xd0\x8f",
1539
      "\xd1\x9e"         => "\xd0\x8e",
1540
      "\xd1\x9d"         => "\xd0\x8d",
1541
      "\xd1\x9c"         => "\xd0\x8c",
1542
      "\xd1\x9b"         => "\xd0\x8b",
1543
      "\xd1\x9a"         => "\xd0\x8a",
1544
      "\xd1\x99"         => "\xd0\x89",
1545
      "\xd1\x98"         => "\xd0\x88",
1546
      "\xd1\x97"         => "\xd0\x87",
1547
      "\xd1\x96"         => "\xd0\x86",
1548
      "\xd1\x95"         => "\xd0\x85",
1549
      "\xd1\x94"         => "\xd0\x84",
1550
      "\xd1\x93"         => "\xd0\x83",
1551
      "\xd1\x92"         => "\xd0\x82",
1552
      "\xd1\x91"         => "\xd0\x81",
1553
      "\xd1\x90"         => "\xd0\x80",
1554
      "\xd1\x8f"         => "\xd0\xaf",
1555
      "\xd1\x8e"         => "\xd0\xae",
1556
      "\xd1\x8d"         => "\xd0\xad",
1557
      "\xd1\x8c"         => "\xd0\xac",
1558
      "\xd1\x8b"         => "\xd0\xab",
1559
      "\xd1\x8a"         => "\xd0\xaa",
1560
      "\xd1\x89"         => "\xd0\xa9",
1561
      "\xd1\x88"         => "\xd0\xa8",
1562
      "\xd1\x87"         => "\xd0\xa7",
1563
      "\xd1\x86"         => "\xd0\xa6",
1564
      "\xd1\x85"         => "\xd0\xa5",
1565
      "\xd1\x84"         => "\xd0\xa4",
1566
      "\xd1\x83"         => "\xd0\xa3",
1567
      "\xd1\x82"         => "\xd0\xa2",
1568
      "\xd1\x81"         => "\xd0\xa1",
1569
      "\xd1\x80"         => "\xd0\xa0",
1570
      "\xd0\xbf"         => "\xd0\x9f",
1571
      "\xd0\xbe"         => "\xd0\x9e",
1572
      "\xd0\xbd"         => "\xd0\x9d",
1573
      "\xd0\xbc"         => "\xd0\x9c",
1574
      "\xd0\xbb"         => "\xd0\x9b",
1575
      "\xd0\xba"         => "\xd0\x9a",
1576
      "\xd0\xb9"         => "\xd0\x99",
1577
      "\xd0\xb8"         => "\xd0\x98",
1578
      "\xd0\xb7"         => "\xd0\x97",
1579
      "\xd0\xb6"         => "\xd0\x96",
1580
      "\xd0\xb5"         => "\xd0\x95",
1581
      "\xd0\xb4"         => "\xd0\x94",
1582
      "\xd0\xb3"         => "\xd0\x93",
1583
      "\xd0\xb2"         => "\xd0\x92",
1584
      "\xd0\xb1"         => "\xd0\x91",
1585
      "\xd0\xb0"         => "\xd0\x90",
1586
      "\xcf\xbb"         => "\xcf\xba",
1587
      "\xcf\xb8"         => "\xcf\xb7",
1588
      "\xcf\xb5"         => "\xce\x95",
1589
      "\xcf\xb2"         => "\xcf\xb9",
1590
      "\xcf\xb1"         => "\xce\xa1",
1591
      "\xcf\xb0"         => "\xce\x9a",
1592
      "\xcf\xaf"         => "\xcf\xae",
1593
      "\xcf\xad"         => "\xcf\xac",
1594
      "\xcf\xab"         => "\xcf\xaa",
1595
      "\xcf\xa9"         => "\xcf\xa8",
1596
      "\xcf\xa7"         => "\xcf\xa6",
1597
      "\xcf\xa5"         => "\xcf\xa4",
1598
      "\xcf\xa3"         => "\xcf\xa2",
1599
      "\xcf\xa1"         => "\xcf\xa0",
1600
      "\xcf\x9f"         => "\xcf\x9e",
1601
      "\xcf\x9d"         => "\xcf\x9c",
1602
      "\xcf\x9b"         => "\xcf\x9a",
1603
      "\xcf\x99"         => "\xcf\x98",
1604
      "\xcf\x97"         => "\xcf\x8f",
1605
      "\xcf\x96"         => "\xce\xa0",
1606
      "\xcf\x95"         => "\xce\xa6",
1607
      "\xcf\x91"         => "\xce\x98",
1608
      "\xcf\x90"         => "\xce\x92",
1609
      "\xcf\x8e"         => "\xce\x8f",
1610
      "\xcf\x8d"         => "\xce\x8e",
1611
      "\xcf\x8c"         => "\xce\x8c",
1612
      "\xcf\x8b"         => "\xce\xab",
1613
      "\xcf\x8a"         => "\xce\xaa",
1614
      "\xcf\x89"         => "\xce\xa9",
1615
      "\xcf\x88"         => "\xce\xa8",
1616
      "\xcf\x87"         => "\xce\xa7",
1617
      "\xcf\x86"         => "\xce\xa6",
1618
      "\xcf\x85"         => "\xce\xa5",
1619
      "\xcf\x84"         => "\xce\xa4",
1620
      "\xcf\x83"         => "\xce\xa3",
1621
      "\xcf\x82"         => "\xce\xa3",
1622
      "\xcf\x81"         => "\xce\xa1",
1623
      "\xcf\x80"         => "\xce\xa0",
1624
      "\xce\xbf"         => "\xce\x9f",
1625
      "\xce\xbe"         => "\xce\x9e",
1626
      "\xce\xbd"         => "\xce\x9d",
1627
      "\xce\xbc"         => "\xce\x9c",
1628
      "\xce\xbb"         => "\xce\x9b",
1629
      "\xce\xba"         => "\xce\x9a",
1630
      "\xce\xb9"         => "\xce\x99",
1631
      "\xce\xb8"         => "\xce\x98",
1632
      "\xce\xb7"         => "\xce\x97",
1633
      "\xce\xb6"         => "\xce\x96",
1634
      "\xce\xb5"         => "\xce\x95",
1635
      "\xce\xb4"         => "\xce\x94",
1636
      "\xce\xb3"         => "\xce\x93",
1637
      "\xce\xb2"         => "\xce\x92",
1638
      "\xce\xb1"         => "\xce\x91",
1639
      "\xce\xaf"         => "\xce\x8a",
1640
      "\xce\xae"         => "\xce\x89",
1641
      "\xce\xad"         => "\xce\x88",
1642
      "\xce\xac"         => "\xce\x86",
1643
      "\xcd\xbd"         => "\xcf\xbf",
1644
      "\xcd\xbc"         => "\xcf\xbe",
1645
      "\xcd\xbb"         => "\xcf\xbd",
1646
      "\xcd\xb7"         => "\xcd\xb6",
1647
      "\xcd\xb3"         => "\xcd\xb2",
1648
      "\xcd\xb1"         => "\xcd\xb0",
1649
      "\xca\x92"         => "\xc6\xb7",
1650
      "\xca\x8c"         => "\xc9\x85",
1651
      "\xca\x8b"         => "\xc6\xb2",
1652
      "\xca\x8a"         => "\xc6\xb1",
1653
      "\xca\x89"         => "\xc9\x84",
1654
      "\xca\x88"         => "\xc6\xae",
1655
      "\xca\x83"         => "\xc6\xa9",
1656
      "\xca\x80"         => "\xc6\xa6",
1657
      "\xc9\xbd"         => "\xe2\xb1\xa4",
1658
      "\xc9\xb5"         => "\xc6\x9f",
1659
      "\xc9\xb2"         => "\xc6\x9d",
1660
      "\xc9\xb1"         => "\xe2\xb1\xae",
1661
      "\xc9\xaf"         => "\xc6\x9c",
1662
      "\xc9\xab"         => "\xe2\xb1\xa2",
1663
      "\xc9\xa9"         => "\xc6\x96",
1664
      "\xc9\xa8"         => "\xc6\x97",
1665
      "\xc9\xa5"         => "\xea\x9e\x8d",
1666
      "\xc9\xa3"         => "\xc6\x94",
1667
      "\xc9\xa0"         => "\xc6\x93",
1668
      "\xc9\x9b"         => "\xc6\x90",
1669
      "\xc9\x99"         => "\xc6\x8f",
1670
      "\xc9\x97"         => "\xc6\x8a",
1671
      "\xc9\x96"         => "\xc6\x89",
1672
      "\xc9\x94"         => "\xc6\x86",
1673
      "\xc9\x93"         => "\xc6\x81",
1674
      "\xc9\x92"         => "\xe2\xb1\xb0",
1675
      "\xc9\x91"         => "\xe2\xb1\xad",
1676
      "\xc9\x90"         => "\xe2\xb1\xaf",
1677
      "\xc9\x8f"         => "\xc9\x8e",
1678
      "\xc9\x8d"         => "\xc9\x8c",
1679
      "\xc9\x8b"         => "\xc9\x8a",
1680
      "\xc9\x89"         => "\xc9\x88",
1681
      "\xc9\x87"         => "\xc9\x86",
1682
      "\xc9\x82"         => "\xc9\x81",
1683
      "\xc9\x80"         => "\xe2\xb1\xbf",
1684
      "\xc8\xbf"         => "\xe2\xb1\xbe",
1685
      "\xc8\xbc"         => "\xc8\xbb",
1686
      "\xc8\xb3"         => "\xc8\xb2",
1687
      "\xc8\xb1"         => "\xc8\xb0",
1688
      "\xc8\xaf"         => "\xc8\xae",
1689
      "\xc8\xad"         => "\xc8\xac",
1690
      "\xc8\xab"         => "\xc8\xaa",
1691
      "\xc8\xa9"         => "\xc8\xa8",
1692
      "\xc8\xa7"         => "\xc8\xa6",
1693
      "\xc8\xa5"         => "\xc8\xa4",
1694
      "\xc8\xa3"         => "\xc8\xa2",
1695
      "\xc8\x9f"         => "\xc8\x9e",
1696
      "\xc8\x9d"         => "\xc8\x9c",
1697
      "\xc8\x9b"         => "\xc8\x9a",
1698
      "\xc8\x99"         => "\xc8\x98",
1699
      "\xc8\x97"         => "\xc8\x96",
1700
      "\xc8\x95"         => "\xc8\x94",
1701
      "\xc8\x93"         => "\xc8\x92",
1702
      "\xc8\x91"         => "\xc8\x90",
1703
      "\xc8\x8f"         => "\xc8\x8e",
1704
      "\xc8\x8d"         => "\xc8\x8c",
1705
      "\xc8\x8b"         => "\xc8\x8a",
1706
      "\xc8\x89"         => "\xc8\x88",
1707
      "\xc8\x87"         => "\xc8\x86",
1708
      "\xc8\x85"         => "\xc8\x84",
1709
      "\xc8\x83"         => "\xc8\x82",
1710
      "\xc8\x81"         => "\xc8\x80",
1711
      "\xc7\xbf"         => "\xc7\xbe",
1712
      "\xc7\xbd"         => "\xc7\xbc",
1713
      "\xc7\xbb"         => "\xc7\xba",
1714
      "\xc7\xb9"         => "\xc7\xb8",
1715
      "\xc7\xb5"         => "\xc7\xb4",
1716
      "\xc7\xb3"         => "\xc7\xb2",
1717
      "\xc7\xaf"         => "\xc7\xae",
1718
      "\xc7\xad"         => "\xc7\xac",
1719
      "\xc7\xab"         => "\xc7\xaa",
1720
      "\xc7\xa9"         => "\xc7\xa8",
1721
      "\xc7\xa7"         => "\xc7\xa6",
1722
      "\xc7\xa5"         => "\xc7\xa4",
1723
      "\xc7\xa3"         => "\xc7\xa2",
1724
      "\xc7\xa1"         => "\xc7\xa0",
1725
      "\xc7\x9f"         => "\xc7\x9e",
1726
      "\xc7\x9d"         => "\xc6\x8e",
1727
      "\xc7\x9c"         => "\xc7\x9b",
1728
      "\xc7\x9a"         => "\xc7\x99",
1729
      "\xc7\x98"         => "\xc7\x97",
1730
      "\xc7\x96"         => "\xc7\x95",
1731
      "\xc7\x94"         => "\xc7\x93",
1732
      "\xc7\x92"         => "\xc7\x91",
1733
      "\xc7\x90"         => "\xc7\x8f",
1734
      "\xc7\x8e"         => "\xc7\x8d",
1735
      "\xc7\x8c"         => "\xc7\x8b",
1736
      "\xc7\x89"         => "\xc7\x88",
1737
      "\xc7\x86"         => "\xc7\x85",
1738
      "\xc6\xbf"         => "\xc7\xb7",
1739
      "\xc6\xbd"         => "\xc6\xbc",
1740
      "\xc6\xb9"         => "\xc6\xb8",
1741
      "\xc6\xb6"         => "\xc6\xb5",
1742
      "\xc6\xb4"         => "\xc6\xb3",
1743
      "\xc6\xb0"         => "\xc6\xaf",
1744
      "\xc6\xad"         => "\xc6\xac",
1745
      "\xc6\xa8"         => "\xc6\xa7",
1746
      "\xc6\xa5"         => "\xc6\xa4",
1747
      "\xc6\xa3"         => "\xc6\xa2",
1748
      "\xc6\xa1"         => "\xc6\xa0",
1749
      "\xc6\x9e"         => "\xc8\xa0",
1750
      "\xc6\x9a"         => "\xc8\xbd",
1751
      "\xc6\x99"         => "\xc6\x98",
1752
      "\xc6\x95"         => "\xc7\xb6",
1753
      "\xc6\x92"         => "\xc6\x91",
1754
      "\xc6\x8c"         => "\xc6\x8b",
1755
      "\xc6\x88"         => "\xc6\x87",
1756
      "\xc6\x85"         => "\xc6\x84",
1757
      "\xc6\x83"         => "\xc6\x82",
1758
      "\xc6\x80"         => "\xc9\x83",
1759
      "\xc5\xbf"         => "\x53",
1760
      "\xc5\xbe"         => "\xc5\xbd",
1761
      "\xc5\xbc"         => "\xc5\xbb",
1762
      "\xc5\xba"         => "\xc5\xb9",
1763
      "\xc5\xb7"         => "\xc5\xb6",
1764
      "\xc5\xb5"         => "\xc5\xb4",
1765
      "\xc5\xb3"         => "\xc5\xb2",
1766
      "\xc5\xb1"         => "\xc5\xb0",
1767
      "\xc5\xaf"         => "\xc5\xae",
1768
      "\xc5\xad"         => "\xc5\xac",
1769
      "\xc5\xab"         => "\xc5\xaa",
1770
      "\xc5\xa9"         => "\xc5\xa8",
1771
      "\xc5\xa7"         => "\xc5\xa6",
1772
      "\xc5\xa5"         => "\xc5\xa4",
1773
      "\xc5\xa3"         => "\xc5\xa2",
1774
      "\xc5\xa1"         => "\xc5\xa0",
1775
      "\xc5\x9f"         => "\xc5\x9e",
1776
      "\xc5\x9d"         => "\xc5\x9c",
1777
      "\xc5\x9b"         => "\xc5\x9a",
1778
      "\xc5\x99"         => "\xc5\x98",
1779
      "\xc5\x97"         => "\xc5\x96",
1780
      "\xc5\x95"         => "\xc5\x94",
1781
      "\xc5\x93"         => "\xc5\x92",
1782
      "\xc5\x91"         => "\xc5\x90",
1783
      "\xc5\x8f"         => "\xc5\x8e",
1784
      "\xc5\x8d"         => "\xc5\x8c",
1785
      "\xc5\x8b"         => "\xc5\x8a",
1786
      "\xc5\x88"         => "\xc5\x87",
1787
      "\xc5\x86"         => "\xc5\x85",
1788
      "\xc5\x84"         => "\xc5\x83",
1789
      "\xc5\x82"         => "\xc5\x81",
1790
      "\xc5\x80"         => "\xc4\xbf",
1791
      "\xc4\xbe"         => "\xc4\xbd",
1792
      "\xc4\xbc"         => "\xc4\xbb",
1793
      "\xc4\xba"         => "\xc4\xb9",
1794
      "\xc4\xb7"         => "\xc4\xb6",
1795
      "\xc4\xb5"         => "\xc4\xb4",
1796
      "\xc4\xb3"         => "\xc4\xb2",
1797
      "\xc4\xb1"         => "\x49",
1798
      "\xc4\xaf"         => "\xc4\xae",
1799
      "\xc4\xad"         => "\xc4\xac",
1800
      "\xc4\xab"         => "\xc4\xaa",
1801
      "\xc4\xa9"         => "\xc4\xa8",
1802
      "\xc4\xa7"         => "\xc4\xa6",
1803
      "\xc4\xa5"         => "\xc4\xa4",
1804
      "\xc4\xa3"         => "\xc4\xa2",
1805
      "\xc4\xa1"         => "\xc4\xa0",
1806
      "\xc4\x9f"         => "\xc4\x9e",
1807
      "\xc4\x9d"         => "\xc4\x9c",
1808
      "\xc4\x9b"         => "\xc4\x9a",
1809
      "\xc4\x99"         => "\xc4\x98",
1810
      "\xc4\x97"         => "\xc4\x96",
1811
      "\xc4\x95"         => "\xc4\x94",
1812
      "\xc4\x93"         => "\xc4\x92",
1813
      "\xc4\x91"         => "\xc4\x90",
1814
      "\xc4\x8f"         => "\xc4\x8e",
1815
      "\xc4\x8d"         => "\xc4\x8c",
1816
      "\xc4\x8b"         => "\xc4\x8a",
1817
      "\xc4\x89"         => "\xc4\x88",
1818
      "\xc4\x87"         => "\xc4\x86",
1819
      "\xc4\x85"         => "\xc4\x84",
1820
      "\xc4\x83"         => "\xc4\x82",
1821
      "\xc4\x81"         => "\xc4\x80",
1822
      "\xc3\xbf"         => "\xc5\xb8",
1823
      "\xc3\xbe"         => "\xc3\x9e",
1824
      "\xc3\xbd"         => "\xc3\x9d",
1825
      "\xc3\xbc"         => "\xc3\x9c",
1826
      "\xc3\xbb"         => "\xc3\x9b",
1827
      "\xc3\xba"         => "\xc3\x9a",
1828
      "\xc3\xb9"         => "\xc3\x99",
1829
      "\xc3\xb8"         => "\xc3\x98",
1830
      "\xc3\xb6"         => "\xc3\x96",
1831
      "\xc3\xb5"         => "\xc3\x95",
1832
      "\xc3\xb4"         => "\xc3\x94",
1833
      "\xc3\xb3"         => "\xc3\x93",
1834
      "\xc3\xb2"         => "\xc3\x92",
1835
      "\xc3\xb1"         => "\xc3\x91",
1836
      "\xc3\xb0"         => "\xc3\x90",
1837
      "\xc3\xaf"         => "\xc3\x8f",
1838
      "\xc3\xae"         => "\xc3\x8e",
1839
      "\xc3\xad"         => "\xc3\x8d",
1840
      "\xc3\xac"         => "\xc3\x8c",
1841
      "\xc3\xab"         => "\xc3\x8b",
1842
      "\xc3\xaa"         => "\xc3\x8a",
1843
      "\xc3\xa9"         => "\xc3\x89",
1844
      "\xc3\xa8"         => "\xc3\x88",
1845
      "\xc3\xa7"         => "\xc3\x87",
1846
      "\xc3\xa6"         => "\xc3\x86",
1847
      "\xc3\xa5"         => "\xc3\x85",
1848
      "\xc3\xa4"         => "\xc3\x84",
1849
      "\xc3\xa3"         => "\xc3\x83",
1850
      "\xc3\xa2"         => "\xc3\x82",
1851
      "\xc3\xa1"         => "\xc3\x81",
1852
      "\xc3\xa0"         => "\xc3\x80",
1853
      "\xc2\xb5"         => "\xce\x9c",
1854
      "\x7a"             => "\x5a",
1855
      "\x79"             => "\x59",
1856
      "\x78"             => "\x58",
1857
      "\x77"             => "\x57",
1858
      "\x76"             => "\x56",
1859
      "\x75"             => "\x55",
1860
      "\x74"             => "\x54",
1861
      "\x73"             => "\x53",
1862
      "\x72"             => "\x52",
1863
      "\x71"             => "\x51",
1864
      "\x70"             => "\x50",
1865
      "\x6f"             => "\x4f",
1866
      "\x6e"             => "\x4e",
1867
      "\x6d"             => "\x4d",
1868
      "\x6c"             => "\x4c",
1869
      "\x6b"             => "\x4b",
1870
      "\x6a"             => "\x4a",
1871
      "\x69"             => "\x49",
1872
      "\x68"             => "\x48",
1873
      "\x67"             => "\x47",
1874
      "\x66"             => "\x46",
1875
      "\x65"             => "\x45",
1876
      "\x64"             => "\x44",
1877
      "\x63"             => "\x43",
1878
      "\x62"             => "\x42",
1879
      "\x61"             => "\x41",
1880
1881
    );
1882
1883
    return $case;
1884
  }
1885
1886
  /**
1887
   * This method will auto-detect your server environment for UTF-8 support.
1888
   *
1889
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
1890
   */
1891 194
  public static function checkForSupport()
1892
  {
1893 194
    if (!isset(self::$support['mbstring'])) {
1894
1895 1
      self::$support['mbstring'] = self::mbstring_loaded();
1896 1
      self::$support['iconv'] = self::iconv_loaded();
1897 1
      self::$support['intl'] = self::intl_loaded();
1898 1
      self::$support['intlChar'] = self::intlChar_loaded();
1899 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
1900 1
    }
1901 194
  }
1902
1903
  /**
1904
   * Generates a UTF-8 encoded character from the given code point.
1905
   *
1906
   * INFO: opposite to UTF8::ord()
1907
   *
1908
   * @param    int $code_point The code point for which to generate a character.
1909
   *
1910
   * @return   string|null Multi-Byte character, returns null on failure to encode.
1911
   */
1912 9
  public static function chr($code_point)
1913
  {
1914 9
    self::checkForSupport();
1915
1916 9
    $i = (int)$code_point;
1917
1918 9
    if (self::$support['intlChar'] === true) {
1919
      return \IntlChar::chr($code_point);
1920
    }
1921
1922 9
    if ($i !== $code_point) {
1923 1
      $i = self::hex_to_int($code_point);
1924 1
    }
1925
1926 9
    if (!$i) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $i of type integer|false is loosely compared to false; this is ambiguous if the integer can be zero. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
1927 2
      return null;
1928
    }
1929
1930 9
    return self::html_entity_decode("&#{$i};", ENT_QUOTES);
1931
  }
1932
1933
  /**
1934
   * Applies callback to all characters of a string.
1935
   *
1936
   * @param  string|array $callback The callback function.
1937
   * @param  string       $str      UTF-8 string to run callback on.
1938
   *
1939
   * @return array The outcome of callback.
1940
   */
1941 1
  public static function chr_map($callback, $str)
1942
  {
1943 1
    $chars = self::split($str);
1944
1945 1
    return array_map($callback, $chars);
1946
  }
1947
1948
  /**
1949
   * Generates an array of byte length of each character of a Unicode string.
1950
   *
1951
   * 1 byte => U+0000  - U+007F
1952
   * 2 byte => U+0080  - U+07FF
1953
   * 3 byte => U+0800  - U+FFFF
1954
   * 4 byte => U+10000 - U+10FFFF
1955
   *
1956
   * @param    string $str The original Unicode string.
1957
   *
1958
   * @return   array An array of byte lengths of each character.
1959
   */
1960 4
  public static function chr_size_list($str)
1961
  {
1962 4
    if (!$str) {
1963 3
      return array();
1964
    }
1965
1966 4
    return array_map('strlen', self::split($str));
1967
  }
1968
1969
  /**
1970
   * Get a decimal code representation of a specific character.
1971
   *
1972
   * @param   string $char The input character
1973
   *
1974
   * @return  int
1975
   */
1976 2
  public static function chr_to_decimal($char)
1977
  {
1978 2
    $char = (string)$char;
1979 2
    $code = self::ord($char[0]);
1980 2
    $bytes = 1;
1981
1982 2
    if (!($code & 0x80)) {
1983
      // 0xxxxxxx
1984 2
      return $code;
1985
    }
1986
1987 2
    if (($code & 0xe0) === 0xc0) {
1988
      // 110xxxxx
1989 2
      $bytes = 2;
1990 2
      $code &= ~0xc0;
1991 2
    } elseif (($code & 0xf0) === 0xe0) {
1992
      // 1110xxxx
1993 1
      $bytes = 3;
1994 1
      $code &= ~0xe0;
1995 1
    } elseif (($code & 0xf8) === 0xf0) {
1996
      // 11110xxx
1997
      $bytes = 4;
1998
      $code &= ~0xf0;
1999
    }
2000
2001 2
    for ($i = 2; $i <= $bytes; $i++) {
2002
      // 10xxxxxx
2003 2
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
2004 2
    }
2005
2006 2
    return $code;
2007
  }
2008
2009
  /**
2010
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
2011
   *
2012
   * @param    string $char The input character
2013
   * @param    string $pfix
2014
   *
2015
   * @return   string The code point encoded as U+xxxx
2016
   */
2017
  public static function chr_to_hex($char, $pfix = 'U+')
2018
  {
2019
    return self::int_to_hex(self::ord($char), $pfix);
2020
  }
2021
2022
  /**
2023
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
2024
   *
2025
   * @param    string $body     The original string to be split.
2026
   * @param    int    $chunklen The maximum character length of a chunk.
2027
   * @param    string $end      The character(s) to be inserted at the end of each chunk.
2028
   *
2029
   * @return   string The chunked string
2030
   */
2031 1
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
2032
  {
2033 1
    return implode($end, self::split($body, $chunklen));
2034
  }
2035
2036
  /**
2037
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
2038
   *
2039
   * @param string $str                     The string to be sanitized.
2040
   * @param bool   $remove_bom
2041
   * @param bool   $normalize_whitespace
2042
   * @param bool   $normalize_msword        e.g.: "…" => "..."
2043
   * @param bool   $keep_non_breaking_space set true, to keep non-breaking-spaces
2044
   *
2045
   * @return string Clean UTF-8 encoded string
2046
   */
2047 41
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
2048
  {
2049
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
2050
    // caused connection reset problem on larger strings
2051
2052
    $regx = '/
2053
      (
2054
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
2055
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
2056
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
2057
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
2058
        ){1,100}                      # ...one or more times
2059
      )
2060
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
2061
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
2062 41
    /x';
2063 41
    $str = preg_replace($regx, '$1', $str);
2064
2065 41
    $str = self::replace_diamond_question_mark($str, '');
2066 41
    $str = self::remove_invisible_characters($str);
2067
2068 41
    if ($normalize_whitespace === true) {
2069 6
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
2070 6
    }
2071
2072 41
    if ($normalize_msword === true) {
2073 1
      $str = self::normalize_msword($str);
2074 1
    }
2075
2076 41
    if ($remove_bom === true) {
2077 5
      $str = self::removeBOM($str);
2078 5
    }
2079
2080 41
    return $str;
2081
  }
2082
2083
  /**
2084
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
2085
   *
2086
   * @param string $str
2087
   *
2088
   * @return string
2089
   */
2090 4
  public static function cleanup($str)
2091
  {
2092 4
    $str = (string)$str;
2093
2094 4
    if (!isset($str[0])) {
2095 1
      return '';
2096
    }
2097
2098
    // fixed ISO <-> UTF-8 Errors
2099 4
    $str = self::fix_simple_utf8($str);
2100
2101
    // remove all none UTF-8 symbols
2102
    // && remove diamond question mark (�)
2103
    // && remove remove invisible characters (e.g. "\0")
2104
    // && remove BOM
2105
    // && normalize whitespace chars (but keep non-breaking-spaces)
2106 4
    $str = self::clean($str, true, true, false, true);
2107
2108 4
    return (string)$str;
2109
  }
2110
2111
  /**
2112
   * Accepts a string or a array of strings and returns an array of Unicode code points.
2113
   *
2114
   * INFO: opposite to UTF8::string()
2115
   *
2116
   * @param    string|string[] $arg     A UTF-8 encoded string or an array of such strings.
2117
   * @param    bool            $u_style If True, will return code points in U+xxxx format,
2118
   *                                    default, code points will be returned as integers.
2119
   *
2120
   * @return   array The array of code points
2121
   */
2122 5
  public static function codepoints($arg, $u_style = false)
2123
  {
2124 5
    if (is_string($arg)) {
2125 5
      $arg = self::split($arg);
2126 5
    }
2127
2128 5
    $arg = array_map(
2129
        array(
2130 5
            '\\voku\\helper\\UTF8',
2131 5
            'ord',
2132 5
        ),
2133
        $arg
2134 5
    );
2135
2136 5
    if ($u_style) {
2137 1
      $arg = array_map(
2138
          array(
2139 1
              '\\voku\\helper\\UTF8',
2140 1
              'int_to_hex',
2141 1
          ),
2142
          $arg
2143 1
      );
2144 1
    }
2145
2146 5
    return $arg;
2147
  }
2148
2149
  /**
2150
   * Returns count of characters used in a string.
2151
   *
2152
   * @param    string $str       The input string.
2153
   * @param    bool   $cleanUtf8 Clean non UTF-8 chars from the string.
2154
   *
2155
   * @return   array An associative array of Character as keys and
2156
   *           their count as values.
2157
   */
2158 6
  public static function count_chars($str, $cleanUtf8 = false)
2159
  {
2160 6
    return array_count_values(self::split($str, 1, $cleanUtf8));
2161
  }
2162
2163
  /**
2164
   * Get a UTF-8 character from its decimal code representation.
2165
   *
2166
   * @param   int $code Code.
2167
   *
2168
   * @return  string
2169
   */
2170 1
  public static function decimal_to_chr($code)
2171
  {
2172 1
    self::checkForSupport();
2173
2174 1
    return \mb_convert_encoding(
2175 1
        '&#x' . dechex($code) . ';',
2176 1
        'UTF-8',
2177
        'HTML-ENTITIES'
2178 1
    );
2179
  }
2180
2181
  /**
2182
   * Encode a string with a new charset-encoding.
2183
   *
2184
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
2185
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
2186
   *
2187
   * @param string $encoding e.g. 'UTF-8', 'ISO-8859-1', etc.
2188
   * @param string $str      the string
2189
   * @param bool   $force    force the new encoding (we try to fix broken / double encoding for UTF-8)<br />
2190
   *                         otherwise we auto-detect the current string-encoding
2191
   *
2192
   * @return string
2193
   */
2194 11
  public static function encode($encoding, $str, $force = true)
2195
  {
2196 11
    $str = (string)$str;
2197 11
    $encoding = (string)$encoding;
2198
2199 11
    if (!isset($str[0], $encoding[0])) {
2200 5
      return $str;
2201
    }
2202
2203 11
    $encoding = self::normalizeEncoding($encoding);
2204 11
    $encodingDetected = self::str_detect_encoding($str);
2205
2206
    if (
2207
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
2208 11
        &&
2209
        (
2210
            $force === true
2211 11
            ||
2212
            $encodingDetected !== $encoding
2213 1
        )
2214 11
    ) {
2215 11
      self::checkForSupport();
2216
2217 View Code Duplication
      if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2218
          $encoding === 'UTF-8'
2219 11
          &&
2220
          (
2221
              $force === true
2222 11
              || $encodingDetected === 'UTF-8'
2223 1
              || $encodingDetected === 'WINDOWS-1252'
2224 1
              || $encodingDetected === 'ISO-8859-1'
2225 1
          )
2226 11
      ) {
2227 11
        return self::to_utf8($str);
2228
      }
2229
2230 View Code Duplication
      if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2231
          $encoding === 'ISO-8859-1'
2232 2
          &&
2233
          (
2234
              $force === true
2235 1
              || $encodingDetected === 'ISO-8859-1'
2236
              || $encodingDetected === 'UTF-8'
2237
          )
2238 2
      ) {
2239 1
        return self::to_win1252($str);
2240
      }
2241
2242 2
      $strEncoded = \mb_convert_encoding(
2243 2
          $str,
2244 2
          $encoding,
2245
          $encodingDetected
2246 2
      );
2247
2248 2
      if ($strEncoded) {
2249 2
        return $strEncoded;
2250
      }
2251
    }
2252
2253 1
    return $str;
2254
  }
2255
2256
  /**
2257
   * Reads entire file into a string.
2258
   *
2259
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
2260
   *
2261
   * @link http://php.net/manual/en/function.file-get-contents.php
2262
   *
2263
   * @param string        $filename      <p>
2264
   *                                     Name of the file to read.
2265
   *                                     </p>
2266
   * @param int|null      $flags         [optional] <p>
2267
   *                                     Prior to PHP 6, this parameter is called
2268
   *                                     use_include_path and is a bool.
2269
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
2270
   *                                     to trigger include path
2271
   *                                     search.
2272
   *                                     </p>
2273
   *                                     <p>
2274
   *                                     The value of flags can be any combination of
2275
   *                                     the following flags (with some restrictions), joined with the
2276
   *                                     binary OR (|)
2277
   *                                     operator.
2278
   *                                     </p>
2279
   *                                     <p>
2280
   *                                     <table>
2281
   *                                     Available flags
2282
   *                                     <tr valign="top">
2283
   *                                     <td>Flag</td>
2284
   *                                     <td>Description</td>
2285
   *                                     </tr>
2286
   *                                     <tr valign="top">
2287
   *                                     <td>
2288
   *                                     FILE_USE_INCLUDE_PATH
2289
   *                                     </td>
2290
   *                                     <td>
2291
   *                                     Search for filename in the include directory.
2292
   *                                     See include_path for more
2293
   *                                     information.
2294
   *                                     </td>
2295
   *                                     </tr>
2296
   *                                     <tr valign="top">
2297
   *                                     <td>
2298
   *                                     FILE_TEXT
2299
   *                                     </td>
2300
   *                                     <td>
2301
   *                                     As of PHP 6, the default encoding of the read
2302
   *                                     data is UTF-8. You can specify a different encoding by creating a
2303
   *                                     custom context or by changing the default using
2304
   *                                     stream_default_encoding. This flag cannot be
2305
   *                                     used with FILE_BINARY.
2306
   *                                     </td>
2307
   *                                     </tr>
2308
   *                                     <tr valign="top">
2309
   *                                     <td>
2310
   *                                     FILE_BINARY
2311
   *                                     </td>
2312
   *                                     <td>
2313
   *                                     With this flag, the file is read in binary mode. This is the default
2314
   *                                     setting and cannot be used with FILE_TEXT.
2315
   *                                     </td>
2316
   *                                     </tr>
2317
   *                                     </table>
2318
   *                                     </p>
2319
   * @param resource|null $context       [optional] <p>
2320
   *                                     A valid context resource created with
2321
   *                                     stream_context_create. If you don't need to use a
2322
   *                                     custom context, you can skip this parameter by &null;.
2323
   *                                     </p>
2324
   * @param int|null      $offset        [optional] <p>
2325
   *                                     The offset where the reading starts.
2326
   *                                     </p>
2327
   * @param int|null      $maxlen        [optional] <p>
2328
   *                                     Maximum length of data read. The default is to read until end
2329
   *                                     of file is reached.
2330
   *                                     </p>
2331
   * @param int           $timeout
2332
   *
2333
   * @param boolean       $convertToUtf8 WARNING: maybe you can't use this option for images or pdf, because they used
2334
   *                                     non default utf-8 chars
2335
   *
2336
   * @return string The function returns the read data or false on failure.
2337
   */
2338 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
2339
  {
2340
    // init
2341 2
    $timeout = (int)$timeout;
2342 2
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
2343
2344 2
    if ($timeout && $context === null) {
2345 2
      $context = stream_context_create(
2346
          array(
2347
              'http' =>
2348
                  array(
2349 2
                      'timeout' => $timeout,
2350 2
                  ),
2351
          )
2352 2
      );
2353 2
    }
2354
2355 2
    if (is_int($maxlen)) {
2356 1
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
2357 1
    } else {
2358 2
      $data = file_get_contents($filename, $flags, $context, $offset);
2359
    }
2360
2361
    // return false on error
2362 2
    if ($data === false) {
2363 1
      return false;
2364
    }
2365
2366 1
    if ($convertToUtf8 === true) {
2367 1
      self::checkForSupport();
2368
2369 1
      $data = self::encode('UTF-8', $data, false);
2370 1
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
2371 1
    }
2372
2373
    // clean utf-8 string
2374 1
    return $data;
2375
  }
2376
2377
  /**
2378
   * Checks if a file starts with BOM (Byte Order Mark) character.
2379
   *
2380
   * @param    string $file_path Path to a valid file.
2381
   *
2382
   * @return   bool True if the file has BOM at the start, False otherwise.
2383
   */
2384 1
  public static function file_has_bom($file_path)
2385
  {
2386 1
    return self::string_has_bom(file_get_contents($file_path));
2387
  }
2388
2389
  /**
2390
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
2391
   *
2392
   * @param mixed  $var
2393
   * @param int    $normalization_form
2394
   * @param string $leading_combining
2395
   *
2396
   * @return mixed
2397
   */
2398 9
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
2399
  {
2400 9
    switch (gettype($var)) {
2401 9 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2402 3
        foreach ($var as $k => $v) {
2403
          /** @noinspection AlterInForeachInspection */
2404 3
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
2405 3
        }
2406 3
        break;
2407 9 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2408 2
        foreach ($var as $k => $v) {
2409 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
2410 2
        }
2411 2
        break;
2412 9
      case 'string':
2413 8
        if (false !== strpos($var, "\r")) {
2414
          // Workaround https://bugs.php.net/65732
2415 2
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
2416 2
        }
2417 8
        if (preg_match('/[\x80-\xFF]/', $var)) {
2418 8
          if (\Normalizer::isNormalized($var, $normalization_form)) {
2419 6
            $n = '-';
2420 6
          } else {
2421 6
            $n = \Normalizer::normalize($var, $normalization_form);
2422
2423 6
            if (isset($n[0])) {
2424 3
              $var = $n;
2425 3
            } else {
2426 5
              $var = self::encode('UTF-8', $var);
2427
            }
2428
2429
          }
2430 8
          if ($var[0] >= "\x80" && isset($n[0], $leading_combining[0]) && preg_match('/^\p{Mn}/u', $var)) {
2431
            // Prevent leading combining chars
2432
            // for NFC-safe concatenations.
2433 2
            $var = $leading_combining . $var;
2434 2
          }
2435 8
        }
2436 8
        break;
2437 9
    }
2438
2439 9
    return $var;
2440
  }
2441
2442
  /**
2443
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
2444
   *
2445
   * @param int    $type
2446
   * @param string $var
2447
   * @param int    $filter
2448
   * @param mixed  $option
2449
   *
2450
   * @return mixed
2451
   */
2452 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2453
  {
2454
    if (4 > func_num_args()) {
2455
      $var = filter_input($type, $var, $filter);
2456
    } else {
2457
      $var = filter_input($type, $var, $filter, $option);
2458
    }
2459
2460
    return self::filter($var);
2461
  }
2462
2463
  /**
2464
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
2465
   *
2466
   * @param int   $type
2467
   * @param mixed $definition
2468
   * @param bool  $add_empty
2469
   *
2470
   * @return mixed
2471
   */
2472 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2473
  {
2474
    if (2 > func_num_args()) {
2475
      $a = filter_input_array($type);
2476
    } else {
2477
      $a = filter_input_array($type, $definition, $add_empty);
2478
    }
2479
2480
    return self::filter($a);
2481
  }
2482
2483
  /**
2484
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
2485
   *
2486
   * @param mixed $var
2487
   * @param int   $filter
2488
   * @param mixed $option
2489
   *
2490
   * @return mixed
2491
   */
2492 1 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2493
  {
2494 1
    if (3 > func_num_args()) {
2495 1
      $var = filter_var($var, $filter);
2496 1
    } else {
2497 1
      $var = filter_var($var, $filter, $option);
2498
    }
2499
2500 1
    return self::filter($var);
2501
  }
2502
2503
  /**
2504
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
2505
   *
2506
   * @param array $data
2507
   * @param mixed $definition
2508
   * @param bool  $add_empty
2509
   *
2510
   * @return mixed
2511
   */
2512 1 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2513
  {
2514 1
    if (2 > func_num_args()) {
2515 1
      $a = filter_var_array($data);
2516 1
    } else {
2517 1
      $a = filter_var_array($data, $definition, $add_empty);
2518
    }
2519
2520 1
    return self::filter($a);
2521
  }
2522
2523
  /**
2524
   * Check if the number of unicode characters are not more than the specified integer.
2525
   *
2526
   * @param    string $str      The original string to be checked.
2527
   * @param    int    $box_size The size in number of chars to be checked against string.
2528
   *
2529
   * @return   bool true if string is less than or equal to $box_size, false otherwise.
2530
   */
2531 1
  public static function fits_inside($str, $box_size)
2532
  {
2533 1
    return (self::strlen($str) <= $box_size);
2534
  }
2535
2536
  /**
2537
   * Try to fix simple broken UTF-8 strings.
2538
   *
2539
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
2540
   *
2541
   * @param string $str
2542
   *
2543
   * @return string
2544
   */
2545 7
  public static function fix_simple_utf8($str)
2546
  {
2547 7
    static $brokenUtf8ToUtf8Keys = null;
2548 7
    static $brokenUtf8ToUtf8Values = null;
2549
2550 7
    $str = (string)$str;
2551
2552 7
    if (!isset($str[0])) {
2553 2
      return '';
2554
    }
2555
2556 7
    if ($brokenUtf8ToUtf8Keys === null) {
2557 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
2558 1
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
2559 1
    }
2560
2561 7
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
2562
  }
2563
2564
  /**
2565
   * Fix a double (or multiple) encoded UTF8 string.
2566
   *
2567
   * @param string|string[] $str You can use a string or an array of strings.
2568
   *
2569
   * @return mixed
2570
   */
2571 1
  public static function fix_utf8($str)
2572
  {
2573 1
    if (is_array($str)) {
2574
2575 1
      foreach ($str as $k => $v) {
2576
        /** @noinspection AlterInForeachInspection */
2577
        /** @noinspection OffsetOperationsInspection */
2578 1
        $str[$k] = self::fix_utf8($v);
2579 1
      }
2580
2581 1
      return $str;
2582
    }
2583
2584 1
    $last = '';
2585 1
    while ($last !== $str) {
2586 1
      $last = $str;
2587 1
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 2587 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
2588 1
    }
2589
2590 1
    return $str;
2591
  }
2592
2593
  /**
2594
   * Get character of a specific character.
2595
   *
2596
   * @param   string $char Character.
2597
   *
2598
   * @return  string 'RTL' or 'LTR'
2599
   */
2600 1
  public static function getCharDirection($char)
2601
  {
2602
    // init
2603 1
    self::checkForSupport();
2604
2605 1
    if (self::$support['intlChar'] === true) {
2606
      $tmpReturn = \IntlChar::charDirection($char);
2607
2608
      // from "IntlChar"-Class
2609
      $charDirection = array(
2610
          'RTL' => array(1, 13, 14, 15, 21),
2611
          'LTR' => array(0, 11, 12, 20),
2612
      );
2613
2614
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
2615
        return 'LTR';
2616
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
2617
        return 'RTL';
2618
      }
2619
    }
2620
2621 1
    $c = static::chr_to_decimal($char);
2622
2623 1
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
2624 1
      return 'LTR';
2625
    }
2626
2627 1
    if (0x85e >= $c) {
2628
2629 1
      if (0x5be === $c ||
2630 1
          0x5c0 === $c ||
2631 1
          0x5c3 === $c ||
2632 1
          0x5c6 === $c ||
2633 1
          (0x5d0 <= $c && 0x5ea >= $c) ||
2634 1
          (0x5f0 <= $c && 0x5f4 >= $c) ||
2635 1
          0x608 === $c ||
2636 1
          0x60b === $c ||
2637 1
          0x60d === $c ||
2638 1
          0x61b === $c ||
2639 1
          (0x61e <= $c && 0x64a >= $c) ||
2640
          (0x66d <= $c && 0x66f >= $c) ||
2641
          (0x671 <= $c && 0x6d5 >= $c) ||
2642
          (0x6e5 <= $c && 0x6e6 >= $c) ||
2643
          (0x6ee <= $c && 0x6ef >= $c) ||
2644
          (0x6fa <= $c && 0x70d >= $c) ||
2645
          0x710 === $c ||
2646
          (0x712 <= $c && 0x72f >= $c) ||
2647
          (0x74d <= $c && 0x7a5 >= $c) ||
2648
          0x7b1 === $c ||
2649
          (0x7c0 <= $c && 0x7ea >= $c) ||
2650
          (0x7f4 <= $c && 0x7f5 >= $c) ||
2651
          0x7fa === $c ||
2652
          (0x800 <= $c && 0x815 >= $c) ||
2653
          0x81a === $c ||
2654
          0x824 === $c ||
2655
          0x828 === $c ||
2656
          (0x830 <= $c && 0x83e >= $c) ||
2657
          (0x840 <= $c && 0x858 >= $c) ||
2658
          0x85e === $c
2659 1
      ) {
2660 1
        return 'RTL';
2661
      }
2662
2663
    } elseif (0x200f === $c) {
2664
2665
      return 'RTL';
2666
2667
    } elseif (0xfb1d <= $c) {
2668
2669
      if (0xfb1d === $c ||
2670
          (0xfb1f <= $c && 0xfb28 >= $c) ||
2671
          (0xfb2a <= $c && 0xfb36 >= $c) ||
2672
          (0xfb38 <= $c && 0xfb3c >= $c) ||
2673
          0xfb3e === $c ||
2674
          (0xfb40 <= $c && 0xfb41 >= $c) ||
2675
          (0xfb43 <= $c && 0xfb44 >= $c) ||
2676
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
2677
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
2678
          (0xfd50 <= $c && 0xfd8f >= $c) ||
2679
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
2680
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
2681
          (0xfe70 <= $c && 0xfe74 >= $c) ||
2682
          (0xfe76 <= $c && 0xfefc >= $c) ||
2683
          (0x10800 <= $c && 0x10805 >= $c) ||
2684
          0x10808 === $c ||
2685
          (0x1080a <= $c && 0x10835 >= $c) ||
2686
          (0x10837 <= $c && 0x10838 >= $c) ||
2687
          0x1083c === $c ||
2688
          (0x1083f <= $c && 0x10855 >= $c) ||
2689
          (0x10857 <= $c && 0x1085f >= $c) ||
2690
          (0x10900 <= $c && 0x1091b >= $c) ||
2691
          (0x10920 <= $c && 0x10939 >= $c) ||
2692
          0x1093f === $c ||
2693
          0x10a00 === $c ||
2694
          (0x10a10 <= $c && 0x10a13 >= $c) ||
2695
          (0x10a15 <= $c && 0x10a17 >= $c) ||
2696
          (0x10a19 <= $c && 0x10a33 >= $c) ||
2697
          (0x10a40 <= $c && 0x10a47 >= $c) ||
2698
          (0x10a50 <= $c && 0x10a58 >= $c) ||
2699
          (0x10a60 <= $c && 0x10a7f >= $c) ||
2700
          (0x10b00 <= $c && 0x10b35 >= $c) ||
2701
          (0x10b40 <= $c && 0x10b55 >= $c) ||
2702
          (0x10b58 <= $c && 0x10b72 >= $c) ||
2703
          (0x10b78 <= $c && 0x10b7f >= $c)
2704
      ) {
2705
        return 'RTL';
2706
      }
2707
    }
2708
2709
    return 'LTR';
2710
  }
2711
2712
  /**
2713
   * get data from "/data/*.ser"
2714
   *
2715
   * @param string $file
2716
   *
2717
   * @return bool|string|array|int false on error
2718
   */
2719 1
  protected static function getData($file)
2720
  {
2721 1
    $file = __DIR__ . '/data/' . $file . '.php';
2722 1
    if (file_exists($file)) {
2723
      /** @noinspection PhpIncludeInspection */
2724 1
      return require $file;
2725
    } else {
2726
      return false;
2727
    }
2728
  }
2729
2730
  /**
2731
   * Converts hexadecimal U+xxxx code point representation to integer.
2732
   *
2733
   * INFO: opposite to UTF8::int_to_hex()
2734
   *
2735
   * @param    string $str The hexadecimal code point representation.
2736
   *
2737
   * @return   int|false The code point, or false on failure.
2738
   */
2739 2
  public static function hex_to_int($str)
2740
  {
2741 2
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
2742 1
      return intval($match[1], 16);
2743
    }
2744
2745 1
    return false;
2746
  }
2747
2748
  /**
2749
   * alias for "UTF8::html_entity_decode()"
2750
   *
2751
   * @see UTF8::html_entity_decode()
2752
   *
2753
   * @param string $str
2754
   * @param int    $flags
2755
   * @param string $encoding
2756
   *
2757
   * @return string
2758
   */
2759 1
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
2760
  {
2761 1
    return self::html_entity_decode($str, $flags, $encoding);
2762
  }
2763
2764
  /**
2765
   * Converts a UTF-8 string to a series of HTML numbered entities.
2766
   *
2767
   * INFO: opposite to UTF8::html_decode()
2768
   *
2769
   * @param  string $str            The Unicode string to be encoded as numbered entities.
2770
   * @param  bool   $keepAsciiChars Keep ASCII chars.
2771
   * @param  string $encoding
2772
   *
2773
   * @return string HTML numbered entities.
2774
   */
2775 2
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
2776
  {
2777
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
2778 2
    if (function_exists('mb_encode_numericentity')) {
2779
2780 2
      $startCode = 0x00;
2781 2
      if ($keepAsciiChars === true) {
2782 1
        $startCode = 0x80;
2783 1
      }
2784
2785 2
      $encoding = self::normalizeEncoding($encoding);
2786
2787 2
      return mb_encode_numericentity(
2788 2
          $str,
2789 2
          array($startCode, 0xffff, 0, 0xffff,),
2790
          $encoding
2791 2
      );
2792
    }
2793
2794
    return implode(
2795
        array_map(
2796
            function ($data) use ($keepAsciiChars) {
2797
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
2798
            },
2799
            self::split($str)
2800
        )
2801
    );
2802
  }
2803
2804
  /**
2805
   * UTF-8 version of html_entity_decode()
2806
   *
2807
   * The reason we are not using html_entity_decode() by itself is because
2808
   * while it is not technically correct to leave out the semicolon
2809
   * at the end of an entity most browsers will still interpret the entity
2810
   * correctly. html_entity_decode() does not convert entities without
2811
   * semicolons, so we are left with our own little solution here. Bummer.
2812
   *
2813
   * Convert all HTML entities to their applicable characters
2814
   *
2815
   * INFO: opposite to UTF8::html_encode()
2816
   *
2817
   * @link http://php.net/manual/en/function.html-entity-decode.php
2818
   *
2819
   * @param string $str      <p>
2820
   *                         The input string.
2821
   *                         </p>
2822
   * @param int    $flags    [optional] <p>
2823
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
2824
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
2825
   *                         <table>
2826
   *                         Available <i>flags</i> constants
2827
   *                         <tr valign="top">
2828
   *                         <td>Constant Name</td>
2829
   *                         <td>Description</td>
2830
   *                         </tr>
2831
   *                         <tr valign="top">
2832
   *                         <td><b>ENT_COMPAT</b></td>
2833
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
2834
   *                         </tr>
2835
   *                         <tr valign="top">
2836
   *                         <td><b>ENT_QUOTES</b></td>
2837
   *                         <td>Will convert both double and single quotes.</td>
2838
   *                         </tr>
2839
   *                         <tr valign="top">
2840
   *                         <td><b>ENT_NOQUOTES</b></td>
2841
   *                         <td>Will leave both double and single quotes unconverted.</td>
2842
   *                         </tr>
2843
   *                         <tr valign="top">
2844
   *                         <td><b>ENT_HTML401</b></td>
2845
   *                         <td>
2846
   *                         Handle code as HTML 4.01.
2847
   *                         </td>
2848
   *                         </tr>
2849
   *                         <tr valign="top">
2850
   *                         <td><b>ENT_XML1</b></td>
2851
   *                         <td>
2852
   *                         Handle code as XML 1.
2853
   *                         </td>
2854
   *                         </tr>
2855
   *                         <tr valign="top">
2856
   *                         <td><b>ENT_XHTML</b></td>
2857
   *                         <td>
2858
   *                         Handle code as XHTML.
2859
   *                         </td>
2860
   *                         </tr>
2861
   *                         <tr valign="top">
2862
   *                         <td><b>ENT_HTML5</b></td>
2863
   *                         <td>
2864
   *                         Handle code as HTML 5.
2865
   *                         </td>
2866
   *                         </tr>
2867
   *                         </table>
2868
   *                         </p>
2869
   * @param string $encoding [optional] <p>
2870
   *                         Encoding to use.
2871
   *                         </p>
2872
   *
2873
   * @return string the decoded string.
2874
   */
2875 17
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
2876
  {
2877 17
    $str = (string)$str;
2878
2879 17
    if (!isset($str[0])) {
2880 4
      return '';
2881
    }
2882
2883 17
    if (strpos($str, '&') === false) {
2884 5
      return $str;
2885
    }
2886
2887 17
    self::checkForSupport();
2888
2889 17
    $encoding = self::normalizeEncoding($encoding);
2890
2891 17
    if ($flags === null) {
2892 4
      if (Bootup::is_php('5.4') === true) {
2893
        $flags = ENT_COMPAT | ENT_HTML5;
2894
      } else {
2895 4
        $flags = ENT_COMPAT;
2896
      }
2897 4
    }
2898
2899
    do {
2900 17
      $str_compare = $str;
2901
2902
      $str = preg_replace_callback(
2903 14
          "/&#\d{2,5};/",
2904
          function ($matches) {
2905 14
            $returnTmp = \mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
2906 14
2907
            if ($returnTmp !== '"' && $returnTmp !== "'") {
2908 6
              return $returnTmp;
2909
            } else {
2910 17
              return $matches[0];
2911
            }
2912
          },
2913 17
          $str
2914 17
      );
2915 17
2916
      // decode numeric & UTF16 two byte entities
2917 17
      $str = html_entity_decode(
2918
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
2919 17
          $flags,
2920
          $encoding
2921 17
      );
2922
2923
    } while ($str_compare !== $str);
2924
2925
    return $str;
2926
  }
2927
2928
  /**
2929
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
2930
   *
2931
   * @link http://php.net/manual/en/function.htmlentities.php
2932
   *
2933
   * @param string $str           <p>
2934
   *                              The input string.
2935
   *                              </p>
2936
   * @param int    $flags         [optional] <p>
2937
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2938
   *                              invalid code unit sequences and the used document type. The default is
2939
   *                              ENT_COMPAT | ENT_HTML401.
2940
   *                              <table>
2941
   *                              Available <i>flags</i> constants
2942
   *                              <tr valign="top">
2943
   *                              <td>Constant Name</td>
2944
   *                              <td>Description</td>
2945
   *                              </tr>
2946
   *                              <tr valign="top">
2947
   *                              <td><b>ENT_COMPAT</b></td>
2948
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2949
   *                              </tr>
2950
   *                              <tr valign="top">
2951
   *                              <td><b>ENT_QUOTES</b></td>
2952
   *                              <td>Will convert both double and single quotes.</td>
2953
   *                              </tr>
2954
   *                              <tr valign="top">
2955
   *                              <td><b>ENT_NOQUOTES</b></td>
2956
   *                              <td>Will leave both double and single quotes unconverted.</td>
2957
   *                              </tr>
2958
   *                              <tr valign="top">
2959
   *                              <td><b>ENT_IGNORE</b></td>
2960
   *                              <td>
2961
   *                              Silently discard invalid code unit sequences instead of returning
2962
   *                              an empty string. Using this flag is discouraged as it
2963
   *                              may have security implications.
2964
   *                              </td>
2965
   *                              </tr>
2966
   *                              <tr valign="top">
2967
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2968
   *                              <td>
2969
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2970
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2971
   *                              </td>
2972
   *                              </tr>
2973
   *                              <tr valign="top">
2974
   *                              <td><b>ENT_DISALLOWED</b></td>
2975
   *                              <td>
2976
   *                              Replace invalid code points for the given document type with a
2977
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2978
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2979
   *                              instance, to ensure the well-formedness of XML documents with
2980
   *                              embedded external content.
2981
   *                              </td>
2982
   *                              </tr>
2983
   *                              <tr valign="top">
2984
   *                              <td><b>ENT_HTML401</b></td>
2985
   *                              <td>
2986
   *                              Handle code as HTML 4.01.
2987
   *                              </td>
2988
   *                              </tr>
2989
   *                              <tr valign="top">
2990
   *                              <td><b>ENT_XML1</b></td>
2991
   *                              <td>
2992
   *                              Handle code as XML 1.
2993
   *                              </td>
2994
   *                              </tr>
2995
   *                              <tr valign="top">
2996
   *                              <td><b>ENT_XHTML</b></td>
2997
   *                              <td>
2998
   *                              Handle code as XHTML.
2999
   *                              </td>
3000
   *                              </tr>
3001
   *                              <tr valign="top">
3002
   *                              <td><b>ENT_HTML5</b></td>
3003
   *                              <td>
3004
   *                              Handle code as HTML 5.
3005
   *                              </td>
3006
   *                              </tr>
3007
   *                              </table>
3008
   *                              </p>
3009
   * @param string $encoding      [optional] <p>
3010
   *                              Like <b>htmlspecialchars</b>,
3011
   *                              <b>htmlentities</b> takes an optional third argument
3012
   *                              <i>encoding</i> which defines encoding used in
3013
   *                              conversion.
3014
   *                              Although this argument is technically optional, you are highly
3015
   *                              encouraged to specify the correct value for your code.
3016
   *                              </p>
3017
   * @param bool   $double_encode [optional] <p>
3018
   *                              When <i>double_encode</i> is turned off PHP will not
3019
   *                              encode existing html entities. The default is to convert everything.
3020
   *                              </p>
3021
   *
3022
   *
3023
   * @return string the encoded string.
3024
   * </p>
3025
   * <p>
3026
   * If the input <i>string</i> contains an invalid code unit
3027 2
   * sequence within the given <i>encoding</i> an empty string
3028
   * will be returned, unless either the <b>ENT_IGNORE</b> or
3029 2
   * <b>ENT_SUBSTITUTE</b> flags are set.
3030
   */
3031 2
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
3032
  {
3033 2
    $encoding = self::normalizeEncoding($encoding);
3034
3035
    $str = htmlentities($str, $flags, $encoding, $double_encode);
3036
3037 2
    if ($encoding !== 'UTF-8') {
3038 2
      return $str;
3039 2
    }
3040 2
3041 2
    $byteLengths = self::chr_size_list($str);
3042 1
    $search = array();
3043
    $replacements = array();
3044 1
    foreach ($byteLengths as $counter => $byteLength) {
3045 1
      if ($byteLength >= 3) {
3046 1
        $char = self::access($str, $counter);
3047 1
3048 1
        if (!isset($replacements[$char])) {
3049 2
          $search[$char] = $char;
3050
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 3046 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
3051 2
        }
3052
      }
3053
    }
3054
3055
    return str_replace($search, $replacements, $str);
3056
  }
3057
3058
  /**
3059
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
3060
   *
3061
   * INFO: Take a look at "UTF8::htmlentities()"
3062
   *
3063
   * @link http://php.net/manual/en/function.htmlspecialchars.php
3064
   *
3065
   * @param string $str           <p>
3066
   *                              The string being converted.
3067
   *                              </p>
3068
   * @param int    $flags         [optional] <p>
3069
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
3070
   *                              invalid code unit sequences and the used document type. The default is
3071
   *                              ENT_COMPAT | ENT_HTML401.
3072
   *                              <table>
3073
   *                              Available <i>flags</i> constants
3074
   *                              <tr valign="top">
3075
   *                              <td>Constant Name</td>
3076
   *                              <td>Description</td>
3077
   *                              </tr>
3078
   *                              <tr valign="top">
3079
   *                              <td><b>ENT_COMPAT</b></td>
3080
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
3081
   *                              </tr>
3082
   *                              <tr valign="top">
3083
   *                              <td><b>ENT_QUOTES</b></td>
3084
   *                              <td>Will convert both double and single quotes.</td>
3085
   *                              </tr>
3086
   *                              <tr valign="top">
3087
   *                              <td><b>ENT_NOQUOTES</b></td>
3088
   *                              <td>Will leave both double and single quotes unconverted.</td>
3089
   *                              </tr>
3090
   *                              <tr valign="top">
3091
   *                              <td><b>ENT_IGNORE</b></td>
3092
   *                              <td>
3093
   *                              Silently discard invalid code unit sequences instead of returning
3094
   *                              an empty string. Using this flag is discouraged as it
3095
   *                              may have security implications.
3096
   *                              </td>
3097
   *                              </tr>
3098
   *                              <tr valign="top">
3099
   *                              <td><b>ENT_SUBSTITUTE</b></td>
3100
   *                              <td>
3101
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
3102
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
3103
   *                              </td>
3104
   *                              </tr>
3105
   *                              <tr valign="top">
3106
   *                              <td><b>ENT_DISALLOWED</b></td>
3107
   *                              <td>
3108
   *                              Replace invalid code points for the given document type with a
3109
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
3110
   *                              (otherwise) instead of leaving them as is. This may be useful, for
3111
   *                              instance, to ensure the well-formedness of XML documents with
3112
   *                              embedded external content.
3113
   *                              </td>
3114
   *                              </tr>
3115
   *                              <tr valign="top">
3116
   *                              <td><b>ENT_HTML401</b></td>
3117
   *                              <td>
3118
   *                              Handle code as HTML 4.01.
3119
   *                              </td>
3120
   *                              </tr>
3121
   *                              <tr valign="top">
3122
   *                              <td><b>ENT_XML1</b></td>
3123
   *                              <td>
3124
   *                              Handle code as XML 1.
3125
   *                              </td>
3126
   *                              </tr>
3127
   *                              <tr valign="top">
3128
   *                              <td><b>ENT_XHTML</b></td>
3129
   *                              <td>
3130
   *                              Handle code as XHTML.
3131
   *                              </td>
3132
   *                              </tr>
3133
   *                              <tr valign="top">
3134
   *                              <td><b>ENT_HTML5</b></td>
3135
   *                              <td>
3136
   *                              Handle code as HTML 5.
3137
   *                              </td>
3138
   *                              </tr>
3139
   *                              </table>
3140
   *                              </p>
3141
   * @param string $encoding      [optional] <p>
3142
   *                              Defines encoding used in conversion.
3143
   *                              </p>
3144
   *                              <p>
3145
   *                              For the purposes of this function, the encodings
3146
   *                              ISO-8859-1, ISO-8859-15,
3147
   *                              UTF-8, cp866,
3148
   *                              cp1251, cp1252, and
3149
   *                              KOI8-R are effectively equivalent, provided the
3150
   *                              <i>string</i> itself is valid for the encoding, as
3151
   *                              the characters affected by <b>htmlspecialchars</b> occupy
3152
   *                              the same positions in all of these encodings.
3153
   *                              </p>
3154
   * @param bool   $double_encode [optional] <p>
3155
   *                              When <i>double_encode</i> is turned off PHP will not
3156
   *                              encode existing html entities, the default is to convert everything.
3157
   *                              </p>
3158
   *
3159
   * @return string The converted string.
3160
   * </p>
3161
   * <p>
3162
   * If the input <i>string</i> contains an invalid code unit
3163 1
   * sequence within the given <i>encoding</i> an empty string
3164
   * will be returned, unless either the <b>ENT_IGNORE</b> or
3165 1
   * <b>ENT_SUBSTITUTE</b> flags are set.
3166
   */
3167 1
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
3168
  {
3169
    $encoding = self::normalizeEncoding($encoding);
3170
3171
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
3172
  }
3173
3174
  /**
3175 1
   * checks whether iconv is available on the server
3176
   *
3177 1
   * @return   bool True if available, False otherwise
3178
   */
3179
  public static function iconv_loaded()
3180
  {
3181
    return extension_loaded('iconv') ? true : false;
3182
  }
3183
3184
  /**
3185
   * Converts Integer to hexadecimal U+xxxx code point representation.
3186
   *
3187
   * INFO: opposite to UTF8::hex_to_int()
3188
   *
3189
   * @param    int    $int The integer to be converted to hexadecimal code point.
3190 2
   * @param    string $pfix
3191
   *
3192 2
   * @return   string The code point, or empty string on failure.
3193 2
   */
3194
  public static function int_to_hex($int, $pfix = 'U+')
3195 2
  {
3196
    if (ctype_digit((string)$int)) {
3197 2
      $hex = dechex((int)$int);
3198
3199
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
3200
3201
      return $pfix . $hex;
3202
    }
3203
3204
    return '';
3205
  }
3206
3207
  /**
3208 1
   * checks whether intl-char is available on the server
3209
   *
3210 1
   * @return   bool True if available, False otherwise
3211
   */
3212
  public static function intlChar_loaded()
3213
  {
3214
    return Bootup::is_php('7.0') === true and class_exists('IntlChar');
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as and instead of && is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
3215
  }
3216
3217
  /**
3218 3
   * checks whether intl is available on the server
3219
   *
3220 3
   * @return   bool True if available, False otherwise
3221
   */
3222
  public static function intl_loaded()
3223
  {
3224
    return extension_loaded('intl') ? true : false;
3225
  }
3226
3227
  /**
3228
   * alias for "UTF8::is_ascii()"
3229
   *
3230
   * @see UTF8::is_ascii()
3231
   *
3232 2
   * @param string $str
3233
   *
3234 2
   * @return boolean
3235
   */
3236
  public static function isAscii($str)
3237
  {
3238
    return self::is_ascii($str);
3239
  }
3240
3241
  /**
3242
   * alias for "UTF8::is_base64()"
3243
   *
3244
   * @see UTF8::is_base64()
3245
   *
3246 1
   * @param string $str
3247
   *
3248 1
   * @return bool
3249
   */
3250
  public static function isBase64($str)
3251
  {
3252
    return self::is_base64($str);
3253
  }
3254
3255
  /**
3256
   * alias for "UTF8::is_binary()"
3257
   *
3258
   * @see UTF8::is_binary()
3259
   *
3260
   * @param string $str
3261
   *
3262
   * @return bool
3263
   */
3264
  public static function isBinary($str)
3265
  {
3266
    return self::is_binary($str);
3267
  }
3268
3269
  /**
3270
   * alias for "UTF8::is_bom()"
3271
   *
3272
   * @see UTF8::is_bom()
3273
   *
3274
   * @param string $utf8_chr
3275
   *
3276
   * @return boolean
3277
   */
3278
  public static function isBom($utf8_chr)
3279
  {
3280
    return self::is_bom($utf8_chr);
3281
  }
3282
3283
  /**
3284
   * alias for "UTF8::is_html()"
3285
   *
3286
   * @see UTF8::is_html()
3287
   *
3288 1
   * @param string $str
3289
   *
3290 1
   * @return boolean
3291
   */
3292
  public static function isHtml($str)
3293
  {
3294
    return self::is_html($str);
3295
  }
3296
3297
  /**
3298
   * alias for "UTF8::is_json()"
3299
   *
3300
   * @see UTF8::is_json()
3301
   *
3302
   * @param string $str
3303
   *
3304
   * @return bool
3305
   */
3306
  public static function isJson($str)
3307
  {
3308
    return self::is_json($str);
3309
  }
3310
3311
  /**
3312
   * alias for "UTF8::is_utf16()"
3313
   *
3314
   * @see UTF8::is_utf16()
3315
   *
3316 1
   * @param string $str
3317
   *
3318 1
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
3319
   */
3320
  public static function isUtf16($str)
3321
  {
3322
    return self::is_utf16($str);
3323
  }
3324
3325
  /**
3326
   * alias for "UTF8::is_utf32()"
3327
   *
3328
   * @see UTF8::is_utf32()
3329
   *
3330 1
   * @param string $str
3331
   *
3332 1
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
3333
   */
3334
  public static function isUtf32($str)
3335
  {
3336
    return self::is_utf32($str);
3337
  }
3338
3339
  /**
3340
   * alias for "UTF8::is_utf8()"
3341
   *
3342
   * @see UTF8::is_utf8()
3343
   *
3344
   * @param string $str
3345 16
   * @param  bool  $strict
3346
   *
3347 16
   * @return bool
3348
   */
3349
  public static function isUtf8($str, $strict = false)
3350
  {
3351
    return self::is_utf8($str, $strict);
3352
  }
3353
3354
  /**
3355
   * Checks if a string is 7 bit ASCII.
3356
   *
3357
   * @param    string $str The string to check.
3358 14
   *
3359
   * @return   bool <strong>true</strong> if it is ASCII<br />
3360 14
   *                <strong>false</strong> otherwise
3361
   */
3362
  public static function is_ascii($str)
3363
  {
3364
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
3365
  }
3366
3367
  /**
3368
   * Returns true if the string is base64 encoded, false otherwise.
3369
   *
3370 1
   * @param string $str
3371
   *
3372 1
   * @return bool Whether or not $str is base64 encoded
3373
   */
3374 1
  public static function is_base64($str)
3375 1
  {
3376
    $str = (string)$str;
3377
3378 1
    if (!isset($str[0])) {
3379 1
      return false;
3380
    }
3381 1
3382
    if (base64_encode(base64_decode($str, true)) === $str) {
3383
      return true;
3384
    } else {
3385
      return false;
3386
    }
3387
  }
3388
3389
  /**
3390
   * Check if the input is binary... (is look like a hack).
3391
   *
3392 16
   * @param mixed $input
3393
   *
3394
   * @return bool
3395 16
   */
3396
  public static function is_binary($input)
3397
  {
3398 16
3399
    $testLength = strlen($input);
3400 16
3401 16
    if (
3402 15
        preg_match('~^[01]+$~', $input)
3403 16
        ||
3404 6
        substr_count($input, "\x00") > 0
3405
        ||
3406 15
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
3407
    ) {
3408
      return true;
3409
    } else {
3410
      return false;
3411
    }
3412
  }
3413
3414
  /**
3415
   * Check if the file is binary.
3416
   *
3417
   * @param string $file
3418
   *
3419
   * @return boolean
3420
   */
3421
  public static function is_binary_file($file)
3422
  {
3423
    try {
3424
      $fp = fopen($file, 'r');
3425
      $block = fread($fp, 512);
3426
      fclose($fp);
3427
    } catch (\Exception $e) {
3428
      $block = '';
3429
    }
3430
3431
    return self::is_binary($block);
3432
  }
3433
3434
  /**
3435
   * Checks if the given string is equal to any "Byte Order Mark".
3436
   *
3437
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
3438
   *
3439
   * @param    string $str The input string.
3440
   *
3441
   * @return   bool True if the $utf8_chr is Byte Order Mark, False otherwise.
3442
   */
3443
  public static function is_bom($str)
3444
  {
3445
    foreach (self::$bom as $bomString => $bomByteLength) {
3446
      if ($str === $bomString) {
3447
        return true;
3448
      }
3449
    }
3450
3451
    return false;
3452
  }
3453
3454
  /**
3455
   * Check if the string contains any html-tags <lall>.
3456
   *
3457 1
   * @param string $str
3458
   *
3459 1
   * @return boolean
3460
   */
3461 1
  public static function is_html($str)
3462
  {
3463
    $str = (string)$str;
3464
3465
    if (!isset($str[0])) {
3466 1
      return false;
3467
    }
3468 1
3469
    // init
3470 1
    $matches = array();
3471 1
3472
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
3473 1
3474
    if (count($matches) == 0) {
3475
      return false;
3476
    } else {
3477
      return true;
3478
    }
3479
  }
3480
3481
  /**
3482
   * Try to check if "$str" is an json-string.
3483
   *
3484 1
   * @param string $str
3485
   *
3486 1
   * @return bool
3487
   */
3488 1
  public static function is_json($str)
3489
  {
3490
    $str = (string)$str;
3491
3492
    if (!isset($str[0])) {
3493 1
      return false;
3494 1
    }
3495 1
3496 1
    if (
3497 1
        is_object(self::json_decode($str))
3498
        &&
3499 1
        json_last_error() === JSON_ERROR_NONE
3500
    ) {
3501
      return true;
3502
    } else {
3503
      return false;
3504
    }
3505
  }
3506
3507
  /**
3508
   * Check if the string is UTF-16.
3509
   *
3510 4
   * @param string $str
3511
   *
3512 4
   * @return int|false false if is't not UTF-16, 1 for UTF-16LE, 2 for UTF-16BE.
3513
   */
3514 4 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3515 4
  {
3516
    $str = self::remove_bom($str);
3517 4
3518 4
    if (self::is_binary($str)) {
3519 4
      self::checkForSupport();
3520 4
3521 4
      $maybeUTF16LE = 0;
3522 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
3523 4
      if ($test) {
3524 4
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
3525 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
3526 2
        if ($test3 === $test) {
3527 2
          $strChars = self::count_chars($str, true);
3528 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
3529 4
            if (in_array($test3char, $strChars, true) === true) {
3530 4
              $maybeUTF16LE++;
3531
            }
3532 4
          }
3533 4
        }
3534 4
      }
3535 4
3536 4
      $maybeUTF16BE = 0;
3537 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
3538 4
      if ($test) {
3539 4
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
3540 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
3541 3
        if ($test3 === $test) {
3542 3
          $strChars = self::count_chars($str, true);
3543 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
3544 4
            if (in_array($test3char, $strChars, true) === true) {
3545 4
              $maybeUTF16BE++;
3546
            }
3547 4
          }
3548 3
        }
3549 2
      }
3550
3551 3
      if ($maybeUTF16BE !== $maybeUTF16LE) {
3552
        if ($maybeUTF16LE > $maybeUTF16BE) {
3553
          return 1;
3554
        } else {
3555 3
          return 2;
3556
        }
3557 3
      }
3558
3559
    }
3560
3561
    return false;
3562
  }
3563
3564
  /**
3565
   * Check if the string is UTF-32.
3566
   *
3567 3
   * @param string $str
3568
   *
3569 3
   * @return int|false false if is't not UTF-16, 1 for UTF-32LE, 2 for UTF-32BE.
3570
   */
3571 3 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3572 3
  {
3573
    $str = self::remove_bom($str);
3574 3
3575 3
    if (self::is_binary($str)) {
3576 3
      self::checkForSupport();
3577 2
3578 2
      $maybeUTF32LE = 0;
3579 2
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
3580 2
      if ($test) {
3581 2
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
3582 2
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
3583 1
        if ($test3 === $test) {
3584 1
          $strChars = self::count_chars($str, true);
3585 2
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
3586 2
            if (in_array($test3char, $strChars, true) === true) {
3587 2
              $maybeUTF32LE++;
3588
            }
3589 3
          }
3590 3
        }
3591 3
      }
3592 2
3593 2
      $maybeUTF32BE = 0;
3594 2
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
3595 2
      if ($test) {
3596 2
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
3597 2
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
3598 1
        if ($test3 === $test) {
3599 1
          $strChars = self::count_chars($str, true);
3600 2
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
3601 2
            if (in_array($test3char, $strChars, true) === true) {
3602 2
              $maybeUTF32BE++;
3603
            }
3604 3
          }
3605 1
        }
3606 1
      }
3607
3608 1
      if ($maybeUTF32BE !== $maybeUTF32LE) {
3609
        if ($maybeUTF32LE > $maybeUTF32BE) {
3610
          return 1;
3611
        } else {
3612 3
          return 2;
3613
        }
3614 3
      }
3615
3616
    }
3617
3618
    return false;
3619
  }
3620
3621
  /**
3622
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
3623
   *
3624
   * @see    http://hsivonen.iki.fi/php-utf8/
3625
   *
3626
   * @param  string $str    The string to be checked.
3627 43
   * @param  bool   $strict Check also if the string is not UTF-16 or UTF-32.
3628
   *
3629 43
   * @return bool
3630
   */
3631 43
  public static function is_utf8($str, $strict = false)
3632 3
  {
3633
    $str = (string)$str;
3634
3635 41
    if (!isset($str[0])) {
3636 1
      return true;
3637 1
    }
3638
3639
    if ($strict === true) {
3640
      if (self::is_utf16($str) !== false) {
3641
        return false;
3642
      }
3643
3644
      if (self::is_utf32($str) !== false) {
3645 41
        return false;
3646
      }
3647
    }
3648
3649
    if (self::pcre_utf8_support() !== true) {
3650
3651
      // If even just the first character can be matched, when the /u
3652
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
3653
      // invalid, nothing at all will match, even if the string contains
3654
      // some valid sequences
3655 41
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
3656
3657 41
    } else {
3658 41
3659 41
      $mState = 0; // cached expected number of octets after the current octet
3660
      // until the beginning of the next UTF8 character sequence
3661
      $mUcs4 = 0; // cached Unicode character
3662 41
      $mBytes = 1; // cached expected number of octets in the current sequence
3663 41
      $len = strlen($str);
3664 41
3665
      /** @noinspection ForeachInvariantsInspection */
3666
      for ($i = 0; $i < $len; $i++) {
3667 41
        $in = ord($str[$i]);
3668
        if ($mState === 0) {
3669 36
          // When mState is zero we expect either a US-ASCII character or a
3670 41
          // multi-octet sequence.
3671
          if (0 === (0x80 & $in)) {
3672 34
            // US-ASCII, pass straight through.
3673 34
            $mBytes = 1;
3674 34 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3675 34
            // First octet of 2 octet sequence.
3676 39
            $mUcs4 = $in;
3677
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
3678 21
            $mState = 1;
3679 21
            $mBytes = 2;
3680 21
          } elseif (0xE0 === (0xF0 & $in)) {
3681 21
            // First octet of 3 octet sequence.
3682 33
            $mUcs4 = $in;
3683
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
3684 9
            $mState = 2;
3685 9
            $mBytes = 3;
3686 9 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3687 9
            // First octet of 4 octet sequence.
3688 16
            $mUcs4 = $in;
3689
            $mUcs4 = ($mUcs4 & 0x07) << 18;
3690
            $mState = 3;
3691
            $mBytes = 4;
3692
          } elseif (0xF8 === (0xFC & $in)) {
3693
            /* First octet of 5 octet sequence.
3694
            *
3695
            * This is illegal because the encoded codepoint must be either
3696
            * (a) not the shortest form or
3697 3
            * (b) outside the Unicode range of 0-0x10FFFF.
3698 3
            * Rather than trying to resynchronize, we will carry on until the end
3699 3
            * of the sequence and let the later error handling code catch it.
3700 3
            */
3701 9
            $mUcs4 = $in;
3702
            $mUcs4 = ($mUcs4 & 0x03) << 24;
3703 3
            $mState = 4;
3704 3
            $mBytes = 5;
3705 3 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3706 3
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
3707 3
            $mUcs4 = $in;
3708
            $mUcs4 = ($mUcs4 & 1) << 30;
3709
            $mState = 5;
3710
            $mBytes = 6;
3711 5
          } else {
3712
            /* Current octet is neither in the US-ASCII range nor a legal first
3713 41
             * octet of a multi-octet sequence.
3714
             */
3715
            return false;
3716 36
          }
3717
        } else {
3718 33
          // When mState is non-zero, we expect a continuation of the multi-octet
3719 33
          // sequence
3720 33
          if (0x80 === (0xC0 & $in)) {
3721 33
            // Legal continuation.
3722
            $shift = ($mState - 1) * 6;
3723
            $tmp = $in;
3724
            $tmp = ($tmp & 0x0000003F) << $shift;
3725
            $mUcs4 |= $tmp;
3726 33
            /**
3727
             * End of the multi-octet sequence. mUcs4 now contains the final
3728
             * Unicode code point to be output
3729
             */
3730
            if (0 === --$mState) {
3731
              /*
3732 33
              * Check for illegal sequences and code points.
3733 33
              */
3734 33
              // From Unicode 3.1, non-shortest form is illegal
3735 33
              if (
3736
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
3737 33
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
3738
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
3739 33
                  (4 < $mBytes) ||
3740 33
                  // From Unicode 3.2, surrogate characters are illegal.
3741 5
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
3742
                  // Code points outside the Unicode range are illegal.
3743
                  ($mUcs4 > 0x10FFFF)
3744 33
              ) {
3745 33
                return false;
3746 33
              }
3747 33
              // initialize UTF8 cache
3748 33
              $mState = 0;
3749
              $mUcs4 = 0;
3750
              $mBytes = 1;
3751
            }
3752
          } else {
3753 18
            /**
3754
             *((0xC0 & (*in) != 0x80) && (mState != 0))
3755
             * Incomplete multi-octet sequence.
3756 41
             */
3757
            return false;
3758 20
          }
3759
        }
3760
      }
3761
3762
      return true;
3763
    }
3764
  }
3765
3766
  /**
3767
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
3768
   * Decodes a JSON string
3769
   *
3770
   * @link http://php.net/manual/en/function.json-decode.php
3771
   *
3772
   * @param string $json    <p>
3773
   *                        The <i>json</i> string being decoded.
3774
   *                        </p>
3775
   *                        <p>
3776
   *                        This function only works with UTF-8 encoded strings.
3777
   *                        </p>
3778
   *                        <p>PHP implements a superset of
3779
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
3780
   *                        only supports these values when they are nested inside an array or an object.
3781
   *                        </p>
3782
   * @param bool   $assoc   [optional] <p>
3783
   *                        When <b>TRUE</b>, returned objects will be converted into
3784
   *                        associative arrays.
3785
   *                        </p>
3786
   * @param int    $depth   [optional] <p>
3787
   *                        User specified recursion depth.
3788
   *                        </p>
3789
   * @param int    $options [optional] <p>
3790
   *                        Bitmask of JSON decode options. Currently only
3791
   *                        <b>JSON_BIGINT_AS_STRING</b>
3792
   *                        is supported (default is to cast large integers as floats)
3793
   *                        </p>
3794
   *
3795
   * @return mixed the value encoded in <i>json</i> in appropriate
3796
   * PHP type. Values true, false and
3797
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
3798 2
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
3799
   * <i>json</i> cannot be decoded or if the encoded
3800 2
   * data is deeper than the recursion limit.
3801
   */
3802 2
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
3803
  {
3804
    $json = self::filter($json);
3805 2
3806
    if (Bootup::is_php('5.4') === true) {
3807
      $json = json_decode($json, $assoc, $depth, $options);
3808 2
    } else {
3809
      $json = json_decode($json, $assoc, $depth);
3810
    }
3811
3812
    return $json;
3813
  }
3814
3815
  /**
3816
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
3817
   * Returns the JSON representation of a value.
3818
   *
3819
   * @link http://php.net/manual/en/function.json-encode.php
3820
   *
3821
   * @param mixed $value   <p>
3822
   *                       The <i>value</i> being encoded. Can be any type except
3823
   *                       a resource.
3824
   *                       </p>
3825
   *                       <p>
3826
   *                       All string data must be UTF-8 encoded.
3827
   *                       </p>
3828
   *                       <p>PHP implements a superset of
3829
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
3830
   *                       only supports these values when they are nested inside an array or an object.
3831
   *                       </p>
3832
   * @param int   $options [optional] <p>
3833
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
3834
   *                       <b>JSON_HEX_TAG</b>,
3835
   *                       <b>JSON_HEX_AMP</b>,
3836
   *                       <b>JSON_HEX_APOS</b>,
3837
   *                       <b>JSON_NUMERIC_CHECK</b>,
3838
   *                       <b>JSON_PRETTY_PRINT</b>,
3839
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
3840
   *                       <b>JSON_FORCE_OBJECT</b>,
3841
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
3842
   *                       constants is described on
3843
   *                       the JSON constants page.
3844
   *                       </p>
3845
   * @param int   $depth   [optional] <p>
3846
   *                       Set the maximum depth. Must be greater than zero.
3847 2
   *                       </p>
3848
   *
3849 2
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
3850
   */
3851 2
  public static function json_encode($value, $options = 0, $depth = 512)
3852
  {
3853
    $value = self::filter($value);
3854 2
3855
    if (Bootup::is_php('5.5')) {
3856
      $json = json_encode($value, $options, $depth);
3857 2
    } else {
3858
      $json = json_encode($value, $options);
3859
    }
3860
3861
    return $json;
3862
  }
3863
3864
  /**
3865
   * Makes string's first char lowercase.
3866
   *
3867 6
   * @param    string $str The input string
3868
   *
3869 6
   * @return   string The resulting string
3870
   */
3871
  public static function lcfirst($str)
3872
  {
3873
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3874
  }
3875
3876
  /**
3877
   * Strip whitespace or other characters from beginning of a UTF-8 string.
3878
   *
3879
   * @param  string $str   The string to be trimmed
3880 24
   * @param  string $chars Optional characters to be stripped
3881
   *
3882 24
   * @return string The string with unwanted characters stripped from the left
3883
   */
3884 24 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3885 2
  {
3886
    $str = (string)$str;
3887
3888
    if (!isset($str[0])) {
3889 23
      return '';
3890 2
    }
3891
3892
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3893 23
    if ($chars === INF || !$chars) {
3894
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
3895 23
    }
3896
3897
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3898
3899
    return preg_replace("/^{$chars}+/u", '', $str);
3900
  }
3901
3902
  /**
3903
   * Returns the UTF-8 character with the maximum code point in the given data.
3904
   *
3905 1
   * @param    mixed $arg A UTF-8 encoded string or an array of such strings.
3906
   *
3907 1
   * @return   string The character with the highest code point than others.
3908
   */
3909 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3910
  {
3911 1
    if (is_array($arg)) {
3912
      $arg = implode($arg);
3913
    }
3914
3915
    return self::chr(max(self::codepoints($arg)));
3916
  }
3917
3918
  /**
3919
   * Calculates and returns the maximum number of bytes taken by any
3920
   * UTF-8 encoded character in the given string.
3921
   *
3922 1
   * @param  string $str The original Unicode string.
3923
   *
3924 1
   * @return int Max byte lengths of the given chars.
3925 1
   */
3926 1
  public static function max_chr_width($str)
3927
  {
3928 1
    $bytes = self::chr_size_list($str);
3929
    if (count($bytes) > 0) {
3930
      return (int)max($bytes);
3931
    } else {
3932
      return 0;
3933
    }
3934
  }
3935
3936
  /**
3937 2
   * checks whether mbstring is available on the server
3938
   *
3939 2
   * @return   bool True if available, False otherwise
3940
   */
3941 2
  public static function mbstring_loaded()
3942 2
  {
3943 2
    $return = extension_loaded('mbstring');
3944
3945 2
    if ($return === true) {
3946
      \mb_internal_encoding('UTF-8');
3947
    }
3948
3949
    return $return;
3950
  }
3951
3952
  /**
3953
   * Returns the UTF-8 character with the minimum code point in the given data.
3954
   *
3955 1
   * @param  mixed $arg A UTF-8 encoded string or an array of such strings.
3956
   *
3957 1
   * @return string The character with the lowest code point than others.
3958
   */
3959 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3960
  {
3961 1
    if (is_array($arg)) {
3962
      $arg = implode($arg);
3963
    }
3964
3965
    return self::chr(min(self::codepoints($arg)));
3966
  }
3967
3968
  /**
3969
   * alias for "UTF8::normalize_encoding()"
3970
   *
3971
   * @see UTF8::normalize_encoding()
3972
   *
3973 125
   * @param string $encoding
3974
   *
3975 125
   * @return string
3976
   */
3977
  public static function normalizeEncoding($encoding)
3978
  {
3979
    return self::normalize_encoding($encoding);
3980
  }
3981
3982
  /**
3983
   * Normalize the encoding-"name" input.
3984
   *
3985 125
   * @param  string $encoding e.g.: ISO, UTF8, WINDOWS-1251 etc.
3986
   *
3987 125
   * @return string e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.
3988
   */
3989 125
  public static function normalize_encoding($encoding)
3990 1
  {
3991
    static $staticNormalizeEncodingCache = array();
3992
3993 125
    if (!$encoding) {
3994 125
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3995
    }
3996
3997 2
    if ('UTF-8' === $encoding) {
3998 2
      return $encoding;
3999
    }
4000
4001 2
    if (in_array($encoding, self::$iconvEncoding, true)) {
4002 2
      return $encoding;
4003
    }
4004
4005 2
    if (isset($staticNormalizeEncodingCache[$encoding])) {
4006 2
      return $staticNormalizeEncodingCache[$encoding];
4007 2
    }
4008
4009
    $encodingOrig = $encoding;
4010 2
    $encoding = strtoupper($encoding);
4011 2
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
4012 2
4013 2
    $equivalences = array(
4014 2
        'ISO88591'    => 'ISO-8859-1',
4015 2
        'ISO8859'     => 'ISO-8859-1',
4016 2
        'ISO'         => 'ISO-8859-1',
4017 2
        'LATIN1'      => 'ISO-8859-1',
4018 2
        'LATIN'       => 'ISO-8859-1',
4019 2
        'UTF16'       => 'UTF-16',
4020 2
        'UTF32'       => 'UTF-32',
4021 2
        'UTF8'        => 'UTF-8',
4022 2
        'UTF'         => 'UTF-8',
4023 2
        'UTF7'        => 'UTF-7',
4024 2
        'WIN1252'     => 'ISO-8859-1',
4025
        'WINDOWS1252' => 'ISO-8859-1',
4026 2
        '8BIT'        => 'CP850',
4027 2
        'BINARY'      => 'CP850',
4028 2
    );
4029
4030 2
    if (!empty($equivalences[$encodingUpperHelper])) {
4031
      $encoding = $equivalences[$encodingUpperHelper];
4032 2
    }
4033
4034
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
4035
4036
    return $encoding;
4037
  }
4038
4039
  /**
4040
   * Normalize some MS Word special characters.
4041
   *
4042 2
   * @param string $str The string to be normalized.
4043
   *
4044 2
   * @return string
4045 2
   */
4046
  public static function normalize_msword($str)
4047 2
  {
4048 1
    static $utf8MSWordKeys = null;
4049 1
    static $utf8MSWordValues = null;
4050 1
4051
    if ($utf8MSWordKeys === null) {
4052 2
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
4053
      $utf8MSWordValues = array_values(self::$utf8MSWord);
4054
    }
4055
4056
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
4057
  }
4058
4059
  /**
4060
   * Normalize the whitespace.
4061
   *
4062
   * @param string $str                     The string to be normalized.
4063
   * @param bool   $keepNonBreakingSpace    Set to true, to keep non-breaking-spaces.
4064 7
   * @param bool   $keepBidiUnicodeControls Set to true, to keep non-printable (for the web) bidirectional text chars.
4065
   *
4066 7
   * @return string
4067 7
   */
4068
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
4069 7
  {
4070
    static $whitespaces = array();
4071 7
    static $bidiUniCodeControls = null;
4072
4073 2
    $cacheKey = (int)$keepNonBreakingSpace;
4074
4075 2
    if (!isset($whitespaces[$cacheKey])) {
4076
4077 1
      $whitespaces[$cacheKey] = self::$whitespaceTable;
4078 1
4079
      if ($keepNonBreakingSpace === true) {
4080 2
        /** @noinspection OffsetOperationsInspection */
4081 2
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
4082
      }
4083 7
4084 7
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
4085 1
    }
4086 1
4087
    if ($keepBidiUnicodeControls === false) {
4088 7
      if ($bidiUniCodeControls === null) {
4089 7
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
4090
      }
4091 7
4092
      $str = str_replace($bidiUniCodeControls, '', $str);
4093
    }
4094
4095
    return str_replace($whitespaces[$cacheKey], ' ', $str);
4096
  }
4097
4098
  /**
4099
   * Format a number with grouped thousands.
4100
   *
4101
   * @param float  $number
4102
   * @param int    $decimals
4103
   * @param string $dec_point
4104
   * @param string $thousands_sep
4105
   *
4106
   * @deprecated
4107
   *
4108
   * @return string
4109
   */
4110
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
4111
  {
4112
    $thousands_sep = (string)$thousands_sep;
4113
    $dec_point = (string)$dec_point;
4114
4115
    if (
4116
        isset($thousands_sep[1], $dec_point[1])
4117
        &&
4118
        Bootup::is_php('5.4') === true
4119
    ) {
4120
      return str_replace(
4121
          array(
4122
              '.',
4123
              ',',
4124
          ),
4125
          array(
4126
              $dec_point,
4127
              $thousands_sep,
4128
          ),
4129
          number_format($number, $decimals, '.', ',')
4130
      );
4131
    }
4132
4133
    return number_format($number, $decimals, $dec_point, $thousands_sep);
4134
  }
4135
4136
  /**
4137
   * Calculates Unicode code point of the given UTF-8 encoded character.
4138
   *
4139
   * INFO: opposite to UTF8::chr()
4140
   *
4141
   * @param  string $chr The character of which to calculate code point.
4142 16
   *
4143
   * @return int Unicode code point of the given character,<br />
4144 16
   *         0 on invalid UTF-8 byte sequence.
4145 2
   */
4146
  public static function ord($chr)
4147
  {
4148
    if (!$chr && $chr !== '0') {
4149 15
      return 0;
4150
    }
4151 15
4152
    // init
4153
    self::checkForSupport();
4154
4155
    if (self::$support['intlChar'] === true) {
4156
      $tmpReturn = \IntlChar::ord($chr);
4157
      if ($tmpReturn) {
4158 15
        return $tmpReturn;
4159 15
      }
4160
    }
4161 15
4162 3
    $chr = unpack('C*', substr($chr, 0, 4));
4163
    $a = $chr ? $chr[1] : 0;
4164
4165 14
    if (0xF0 <= $a && isset($chr[4])) {
4166 9
      return (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
4167
    }
4168
4169 12
    if (0xE0 <= $a && isset($chr[3])) {
4170 9
      return (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
4171
    }
4172
4173 11
    if (0xC0 <= $a && isset($chr[2])) {
4174
      return (($a - 0xC0) << 6) + $chr[2] - 0x80;
4175
    }
4176
4177
    return $a;
4178
  }
4179
4180
  /**
4181
   * Parses the string into an array (into the the second parameter).
4182
   *
4183
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
4184
   *          if the second parameter is not set!
4185
   *
4186
   * @link http://php.net/manual/en/function.parse-str.php
4187
   *
4188
   * @param string $str     <p>
4189
   *                        The input string.
4190
   *                        </p>
4191
   * @param array  $result  <p>
4192
   *                        The result will be returned into this reference parameter.
4193 1
   *                        </p>
4194
   *
4195
   * @return bool will return false if php can't parse the string and we haven't any $result
4196 1
   */
4197
  public static function parse_str($str, &$result)
4198 1
  {
4199
    // init
4200 1
    self::checkForSupport();
4201 1
4202 1
    $str = self::clean($str);
4203
4204
    $return = \mb_parse_str($str, $result);
4205 1
    if ($return === false || empty($result)) {
4206
      return false;
4207
    }
4208
4209
    return true;
4210
  }
4211
4212
  /**
4213 41
   * checks if \u modifier is available that enables Unicode support in PCRE.
4214
   *
4215
   * @return   bool True if support is available, false otherwise
4216 41
   */
4217
  public static function pcre_utf8_support()
4218
  {
4219
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
4220
    return (bool)@preg_match('//u', '');
4221
  }
4222
4223
  /**
4224
   * Create an array containing a range of UTF-8 characters.
4225
   *
4226
   * @param  mixed $var1 Numeric or hexadecimal code points, or a UTF-8 character to start from.
4227 1
   * @param  mixed $var2 Numeric or hexadecimal code points, or a UTF-8 character to end at.
4228
   *
4229 1
   * @return array
4230 1
   */
4231
  public static function range($var1, $var2)
4232
  {
4233 1
    if (!$var1 || !$var2) {
4234 1
      return array();
4235 1
    }
4236
4237 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4238 1
      $start = (int)$var1;
4239
    } elseif (ctype_xdigit($var1)) {
4240
      $start = (int)self::hex_to_int($var1);
4241 1
    } else {
4242
      $start = self::ord($var1);
4243
    }
4244
4245 1
    if (!$start) {
4246 1
      return array();
4247 1
    }
4248
4249 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4250 1
      $end = (int)$var2;
4251
    } elseif (ctype_xdigit($var2)) {
4252
      $end = (int)self::hex_to_int($var2);
4253 1
    } else {
4254
      $end = self::ord($var2);
4255
    }
4256
4257 1
    if (!$end) {
4258
      return array();
4259 1
    }
4260 1
4261 1
    return array_map(
4262 1
        array(
4263 1
            '\\voku\\helper\\UTF8',
4264
            'chr',
4265
        ),
4266
        range($start, $end)
4267
    );
4268
  }
4269
4270
  /**
4271
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
4272
   *
4273 10
   * @param string $str
4274
   *
4275 10
   * @return string
4276 10
   */
4277 5
  public static function remove_bom($str)
4278 5
  {
4279 10
    foreach (self::$bom as $bomString => $bomByteLength) {
4280
      if (0 === strpos($str, $bomString)) {
4281 10
        $str = substr($str, $bomByteLength);
4282
      }
4283
    }
4284
4285
    return $str;
4286
  }
4287
4288
  /**
4289
   * alias for "UTF8::remove_bom()"
4290
   *
4291
   * @see UTF8::remove_bom()
4292
   *
4293 5
   * @param string $str
4294
   *
4295 5
   * @return string
4296
   */
4297
  public static function removeBOM($str)
4298
  {
4299
    return self::remove_bom($str);
4300
  }
4301
4302
  /**
4303
   * Removes duplicate occurrences of a string in another string.
4304
   *
4305
   * @param    string       $str  The base string
4306 1
   * @param    string|array $what String to search for in the base string
4307
   *
4308 1
   * @return   string The result string with removed duplicates
4309 1
   */
4310 1
  public static function remove_duplicates($str, $what = ' ')
4311
  {
4312 1
    if (is_string($what)) {
4313 1
      $what = array($what);
4314 1
    }
4315 1
4316 1
    if (is_array($what)) {
4317
      foreach ($what as $item) {
4318 1
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
4319
      }
4320
    }
4321
4322
    return $str;
4323
  }
4324
4325
  /**
4326
   * Remove invisible characters from a string.
4327
   *
4328
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
4329
   *
4330
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
4331
   *
4332
   * @param  string $str
4333
   * @param  bool   $url_encoded
4334 42
   * @param  string $replacement
4335
   *
4336
   * @return  string
4337 42
   */
4338
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
4339
  {
4340
    // init
4341 42
    $non_displayables = array();
4342 42
4343 42
    // every control character except newline (dec 10),
4344 42
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4345
    if ($url_encoded) {
4346 42
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4347
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
4348
    }
4349 42
4350 42
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4351
4352 42
    do {
4353
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
4354
    } while ($count !== 0);
4355
4356
    return $str;
4357
  }
4358
4359
  /**
4360
   * Replace the diamond question mark (�) with the replacement.
4361
   *
4362
   * @param string $str
4363 42
   * @param string $unknown
4364
   *
4365 42
   * @return string
4366
   */
4367 42
  public static function replace_diamond_question_mark($str, $unknown = '?')
4368 42
  {
4369 42
    return str_replace(
4370
        array(
4371 42
            "\xEF\xBF\xBD",
4372 42
            '�',
4373 42
        ),
4374
        array(
4375 42
            $unknown,
4376
            $unknown,
4377
        ),
4378
        $str
4379
    );
4380
  }
4381
4382
  /**
4383
   * Strip whitespace or other characters from end of a UTF-8 string.
4384
   *
4385
   * @param    string $str   The string to be trimmed
4386 23
   * @param    string $chars Optional characters to be stripped
4387
   *
4388 23
   * @return   string The string with unwanted characters stripped from the right
4389
   */
4390 23 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4391 5
  {
4392
    $str = (string)$str;
4393
4394
    if (!isset($str[0])) {
4395 19
      return '';
4396 3
    }
4397
4398
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
4399 18
    if ($chars === INF || !$chars) {
4400
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
4401 18
    }
4402
4403
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
4404
4405
    return preg_replace("/{$chars}+$/u", '', $str);
4406
  }
4407
4408
  /**
4409
   * rxClass
4410
   *
4411
   * @param string $s
4412 45
   * @param string $class
4413
   *
4414 45
   * @return string
4415
   */
4416 45
  protected static function rxClass($s, $class = '')
4417
  {
4418 45
    static $rxClassCache = array();
4419 34
4420
    $cacheKey = $s . $class;
4421
4422 17
    if (isset($rxClassCache[$cacheKey])) {
4423
      return $rxClassCache[$cacheKey];
4424
    }
4425 17
4426 17
    $class = array($class);
4427
4428 17
    /** @noinspection SuspiciousLoopInspection */
4429 17
    foreach (self::str_split($s) as $s) {
4430 17
      if ('-' === $s) {
4431 2
        $class[0] = '-' . $class[0];
4432 2
      } elseif (!isset($s[2])) {
4433
        $class[0] .= preg_quote($s, '/');
4434
      } elseif (1 === self::strlen($s)) {
4435 17
        $class[0] .= $s;
4436
      } else {
4437 17
        $class[] = $s;
4438 17
      }
4439 17
    }
4440
4441 17
    if ($class[0]) {
4442 17
      $class[0] = '[' . $class[0] . ']';
4443 17
    }
4444
4445
    if (1 === count($class)) {
4446
      $return = $class[0];
4447 17
    } else {
4448
      $return = '(?:' . implode('|', $class) . ')';
4449 17
    }
4450
4451
    $rxClassCache[$cacheKey] = $return;
4452
4453
    return $return;
4454
  }
4455
4456
  /**
4457
   * Echo native UTF8-Support libs, e.g. for debugging.
4458
   */
4459
  public static function showSupport()
4460
  {
4461
    foreach (self::$support as $utf8Support) {
4462
      echo $utf8Support . "\n<br>";
4463
    }
4464
  }
4465
4466
  /**
4467
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
4468
   *
4469
   * @param    string $char           The Unicode character to be encoded as numbered entity.
4470 1
   * @param    bool   $keepAsciiChars Keep ASCII chars.
4471
   *
4472 1
   * @return   string The HTML numbered entity.
4473 1
   */
4474
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
4475
  {
4476
    if (!$char) {
4477
      return '';
4478 1
    }
4479 1
4480 1
    if (
4481 1
        $keepAsciiChars === true
4482
        &&
4483
        self::isAscii($char) === true
4484 1
    ) {
4485
      return $char;
4486
    }
4487
4488
    return '&#' . self::ord($char) . ';';
4489
  }
4490
4491
  /**
4492
   * Convert a string to an array of Unicode characters.
4493
   *
4494
   * @param    string  $str       The string to split into array.
4495
   * @param    int     $length    Max character length of each array element.
4496 36
   * @param    boolean $cleanUtf8 Clean non UTF-8 chars from the string.
4497
   *
4498 36
   * @return   array An array containing chunks of the string.
4499
   */
4500 36
  public static function split($str, $length = 1, $cleanUtf8 = false)
4501 4
  {
4502
    $str = (string)$str;
4503
4504
    if (!isset($str[0])) {
4505 35
      return array();
4506 35
    }
4507 35
4508
    // init
4509 35
    self::checkForSupport();
4510
    $str = (string)$str;
4511 35
    $ret = array();
4512 6
4513 6
    if (self::$support['pcre_utf8'] === true) {
4514
4515 35
      if ($cleanUtf8 === true) {
4516 35
        $str = self::clean($str);
4517 35
      }
4518 35
4519 35
      preg_match_all('/./us', $str, $retArray);
4520
      if (isset($retArray[0])) {
4521 35
        $ret = $retArray[0];
4522
      }
4523
      unset($retArray);
4524
4525
    } else {
4526
4527
      // fallback
4528
4529
      $len = strlen($str);
4530
4531
      /** @noinspection ForeachInvariantsInspection */
4532
      for ($i = 0; $i < $len; $i++) {
4533
        if (($str[$i] & "\x80") === "\x00") {
4534
          $ret[] = $str[$i];
4535
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
4536
          if (($str[$i + 1] & "\xC0") === "\x80") {
4537
            $ret[] = $str[$i] . $str[$i + 1];
4538
4539
            $i++;
4540
          }
4541 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4542
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
4543
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
4544
4545
            $i += 2;
4546
          }
4547
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
4548 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4549
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
4550
4551
            $i += 3;
4552
          }
4553 35
        }
4554 5
      }
4555
    }
4556 5
4557 5
    if ($length > 1) {
4558
      $ret = array_chunk($ret, $length);
4559
4560 35
      $ret = array_map('implode', $ret);
4561
    }
4562
4563
    /** @noinspection OffsetOperationsInspection */
4564 35
    if (isset($ret[0]) && $ret[0] === '') {
4565
      return array();
4566
    }
4567
4568
    return $ret;
4569
  }
4570
4571
  /**
4572
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
4573
   *
4574
   * @param string $str
4575 12
   *
4576
   * @return false|string The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
4577
   *                      otherwise it will return false.
4578
   */
4579
  public static function str_detect_encoding($str)
4580
  {
4581
4582 12
    //
4583 2
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
4584 1
    //
4585 2
4586 1
    if (self::is_binary($str)) {
4587 2
      if (self::is_utf16($str) === 1) {
4588
        return 'UTF-16LE';
4589 2
      } elseif (self::is_utf16($str) === 2) {
4590
        return 'UTF-16BE';
4591
      } elseif (self::is_utf32($str) === 1) {
4592 2
        return 'UTF-32LE';
4593
      } elseif (self::is_utf32($str) === 2) {
4594
        return 'UTF-32BE';
4595
      }
4596
    }
4597
4598 12
    //
4599 3
    // 2.) simple check for ASCII chars
4600
    //
4601
4602
    if (self::is_ascii($str) === true) {
4603
      return 'ASCII';
4604
    }
4605
4606 12
    //
4607 9
    // 3.) simple check for UTF-8 chars
4608
    //
4609
4610
    if (self::is_utf8($str) === true) {
4611
      return 'UTF-8';
4612
    }
4613
4614
    //
4615
    // 4.) check via "\mb_detect_encoding()"
4616 6
    //
4617 6
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
4618 6
4619 6
    $detectOrder = array(
4620 6
        'ISO-8859-1',
4621 6
        'ISO-8859-2',
4622
        'ISO-8859-3',
4623 6
        'ISO-8859-4',
4624 6
        'ISO-8859-5',
4625 6
        'ISO-8859-6',
4626 6
        'ISO-8859-7',
4627
        'ISO-8859-8',
4628
        'ISO-8859-9',
4629
        'ISO-8859-10',
4630
        'ISO-8859-13',
4631
        'ISO-8859-14',
4632
        'ISO-8859-15',
4633
        'ISO-8859-16',
4634
        'WINDOWS-1251',
4635
        'WINDOWS-1252',
4636
        'WINDOWS-1254',
4637
        'ISO-2022-JP',
4638
        'JIS',
4639
        'EUC-JP',
4640
    );
4641
4642
    self::checkForSupport();
4643
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
4644
    if ($encoding) {
4645
      return $encoding;
4646
    }
4647
4648
    //
4649
    // 5.) check via "iconv()"
4650
    //
4651
4652
    $md5 = md5($str);
4653
    foreach (self::$iconvEncoding as $encodingTmp) {
4654
      # INFO: //IGNORE and //TRANSLIT still throw notice
4655
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
4656
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
4657
        return $encodingTmp;
4658
      }
4659
    }
4660
4661
    return false;
4662
  }
4663
4664
  /**
4665
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
4666
   *
4667
   * @link  http://php.net/manual/en/function.str-ireplace.php
4668
   *
4669
   * @param mixed $search  <p>
4670 13
   *                       Every replacement with search array is
4671
   *                       performed on the result of previous replacement.
4672 13
   *                       </p>
4673
   * @param mixed $replace <p>
4674
   *                       </p>
4675 13
   * @param mixed $subject <p>
4676 13
   *                       If subject is an array, then the search and
4677 1
   *                       replace is performed with every entry of
4678 1
   *                       subject, and the return value is an array as
4679 12
   *                       well.
4680
   *                       </p>
4681 13
   * @param int   $count   [optional] <p>
4682
   *                       The number of matched and replaced needles will
4683 13
   *                       be returned in count which is passed by
4684 13
   *                       reference.
4685
   *                       </p>
4686 13
   *
4687
   * @return mixed A string or an array of replacements.
4688
   */
4689
  public static function str_ireplace($search, $replace, $subject, &$count = null)
4690
  {
4691
    $search = (array)$search;
4692
4693
    /** @noinspection AlterInForeachInspection */
4694
    foreach ($search as &$s) {
4695
      if ('' === $s .= '') {
4696
        $s = '/^(?<=.)$/';
4697
      } else {
4698 1
        $s = '/' . preg_quote($s, '/') . '/ui';
4699
      }
4700 1
    }
4701
4702 1
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
4703
    $count = $replace; // used as reference parameter
4704
4705
    return $subject;
4706 1
  }
4707
4708 1
  /**
4709
   * Limit the number of characters in a string, but also after the next word.
4710
   *
4711
   * @param  string $str
4712 1
   * @param  int    $length
4713 1
   * @param  string $strAddOn
4714
   *
4715
   * @return string
4716 1
   */
4717 1
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
4718 1
  {
4719 1
    $str = (string)$str;
4720
4721 1
    if (!isset($str[0])) {
4722
      return '';
4723
    }
4724 1
4725
    $length = (int)$length;
4726
4727 1
    if (self::strlen($str) <= $length) {
4728
      return $str;
4729
    }
4730
4731
    if (self::substr($str, $length - 1, 1) === ' ') {
4732
      return self::substr($str, 0, $length - 1) . $strAddOn;
4733
    }
4734
4735
    $str = self::substr($str, 0, $length);
4736
    $array = explode(' ', $str);
4737
    array_pop($array);
4738
    $new_str = implode(' ', $array);
4739
4740 2
    if ($new_str === '') {
4741
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
4742 2
    } else {
4743
      $str = $new_str . $strAddOn;
4744 2
    }
4745 2
4746
    return $str;
4747 2
  }
4748
4749
  /**
4750 2
   * Pad a UTF-8 string to given length with another string.
4751 2
   *
4752 2
   * @param    string $str        The input string
4753 2
   * @param    int    $pad_length The length of return string
4754 2
   * @param    string $pad_string String to use for padding the input string
4755
   * @param    int    $pad_type   can be STR_PAD_RIGHT, STR_PAD_LEFT or STR_PAD_BOTH
4756 2
   *
4757 2
   * @return   string Returns the padded string
4758 2
   */
4759 2
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
4760 2
  {
4761 2
    $str_length = self::strlen($str);
4762
4763 2
    if (is_int($pad_length) && ($pad_length > 0) && ($pad_length >= $str_length)) {
4764 2
      $ps_length = self::strlen($pad_string);
4765 2
4766 2
      $diff = $pad_length - $str_length;
4767 2
4768 2
      switch ($pad_type) {
4769 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4770 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
4771
          $pre = self::substr($pre, 0, $diff);
4772
          $post = '';
4773 2
          break;
4774
4775
        case STR_PAD_BOTH:
4776
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
4777
          $pre = self::substr($pre, 0, (int)$diff / 2);
4778
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
4779
          $post = self::substr($post, 0, (int)ceil($diff / 2));
4780
          break;
4781
4782
        case STR_PAD_RIGHT:
4783 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4784
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
4785
          $post = self::substr($post, 0, $diff);
4786
          $pre = '';
4787
      }
4788
4789
      return $pre . $str . $post;
4790
    }
4791
4792
    return $str;
4793
  }
4794 1
4795
  /**
4796 1
   * Repeat a string.
4797
   *
4798 1
   * @param string $str        <p>
4799
   *                           The string to be repeated.
4800
   *                           </p>
4801
   * @param int    $multiplier <p>
4802
   *                           Number of time the input string should be
4803
   *                           repeated.
4804
   *                           </p>
4805
   *                           <p>
4806
   *                           multiplier has to be greater than or equal to 0.
4807
   *                           If the multiplier is set to 0, the function
4808
   *                           will return an empty string.
4809
   *                           </p>
4810
   *
4811
   * @return string the repeated string.
4812
   */
4813
  public static function str_repeat($str, $multiplier)
4814
  {
4815
    $str = self::filter($str);
4816
4817
    return str_repeat($str, $multiplier);
4818
  }
4819
4820
  /**
4821
   * INFO: this is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe
4822
   *
4823
   * (PHP 4, PHP 5)<br/>
4824
   * Replace all occurrences of the search string with the replacement string
4825
   *
4826
   * @link http://php.net/manual/en/function.str-replace.php
4827
   *
4828
   * @param mixed $search  <p>
4829
   *                       The value being searched for, otherwise known as the needle.
4830
   *                       An array may be used to designate multiple needles.
4831 12
   *                       </p>
4832
   * @param mixed $replace <p>
4833 12
   *                       The replacement value that replaces found search
4834
   *                       values. An array may be used to designate multiple replacements.
4835
   *                       </p>
4836
   * @param mixed $subject <p>
4837
   *                       The string or array being searched and replaced on,
4838
   *                       otherwise known as the haystack.
4839
   *                       </p>
4840
   *                       <p>
4841
   *                       If subject is an array, then the search and
4842
   *                       replace is performed with every entry of
4843 1
   *                       subject, and the return value is an array as
4844
   *                       well.
4845 1
   *                       </p>
4846
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
4847 1
   *
4848
   * @return mixed This function returns a string or an array with the replaced values.
4849 1
   */
4850
  public static function str_replace($search, $replace, $subject, &$count = null)
4851
  {
4852
    return str_replace($search, $replace, $subject, $count);
4853
  }
4854
4855
  /**
4856
   * Shuffles all the characters in the string.
4857
   *
4858
   * @param    string $str The input string
4859
   *
4860
   * @return   string The shuffled string.
4861 1
   */
4862
  public static function str_shuffle($str)
4863 1
  {
4864
    $array = self::split($str);
4865 1
4866 1
    shuffle($array);
4867 1
4868
    return implode('', $array);
4869 1
  }
4870 1
4871 1
  /**
4872 1
   * Sort all characters according to code points.
4873
   *
4874
   * @param    string $str    A UTF-8 string.
4875 1
   * @param    bool   $unique Sort unique. If true, repeated characters are ignored.
4876
   * @param    bool   $desc   If true, will sort characters in reverse code point order.
4877
   *
4878
   * @return   string String of sorted characters
4879
   */
4880
  public static function str_sort($str, $unique = false, $desc = false)
4881
  {
4882
    $array = self::codepoints($str);
4883
4884
    if ($unique) {
4885
      $array = array_flip(array_flip($array));
4886 20
    }
4887
4888
    if ($desc) {
4889 20
      arsort($array);
4890 20
    } else {
4891
      asort($array);
4892 20
    }
4893
4894
    return self::string($array);
4895
  }
4896 20
4897 20
  /**
4898
   * Split a string into an array.
4899 20
   *
4900 20
   * @param string $str
4901
   * @param int    $len
4902
   *
4903 1
   * @return array
4904 1
   */
4905
  public static function str_split($str, $len = 1)
4906
  {
4907 1
    // init
4908 1
    self::checkForSupport();
4909 1
    $len = (int)$len;
4910 1
4911 1
    if ($len < 1) {
4912
      return str_split($str, $len);
4913 1
    }
4914
4915 1
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
4916
    $a = $a[0];
4917
4918
    if ($len === 1) {
4919
      return $a;
4920
    }
4921
4922
    $arrayOutput = array();
4923
    $p = -1;
4924
4925 1
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
4926
    foreach ($a as $l => $a) {
4927 1
      if ($l % $len) {
4928
        $arrayOutput[$p] .= $a;
4929 1
      } else {
4930
        $arrayOutput[++$p] = $a;
4931 1
      }
4932
    }
4933
4934
    return $arrayOutput;
4935
  }
4936
4937
  /**
4938
   * Get a binary representation of a specific string.
4939
   *
4940
   * @param  string $str The input string.
4941
   *
4942
   * @return string
4943
   */
4944 7
  public static function str_to_binary($str)
4945
  {
4946 7
    $str = (string)$str;
4947
4948
    $value = unpack('H*', $str);
4949
4950
    return base_convert($value[1], 16, 2);
4951
  }
4952
4953
  /**
4954
   * alias for "UTF8::to_ascii()"
4955
   *
4956
   * @see UTF8::to_ascii()
4957
   *
4958
   * @param string $str
4959
   * @param string $unknown
4960 1
   *
4961
   * @return string
4962 1
   */
4963 1
  public static function str_transliterate($str, $unknown = '?')
4964
  {
4965 1
    return self::to_ascii($str, $unknown);
4966
  }
4967 1
4968
  /**
4969 1
   * Counts number of words in the UTF-8 string.
4970 1
   *
4971 1
   * @param string $str      The input string.
4972 1
   * @param int    $format   <strong>0</strong> => return a number of words<br />
4973
   *                         <strong>1</strong> => return an array of words<br />
4974 1
   *                         <strong>2</strong> => return an array of words with word-offset as key
4975
   * @param string $charlist Additional chars that contains to words and do not start a new word (default: "'", "’")
4976 1
   *
4977
   * @return array|int The number of words in the string
4978 1
   */
4979 1
  public static function str_word_count($str, $format = 0, $charlist = '')
4980 1
  {
4981 1
    $charlist = self::rxClass($charlist, '\pL');
4982 1
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4983 1
4984
    $len = count($strParts);
4985 1
4986
    if ($format === 1) {
4987 1
4988
      $numberOfWords = array();
4989
      for ($i = 1; $i < $len; $i += 2) {
4990
        $numberOfWords[] = $strParts[$i];
4991 1
      }
4992
4993
    } elseif ($format === 2) {
4994
4995
      self::checkForSupport();
4996
4997
      $numberOfWords = array();
4998
      $offset = self::strlen($strParts[0]);
4999
      for ($i = 1; $i < $len; $i += 2) {
5000
        $numberOfWords[$offset] = $strParts[$i];
5001
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
5002
      }
5003
5004
    } else {
5005
5006 9
      $numberOfWords = ($len - 1) / 2;
5007
5008 9
    }
5009
5010
    return $numberOfWords;
5011
  }
5012
5013
  /**
5014
   * Case-insensitive string comparison.
5015
   *
5016
   * INFO: Case-insensitive version of UTF8::strcmp()
5017
   *
5018
   * @param string $str1
5019
   * @param string $str2
5020
   *
5021 12
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
5022
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
5023 12
   *             <strong>0</strong> if they are equal.
5024 11
   */
5025 11
  public static function strcasecmp($str1, $str2)
5026 12
  {
5027
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
5028
  }
5029
5030
  /**
5031
   * Case-sensitive string comparison.
5032
   *
5033
   * @param string $str1
5034
   * @param string $str2
5035
   *
5036
   * @return int  <strong>&lt; 0</strong> if str1 is less than str2<br />
5037
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
5038
   *              <strong>0</strong> if they are equal.
5039 8
   */
5040
  public static function strcmp($str1, $str2)
5041 8
  {
5042 1
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
5043
        \Normalizer::normalize($str1, \Normalizer::NFD),
5044
        \Normalizer::normalize($str2, \Normalizer::NFD)
5045 7
    );
5046 2
  }
5047 2
5048 5
  /**
5049
   * Find length of initial segment not matching mask.
5050
   *
5051 7
   * @param string $str
5052
   * @param string $charList
5053 7
   * @param int    $offset
5054
   * @param int    $length
5055 1
   *
5056
   * @return int|null
5057
   */
5058
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
5059
  {
5060
    if ('' === $charList .= '') {
5061
      return null;
5062
    }
5063
5064
    if ($offset || 2147483647 !== $length) {
5065
      $str = (string)self::substr($str, $offset, $length);
5066
    } else {
5067
      $str = (string)$str;
5068 2
    }
5069
5070 2
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
5071 2
      /** @noinspection OffsetOperationsInspection */
5072
      return self::strlen($length[1]);
5073 2
    } else {
5074 2
      return self::strlen($str);
5075 2
    }
5076
  }
5077 2
5078 2
  /**
5079
   * Create a UTF-8 string from code points.
5080
   *
5081
   * INFO: opposite to UTF8::codepoints()
5082
   *
5083
   * @param  array $array Integer or Hexadecimal codepoints
5084
   *
5085
   * @return string UTF-8 encoded string
5086
   */
5087
  public static function string(array $array)
5088
  {
5089
    return implode(
5090
        array_map(
5091
            array(
5092
                '\\voku\\helper\\UTF8',
5093
                'chr',
5094
            ),
5095
            $array
5096
        )
5097
    );
5098
  }
5099
5100
  /**
5101
   * alias for "UTF8::string_has_bom()"
5102 2
   *
5103
   * @see UTF8::string_has_bom()
5104 2
   *
5105 2
   * @param string $str
5106 2
   *
5107
   * @return bool
5108 2
   */
5109
  public static function hasBom($str)
5110 2
  {
5111
    return self::string_has_bom($str);
5112
  }
5113
5114
  /**
5115
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
5116
   *
5117
   * @param    string $str The input string.
5118
   *
5119
   * @return   bool True if the string has BOM at the start, False otherwise.
5120
   */
5121
  public static function string_has_bom($str)
5122
  {
5123
    foreach (self::$bom as $bomString => $bomByteLength) {
5124
      if (0 === strpos($str, $bomString)) {
5125
        return true;
5126
      }
5127
    }
5128
5129
    return false;
5130
  }
5131
5132 2
  /**
5133
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
5134
   *
5135 2
   * @link http://php.net/manual/en/function.strip-tags.php
5136
   *
5137 2
   * @param string $str            <p>
5138
   *                               The input string.
5139
   *                               </p>
5140
   * @param string $allowable_tags [optional] <p>
5141
   *                               You can use the optional second parameter to specify tags which should
5142
   *                               not be stripped.
5143
   *                               </p>
5144
   *                               <p>
5145
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
5146
   *                               can not be changed with allowable_tags.
5147
   *                               </p>
5148
   *
5149
   * @return string the stripped string.
5150
   */
5151
  public static function strip_tags($str, $allowable_tags = null)
5152
  {
5153
    // clean broken utf8
5154
    $str = self::clean($str);
5155
5156
    return strip_tags($str, $allowable_tags);
5157
  }
5158
5159
  /**
5160
   * Finds position of first occurrence of a string within another, case insensitive.
5161
   *
5162 8
   * @link http://php.net/manual/en/function.mb-stripos.php
5163
   *
5164 8
   * @param string  $haystack  <p>
5165 8
   *                           The string from which to get the position of the first occurrence
5166
   *                           of needle
5167 8
   *                           </p>
5168 3
   * @param string  $needle    <p>
5169
   *                           The string to find in haystack
5170
   *                           </p>
5171
   * @param int     $offset    [optional] <p>
5172 7
   *                           The position in haystack
5173
   *                           to start searching
5174 7
   *                           </p>
5175 1
   * @param string  $encoding
5176 1
   * @param boolean $cleanUtf8 Clean non UTF-8 chars from the string.
5177 1
   *
5178
   * @return int|false Return the numeric position of the first occurrence of needle in the haystack string,<br />
5179
   *                   or false if needle is not found.
5180 7
   */
5181 1
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5182 1
  {
5183 7
    $haystack = (string)$haystack;
5184
    $needle = (string)$needle;
5185
5186 7
    if (!isset($haystack[0], $needle[0])) {
5187
      return false;
5188
    }
5189
5190
    // init
5191
    self::checkForSupport();
5192
5193
    if ($cleanUtf8 === true) {
5194
      $haystack = self::clean($haystack);
5195
      $needle = self::clean($needle);
5196
    }
5197
5198 7
    // INFO: this is only a fallback for old versions
5199
    if ($encoding === true || $encoding === false) {
5200 7
      $encoding = 'UTF-8';
5201 2
    } else {
5202
      $encoding = self::normalizeEncoding($encoding);
5203
    }
5204
5205 5
    return \mb_stripos($haystack, $needle, $offset, $encoding);
5206
  }
5207 5
5208
  /**
5209
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
5210
   *
5211
   * @param string $str
5212
   * @param string $needle
5213
   * @param bool   $before_needle
5214
   *
5215
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
5216
   */
5217
  public static function stristr($str, $needle, $before_needle = false)
5218
  {
5219
    if ('' === $needle .= '') {
5220
      return false;
5221 61
    }
5222
5223 61
    // init
5224
    self::checkForSupport();
5225 61
5226 4
    return \mb_stristr($str, $needle, $before_needle, 'UTF-8');
5227
  }
5228
5229
  /**
5230 60
   * Get the string length, not the byte-length!
5231
   *
5232
   * @link     http://php.net/manual/en/function.mb-strlen.php
5233 60
   *
5234
   * @param string  $str       The string being checked for length.
5235
   * @param string  $encoding  Set the charset for e.g. "\mb_" function
5236
   * @param boolean $cleanUtf8 Clean non UTF-8 chars from the string
5237 60
   *
5238 60
   * @return int the number of characters in the string $str having character encoding $encoding. (One multi-byte
5239
   *             character counted as +1)
5240
   */
5241
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5242 60
  {
5243
    $str = (string)$str;
5244 60
5245 1
    if (!isset($str[0])) {
5246 1
      return 0;
5247
    }
5248 60
5249
    // INFO: this is only a fallback for old versions
5250
    if ($encoding === true || $encoding === false) {
5251
      $encoding = 'UTF-8';
5252
    } else {
5253
      $encoding = self::normalizeEncoding($encoding);
5254
    }
5255
5256
    switch ($encoding) {
5257
      case 'ASCII':
5258
      case 'CP850':
5259
        return strlen($str);
5260
    }
5261
5262
    self::checkForSupport();
5263 1
5264
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
5265 1
      $str = self::clean($str);
5266
    }
5267
5268
    return \mb_strlen($str, $encoding);
5269
  }
5270
5271
  /**
5272
   * Case insensitive string comparisons using a "natural order" algorithm.
5273
   *
5274
   * INFO: natural order version of UTF8::strcasecmp()
5275
   *
5276
   * @param string $str1
5277
   * @param string $str2
5278
   *
5279
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
5280
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
5281
   *             <strong>0</strong> if they are equal
5282
   */
5283
  public static function strnatcasecmp($str1, $str2)
5284
  {
5285
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
5286
  }
5287 2
5288
  /**
5289 2
   * String comparisons using a "natural order" algorithm
5290
   *
5291
   * INFO: natural order version of UTF8::strcmp()
5292
   *
5293
   * @link  http://php.net/manual/en/function.strnatcmp.php
5294
   *
5295
   * @param string $str1 <p>
5296
   *                     The first string.
5297
   *                     </p>
5298
   * @param string $str2 <p>
5299
   *                     The second string.
5300
   *                     </p>
5301
   *
5302
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
5303
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
5304
   *             <strong>0</strong> if they are equal
5305
   */
5306
  public static function strnatcmp($str1, $str2)
5307
  {
5308
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
5309
  }
5310
5311 1
  /**
5312
   * Case-insensitive string comparison of the first n characters.
5313 1
   *
5314
   * @link  http://php.net/manual/en/function.strncasecmp.php
5315
   *
5316
   * @param string $str1 <p>
5317
   *                     The first string.
5318
   *                     </p>
5319
   * @param string $str2 <p>
5320
   *                     The second string.
5321
   *                     </p>
5322
   * @param int    $len  <p>
5323
   *                     The length of strings to be used in the comparison.
5324
   *                     </p>
5325
   *
5326
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
5327
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
5328
   *             <strong>0</strong> if they are equal
5329
   */
5330
  public static function strncasecmp($str1, $str2, $len)
5331
  {
5332
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
5333
  }
5334
5335
  /**
5336 2
   * String comparison of the first n characters.
5337
   *
5338 2
   * @link  http://php.net/manual/en/function.strncmp.php
5339 2
   *
5340
   * @param string $str1 <p>
5341 2
   *                     The first string.
5342
   *                     </p>
5343
   * @param string $str2 <p>
5344
   *                     The second string.
5345
   *                     </p>
5346
   * @param int    $len  <p>
5347
   *                     Number of characters to use in the comparison.
5348
   *                     </p>
5349
   *
5350
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
5351
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
5352
   *             <strong>0</strong> if they are equal
5353
   */
5354
  public static function strncmp($str1, $str2, $len)
5355
  {
5356
    $str1 = self::substr($str1, 0, $len);
5357
    $str2 = self::substr($str2, 0, $len);
5358 1
5359
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 5356 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 5357 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5360 1
  }
5361 1
5362
  /**
5363 1
   * Search a string for any of a set of characters.
5364 1
   *
5365
   * @link  http://php.net/manual/en/function.strpbrk.php
5366
   *
5367 1
   * @param string $haystack  <p>
5368 1
   *                          The string where char_list is looked for.
5369
   *                          </p>
5370
   * @param string $char_list <p>
5371
   *                          This parameter is case sensitive.
5372
   *                          </p>
5373
   *
5374
   * @return string String starting from the character found, or false if it is not found.
5375
   */
5376
  public static function strpbrk($haystack, $char_list)
5377
  {
5378
    $haystack = (string)$haystack;
5379
    $char_list = (string)$char_list;
5380
5381
    if (!isset($haystack[0], $char_list[0])) {
5382
      return false;
5383
    }
5384
5385
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
5386
      return substr($haystack, strpos($haystack, $m[0]));
5387
    } else {
5388
      return false;
5389
    }
5390
  }
5391
5392
  /**
5393
   * Find position of first occurrence of string in a string.
5394 15
   *
5395
   * @link http://php.net/manual/en/function.mb-strpos.php
5396 15
   *
5397 15
   * @param string  $haystack     <p>
5398
   *                              The string being checked.
5399 15
   *                              </p>
5400 2
   * @param string  $needle       <p>
5401
   *                              The position counted from the beginning of haystack.
5402
   *                              </p>
5403
   * @param int     $offset       [optional] <p>
5404 14
   *                              The search offset. If it is not specified, 0 is used.
5405 14
   *                              </p>
5406
   * @param string  $encoding
5407
   * @param boolean $cleanUtf8    Clean non UTF-8 chars from the string.
5408
   *
5409 14
   * @return int|false The numeric position of the first occurrence of needle in the haystack string.<br />
5410
   *                   If needle is not found it returns false.
5411
   */
5412
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
5413 14
  {
5414
    $haystack = (string)$haystack;
5415
    $needle = (string)$needle;
5416
5417 1
    if (!isset($haystack[0], $needle[0])) {
5418 1
      return false;
5419 1
    }
5420
5421 14
    // init
5422
    self::checkForSupport();
5423
    $offset = (int)$offset;
5424 14
5425 1
    // iconv and mbstring do not support integer $needle
5426 1
5427 14
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
5428
      $needle = self::chr($needle);
5429
    }
5430 14
5431
    if ($cleanUtf8 === true) {
5432
      // \mb_strpos returns wrong position if invalid characters are found in $haystack before $needle
5433
      // iconv_strpos is not tolerant to invalid characters
5434
5435
      $needle = self::clean((string)$needle);
5436
      $haystack = self::clean($haystack);
5437
    }
5438
5439 View Code Duplication
    if (self::$support['mbstring'] === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5440
5441
      // INFO: this is only a fallback for old versions
5442
      if ($encoding === true || $encoding === false) {
5443
        $encoding = 'UTF-8';
5444
      } else {
5445
        $encoding = self::normalizeEncoding($encoding);
5446
      }
5447
5448
      return \mb_strpos($haystack, $needle, $offset, $encoding);
5449
    }
5450
5451
    if (self::$support['iconv'] === true) {
5452
      // ignore invalid negative offset to keep compatility
5453
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
5454
      return \grapheme_strpos($haystack, $needle, $offset > 0 ? $offset : 0);
5455
    }
5456
5457
    if ($offset > 0) {
5458
      $haystack = self::substr($haystack, $offset);
5459
    }
5460
5461 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5462
      $left = substr($haystack, 0, $pos);
5463
5464
      // negative offset not supported in PHP strpos(), ignoring
5465
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
5466
    }
5467
5468
    return false;
5469
  }
5470
5471
  /**
5472
   * Finds the last occurrence of a character in a string within another.
5473
   *
5474
   * @link http://php.net/manual/en/function.mb-strrchr.php
5475
   *
5476
   * @param string $haystack <p>
5477
   *                         The string from which to get the last occurrence
5478
   *                         of needle
5479
   *                         </p>
5480
   * @param string $needle   <p>
5481 1
   *                         The string to find in haystack
5482
   *                         </p>
5483 1
   * @param bool   $part     [optional] <p>
5484 1
   *                         Determines which portion of haystack
5485
   *                         this function returns.
5486 1
   *                         If set to true, it returns all of haystack
5487
   *                         from the beginning to the last occurrence of needle.
5488
   *                         If set to false, it returns all of haystack
5489
   *                         from the last occurrence of needle to the end,
5490
   *                         </p>
5491
   * @param string $encoding [optional] <p>
5492
   *                         Character encoding name to use.
5493
   *                         If it is omitted, internal character encoding is used.
5494
   *                         </p>
5495
   *
5496 4
   * @return string|false The portion of haystack or false if needle is not found.
5497
   */
5498 4 View Code Duplication
  public static function strrchr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5499
  {
5500
    self::checkForSupport();
5501
    $encoding = self::normalizeEncoding($encoding);
5502
5503
    return \mb_strrchr($haystack, $needle, $part, $encoding);
5504
  }
5505
5506
  /**
5507
   * alias for "UTF8::strstr()"
5508
   *
5509
   * @see UTF8::strstr()
5510
   *
5511
   * @param string $haystack
5512
   * @param string $needle
5513
   * @param bool   $before_needle
5514
   *
5515
   * @return string|false
5516
   */
5517
  public static function strchr($haystack, $needle, $before_needle = false)
5518
  {
5519
    return self::strstr($haystack, $needle, $before_needle);
5520
  }
5521
5522
  /**
5523
   * alias for "UTF8::stristr()"
5524
   *
5525
   * @see UTF8::stristr()
5526
   *
5527
   * @param string $haystack
5528
   * @param string $needle
5529 1
   * @param bool   $before_needle
5530
   *
5531 1
   * @return string|false
5532 1
   */
5533
  public static function strichr($haystack, $needle, $before_needle = false)
5534 1
  {
5535
    return self::stristr($haystack, $needle, $before_needle);
5536
  }
5537
5538
  /**
5539
   * Reverses characters order in the string.
5540
   *
5541
   * @param  string $str The input string
5542
   *
5543
   * @return string The string with characters in the reverse sequence
5544
   */
5545
  public static function strrev($str)
5546 1
  {
5547
    $str = (string)$str;
5548 1
5549
    if (!isset($str[0])) {
5550
      return '';
5551
    }
5552
5553
    return implode(array_reverse(self::split($str)));
5554
  }
5555
5556
  /**
5557
   * Finds the last occurrence of a character in a string within another, case insensitive.
5558
   *
5559
   * @link http://php.net/manual/en/function.mb-strrichr.php
5560
   *
5561
   * @param string $haystack <p>
5562
   *                         The string from which to get the last occurrence
5563
   *                         of needle
5564
   *                         </p>
5565
   * @param string $needle   <p>
5566
   *                         The string to find in haystack
5567
   *                         </p>
5568
   * @param bool   $part     [optional] <p>
5569
   *                         Determines which portion of haystack
5570
   *                         this function returns.
5571
   *                         If set to true, it returns all of haystack
5572
   *                         from the beginning to the last occurrence of needle.
5573
   *                         If set to false, it returns all of haystack
5574 11
   *                         from the last occurrence of needle to the end,
5575
   *                         </p>
5576 11
   * @param string $encoding [optional] <p>
5577
   *                         Character encoding name to use.
5578 11
   *                         If it is omitted, internal character encoding is used.
5579 2
   *                         </p>
5580 2
   *
5581
   * @return string|false The portion of haystack or false if needle is not found.
5582 11
   */
5583 View Code Duplication
  public static function strrichr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5584 11
  {
5585 2
    self::checkForSupport();
5586
    $encoding = self::normalizeEncoding($encoding);
5587
5588
    return \mb_strrichr($haystack, $needle, $part, $encoding);
5589 10
  }
5590
5591 10
  /**
5592 10
   * Find position of last occurrence of a case-insensitive string.
5593
   *
5594 10
   * @param string  $haystack  The string to look in
5595
   * @param string  $needle    The string to look for
5596
   * @param int     $offset    (Optional) Number of characters to ignore in the beginning or end
5597 2
   * @param boolean $cleanUtf8 Clean non UTF-8 chars from the string
5598 2
   *
5599 2
   * @return int|false The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
5600
   *                   not found, it returns false.
5601 10
   */
5602 10
  public static function strripos($haystack, $needle, $offset = 0, $cleanUtf8 = false)
5603
  {
5604
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $cleanUtf8);
5605
  }
5606
5607
  /**
5608
   * Find position of last occurrence of a string in a string.
5609
   *
5610
   * @link http://php.net/manual/en/function.mb-strrpos.php
5611
   *
5612
   * @param string     $haystack  <p>
5613
   *                              The string being checked, for the last occurrence
5614
   *                              of needle
5615
   *                              </p>
5616
   * @param string|int $needle    <p>
5617
   *                              The string to find in haystack.
5618
   *                              Or a code point as int.
5619
   *                              </p>
5620
   * @param int        $offset    [optional] May be specified to begin searching an arbitrary number of characters into
5621
   *                              the string. Negative values will stop searching at an arbitrary point
5622
   *                              prior to the end of the string.
5623
   * @param boolean    $cleanUtf8 Clean non UTF-8 chars from the string
5624
   *
5625
   * @return int|false The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
5626
   *                   not found, it returns false.
5627
   */
5628
  public static function strrpos($haystack, $needle, $offset = null, $cleanUtf8 = false)
5629
  {
5630
    $haystack = (string)$haystack;
5631
5632
    if (((int)$needle) === $needle && ($needle >= 0)) {
5633
      $needle = self::chr($needle);
5634
    }
5635
5636
    $needle = (string)$needle;
5637
5638 8
    if (!isset($haystack[0], $needle[0])) {
5639
      return false;
5640 8
    }
5641 2
5642 2
    // init
5643
    self::checkForSupport();
5644 8
5645
    $needle = (string)$needle;
5646
    $offset = (int)$offset;
5647
5648
    if ($cleanUtf8 === true) {
5649
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
5650
5651
      $needle = self::clean($needle);
5652
      $haystack = self::clean($haystack);
5653
    }
5654
5655
    if (self::$support['mbstring'] === true) {
5656
      return \mb_strrpos($haystack, $needle, $offset, 'UTF-8');
5657
    }
5658
5659
    if (self::$support['iconv'] === true) {
5660
      return \grapheme_strrpos($haystack, $needle, $offset);
5661
    }
5662
5663
    // fallback
5664
5665 1
    if ($offset > 0) {
5666
      $haystack = self::substr($haystack, $offset);
5667 1
    } elseif ($offset < 0) {
5668
      $haystack = self::substr($haystack, 0, $offset);
5669 1
    }
5670
5671 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5672
      $left = substr($haystack, 0, $pos);
5673
5674
      // negative offset not supported in PHP strpos(), ignoring
5675
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
5676
    }
5677
5678
    return false;
5679
  }
5680
5681
  /**
5682 11
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
5683
   * mask.
5684 11
   *
5685 11
   * @param string $str
5686 11
   * @param string $mask
5687
   * @param int    $offset
5688 11
   * @param int    $length
5689 1
   *
5690 1
   * @return int|null
5691 1
   */
5692
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
5693 11
  {
5694
    if ($offset || 2147483647 !== $length) {
5695 11
      $str = self::substr($str, $offset, $length);
5696
    }
5697 11
5698 1
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
5699 1
  }
5700
5701
  /**
5702 11
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
5703 11
   *
5704
   * @link http://php.net/manual/en/function.grapheme-strstr.php
5705 11
   *
5706
   * @param string $haystack      <p>
5707 11
   *                              The input string. Must be valid UTF-8.
5708
   *                              </p>
5709
   * @param string $needle        <p>
5710
   *                              The string to look for. Must be valid UTF-8.
5711
   *                              </p>
5712
   * @param bool   $before_needle [optional] <p>
5713
   *                              If <b>TRUE</b>, grapheme_strstr() returns the part of the
5714
   *                              haystack before the first occurrence of the needle (excluding the needle).
5715
   *                              </p>
5716
   *
5717
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
5718
   */
5719
  public static function strstr($haystack, $needle, $before_needle = false)
5720
  {
5721
    self::checkForSupport();
5722
5723 21
    return \grapheme_strstr($haystack, $needle, $before_needle);
5724
  }
5725 21
5726
  /**
5727 21
   * Unicode transformation for case-less matching.
5728 6
   *
5729
   * @link http://unicode.org/reports/tr21/tr21-5.html
5730
   *
5731
   * @param string $str
5732 19
   * @param bool   $full
5733 19
   *
5734
   * @return string
5735 19
   */
5736
  public static function strtocasefold($str, $full = true)
5737
  {
5738
    static $fullCaseFold = null;
5739
    static $commonCaseFoldKeys = null;
5740
    static $commonCaseFoldValues = null;
5741
5742
    if ($commonCaseFoldKeys === null) {
5743
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
5744
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
5745 3
    }
5746
5747 3
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
5748
5749
    if ($full) {
5750
5751
      if ($fullCaseFold === null) {
5752
        $fullCaseFold = self::getData('caseFolding_full');
5753
      }
5754
5755
      /** @noinspection OffsetOperationsInspection */
5756
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
5757
    }
5758
5759
    $str = self::clean($str);
5760
5761
    return self::strtolower($str);
5762 16
  }
5763
5764 16
  /**
5765
   * (PHP 4 &gt;= 4.3.0, PHP 5)<br/>
5766 16
   * Make a string lowercase.
5767 4
   *
5768
   * @link http://php.net/manual/en/function.mb-strtolower.php
5769
   *
5770
   * @param string $str <p>
5771 15
   *                    The string being lowercased.
5772
   *                    </p>
5773 15
   * @param string $encoding
5774 15
   *
5775
   * @return string str with all alphabetic characters converted to lowercase.
5776 15
   */
5777
  public static function strtolower($str, $encoding = 'UTF-8')
5778
  {
5779
    $str = (string)$str;
5780
5781
    if (!isset($str[0])) {
5782
      return '';
5783
    }
5784
5785
    // init
5786
    self::checkForSupport();
5787
    $encoding = self::normalizeEncoding($encoding);
5788
5789
    return \mb_strtolower($str, $encoding);
5790
  }
5791
5792
  /**
5793
   * Generic case sensitive transformation for collation matching.
5794
   *
5795
   * @param string $s
5796
   *
5797
   * @return string
5798
   */
5799
  protected static function strtonatfold($s)
5800
  {
5801
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($s, \Normalizer::NFD));
5802
  }
5803
5804
  /**
5805
   * Make a string uppercase.
5806
   *
5807
   * @link http://php.net/manual/en/function.mb-strtoupper.php
5808
   *
5809
   * @param string $str <p>
5810
   *                    The string being uppercased.
5811
   *                    </p>
5812
   * @param string $encoding
5813
   *
5814
   * @return string str with all alphabetic characters converted to uppercase.
5815
   */
5816 1
  public static function strtoupper($str, $encoding = 'UTF-8')
5817
  {
5818 1
    $str = (string)$str;
5819 1
5820 1
    if (!isset($str[0])) {
5821 1
      return '';
5822 1
    }
5823
5824 1
    // init
5825 1
    self::checkForSupport();
5826 1
5827 1
    if (self::$support['mbstring'] === true) {
5828 1
      $encoding = self::normalizeEncoding($encoding);
5829
5830 1
      return \mb_strtoupper($str, $encoding);
5831 1
    } else {
5832
5833 1
      // fallback
5834
5835
      static $caseTableKeys = null;
5836
      static $caseTableValues = null;
5837
5838
      if ($caseTableKeys === null) {
5839
        $caseTable = self::case_table();
5840
        $caseTableKeys = array_keys($caseTable);
5841
        $caseTableValues = array_values($caseTable);
5842
      }
5843 1
5844
      $str = self::clean($str);
5845
5846 1
      return str_replace($caseTableKeys, $caseTableValues, $str);
5847
    }
5848 1
  }
5849
5850
  /**
5851
   * Translate characters or replace sub-strings.
5852
   *
5853
   * @link  http://php.net/manual/en/function.strtr.php
5854
   *
5855
   * @param string       $str  <p>
5856
   *                           The string being translated.
5857
   *                           </p>
5858
   * @param string|array $from <p>
5859
   *                           The string replacing from.
5860
   *                           </p>
5861
   * @param string|array $to   <p>
5862
   *                           The string being translated to to.
5863
   *                           </p>
5864
   *
5865
   * @return string This function returns a copy of str,
5866
   * translating all occurrences of each character in
5867
   * from to the corresponding character in
5868
   * to.
5869
   */
5870 47
  public static function strtr($str, $from, $to = INF)
5871
  {
5872 47
    if (INF !== $to) {
5873
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 5873 can also be of type array; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5874 47
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 5874 can also be of type array; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5875 11
      $countFrom = count($from);
5876
      $countTo = count($to);
5877
5878
      if ($countFrom > $countTo) {
5879 45
        $from = array_slice($from, 0, $countTo);
5880
      } elseif ($countFrom < $countTo) {
5881 45
        $to = array_slice($to, 0, $countFrom);
5882
      }
5883
5884
      $from = array_combine($from, $to);
5885 1
    }
5886 1
5887
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 5870 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5888 45
  }
5889 45
5890 37
  /**
5891 37
   * Return the width of a string.
5892
   *
5893 45
   * @param string $s
5894 2
   *
5895
   * @return int
5896
   */
5897 43
  public static function strwidth($s)
5898 20
  {
5899 20
    // init
5900 41
    self::checkForSupport();
5901
5902
    return \mb_strwidth($s, 'UTF-8');
5903 43
  }
5904
5905
  /**
5906 43
   * Get part of a string.
5907 1
   *
5908 1
   * @link http://php.net/manual/en/function.mb-substr.php
5909 43
   *
5910
   * @param string  $str       <p>
5911
   *                           The string being checked.
5912 43
   *                           </p>
5913
   * @param int     $start     <p>
5914
   *                           The first position used in str.
5915
   *                           </p>
5916
   * @param int     $length    [optional] <p>
5917
   *                           The maximum length of the returned string.
5918
   *                           </p>
5919
   * @param string  $encoding
5920
   * @param boolean $cleanUtf8 Clean non UTF-8 chars from the string
5921
   *
5922
   * @return string Returns a sub-string specified by the start and length parameters.
5923
   */
5924
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5925
  {
5926
    $str = (string)$str;
5927
5928
    if (!isset($str[0])) {
5929
      return '';
5930
    }
5931
5932
    // init
5933
    self::checkForSupport();
5934
5935
    if ($cleanUtf8 === true) {
5936
      // iconv and mbstring are not tolerant to invalid encoding
5937
      // further, their behaviour is inconsistent with that of PHP's substr
5938
5939
      $str = self::clean($str);
5940
    }
5941 1
5942
    $str_length = 0;
5943 1
    if ($start || $length === null) {
5944 1
      $str_length = (int)self::strlen($str);
5945
    }
5946 1
5947
    if ($start && $start > $str_length) {
5948
      return false;
5949
    }
5950
5951
    if ($length === null) {
5952
      $length = $str_length;
5953
    } else {
5954
      $length = (int)$length;
5955
    }
5956
5957 View Code Duplication
    if (self::$support['mbstring'] === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5958
5959
      // INFO: this is only a fallback for old versions
5960
      if ($encoding === true || $encoding === false) {
5961
        $encoding = 'UTF-8';
5962
      } else {
5963
        $encoding = self::normalizeEncoding($encoding);
5964
      }
5965
5966
      return \mb_substr($str, $start, $length, $encoding);
5967
    }
5968
5969
    if (self::$support['iconv'] === true) {
5970
      return (string)\grapheme_substr($str, $start, $length);
5971 1
    }
5972
5973 1
    // fallback
5974 1
5975
    // split to array, and remove invalid characters
5976 1
    $array = self::split($str);
5977 1
5978
    // extract relevant part, and join to make sting again
5979
    return implode(array_slice($array, $start, $length));
5980 1
  }
5981 1
5982 1
  /**
5983
   * Binary safe comparison of two strings from an offset, up to length characters.
5984 1
   *
5985 1
   * @param string  $main_str           The main string being compared.
5986
   * @param string  $str                The secondary string being compared.
5987
   * @param int     $offset             The start position for the comparison. If negative, it starts counting from the
5988 1
   *                                    end of the string.
5989 1
   * @param int     $length             The length of the comparison. The default value is the largest of the length of
5990
   *                                    the str compared to the length of main_str less the offset.
5991 1
   * @param boolean $case_insensitivity If case_insensitivity is TRUE, comparison is case insensitive.
5992
   *
5993 1
   * @return int
5994
   */
5995
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
5996
  {
5997
    $main_str = self::substr($main_str, $offset, $length);
5998
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5997 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5999
6000
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5997 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5998 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5997 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5998 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
6001
  }
6002
6003
  /**
6004
   * Count the number of substring occurrences
6005
   *
6006
   * @link  http://php.net/manual/en/function.substr-count.php
6007
   *
6008 6
   * @param string $haystack <p>
6009
   *                         The string to search in
6010 6
   *                         </p>
6011 1
   * @param string $needle   <p>
6012
   *                         The substring to search for
6013
   *                         </p>
6014 1
   * @param int    $offset   [optional] <p>
6015 1
   *                         The offset where to start counting
6016 1
   *                         </p>
6017 1
   * @param int    $length   [optional] <p>
6018
   *                         The maximum length after the specified offset to search for the
6019
   *                         substring. It outputs a warning if the offset plus the length is
6020
   *                         greater than the haystack length.
6021 1
   *                         </p>
6022 1
   *
6023 1
   * @return int This functions returns an integer.
6024 1
   */
6025 1
  public static function substr_count($haystack, $needle, $offset = 0, $length = null)
6026 1
  {
6027 1
    $haystack = (string)$haystack;
6028 1
    $needle = (string)$needle;
6029
6030
    if (!isset($haystack[0], $needle[0])) {
6031
      return false;
6032 1
    }
6033 1
6034 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
6035 1
      $offset = (int)$offset;
6036 1
      $length = (int)$length;
6037 1
6038 1
      if ($length + $offset <= 0) {
6039 1
        return false;
6040
      }
6041
6042 1
      $haystack = self::substr($haystack, $offset, $length);
6043 1
    }
6044 1
6045 1
    self::checkForSupport();
6046
6047
    return \mb_substr_count($haystack, $needle);
6048
  }
6049 1
6050
  /**
6051 6
   * Replace text within a portion of a string.
6052 1
   *
6053 1
   * source: https://gist.github.com/stemar/8287074
6054 1
   *
6055 1
   * @param string|array   $str
6056
   * @param string|array   $replacement
6057 1
   * @param int|array      $start
6058
   * @param null|int|array $length
6059
   *
6060 6
   * @return array|string
6061 6
   */
6062
  public static function substr_replace($str, $replacement, $start, $length = null)
6063 6
  {
6064 4
    if (is_array($str)) {
6065
      $num = count($str);
6066 4
6067 4
      // $replacement
6068
      if (is_array($replacement)) {
6069 6
        $replacement = array_slice($replacement, 0, $num);
6070
      } else {
6071 6
        $replacement = array_pad(array($replacement), $num, $replacement);
6072
      }
6073
6074
      // $start
6075 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6076
        $start = array_slice($start, 0, $num);
6077
        foreach ($start as &$valueTmp) {
6078
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
6079
        }
6080
        unset($valueTmp);
6081
      } else {
6082 1
        $start = array_pad(array($start), $num, $start);
6083
      }
6084 1
6085
      // $length
6086 1
      if (!isset($length)) {
6087 1
        $length = array_fill(0, $num, 0);
6088 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6089
        $length = array_slice($length, 0, $num);
6090 1
        foreach ($length as &$valueTmpV2) {
6091 1
          if (isset($valueTmpV2)) {
6092
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
6093 1
          } else {
6094 1
            $valueTmpV2 = 0;
6095
          }
6096 1
        }
6097
        unset($valueTmpV2);
6098 1
      } else {
6099 1
        $length = array_pad(array($length), $num, $length);
6100
      }
6101 1
6102
      // Recursive call
6103 1
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
6104
    } else {
6105 1
      if (is_array($replacement)) {
6106
        if (count($replacement) > 0) {
6107 1
          $replacement = $replacement[0];
6108
        } else {
6109
          $replacement = '';
6110
        }
6111
      }
6112
    }
6113
6114
    preg_match_all('/./us', (string)$str, $smatches);
6115
    preg_match_all('/./us', (string)$replacement, $rmatches);
6116
6117
    if ($length === null) {
6118
      self::checkForSupport();
6119
6120 6
      $length = \mb_strlen($str);
6121
    }
6122 6
6123
    array_splice($smatches[0], $start, $length, $rmatches[0]);
6124
6125
    return implode($smatches[0], null);
6126
  }
6127
6128
  /**
6129
   * Returns a case swapped version of the string.
6130
   *
6131
   * @param string $str
6132
   * @param string $encoding
6133
   *
6134 1
   * @return string each character's case swapped
6135
   */
6136 1
  public static function swapCase($str, $encoding = 'UTF-8')
6137
  {
6138
    $str = (string)$str;
6139
6140
    if (!isset($str[0])) {
6141
      return '';
6142
    }
6143
6144
    $encoding = self::normalizeEncoding($encoding);
6145
    $str = self::clean($str);
6146
6147
    $strSwappedCase = preg_replace_callback(
6148 1
        '/[\S]/u',
6149
        function ($match) use ($encoding) {
6150 1
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
6151
6152
          if ($match[0] === $marchToUpper) {
6153
            return UTF8::strtolower($match[0], $encoding);
6154
          } else {
6155
            return $marchToUpper;
6156
          }
6157
        },
6158
        $str
6159
    );
6160
6161 13
    return $strSwappedCase;
6162
  }
6163 13
6164
  /**
6165
   * alias for "UTF8::to_ascii()"
6166 13
   *
6167
   * @see UTF8::to_ascii()
6168 13
   *
6169 3
   * @param string $s The input string e.g. a UTF-8 String
6170
   * @param string $subst_chr
6171
   *
6172 11
   * @return string
6173
   */
6174 11
  public static function toAscii($s, $subst_chr = '?')
6175 11
  {
6176
    return self::to_ascii($s, $subst_chr);
6177
  }
6178
6179
  /**
6180
   * alias for "UTF8::to_latin1()"
6181
   *
6182
   * @see UTF8::to_latin1()
6183
   *
6184 11
   * @param $str
6185 11
   *
6186 11
   * @return string
6187
   */
6188 11
  public static function toLatin1($str)
6189
  {
6190 11
    return self::to_latin1($str);
6191 11
  }
6192
6193
  /**
6194 5
   * alias for "UTF8::to_utf8()"
6195
   *
6196
   * @see UTF8::to_utf8()
6197 5
   *
6198 5
   * @param string $str
6199 5
   *
6200
   * @return string
6201 5
   */
6202 2
  public static function toUTF8($str)
6203
  {
6204 2
    return self::to_utf8($str);
6205 2
  }
6206 2
6207
  /**
6208 2
   * convert to ASCII
6209 1
   *
6210
   * @param string $str     The input string.
6211 1
   * @param string $unknown Character use if character unknown. (default is ?)
6212 1
   *
6213 1
   * @return string
6214
   */
6215 1
  public static function to_ascii($str, $unknown = '?')
6216
  {
6217
    static $UTF8_TO_ASCII;
6218
6219
    // init
6220
    $str = (string)$str;
6221
6222
    if (!isset($str[0])) {
6223
      return '';
6224
    }
6225
6226
    $str = self::clean($str);
6227
6228
    self::checkForSupport();
6229
    if (self::$support['intl'] === true && Bootup::is_php('5.4')) {
6230 1
      $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
6231 2
6232
      // check again, if we only have ASCII, now ...
6233 5
      if (!preg_match("/[\x80-\xFF]/", $str)) {
6234
        return $str;
6235
      }
6236
    }
6237
6238 5
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
6239
    $chars = $ar[0];
6240
    foreach ($chars as &$c) {
6241
6242
      $ordC0 = ord($c[0]);
6243 5
6244 5
      if ($ordC0 >= 0 && $ordC0 <= 127) {
6245 1
        continue;
6246 1
      }
6247
6248 1
      $ordC1 = ord($c[1]);
6249 1
6250 1
      // ASCII - next please
6251
      if ($ordC0 >= 192 && $ordC0 <= 223) {
6252 1
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
6253
      }
6254 5
6255 5
      if ($ordC0 >= 224) {
6256 5
        $ordC2 = ord($c[2]);
6257 5
6258 1
        if ($ordC0 <= 239) {
6259
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
6260 11
        }
6261
6262 11
        if ($ordC0 >= 240) {
6263
          $ordC3 = ord($c[3]);
6264
6265
          if ($ordC0 <= 247) {
6266
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
6267
          }
6268
6269
          if ($ordC0 >= 248) {
6270
            $ordC4 = ord($c[4]);
6271
6272 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6273
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
6274 1
            }
6275
6276 1
            if ($ordC0 >= 252) {
6277
              $ordC5 = ord($c[5]);
6278
6279 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6280
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
6281
              }
6282
            }
6283
          }
6284
        }
6285
      }
6286
6287
      if ($ordC0 >= 254 && $ordC0 <= 255) {
6288 1
        $c = $unknown;
6289
        continue;
6290 1
      }
6291
6292
      if (!isset($ord)) {
6293
        $c = $unknown;
6294
        continue;
6295
      }
6296
6297
      $bank = $ord >> 8;
6298
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
6299
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
6300
        if (file_exists($bankfile)) {
6301
          /** @noinspection PhpIncludeInspection */
6302
          require $bankfile;
6303
        } else {
6304
          $UTF8_TO_ASCII[$bank] = array();
6305
        }
6306
      }
6307
6308
      $newchar = $ord & 255;
6309
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
6310
        $c = $UTF8_TO_ASCII[$bank][$newchar];
6311
      } else {
6312
        $c = $unknown;
6313
      }
6314 20
    }
6315
6316 20
    return implode('', $chars);
6317 2
  }
6318
6319 2
  /**
6320 2
   * alias for "UTF8::to_win1252()"
6321
   *
6322 2
   * @see UTF8::to_win1252()
6323
   *
6324
   * @param   string $str
6325 20
   *
6326
   * @return  array|string
6327 20
   */
6328 4
  public static function to_iso8859($str)
6329
  {
6330
    return self::to_win1252($str);
6331 19
  }
6332 19
6333
  /**
6334
   * alias for "UTF8::to_win1252()"
6335 19
   *
6336 19
   * @see UTF8::to_win1252()
6337
   *
6338 19
   * @param string|array $str
6339 19
   *
6340 19
   * @return string|array
6341 19
   */
6342
  public static function to_latin1($str)
6343 19
  {
6344
    return self::to_win1252($str);
6345 16
  }
6346 16
6347 16
  /**
6348 16
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
6349 5
   *
6350 5
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
6351 5
   *
6352
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
6353
   *
6354 19
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
6355
   *    are followed by any of these:  ("group B")
6356 17
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
6357 13
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
6358 13
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
6359 13
   * is also a valid unicode character, and will be left unchanged.
6360 8
   *
6361 8
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
6362 8
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
6363
   *
6364
   * @param string|array $str Any string or array.
6365 19
   *
6366
   * @return string The same string, but UTF8 encoded.
6367 9
   */
6368 4
  public static function to_utf8($str)
6369 4
  {
6370 4
    if (is_array($str)) {
6371 6
      foreach ($str as $k => $v) {
6372 6
        /** @noinspection AlterInForeachInspection */
6373 6
        $str[$k] = self::to_utf8($v);
6374
      }
6375
6376 9
      return $str;
6377 6
    }
6378 6
6379 6
    $str = (string)$str;
6380
6381
    if (!isset($str[0])) {
6382 19
      return $str;
6383
    }
6384 4
6385 4
    $max = strlen($str);
6386 2
    $buf = '';
6387 2
6388 3
    /** @noinspection ForeachInvariantsInspection */
6389 3
    for ($i = 0; $i < $max; $i++) {
6390 3
      $c1 = $str[$i];
6391
6392
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
6393 4
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
6394 16
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
6395
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
6396 19
6397
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
6398 19
6399
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
6400
            $buf .= $c1 . $c2;
6401 19
            $i++;
6402 19
          } else { // not valid UTF8 - convert it
6403
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
6404 3
            $cc2 = ($c1 & "\x3f") | "\x80";
6405 19
            $buf .= $cc1 . $cc2;
6406
          }
6407 19
6408 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6409
6410 19
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
6411 19
            $buf .= $c1 . $c2 . $c3;
6412 19
            $i += 2;
6413 2
          } else { // not valid UTF8 - convert it
6414 19
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
6415
            $cc2 = ($c1 & "\x3f") | "\x80";
6416 19
            $buf .= $cc1 . $cc2;
6417
          }
6418 19
6419
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
6420
6421 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
6422
            $buf .= $c1 . $c2 . $c3 . $c4;
6423
            $i += 3;
6424
          } else { // not valid UTF8 - convert it
6425
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
6426
            $cc2 = ($c1 & "\x3f") | "\x80";
6427
            $buf .= $cc1 . $cc2;
6428 2
          }
6429
6430 2
        } else { // doesn't look like UTF8, but should be converted
6431
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
6432 1
          $cc2 = (($c1 & "\x3f") | "\x80");
6433
          $buf .= $cc1 . $cc2;
6434 1
        }
6435 1
6436
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
6437 1
6438
        $ordC1 = ord($c1);
6439
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
6440 2
          $buf .= self::$win1252ToUtf8[$ordC1];
6441
        } else {
6442 2
          $cc1 = (chr($ordC1 / 64) | "\xc0");
6443 1
          $cc2 = (($c1 & "\x3f") | "\x80");
6444
          $buf .= $cc1 . $cc2;
6445
        }
6446 2
6447
      } else { // it doesn't need conversion
6448
        $buf .= $c1;
6449
      }
6450
    }
6451
6452
    self::checkForSupport();
6453
6454
    // decode unicode escape sequences
6455
    $buf = preg_replace_callback(
6456
        '/\\\\u([0-9a-f]{4})/i',
6457
        function ($match) {
6458
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
6459
        },
6460
        $buf
6461
    );
6462 26
6463
    // decode UTF-8 codepoints
6464 26
    $buf = preg_replace_callback(
6465
        '/&#\d{2,4};/',
6466 26
        function ($match) {
6467 5
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
6468
        },
6469
        $buf
6470
    );
6471 22
6472 6
    return $buf;
6473
  }
6474
6475 16
  /**
6476
   * Convert a string into "win1252"-encoding.
6477
   *
6478
   * @param  string|array $str
6479
   *
6480
   * @return string|array
6481
   */
6482
  protected static function to_win1252($str)
6483
  {
6484
    if (is_array($str)) {
6485 14
6486
      foreach ($str as $k => $v) {
6487 14
        /** @noinspection AlterInForeachInspection */
6488
        $str[$k] = self::to_win1252($v);
6489
      }
6490
6491
      return $str;
6492
    }
6493
6494
    $str = (string)$str;
6495
6496
    if (!isset($str[0])) {
6497
      return '';
6498
    }
6499 1
6500
    return self::utf8_decode($str);
6501 1
  }
6502
6503
  /**
6504
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
6505
   *
6506
   * INFO: This is slower then "trim()"
6507
   *
6508
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
6509
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
6510
   *
6511
   * @param    string $str   The string to be trimmed
6512 8
   * @param    string $chars Optional characters to be stripped
6513
   *
6514 8
   * @return   string The trimmed string
6515 2
   */
6516
  public static function trim($str = '', $chars = INF)
6517
  {
6518
    $str = (string)$str;
6519 7
6520 7
    if (!isset($str[0])) {
6521
      return '';
6522 7
    }
6523 1
6524 1
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
6525 7
    if ($chars === INF || !$chars) {
6526
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
6527
    }
6528 7
6529
    return self::rtrim(self::ltrim($str, $chars), $chars);
6530 7
  }
6531
6532
  /**
6533
   * Makes string's first char uppercase.
6534 1
   *
6535 1
   * @param    string $str The input string
6536 1
   *
6537 7
   * @return   string The resulting string
6538 7
   */
6539 7
  public static function ucfirst($str)
6540 7
  {
6541 7
    return self::strtoupper(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
6542
  }
6543 7
6544
  /**
6545
   * alias for "UTF8::ucfirst()"
6546
   *
6547
   * @see UTF8::ucfirst()
6548
   *
6549
   * @param string $word
6550
   *
6551
   * @return string
6552
   */
6553
  public static function ucword($word)
6554
  {
6555
    return self::ucfirst($word);
6556
  }
6557
6558
  /**
6559
   * Uppercase for all words in the string.
6560
   *
6561
   * @param  string $str
6562
   * @param array   $exceptions
6563 1
   *
6564
   * @return string
6565 1
   */
6566
  public static function ucwords($str, $exceptions = array())
6567 1
  {
6568 1
    if (!$str) {
6569
      return '';
6570
    }
6571 1
6572
    // init
6573 1
    $words = explode(' ', $str);
6574
    $newwords = array();
6575 1
6576 1
    if (count($exceptions) > 0) {
6577 1
      $useExceptions = true;
6578 1
    } else {
6579
      $useExceptions = false;
6580 1
    }
6581 1
6582 1
    foreach ($words as $word) {
6583
      if (
6584 1
          ($useExceptions === false)
6585
          ||
6586
          (
6587
              $useExceptions === true
6588
              &&
6589
              !in_array($word, $exceptions, true)
6590
          )
6591
      ) {
6592 1
        $word = self::ucfirst($word);
6593
      }
6594
      $newwords[] = $word;
6595
    }
6596
6597
    return self::ucfirst(implode(' ', $newwords));
6598
  }
6599
6600
  /**
6601
   * Multi decode html entity & fix urlencoded-win1252-chars.
6602
   *
6603
   * e.g:
6604
   * 'D&#252;sseldorf'               => 'Düsseldorf'
6605
   * 'D%FCsseldorf'                  => 'Düsseldorf'
6606
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
6607
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
6608
   * 'Düsseldorf'                   => 'Düsseldorf'
6609
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
6610
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
6611
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
6612
   *
6613
   * @param string $str
6614
   *
6615
   * @return string
6616
   */
6617
  public static function urldecode($str)
6618
  {
6619
    $str = (string)$str;
6620
6621
    if (!isset($str[0])) {
6622
      return '';
6623
    }
6624
6625
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
6626
6627
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
6628
6629
    $str = self::fix_simple_utf8(
6630
        rawurldecode(
6631
            self::html_entity_decode(
6632
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
6633
                $flags
6634
            )
6635
        )
6636
    );
6637
6638
    return (string)$str;
6639
  }
6640
6641
  /**
6642
   * Return a array with "urlencoded"-win1252 -> UTF-8
6643
   *
6644
   * @return mixed
6645
   */
6646
  public static function urldecode_fix_win1252_chars()
6647
  {
6648
    static $array = array(
6649
        '%20' => ' ',
6650
        '%21' => '!',
6651
        '%22' => '"',
6652
        '%23' => '#',
6653
        '%24' => '$',
6654
        '%25' => '%',
6655
        '%26' => '&',
6656
        '%27' => "'",
6657
        '%28' => '(',
6658
        '%29' => ')',
6659
        '%2A' => '*',
6660
        '%2B' => '+',
6661
        '%2C' => ',',
6662
        '%2D' => '-',
6663
        '%2E' => '.',
6664
        '%2F' => '/',
6665
        '%30' => '0',
6666
        '%31' => '1',
6667
        '%32' => '2',
6668
        '%33' => '3',
6669
        '%34' => '4',
6670
        '%35' => '5',
6671
        '%36' => '6',
6672
        '%37' => '7',
6673
        '%38' => '8',
6674
        '%39' => '9',
6675
        '%3A' => ':',
6676
        '%3B' => ';',
6677
        '%3C' => '<',
6678
        '%3D' => '=',
6679
        '%3E' => '>',
6680
        '%3F' => '?',
6681
        '%40' => '@',
6682
        '%41' => 'A',
6683
        '%42' => 'B',
6684
        '%43' => 'C',
6685
        '%44' => 'D',
6686
        '%45' => 'E',
6687
        '%46' => 'F',
6688
        '%47' => 'G',
6689
        '%48' => 'H',
6690
        '%49' => 'I',
6691
        '%4A' => 'J',
6692
        '%4B' => 'K',
6693
        '%4C' => 'L',
6694
        '%4D' => 'M',
6695
        '%4E' => 'N',
6696
        '%4F' => 'O',
6697
        '%50' => 'P',
6698
        '%51' => 'Q',
6699
        '%52' => 'R',
6700
        '%53' => 'S',
6701
        '%54' => 'T',
6702
        '%55' => 'U',
6703
        '%56' => 'V',
6704
        '%57' => 'W',
6705
        '%58' => 'X',
6706
        '%59' => 'Y',
6707
        '%5A' => 'Z',
6708
        '%5B' => '[',
6709
        '%5C' => '\\',
6710
        '%5D' => ']',
6711
        '%5E' => '^',
6712
        '%5F' => '_',
6713
        '%60' => '`',
6714
        '%61' => 'a',
6715
        '%62' => 'b',
6716
        '%63' => 'c',
6717
        '%64' => 'd',
6718
        '%65' => 'e',
6719
        '%66' => 'f',
6720
        '%67' => 'g',
6721
        '%68' => 'h',
6722
        '%69' => 'i',
6723
        '%6A' => 'j',
6724
        '%6B' => 'k',
6725
        '%6C' => 'l',
6726
        '%6D' => 'm',
6727
        '%6E' => 'n',
6728
        '%6F' => 'o',
6729
        '%70' => 'p',
6730
        '%71' => 'q',
6731
        '%72' => 'r',
6732
        '%73' => 's',
6733
        '%74' => 't',
6734
        '%75' => 'u',
6735
        '%76' => 'v',
6736
        '%77' => 'w',
6737
        '%78' => 'x',
6738
        '%79' => 'y',
6739
        '%7A' => 'z',
6740
        '%7B' => '{',
6741
        '%7C' => '|',
6742
        '%7D' => '}',
6743
        '%7E' => '~',
6744
        '%7F' => '',
6745
        '%80' => '`',
6746
        '%81' => '',
6747
        '%82' => '‚',
6748
        '%83' => 'ƒ',
6749
        '%84' => '„',
6750
        '%85' => '…',
6751
        '%86' => '†',
6752
        '%87' => '‡',
6753
        '%88' => 'ˆ',
6754
        '%89' => '‰',
6755
        '%8A' => 'Š',
6756
        '%8B' => '‹',
6757
        '%8C' => 'Œ',
6758
        '%8D' => '',
6759
        '%8E' => 'Ž',
6760
        '%8F' => '',
6761
        '%90' => '',
6762
        '%91' => '‘',
6763
        '%92' => '’',
6764
        '%93' => '“',
6765
        '%94' => '”',
6766
        '%95' => '•',
6767
        '%96' => '–',
6768
        '%97' => '—',
6769
        '%98' => '˜',
6770
        '%99' => '™',
6771
        '%9A' => 'š',
6772
        '%9B' => '›',
6773
        '%9C' => 'œ',
6774
        '%9D' => '',
6775
        '%9E' => 'ž',
6776
        '%9F' => 'Ÿ',
6777
        '%A0' => '',
6778
        '%A1' => '¡',
6779
        '%A2' => '¢',
6780
        '%A3' => '£',
6781
        '%A4' => '¤',
6782
        '%A5' => '¥',
6783
        '%A6' => '¦',
6784
        '%A7' => '§',
6785
        '%A8' => '¨',
6786
        '%A9' => '©',
6787
        '%AA' => 'ª',
6788
        '%AB' => '«',
6789
        '%AC' => '¬',
6790
        '%AD' => '',
6791
        '%AE' => '®',
6792
        '%AF' => '¯',
6793
        '%B0' => '°',
6794
        '%B1' => '±',
6795
        '%B2' => '²',
6796
        '%B3' => '³',
6797
        '%B4' => '´',
6798
        '%B5' => 'µ',
6799
        '%B6' => '¶',
6800
        '%B7' => '·',
6801
        '%B8' => '¸',
6802
        '%B9' => '¹',
6803
        '%BA' => 'º',
6804
        '%BB' => '»',
6805
        '%BC' => '¼',
6806
        '%BD' => '½',
6807
        '%BE' => '¾',
6808
        '%BF' => '¿',
6809
        '%C0' => 'À',
6810
        '%C1' => 'Á',
6811
        '%C2' => 'Â',
6812
        '%C3' => 'Ã',
6813
        '%C4' => 'Ä',
6814
        '%C5' => 'Å',
6815
        '%C6' => 'Æ',
6816
        '%C7' => 'Ç',
6817
        '%C8' => 'È',
6818
        '%C9' => 'É',
6819 1
        '%CA' => 'Ê',
6820
        '%CB' => 'Ë',
6821 1
        '%CC' => 'Ì',
6822
        '%CD' => 'Í',
6823
        '%CE' => 'Î',
6824
        '%CF' => 'Ï',
6825
        '%D0' => 'Ð',
6826
        '%D1' => 'Ñ',
6827
        '%D2' => 'Ò',
6828
        '%D3' => 'Ó',
6829
        '%D4' => 'Ô',
6830
        '%D5' => 'Õ',
6831 6
        '%D6' => 'Ö',
6832
        '%D7' => '×',
6833 6
        '%D8' => 'Ø',
6834 6
        '%D9' => 'Ù',
6835
        '%DA' => 'Ú',
6836 6
        '%DB' => 'Û',
6837
        '%DC' => 'Ü',
6838 6
        '%DD' => 'Ý',
6839 3
        '%DE' => 'Þ',
6840
        '%DF' => 'ß',
6841
        '%E0' => 'à',
6842
        '%E1' => 'á',
6843 6
        '%E2' => 'â',
6844
        '%E3' => 'ã',
6845 6
        '%E4' => 'ä',
6846
        '%E5' => 'å',
6847 6
        '%E6' => 'æ',
6848 1
        '%E7' => 'ç',
6849 1
        '%E8' => 'è',
6850 1
        '%E9' => 'é',
6851
        '%EA' => 'ê',
6852 6
        '%EB' => 'ë',
6853
        '%EC' => 'ì',
6854
        '%ED' => 'í',
6855
        '%EE' => 'î',
6856
        '%EF' => 'ï',
6857
        '%F0' => 'ð',
6858
        '%F1' => 'ñ',
6859
        '%F2' => 'ò',
6860
        '%F3' => 'ó',
6861
        '%F4' => 'ô',
6862 6
        '%F5' => 'õ',
6863
        '%F6' => 'ö',
6864 6
        '%F7' => '÷',
6865
        '%F8' => 'ø',
6866 6
        '%F9' => 'ù',
6867 6
        '%FA' => 'ú',
6868
        '%FB' => 'û',
6869
        '%FC' => 'ü',
6870 5
        '%FD' => 'ý',
6871 5
        '%FE' => 'þ',
6872
        '%FF' => 'ÿ',
6873 5
    );
6874 1
6875 1
    return $array;
6876 1
  }
6877
6878 5
  /**
6879
   * Decodes an UTF-8 string to ISO-8859-1.
6880
   *
6881
   * @param string $str
6882
   *
6883
   * @return string
6884
   */
6885
  public static function utf8_decode($str)
6886
  {
6887
    static $utf8ToWin1252Keys = null;
6888
    static $utf8ToWin1252Values = null;
6889
6890
    $str = (string)$str;
6891
6892
    if (!isset($str[0])) {
6893
      return '';
6894
    }
6895
6896
    // init
6897
    self::checkForSupport();
6898
6899
    $str = self::to_utf8($str);
6900
6901
    if ($utf8ToWin1252Keys === null) {
6902
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
6903
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
6904
    }
6905
6906
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
6907
  }
6908
6909
  /**
6910 1
   * Encodes an ISO-8859-1 string to UTF-8.
6911
   *
6912 1
   * @param string $str
6913
   *
6914
   * @return string
6915
   */
6916
  public static function utf8_encode($str)
6917
  {
6918
    $str = \utf8_encode($str);
6919
6920
    if (false === strpos($str, "\xC2")) {
6921
      return $str;
6922
    } else {
6923
6924 1
      static $cp1252ToUtf8Keys = null;
6925
      static $cp1252ToUtf8Values = null;
6926 1
6927
      if ($cp1252ToUtf8Keys === null) {
6928 1
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
6929 1
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
6930
      }
6931
6932 1
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
6933
    }
6934 1
  }
6935 1
6936
  /**
6937
   * fix -> utf8-win1252 chars
6938 1
   *
6939
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
6940
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
6941 1
   * See: http://en.wikipedia.org/wiki/Windows-1252
6942 1
   *
6943 1
   * @deprecated use "UTF8::fix_simple_utf8()"
6944 1
   *
6945 1
   * @param   string $str
6946
   *
6947
   * @return  string
6948 1
   */
6949
  public static function utf8_fix_win1252_chars($str)
6950
  {
6951
    return self::fix_simple_utf8($str);
6952
  }
6953
6954
  /**
6955
   * Returns an array with all utf8 whitespace characters.
6956
   *
6957
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
6958
   *
6959
   * @author: Derek E. [email protected]
6960
   *
6961
   * @return array an array with all known whitespace characters as values and the type of whitespace as keys
6962
   *         as defined in above URL
6963
   */
6964
  public static function whitespace_table()
6965
  {
6966
    return self::$whitespaceTable;
6967
  }
6968
6969
  /**
6970
   * Limit the number of words in a string.
6971
   *
6972
   * @param  string $str
6973
   * @param  int    $words
6974
   * @param  string $strAddOn
6975 9
   *
6976
   * @return string
6977 9
   */
6978 9
  public static function words_limit($str, $words = 100, $strAddOn = '...')
6979
  {
6980 9
    $str = (string)$str;
6981 2
6982
    if (!isset($str[0])) {
6983
      return '';
6984 8
    }
6985 8
6986 8
    $words = (int)$words;
6987
6988 8
    if ($words < 1) {
6989
      return '';
6990
    }
6991
6992 8
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
6993
6994 8
    if (
6995
        !isset($matches[0])
6996 8
        ||
6997 1
        self::strlen($str) === self::strlen($matches[0])
6998 1
    ) {
6999 1
      return $str;
7000
    }
7001 8
7002 8
    return self::rtrim($matches[0]) . $strAddOn;
7003
  }
7004 8
7005 8
  /**
7006 8
   * Wraps a string to a given number of characters
7007 8
   *
7008 8
   * @link  http://php.net/manual/en/function.wordwrap.php
7009
   *
7010 8
   * @param string $str   <p>
7011 8
   *                      The input string.
7012 8
   *                      </p>
7013 8
   * @param int    $width [optional] <p>
7014
   *                      The column width.
7015 8
   *                      </p>
7016 6
   * @param string $break [optional] <p>
7017 6
   *                      The line is broken using the optional
7018 6
   *                      break parameter.
7019 6
   *                      </p>
7020
   * @param bool   $cut   [optional] <p>
7021 6
   *                      If the cut is set to true, the string is
7022 3
   *                      always wrapped at or before the specified width. So if you have
7023 3
   *                      a word that is larger than the given width, it is broken apart.
7024
   *                      (See second example).
7025 6
   *                      </p>
7026 6
   *
7027
   * @return string the given string wrapped at the specified column.
7028 8
   */
7029
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
7030
  {
7031
    $str = (string)$str;
7032
    $break = (string)$break;
7033
7034
    if (!isset($str[0], $break[0])) {
7035
      return '';
7036 1
    }
7037
7038 1
    $w = '';
7039
    $strSplit = explode($break, $str);
7040
    $count = count($strSplit);
7041
7042
    if (1 === $count && '' === $strSplit[0]) {
7043
      return '';
7044
    }
7045
7046
    $chars = array();
7047
    /** @noinspection ForeachInvariantsInspection */
7048
    for ($i = 0; $i < $count; ++$i) {
7049
7050
      if ($i) {
7051
        $chars[] = $break;
7052
        $w .= '#';
7053
      }
7054
7055
      $c = $strSplit[$i];
7056
      unset($strSplit[$i]);
7057
7058
      foreach (self::split($c) as $c) {
7059
        $chars[] = $c;
7060
        $w .= ' ' === $c ? ' ' : '?';
7061
      }
7062
    }
7063
7064
    $strReturn = '';
7065
    $j = 0;
7066
    $b = $i = -1;
7067
    $w = wordwrap($w, $width, '#', $cut);
7068
7069
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
7070
      for (++$i; $i < $b; ++$i) {
7071
        $strReturn .= $chars[$j];
7072
        unset($chars[$j++]);
7073
      }
7074
7075
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
7076
        unset($chars[$j++]);
7077
      }
7078
7079
      $strReturn .= $break;
7080
    }
7081
7082
    return $strReturn . implode('', $chars);
7083
  }
7084
7085
  /**
7086
   * Returns an array of Unicode White Space characters.
7087
   *
7088
   * @return   array An array with numeric code point as key and White Space Character as value.
7089
   */
7090
  public static function ws()
7091
  {
7092
    return self::$whitespace;
7093
  }
7094
7095
}
7096