Completed
Push — master ( c50298...c96d38 )
by Lars
03:44
created

UTF8::chr()   C

Complexity

Conditions 9
Paths 15

Size

Total Lines 47
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 9.5338

Importance

Changes 10
Bugs 1 Features 2
Metric Value
c 10
b 1
f 2
dl 0
loc 47
ccs 13
cts 16
cp 0.8125
rs 5.2941
cc 9
eloc 26
nc 15
nop 1
crap 9.5338
1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834 1
  public static function add_bom_to_string($str)
835
  {
836 1
    if (self::string_has_bom($str) === false) {
837 1
      $str = self::bom() . $str;
838 1
    }
839
840 1
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 2
  public static function bom()
861
  {
862 2
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884 2
  public static function checkForSupport()
885
  {
886 2
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888 1
      self::$support['already_checked_via_portable_utf8'] = true;
889
890 1
      self::$support['mbstring'] = self::mbstring_loaded();
891 1
      self::$support['iconv'] = self::iconv_loaded();
892 1
      self::$support['intl'] = self::intl_loaded();
893 1
      self::$support['intlChar'] = self::intlChar_loaded();
894 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895 1
    }
896 2
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907 9
  public static function chr($code_point)
908
  {
909
    $i = (int)$code_point;
910 9
    if ($i !== $code_point) {
911
      return null;
912 9
    }
913
914
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
915
      self::checkForSupport();
916 9
    }
917
918
    if (self::$support['intlChar'] === true) {
919
      return \IntlChar::chr($code_point);
920 9
    }
921 1
922 1
    // use static cache, if there is no support for "IntlChar"
923
    static $cache = array();
924 9
    if (isset($cache[$code_point]) === true) {
925 2
      return $cache[$code_point];
926
    }
927
928 9
929
    if ($code_point <= 0x7f) {
930
      return $cache[$code_point] = chr($code_point);
931
    }
932
933
    if ($code_point <= 0x7ff) {
934
      return $cache[$code_point] = chr(0xc0 | ($code_point >> 6)) .
935
                                   chr(0x80 | ($code_point & 0x3f));
936
    }
937
938
    if ($code_point <= 0xffff) {
939 1
      return $cache[$code_point] = chr(0xe0 | ($code_point >> 12)) .
940
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
941 1
                                   chr(0x80 | ($code_point & 0x3f));
942
    }
943 1
944
    if ($code_point <= 0x10ffff) {
945
      return $cache[$code_point] = chr(0xf0 | ($code_point >> 18)) .
946
                                   chr(0x80 | (($code_point >> 12) & 0x3f)) .
947
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
948
                                   chr(0x80 | ($code_point & 0x3f));
949
    }
950
951
    # U+FFFD REPLACEMENT CHARACTER
952
    return $cache[$code_point] = "\xEF\xBF\xBD";
953
  }
954
955
  /**
956
   * Applies callback to all characters of a string.
957
   *
958 4
   * @param string|array $callback <p>The callback function.</p>
959
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
960 4
   *
961 3
   * @return array <p>The outcome of callback.</p>
962
   */
963
  public static function chr_map($callback, $str)
964 4
  {
965
    $chars = self::split($str);
966
967
    return array_map($callback, $chars);
968
  }
969
970
  /**
971
   * Generates an array of byte length of each character of a Unicode string.
972
   *
973
   * 1 byte => U+0000  - U+007F
974 2
   * 2 byte => U+0080  - U+07FF
975
   * 3 byte => U+0800  - U+FFFF
976 2
   * 4 byte => U+10000 - U+10FFFF
977 2
   *
978 2
   * @param string $str <p>The original Unicode string.</p>
979
   *
980 2
   * @return array <p>An array of byte lengths of each character.</p>
981
   */
982 2
  public static function chr_size_list($str)
983
  {
984
    if (!$str) {
985 2
      return array();
986
    }
987 2
988 2
    return array_map('strlen', self::split($str));
989 2
  }
990
991 1
  /**
992 1
   * Get a decimal code representation of a specific character.
993 1
   *
994
   * @param string $char <p>The input character.</p>
995
   *
996
   * @return int
997
   */
998
  public static function chr_to_decimal($char)
999 2
  {
1000
    $char = (string)$char;
1001 2
    $code = self::ord($char[0]);
1002 2
    $bytes = 1;
1003
1004 2
    if (!($code & 0x80)) {
1005
      // 0xxxxxxx
1006
      return $code;
1007
    }
1008
1009
    if (($code & 0xe0) === 0xc0) {
1010
      // 110xxxxx
1011
      $bytes = 2;
1012
      $code &= ~0xc0;
1013
    } elseif (($code & 0xf0) === 0xe0) {
1014
      // 1110xxxx
1015 1
      $bytes = 3;
1016
      $code &= ~0xe0;
1017 1
    } elseif (($code & 0xf8) === 0xf0) {
1018
      // 11110xxx
1019
      $bytes = 4;
1020
      $code &= ~0xf0;
1021
    }
1022
1023
    for ($i = 2; $i <= $bytes; $i++) {
1024
      // 10xxxxxx
1025
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1026
    }
1027
1028
    return $code;
1029 1
  }
1030
1031 1
  /**
1032
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1033
   *
1034
   * @param string $char <p>The input character</p>
1035
   * @param string $pfix [optional]
1036
   *
1037
   * @return string <p>The code point encoded as U+xxxx<p>
1038
   */
1039
  public static function chr_to_hex($char, $pfix = 'U+')
1040
  {
1041
    return self::int_to_hex(self::ord($char), $pfix);
1042
  }
1043
1044
  /**
1045
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1046
   *
1047 42
   * @param string $body     <p>The original string to be split.</p>
1048
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1049
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1050
   *
1051
   * @return string <p>The chunked string</p>
1052
   */
1053
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1054
  {
1055
    return implode($end, self::split($body, $chunklen));
1056
  }
1057
1058
  /**
1059
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1060
   *
1061
   * @param string $str                     <p>The string to be sanitized.</p>
1062 42
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1063 42
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1064
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1065 42
   *                                        => "..."</p>
1066 42
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1067
   *                                        $normalize_whitespace</p>
1068 42
   *
1069 6
   * @return string <p>Clean UTF-8 encoded string.</p>
1070 6
   */
1071
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1072 42
  {
1073 1
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1074 1
    // caused connection reset problem on larger strings
1075
1076 42
    $regx = '/
1077 5
      (
1078 5
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1079
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1080 42
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1081
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1082
        ){1,100}                      # ...one or more times
1083
      )
1084
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1085
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1086
    /x';
1087
    $str = preg_replace($regx, '$1', $str);
1088
1089
    $str = self::replace_diamond_question_mark($str, '');
1090 4
    $str = self::remove_invisible_characters($str);
1091
1092 4
    if ($normalize_whitespace === true) {
1093
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1094 4
    }
1095 1
1096
    if ($normalize_msword === true) {
1097
      $str = self::normalize_msword($str);
1098
    }
1099 4
1100
    if ($remove_bom === true) {
1101
      $str = self::removeBOM($str);
1102
    }
1103
1104
    return $str;
1105
  }
1106 4
1107
  /**
1108 4
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1109
   *
1110
   * @param string $str <p>The input string.</p>
1111
   *
1112
   * @return string
1113
   */
1114
  public static function cleanup($str)
1115
  {
1116
    $str = (string)$str;
1117
1118
    if (!isset($str[0])) {
1119
      return '';
1120
    }
1121
1122 5
    // fixed ISO <-> UTF-8 Errors
1123
    $str = self::fix_simple_utf8($str);
1124 5
1125 5
    // remove all none UTF-8 symbols
1126 5
    // && remove diamond question mark (�)
1127
    // && remove remove invisible characters (e.g. "\0")
1128 5
    // && remove BOM
1129
    // && normalize whitespace chars (but keep non-breaking-spaces)
1130 5
    $str = self::clean($str, true, true, false, true);
1131 5
1132 5
    return (string)$str;
1133
  }
1134 5
1135
  /**
1136 5
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1137 1
   *
1138
   * INFO: opposite to UTF8::string()
1139 1
   *
1140 1
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1141 1
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1142
   *                                    default, code points will be returned as integers.</p>
1143 1
   *
1144 1
   * @return array <p>The array of code points.</p>
1145
   */
1146 5
  public static function codepoints($arg, $u_style = false)
1147
  {
1148
    if (is_string($arg)) {
1149
      $arg = self::split($arg);
1150
    }
1151
1152
    $arg = array_map(
1153
        array(
1154
            '\\voku\\helper\\UTF8',
1155
            'ord',
1156
        ),
1157
        $arg
1158 6
    );
1159
1160 6
    if ($u_style) {
1161
      $arg = array_map(
1162
          array(
1163
              '\\voku\\helper\\UTF8',
1164
              'int_to_hex',
1165
          ),
1166
          $arg
1167
      );
1168
    }
1169
1170 1
    return $arg;
1171
  }
1172 1
1173 1
  /**
1174 1
   * Returns count of characters used in a string.
1175
   *
1176 1
   * @param string $str       <p>The input string.</p>
1177
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1178
   *
1179
   * @return array <p>An associative array of Character as keys and
1180
   *               their count as values.</p>
1181
   */
1182
  public static function count_chars($str, $cleanUtf8 = false)
1183
  {
1184
    return array_count_values(self::split($str, 1, $cleanUtf8));
1185
  }
1186
1187
  /**
1188
   * Get a UTF-8 character from its decimal code representation.
1189
   *
1190
   * @param int $code
1191
   *
1192 11
   * @return string
1193
   */
1194 11
  public static function decimal_to_chr($code)
1195 11
  {
1196
    return \mb_convert_encoding(
1197 11
        '&#x' . dechex($code) . ';',
1198 5
        'UTF-8',
1199
        'HTML-ENTITIES'
1200
    );
1201 11
  }
1202 1
1203 1
  /**
1204
   * Encode a string with a new charset-encoding.
1205 11
   *
1206
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1207
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1208
   *
1209 11
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1210
   * @param string $str      <p>The input string</p>
1211
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1212 11
   *                         /> otherwise we auto-detect the current string-encoding</p>
1213
   *
1214 1
   * @return string
1215 11
   */
1216
  public static function encode($encoding, $str, $force = true)
1217
  {
1218
    $str = (string)$str;
1219 11
    $encoding = (string)$encoding;
1220
1221
    if (!isset($str[0], $encoding[0])) {
1222 11
      return $str;
1223 1
    }
1224 1
1225 1
    if ($encoding !== 'UTF-8') {
1226 11
      $encoding = self::normalize_encoding($encoding);
1227 11
    }
1228
1229
    $encodingDetected = self::str_detect_encoding($str);
1230
1231
    if (
1232 2
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1233
        &&
1234
        (
1235 1
            $force === true
1236
            ||
1237
            $encodingDetected !== $encoding
1238 2
        )
1239 1
    ) {
1240
1241
      if (
1242 2
          $encoding === 'UTF-8'
1243 2
          &&
1244 2
          (
1245
              $force === true
1246 2
              || $encodingDetected === 'UTF-8'
1247
              || $encodingDetected === 'WINDOWS-1252'
1248 2
              || $encodingDetected === 'ISO-8859-1'
1249 2
          )
1250
      ) {
1251
        return self::to_utf8($str);
1252
      }
1253 1
1254
      if (
1255
          $encoding === 'ISO-8859-1'
1256
          &&
1257
          (
1258
              $force === true
1259
              || $encodingDetected === 'ISO-8859-1'
1260
              || $encodingDetected === 'UTF-8'
1261
          )
1262
      ) {
1263
        return self::to_iso8859($str);
1264
      }
1265
1266
      $strEncoded = \mb_convert_encoding(
1267
          $str,
1268
          $encoding,
1269
          $encodingDetected
1270
      );
1271
1272
      if ($strEncoded) {
1273
        return $strEncoded;
1274
      }
1275
    }
1276
1277
    return $str;
1278
  }
1279
1280
  /**
1281
   * Reads entire file into a string.
1282
   *
1283
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1284
   *
1285
   * @link http://php.net/manual/en/function.file-get-contents.php
1286
   *
1287
   * @param string        $filename      <p>
1288
   *                                     Name of the file to read.
1289
   *                                     </p>
1290
   * @param int|null      $flags         [optional] <p>
1291
   *                                     Prior to PHP 6, this parameter is called
1292
   *                                     use_include_path and is a bool.
1293
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1294
   *                                     to trigger include path
1295
   *                                     search.
1296
   *                                     </p>
1297
   *                                     <p>
1298
   *                                     The value of flags can be any combination of
1299
   *                                     the following flags (with some restrictions), joined with the
1300
   *                                     binary OR (|)
1301
   *                                     operator.
1302
   *                                     </p>
1303
   *                                     <p>
1304
   *                                     <table>
1305
   *                                     Available flags
1306
   *                                     <tr valign="top">
1307
   *                                     <td>Flag</td>
1308
   *                                     <td>Description</td>
1309
   *                                     </tr>
1310
   *                                     <tr valign="top">
1311
   *                                     <td>
1312
   *                                     FILE_USE_INCLUDE_PATH
1313
   *                                     </td>
1314
   *                                     <td>
1315
   *                                     Search for filename in the include directory.
1316
   *                                     See include_path for more
1317
   *                                     information.
1318
   *                                     </td>
1319
   *                                     </tr>
1320
   *                                     <tr valign="top">
1321
   *                                     <td>
1322
   *                                     FILE_TEXT
1323
   *                                     </td>
1324
   *                                     <td>
1325
   *                                     As of PHP 6, the default encoding of the read
1326
   *                                     data is UTF-8. You can specify a different encoding by creating a
1327
   *                                     custom context or by changing the default using
1328
   *                                     stream_default_encoding. This flag cannot be
1329
   *                                     used with FILE_BINARY.
1330
   *                                     </td>
1331
   *                                     </tr>
1332
   *                                     <tr valign="top">
1333
   *                                     <td>
1334
   *                                     FILE_BINARY
1335
   *                                     </td>
1336
   *                                     <td>
1337
   *                                     With this flag, the file is read in binary mode. This is the default
1338 2
   *                                     setting and cannot be used with FILE_TEXT.
1339
   *                                     </td>
1340
   *                                     </tr>
1341 2
   *                                     </table>
1342 2
   *                                     </p>
1343
   * @param resource|null $context       [optional] <p>
1344 2
   *                                     A valid context resource created with
1345 2
   *                                     stream_context_create. If you don't need to use a
1346
   *                                     custom context, you can skip this parameter by &null;.
1347
   *                                     </p>
1348
   * @param int|null      $offset        [optional] <p>
1349 2
   *                                     The offset where the reading starts.
1350 2
   *                                     </p>
1351
   * @param int|null      $maxlen        [optional] <p>
1352 2
   *                                     Maximum length of data read. The default is to read until end
1353 2
   *                                     of file is reached.
1354
   *                                     </p>
1355 2
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1356 1
   *
1357 1
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1358 2
   *                                     or pdf, because they used non default utf-8 chars</p>
1359
   *
1360
   * @return string <p>The function returns the read data or false on failure.</p>
1361
   */
1362 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1363 1
  {
1364
    // init
1365
    $timeout = (int)$timeout;
1366 1
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1367 1
1368 1
    if ($timeout && $context === null) {
1369 1
      $context = stream_context_create(
1370
          array(
1371 1
              'http' =>
1372
                  array(
1373
                      'timeout' => $timeout,
1374
                  ),
1375
          )
1376
      );
1377
    }
1378
1379
    if (is_int($maxlen)) {
1380
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1381 1
    } else {
1382
      $data = file_get_contents($filename, $flags, $context, $offset);
1383 1
    }
1384
1385
    // return false on error
1386
    if ($data === false) {
1387
      return false;
1388
    }
1389
1390
    if ($convertToUtf8 === true) {
1391
      $data = self::encode('UTF-8', $data, false);
1392
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1393
    }
1394
1395 9
    return $data;
1396
  }
1397 9
1398 9
  /**
1399 3
   * Checks if a file starts with BOM (Byte Order Mark) character.
1400
   *
1401 3
   * @param string $file_path <p>Path to a valid file.</p>
1402 3
   *
1403 3
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1404 9
   */
1405 2
  public static function file_has_bom($file_path)
1406 2
  {
1407 2
    return self::string_has_bom(file_get_contents($file_path));
1408 2
  }
1409 9
1410 8
  /**
1411
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1412 2
   *
1413 2
   * @param mixed  $var
1414 8
   * @param int    $normalization_form
1415 8
   * @param string $leading_combining
1416 6
   *
1417 6
   * @return mixed
1418 6
   */
1419
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1420 6
  {
1421 3
    switch (gettype($var)) {
1422 3 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1423 5
        foreach ($var as $k => $v) {
1424
          /** @noinspection AlterInForeachInspection */
1425
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1426
        }
1427 8
        break;
1428 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1429
        foreach ($var as $k => $v) {
1430 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1431 2
        }
1432 8
        break;
1433 8
      case 'string':
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
1434 9
1435
        if (false !== strpos($var, "\r")) {
1436 9
          // Workaround https://bugs.php.net/65732
1437
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1438
        }
1439
1440
        if (self::is_ascii($var) === false) {
1441
1442
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1443
            $n = '-';
1444
          } else {
1445
            $n = \Normalizer::normalize($var, $normalization_form);
1446
1447
            if (isset($n[0])) {
1448
              $var = $n;
1449
            } else {
1450
              $var = self::encode('UTF-8', $var);
1451
            }
1452
          }
1453
1454
          if (
1455
              $var[0] >= "\x80" && isset($n[0], $leading_combining[0])
1456
              &&
1457
              preg_match('/^\p{Mn}/u', $var)
1458
          ) {
1459
            // Prevent leading combining chars
1460
            // for NFC-safe concatenations.
1461
            $var = $leading_combining . $var;
1462
          }
1463
        }
1464
        break;
1465
    }
1466
1467
    return $var;
1468
  }
1469
1470
  /**
1471
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1472
   *
1473
   * @param int    $type
1474
   * @param string $var
1475
   * @param int    $filter
1476
   * @param mixed  $option
1477
   *
1478
   * @return mixed
1479
   */
1480 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1481
  {
1482
    if (4 > func_num_args()) {
1483
      $var = filter_input($type, $var, $filter);
1484
    } else {
1485
      $var = filter_input($type, $var, $filter, $option);
1486
    }
1487
1488
    return self::filter($var);
1489 1
  }
1490
1491 1
  /**
1492 1
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1493 1
   *
1494 1
   * @param int   $type
1495
   * @param mixed $definition
1496
   * @param bool  $add_empty
1497 1
   *
1498
   * @return mixed
1499
   */
1500 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1501
  {
1502
    if (2 > func_num_args()) {
1503
      $a = filter_input_array($type);
1504
    } else {
1505
      $a = filter_input_array($type, $definition, $add_empty);
1506
    }
1507
1508
    return self::filter($a);
1509 1
  }
1510
1511 1
  /**
1512 1
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1513 1
   *
1514 1
   * @param mixed $var
1515
   * @param int   $filter
1516
   * @param mixed $option
1517 1
   *
1518
   * @return mixed
1519
   */
1520 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1521
  {
1522
    if (3 > func_num_args()) {
1523
      $var = filter_var($var, $filter);
1524
    } else {
1525
      $var = filter_var($var, $filter, $option);
1526
    }
1527
1528 1
    return self::filter($var);
1529
  }
1530 1
1531
  /**
1532
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1533
   *
1534
   * @param array $data
1535
   * @param mixed $definition
1536
   * @param bool  $add_empty
1537
   *
1538
   * @return mixed
1539
   */
1540 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1541
  {
1542
    if (2 > func_num_args()) {
1543
      $a = filter_var_array($data);
1544
    } else {
1545
      $a = filter_var_array($data, $definition, $add_empty);
1546 7
    }
1547
1548 7
    return self::filter($a);
1549 7
  }
1550
1551 7
  /**
1552
   * Check if the number of unicode characters are not more than the specified integer.
1553 7
   *
1554 2
   * @param string $str      The original string to be checked.
1555
   * @param int    $box_size The size in number of chars to be checked against string.
1556
   *
1557 7
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1558 1
   */
1559 1
  public static function fits_inside($str, $box_size)
1560 1
  {
1561
    return (self::strlen($str) <= $box_size);
1562 7
  }
1563
1564
  /**
1565
   * Try to fix simple broken UTF-8 strings.
1566
   *
1567
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1568
   *
1569
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1570
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1571
   * See: http://en.wikipedia.org/wiki/Windows-1252
1572 1
   *
1573
   * @param string $str <p>The input string</p>
1574 1
   *
1575
   * @return string
1576 1
   */
1577
  public static function fix_simple_utf8($str)
1578
  {
1579 1
    static $brokenUtf8ToUtf8Keys = null;
1580 1
    static $brokenUtf8ToUtf8Values = null;
1581
1582 1
    $str = (string)$str;
1583
1584
    if (!isset($str[0])) {
1585 1
      return '';
1586 1
    }
1587 1
1588 1
    if ($brokenUtf8ToUtf8Keys === null) {
1589 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1590
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1591 1
    }
1592
1593
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1594
  }
1595
1596
  /**
1597
   * Fix a double (or multiple) encoded UTF8 string.
1598
   *
1599
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1600
   *
1601 1
   * @return mixed
1602
   */
1603 1
  public static function fix_utf8($str)
1604
  {
1605
    if (is_array($str)) {
1606
1607 1
      foreach ($str as $k => $v) {
1608
        /** @noinspection AlterInForeachInspection */
1609
        /** @noinspection OffsetOperationsInspection */
1610
        $str[$k] = self::fix_utf8($v);
1611
      }
1612
1613
      return $str;
1614
    }
1615
1616
    $last = '';
1617
    while ($last !== $str) {
1618
      $last = $str;
1619
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1619 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1620
    }
1621
1622
    return $str;
1623 1
  }
1624
1625 1
  /**
1626 1
   * Get character of a specific character.
1627
   *
1628
   * @param string $char
1629 1
   *
1630
   * @return string <p>'RTL' or 'LTR'</p>
1631 1
   */
1632 1
  public static function getCharDirection($char)
1633 1
  {
1634 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1635 1
      self::checkForSupport();
1636 1
    }
1637 1
1638 1
    if (self::$support['intlChar'] === true) {
1639 1
      $tmpReturn = \IntlChar::charDirection($char);
1640 1
1641 1
      // from "IntlChar"-Class
1642
      $charDirection = array(
1643
          'RTL' => array(1, 13, 14, 15, 21),
1644
          'LTR' => array(0, 11, 12, 20),
1645
      );
1646
1647
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1648
        return 'LTR';
1649
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1650
        return 'RTL';
1651
      }
1652
    }
1653
1654
    $c = static::chr_to_decimal($char);
1655
1656
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1657
      return 'LTR';
1658
    }
1659
1660
    if (0x85e >= $c) {
1661 1
1662 1
      if (0x5be === $c ||
1663
          0x5c0 === $c ||
1664
          0x5c3 === $c ||
1665
          0x5c6 === $c ||
1666
          (0x5d0 <= $c && 0x5ea >= $c) ||
1667
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1668
          0x608 === $c ||
1669
          0x60b === $c ||
1670
          0x60d === $c ||
1671
          0x61b === $c ||
1672
          (0x61e <= $c && 0x64a >= $c) ||
1673
          (0x66d <= $c && 0x66f >= $c) ||
1674
          (0x671 <= $c && 0x6d5 >= $c) ||
1675
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1676
          (0x6ee <= $c && 0x6ef >= $c) ||
1677
          (0x6fa <= $c && 0x70d >= $c) ||
1678
          0x710 === $c ||
1679
          (0x712 <= $c && 0x72f >= $c) ||
1680
          (0x74d <= $c && 0x7a5 >= $c) ||
1681
          0x7b1 === $c ||
1682
          (0x7c0 <= $c && 0x7ea >= $c) ||
1683
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1684
          0x7fa === $c ||
1685
          (0x800 <= $c && 0x815 >= $c) ||
1686
          0x81a === $c ||
1687
          0x824 === $c ||
1688
          0x828 === $c ||
1689
          (0x830 <= $c && 0x83e >= $c) ||
1690
          (0x840 <= $c && 0x858 >= $c) ||
1691
          0x85e === $c
1692
      ) {
1693
        return 'RTL';
1694
      }
1695
1696
    } elseif (0x200f === $c) {
1697
1698
      return 'RTL';
1699
1700
    } elseif (0xfb1d <= $c) {
1701
1702
      if (0xfb1d === $c ||
1703
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1704
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1705
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1706
          0xfb3e === $c ||
1707
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1708
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1709
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1710
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1711
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1712
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1713
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1714
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1715
          (0xfe76 <= $c && 0xfefc >= $c) ||
1716
          (0x10800 <= $c && 0x10805 >= $c) ||
1717
          0x10808 === $c ||
1718
          (0x1080a <= $c && 0x10835 >= $c) ||
1719
          (0x10837 <= $c && 0x10838 >= $c) ||
1720
          0x1083c === $c ||
1721 1
          (0x1083f <= $c && 0x10855 >= $c) ||
1722
          (0x10857 <= $c && 0x1085f >= $c) ||
1723 1
          (0x10900 <= $c && 0x1091b >= $c) ||
1724 1
          (0x10920 <= $c && 0x10939 >= $c) ||
1725
          0x1093f === $c ||
1726 1
          0x10a00 === $c ||
1727
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1728
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1729
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1730
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1731
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1732
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1733
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1734
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1735
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1736
          (0x10b78 <= $c && 0x10b7f >= $c)
1737
      ) {
1738
        return 'RTL';
1739
      }
1740
    }
1741 2
1742
    return 'LTR';
1743 2
  }
1744 1
1745
  /**
1746
   * get data from "/data/*.ser"
1747 1
   *
1748
   * @param string $file
1749
   *
1750
   * @return bool|string|array|int <p>Will return false on error.</p>
1751
   */
1752
  private static function getData($file)
1753
  {
1754
    $file = __DIR__ . '/data/' . $file . '.php';
1755
    if (file_exists($file)) {
1756
      /** @noinspection PhpIncludeInspection */
1757
      return require $file;
1758
    } else {
1759
      return false;
1760
    }
1761 1
  }
1762
1763 1
  /**
1764
   * Converts hexadecimal U+xxxx code point representation to integer.
1765
   *
1766
   * INFO: opposite to UTF8::int_to_hex()
1767
   *
1768
   * @param string $str <p>The hexadecimal code point representation.</p>
1769
   *
1770
   * @return int|false <p>The code point, or false on failure.</p>
1771
   */
1772
  public static function hex_to_int($str)
1773
  {
1774
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1775
      return intval($match[1], 16);
1776
    }
1777 2
1778
    return false;
1779
  }
1780 2
1781
  /**
1782 2
   * alias for "UTF8::html_entity_decode()"
1783 2
   *
1784 1
   * @see UTF8::html_entity_decode()
1785 1
   *
1786
   * @param string $str
1787 2
   * @param int    $flags
1788
   * @param string $encoding
1789
   *
1790
   * @return string
1791 2
   */
1792 2
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1793 2
  {
1794
    return self::html_entity_decode($str, $flags, $encoding);
1795 2
  }
1796
1797
  /**
1798
   * Converts a UTF-8 string to a series of HTML numbered entities.
1799
   *
1800
   * INFO: opposite to UTF8::html_decode()
1801
   *
1802
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1803
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1804
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1805
   *
1806
   * @return string <p>HTML numbered entities.</p>
1807
   */
1808
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1809
  {
1810
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1811
    if (function_exists('mb_encode_numericentity')) {
1812
1813
      $startCode = 0x00;
1814
      if ($keepAsciiChars === true) {
1815
        $startCode = 0x80;
1816
      }
1817
1818
      if ($encoding !== 'UTF-8') {
1819
        $encoding = self::normalize_encoding($encoding);
1820
      }
1821
1822
      return mb_encode_numericentity(
1823
          $str,
1824
          array($startCode, 0xffff, 0, 0xffff,),
1825
          $encoding
1826
      );
1827
    }
1828
1829
    return implode(
1830
        array_map(
1831
            function ($data) use ($keepAsciiChars) {
1832
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1833
            },
1834
            self::split($str)
1835
        )
1836
    );
1837
  }
1838
1839
  /**
1840
   * UTF-8 version of html_entity_decode()
1841
   *
1842
   * The reason we are not using html_entity_decode() by itself is because
1843
   * while it is not technically correct to leave out the semicolon
1844
   * at the end of an entity most browsers will still interpret the entity
1845
   * correctly. html_entity_decode() does not convert entities without
1846
   * semicolons, so we are left with our own little solution here. Bummer.
1847
   *
1848
   * Convert all HTML entities to their applicable characters
1849
   *
1850
   * INFO: opposite to UTF8::html_encode()
1851
   *
1852
   * @link http://php.net/manual/en/function.html-entity-decode.php
1853
   *
1854
   * @param string $str      <p>
1855
   *                         The input string.
1856
   *                         </p>
1857
   * @param int    $flags    [optional] <p>
1858
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1859
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1860
   *                         <table>
1861
   *                         Available <i>flags</i> constants
1862
   *                         <tr valign="top">
1863
   *                         <td>Constant Name</td>
1864
   *                         <td>Description</td>
1865
   *                         </tr>
1866
   *                         <tr valign="top">
1867
   *                         <td><b>ENT_COMPAT</b></td>
1868
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1869
   *                         </tr>
1870
   *                         <tr valign="top">
1871
   *                         <td><b>ENT_QUOTES</b></td>
1872
   *                         <td>Will convert both double and single quotes.</td>
1873
   *                         </tr>
1874
   *                         <tr valign="top">
1875
   *                         <td><b>ENT_NOQUOTES</b></td>
1876
   *                         <td>Will leave both double and single quotes unconverted.</td>
1877 18
   *                         </tr>
1878
   *                         <tr valign="top">
1879 18
   *                         <td><b>ENT_HTML401</b></td>
1880
   *                         <td>
1881 18
   *                         Handle code as HTML 4.01.
1882 6
   *                         </td>
1883
   *                         </tr>
1884
   *                         <tr valign="top">
1885 18
   *                         <td><b>ENT_XML1</b></td>
1886 7
   *                         <td>
1887
   *                         Handle code as XML 1.
1888
   *                         </td>
1889 18
   *                         </tr>
1890 1
   *                         <tr valign="top">
1891 1
   *                         <td><b>ENT_XHTML</b></td>
1892
   *                         <td>
1893 18
   *                         Handle code as XHTML.
1894 4
   *                         </td>
1895 4
   *                         </tr>
1896 4
   *                         <tr valign="top">
1897
   *                         <td><b>ENT_HTML5</b></td>
1898
   *                         <td>
1899 4
   *                         Handle code as HTML 5.
1900
   *                         </td>
1901
   *                         </tr>
1902 18
   *                         </table>
1903
   *                         </p>
1904 18
   * @param string $encoding [optional] <p>Encoding to use.</p>
1905 18
   *
1906
   * @return string <p>The decoded string.</p>
1907 16
   */
1908
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1909 16
  {
1910 15
    $str = (string)$str;
1911
1912 7
    if (!isset($str[0])) {
1913
      return '';
1914 18
    }
1915
1916 18
    if (!isset($str[3])) { // examples: &; || &x;
0 ignored issues
show
Unused Code Comprehensibility introduced by
46% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
1917
      return $str;
1918
    }
1919 18
1920 18
    if (
1921 18
        strpos($str, '&') === false
1922
        ||
1923 18
        strpos($str, ';') === false
1924
    ) {
1925 18
      return $str;
1926
    }
1927 18
1928
    if ($encoding !== 'UTF-8') {
1929
      $encoding = self::normalize_encoding($encoding);
1930
    }
1931
1932
    if ($flags === null) {
1933
      if (Bootup::is_php('5.4') === true) {
1934
        $flags = ENT_COMPAT | ENT_HTML5;
1935
      } else {
1936
        $flags = ENT_COMPAT;
1937
      }
1938
    }
1939
1940
    do {
1941
      $str_compare = $str;
1942
1943
      $str = preg_replace_callback(
1944
          "/&#\d{2,5};/",
1945
          function ($matches) {
1946
            $returnTmp = \mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
1947
1948
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1949
              return $returnTmp;
1950
            } else {
1951
              return $matches[0];
1952
            }
1953
          },
1954
          $str
1955
      );
1956
1957
      // decode numeric & UTF16 two byte entities
1958
      $str = html_entity_decode(
1959
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1960
          $flags,
1961
          $encoding
1962
      );
1963
1964
    } while ($str_compare !== $str);
1965
1966
    return $str;
1967
  }
1968
1969
  /**
1970
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1971
   *
1972
   * @link http://php.net/manual/en/function.htmlentities.php
1973
   *
1974
   * @param string $str           <p>
1975
   *                              The input string.
1976
   *                              </p>
1977
   * @param int    $flags         [optional] <p>
1978
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
1979
   *                              invalid code unit sequences and the used document type. The default is
1980
   *                              ENT_COMPAT | ENT_HTML401.
1981
   *                              <table>
1982
   *                              Available <i>flags</i> constants
1983
   *                              <tr valign="top">
1984
   *                              <td>Constant Name</td>
1985
   *                              <td>Description</td>
1986
   *                              </tr>
1987
   *                              <tr valign="top">
1988
   *                              <td><b>ENT_COMPAT</b></td>
1989
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
1990
   *                              </tr>
1991
   *                              <tr valign="top">
1992
   *                              <td><b>ENT_QUOTES</b></td>
1993
   *                              <td>Will convert both double and single quotes.</td>
1994
   *                              </tr>
1995
   *                              <tr valign="top">
1996
   *                              <td><b>ENT_NOQUOTES</b></td>
1997
   *                              <td>Will leave both double and single quotes unconverted.</td>
1998
   *                              </tr>
1999
   *                              <tr valign="top">
2000
   *                              <td><b>ENT_IGNORE</b></td>
2001
   *                              <td>
2002
   *                              Silently discard invalid code unit sequences instead of returning
2003
   *                              an empty string. Using this flag is discouraged as it
2004
   *                              may have security implications.
2005
   *                              </td>
2006
   *                              </tr>
2007
   *                              <tr valign="top">
2008
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2009
   *                              <td>
2010
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2011
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2012
   *                              </td>
2013
   *                              </tr>
2014
   *                              <tr valign="top">
2015
   *                              <td><b>ENT_DISALLOWED</b></td>
2016
   *                              <td>
2017
   *                              Replace invalid code points for the given document type with a
2018
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2019
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2020
   *                              instance, to ensure the well-formedness of XML documents with
2021
   *                              embedded external content.
2022
   *                              </td>
2023
   *                              </tr>
2024
   *                              <tr valign="top">
2025
   *                              <td><b>ENT_HTML401</b></td>
2026
   *                              <td>
2027
   *                              Handle code as HTML 4.01.
2028
   *                              </td>
2029
   *                              </tr>
2030
   *                              <tr valign="top">
2031
   *                              <td><b>ENT_XML1</b></td>
2032
   *                              <td>
2033 2
   *                              Handle code as XML 1.
2034
   *                              </td>
2035 2
   *                              </tr>
2036 1
   *                              <tr valign="top">
2037 1
   *                              <td><b>ENT_XHTML</b></td>
2038
   *                              <td>
2039 2
   *                              Handle code as XHTML.
2040
   *                              </td>
2041 2
   *                              </tr>
2042 1
   *                              <tr valign="top">
2043
   *                              <td><b>ENT_HTML5</b></td>
2044
   *                              <td>
2045 2
   *                              Handle code as HTML 5.
2046 2
   *                              </td>
2047 2
   *                              </tr>
2048 2
   *                              </table>
2049 2
   *                              </p>
2050 1
   * @param string $encoding      [optional] <p>
2051
   *                              Like <b>htmlspecialchars</b>,
2052 1
   *                              <b>htmlentities</b> takes an optional third argument
2053 1
   *                              <i>encoding</i> which defines encoding used in
2054 1
   *                              conversion.
2055 1
   *                              Although this argument is technically optional, you are highly
2056 1
   *                              encouraged to specify the correct value for your code.
2057 2
   *                              </p>
2058
   * @param bool   $double_encode [optional] <p>
2059 2
   *                              When <i>double_encode</i> is turned off PHP will not
2060
   *                              encode existing html entities. The default is to convert everything.
2061
   *                              </p>
2062
   *
2063
   *
2064
   * @return string the encoded string.
2065
   * </p>
2066
   * <p>
2067
   * If the input <i>string</i> contains an invalid code unit
2068
   * sequence within the given <i>encoding</i> an empty string
2069
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2070
   * <b>ENT_SUBSTITUTE</b> flags are set.
2071
   */
2072
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2073
  {
2074
    if ($encoding !== 'UTF-8') {
2075
      $encoding = self::normalize_encoding($encoding);
2076
    }
2077
2078
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2079
2080
    if ($encoding !== 'UTF-8') {
2081
      return $str;
2082
    }
2083
2084
    $byteLengths = self::chr_size_list($str);
2085
    $search = array();
2086
    $replacements = array();
2087
    foreach ($byteLengths as $counter => $byteLength) {
2088
      if ($byteLength >= 3) {
2089
        $char = self::access($str, $counter);
2090
2091
        if (!isset($replacements[$char])) {
2092
          $search[$char] = $char;
2093
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2089 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2094
        }
2095
      }
2096
    }
2097
2098
    return str_replace($search, $replacements, $str);
2099
  }
2100
2101
  /**
2102
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2103
   *
2104
   * INFO: Take a look at "UTF8::htmlentities()"
2105
   *
2106
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2107
   *
2108
   * @param string $str           <p>
2109
   *                              The string being converted.
2110
   *                              </p>
2111
   * @param int    $flags         [optional] <p>
2112
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2113
   *                              invalid code unit sequences and the used document type. The default is
2114
   *                              ENT_COMPAT | ENT_HTML401.
2115
   *                              <table>
2116
   *                              Available <i>flags</i> constants
2117
   *                              <tr valign="top">
2118
   *                              <td>Constant Name</td>
2119
   *                              <td>Description</td>
2120
   *                              </tr>
2121
   *                              <tr valign="top">
2122
   *                              <td><b>ENT_COMPAT</b></td>
2123
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2124
   *                              </tr>
2125
   *                              <tr valign="top">
2126
   *                              <td><b>ENT_QUOTES</b></td>
2127
   *                              <td>Will convert both double and single quotes.</td>
2128
   *                              </tr>
2129
   *                              <tr valign="top">
2130
   *                              <td><b>ENT_NOQUOTES</b></td>
2131
   *                              <td>Will leave both double and single quotes unconverted.</td>
2132
   *                              </tr>
2133
   *                              <tr valign="top">
2134
   *                              <td><b>ENT_IGNORE</b></td>
2135
   *                              <td>
2136
   *                              Silently discard invalid code unit sequences instead of returning
2137
   *                              an empty string. Using this flag is discouraged as it
2138
   *                              may have security implications.
2139
   *                              </td>
2140
   *                              </tr>
2141
   *                              <tr valign="top">
2142
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2143
   *                              <td>
2144
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2145
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2146
   *                              </td>
2147
   *                              </tr>
2148
   *                              <tr valign="top">
2149
   *                              <td><b>ENT_DISALLOWED</b></td>
2150
   *                              <td>
2151
   *                              Replace invalid code points for the given document type with a
2152
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2153
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2154
   *                              instance, to ensure the well-formedness of XML documents with
2155
   *                              embedded external content.
2156
   *                              </td>
2157
   *                              </tr>
2158
   *                              <tr valign="top">
2159
   *                              <td><b>ENT_HTML401</b></td>
2160
   *                              <td>
2161
   *                              Handle code as HTML 4.01.
2162
   *                              </td>
2163
   *                              </tr>
2164
   *                              <tr valign="top">
2165
   *                              <td><b>ENT_XML1</b></td>
2166
   *                              <td>
2167
   *                              Handle code as XML 1.
2168
   *                              </td>
2169
   *                              </tr>
2170
   *                              <tr valign="top">
2171 1
   *                              <td><b>ENT_XHTML</b></td>
2172
   *                              <td>
2173 1
   *                              Handle code as XHTML.
2174
   *                              </td>
2175
   *                              </tr>
2176
   *                              <tr valign="top">
2177 1
   *                              <td><b>ENT_HTML5</b></td>
2178
   *                              <td>
2179
   *                              Handle code as HTML 5.
2180
   *                              </td>
2181
   *                              </tr>
2182
   *                              </table>
2183
   *                              </p>
2184
   * @param string $encoding      [optional] <p>
2185 1
   *                              Defines encoding used in conversion.
2186
   *                              </p>
2187 1
   *                              <p>
2188
   *                              For the purposes of this function, the encodings
2189
   *                              ISO-8859-1, ISO-8859-15,
2190
   *                              UTF-8, cp866,
2191
   *                              cp1251, cp1252, and
2192
   *                              KOI8-R are effectively equivalent, provided the
2193
   *                              <i>string</i> itself is valid for the encoding, as
2194
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2195
   *                              the same positions in all of these encodings.
2196
   *                              </p>
2197
   * @param bool   $double_encode [optional] <p>
2198
   *                              When <i>double_encode</i> is turned off PHP will not
2199
   *                              encode existing html entities, the default is to convert everything.
2200 3
   *                              </p>
2201
   *
2202 3
   * @return string The converted string.
2203 3
   * </p>
2204
   * <p>
2205 3
   * If the input <i>string</i> contains an invalid code unit
2206
   * sequence within the given <i>encoding</i> an empty string
2207 3
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2208
   * <b>ENT_SUBSTITUTE</b> flags are set.
2209
   */
2210
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2211
  {
2212
    if ($encoding !== 'UTF-8') {
2213
      $encoding = self::normalize_encoding($encoding);
2214
    }
2215
2216
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2217
  }
2218 1
2219
  /**
2220 1
   * Checks whether iconv is available on the server.
2221
   *
2222
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2223
   */
2224
  public static function iconv_loaded()
2225
  {
2226
    return extension_loaded('iconv') ? true : false;
2227
  }
2228 3
2229
  /**
2230 3
   * Converts Integer to hexadecimal U+xxxx code point representation.
2231
   *
2232
   * INFO: opposite to UTF8::hex_to_int()
2233
   *
2234
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2235
   * @param string $pfix [optional]
2236
   *
2237
   * @return string <p>The code point, or empty string on failure.</p>
2238
   */
2239
  public static function int_to_hex($int, $pfix = 'U+')
2240
  {
2241
    if (ctype_digit((string)$int)) {
2242 2
      $hex = dechex((int)$int);
2243
2244 2
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2245
2246
      return $pfix . $hex;
2247
    }
2248
2249
    return '';
2250
  }
2251
2252
  /**
2253
   * Checks whether intl-char is available on the server.
2254
   *
2255
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2256 1
   */
2257
  public static function intlChar_loaded()
2258 1
  {
2259
    return (Bootup::is_php('7.0') === true && class_exists('IntlChar') === true);
2260
  }
2261
2262
  /**
2263
   * Checks whether intl is available on the server.
2264
   *
2265
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2266
   */
2267
  public static function intl_loaded()
2268
  {
2269
    return extension_loaded('intl') ? true : false;
2270
  }
2271
2272
  /**
2273
   * alias for "UTF8::is_ascii()"
2274
   *
2275
   * @see UTF8::is_ascii()
2276
   *
2277
   * @param string $str
2278
   *
2279
   * @return boolean
2280
   */
2281
  public static function isAscii($str)
2282
  {
2283
    return self::is_ascii($str);
2284
  }
2285
2286
  /**
2287
   * alias for "UTF8::is_base64()"
2288
   *
2289
   * @see UTF8::is_base64()
2290
   *
2291
   * @param string $str
2292
   *
2293
   * @return bool
2294
   */
2295
  public static function isBase64($str)
2296
  {
2297
    return self::is_base64($str);
2298 1
  }
2299
2300 1
  /**
2301
   * alias for "UTF8::is_binary()"
2302
   *
2303
   * @see UTF8::is_binary()
2304
   *
2305
   * @param string $str
2306
   *
2307
   * @return bool
2308
   */
2309
  public static function isBinary($str)
2310
  {
2311
    return self::is_binary($str);
2312
  }
2313
2314
  /**
2315
   * alias for "UTF8::is_bom()"
2316
   *
2317
   * @see UTF8::is_bom()
2318
   *
2319
   * @param string $utf8_chr
2320
   *
2321
   * @return boolean
2322
   */
2323
  public static function isBom($utf8_chr)
2324
  {
2325
    return self::is_bom($utf8_chr);
2326 1
  }
2327
2328 1
  /**
2329
   * alias for "UTF8::is_html()"
2330
   *
2331
   * @see UTF8::is_html()
2332
   *
2333
   * @param string $str
2334
   *
2335
   * @return boolean
2336
   */
2337
  public static function isHtml($str)
2338
  {
2339
    return self::is_html($str);
2340 1
  }
2341
2342 1
  /**
2343
   * alias for "UTF8::is_json()"
2344
   *
2345
   * @see UTF8::is_json()
2346
   *
2347
   * @param string $str
2348
   *
2349
   * @return bool
2350
   */
2351
  public static function isJson($str)
2352
  {
2353
    return self::is_json($str);
2354
  }
2355 16
2356
  /**
2357 16
   * alias for "UTF8::is_utf16()"
2358
   *
2359
   * @see UTF8::is_utf16()
2360
   *
2361
   * @param string $str
2362
   *
2363
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2364
   */
2365
  public static function isUtf16($str)
2366
  {
2367
    return self::is_utf16($str);
2368
  }
2369
2370 14
  /**
2371
   * alias for "UTF8::is_utf32()"
2372 14
   *
2373
   * @see UTF8::is_utf32()
2374
   *
2375
   * @param string $str
2376
   *
2377
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2378
   */
2379
  public static function isUtf32($str)
2380
  {
2381
    return self::is_utf32($str);
2382 1
  }
2383
2384 1
  /**
2385
   * alias for "UTF8::is_utf8()"
2386 1
   *
2387 1
   * @see UTF8::is_utf8()
2388
   *
2389
   * @param string $str
2390 1
   * @param bool   $strict
2391 1
   *
2392
   * @return bool
2393 1
   */
2394
  public static function isUtf8($str, $strict = false)
2395
  {
2396
    return self::is_utf8($str, $strict);
2397
  }
2398
2399
  /**
2400
   * Checks if a string is 7 bit ASCII.
2401
   *
2402
   * @param string $str <p>The string to check.</p>
2403
   *
2404 16
   * @return bool <p>
2405
   *              <strong>true</strong> if it is ASCII<br />
2406
   *              <strong>false</strong> otherwise
2407 16
   *              </p>
2408
   */
2409
  public static function is_ascii($str)
2410 16
  {
2411
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2412 16
  }
2413 16
2414 15
  /**
2415 16
   * Returns true if the string is base64 encoded, false otherwise.
2416 6
   *
2417
   * @param string $str <p>The input string.</p>
2418 15
   *
2419
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2420
   */
2421
  public static function is_base64($str)
2422
  {
2423
    $str = (string)$str;
2424
2425
    if (!isset($str[0])) {
2426
      return false;
2427
    }
2428
2429
    if (base64_encode(base64_decode($str, true)) === $str) {
2430
      return true;
2431
    } else {
2432
      return false;
2433
    }
2434
  }
2435
2436
  /**
2437
   * Check if the input is binary... (is look like a hack).
2438
   *
2439
   * @param mixed $input
2440
   *
2441
   * @return bool
2442
   */
2443
  public static function is_binary($input)
2444
  {
2445
2446
    $testLength = strlen($input);
2447
2448
    if (
2449
        preg_match('~^[01]+$~', $input)
2450
        ||
2451
        substr_count($input, "\x00") > 0
2452
        ||
2453
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2454
    ) {
2455
      return true;
2456
    } else {
2457
      return false;
2458
    }
2459
  }
2460
2461
  /**
2462
   * Check if the file is binary.
2463
   *
2464
   * @param string $file
2465
   *
2466
   * @return boolean
2467
   */
2468
  public static function is_binary_file($file)
2469 1
  {
2470
    try {
2471 1
      $fp = fopen($file, 'r');
2472
      $block = fread($fp, 512);
2473 1
      fclose($fp);
2474
    } catch (\Exception $e) {
2475
      $block = '';
2476
    }
2477
2478 1
    return self::is_binary($block);
2479
  }
2480 1
2481
  /**
2482 1
   * Checks if the given string is equal to any "Byte Order Mark".
2483 1
   *
2484
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2485 1
   *
2486
   * @param string $str <p>The input string.</p>
2487
   *
2488
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2489
   */
2490
  public static function is_bom($str)
2491
  {
2492
    foreach (self::$bom as $bomString => $bomByteLength) {
2493
      if ($str === $bomString) {
2494
        return true;
2495
      }
2496 1
    }
2497
2498 1
    return false;
2499
  }
2500 1
2501
  /**
2502
   * Check if the string contains any html-tags <lall>.
2503
   *
2504
   * @param string $str <p>The input string.</p>
2505 1
   *
2506 1
   * @return boolean
2507 1
   */
2508 1
  public static function is_html($str)
2509 1
  {
2510
    $str = (string)$str;
2511 1
2512
    if (!isset($str[0])) {
2513
      return false;
2514
    }
2515
2516
    // init
2517
    $matches = array();
2518
2519
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2520
2521
    if (count($matches) == 0) {
2522
      return false;
2523
    } else {
2524
      return true;
2525
    }
2526 4
  }
2527
2528 4
  /**
2529
   * Try to check if "$str" is an json-string.
2530 4
   *
2531
   * @param string $str <p>The input string.</p>
2532 4
   *
2533 4
   * @return bool
2534 4
   */
2535 4
  public static function is_json($str)
2536 4
  {
2537 4
    $str = (string)$str;
2538 4
2539 4
    if (!isset($str[0])) {
2540 4
      return false;
2541 2
    }
2542 2
2543 4
    if (
2544 4
        is_object(self::json_decode($str))
2545 4
        &&
2546
        json_last_error() === JSON_ERROR_NONE
2547 4
    ) {
2548 4
      return true;
2549 4
    } else {
2550 4
      return false;
2551 4
    }
2552 4
  }
2553 4
2554 4
  /**
2555 4
   * Check if the string is UTF-16.
2556 3
   *
2557 3
   * @param string $str <p>The input string.</p>
2558 4
   *
2559 4
   * @return int|false <p>
2560 4
   *                   <strong>false</strong> if is't not UTF-16,<br />
2561
   *                   <strong>1</strong> for UTF-16LE,<br />
2562 4
   *                   <strong>2</strong> for UTF-16BE.
2563 3
   *                   </p>
2564 2
   */
2565 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2566 3
  {
2567
    $str = self::remove_bom($str);
2568
2569
    if (self::is_binary($str)) {
2570 3
2571
      $maybeUTF16LE = 0;
2572 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2573
      if ($test) {
2574
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2575
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2576
        if ($test3 === $test) {
2577
          $strChars = self::count_chars($str, true);
2578
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2579
            if (in_array($test3char, $strChars, true) === true) {
2580
              $maybeUTF16LE++;
2581
            }
2582
          }
2583
        }
2584
      }
2585
2586 3
      $maybeUTF16BE = 0;
2587
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2588 3
      if ($test) {
2589
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2590 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2591
        if ($test3 === $test) {
2592 3
          $strChars = self::count_chars($str, true);
2593 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2594 3
            if (in_array($test3char, $strChars, true) === true) {
2595 3
              $maybeUTF16BE++;
2596 3
            }
2597 3
          }
2598 3
        }
2599 3
      }
2600 3
2601 1
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2602 1
        if ($maybeUTF16LE > $maybeUTF16BE) {
2603 3
          return 1;
2604 3
        } else {
2605 3
          return 2;
2606
        }
2607 3
      }
2608 3
2609 3
    }
2610 3
2611 3
    return false;
2612 3
  }
2613 3
2614 3
  /**
2615 3
   * Check if the string is UTF-32.
2616 1
   *
2617 1
   * @param string $str
2618 3
   *
2619 3
   * @return int|false <p>
2620 3
   *                   <strong>false</strong> if is't not UTF-16,<br />
2621
   *                   <strong>1</strong> for UTF-32LE,<br />
2622 3
   *                   <strong>2</strong> for UTF-32BE.
2623 1
   *                   </p>
2624 1
   */
2625 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2626 1
  {
2627
    $str = self::remove_bom($str);
2628
2629
    if (self::is_binary($str)) {
2630 3
2631
      $maybeUTF32LE = 0;
2632 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2633
      if ($test) {
2634
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2635
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2636
        if ($test3 === $test) {
2637
          $strChars = self::count_chars($str, true);
2638
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2639
            if (in_array($test3char, $strChars, true) === true) {
2640
              $maybeUTF32LE++;
2641
            }
2642
          }
2643
        }
2644
      }
2645 43
2646
      $maybeUTF32BE = 0;
2647 43
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2648
      if ($test) {
2649 43
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2650 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2651
        if ($test3 === $test) {
2652
          $strChars = self::count_chars($str, true);
2653 41
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2654 1
            if (in_array($test3char, $strChars, true) === true) {
2655 1
              $maybeUTF32BE++;
2656
            }
2657
          }
2658
        }
2659
      }
2660
2661
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2662
        if ($maybeUTF32LE > $maybeUTF32BE) {
2663 41
          return 1;
2664
        } else {
2665
          return 2;
2666
        }
2667
      }
2668
2669
    }
2670
2671
    return false;
2672
  }
2673 41
2674
  /**
2675 41
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2676 41
   *
2677 41
   * @see    http://hsivonen.iki.fi/php-utf8/
2678
   *
2679
   * @param string $str    <p>The string to be checked.</p>
2680 41
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2681 41
   *
2682 41
   * @return bool
2683
   */
2684
  public static function is_utf8($str, $strict = false)
2685 41
  {
2686
    $str = (string)$str;
2687 36
2688 41
    if (!isset($str[0])) {
2689
      return true;
2690 34
    }
2691 34
2692 34
    if ($strict === true) {
2693 34
      if (self::is_utf16($str) !== false) {
2694 39
        return false;
2695
      }
2696 21
2697 21
      if (self::is_utf32($str) !== false) {
2698 21
        return false;
2699 21
      }
2700 33
    }
2701
2702 9
    if (self::pcre_utf8_support() !== true) {
2703 9
2704 9
      // If even just the first character can be matched, when the /u
2705 9
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2706 16
      // invalid, nothing at all will match, even if the string contains
2707
      // some valid sequences
2708
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2709
2710
    } else {
2711
2712
      $mState = 0; // cached expected number of octets after the current octet
2713
      // until the beginning of the next UTF8 character sequence
2714
      $mUcs4 = 0; // cached Unicode character
2715 3
      $mBytes = 1; // cached expected number of octets in the current sequence
2716 3
      $len = strlen($str);
2717 3
2718 3
      /** @noinspection ForeachInvariantsInspection */
2719 9
      for ($i = 0; $i < $len; $i++) {
2720
        $in = ord($str[$i]);
2721 3
        if ($mState === 0) {
2722 3
          // When mState is zero we expect either a US-ASCII character or a
2723 3
          // multi-octet sequence.
2724 3
          if (0 === (0x80 & $in)) {
2725 3
            // US-ASCII, pass straight through.
2726
            $mBytes = 1;
2727 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2728
            // First octet of 2 octet sequence.
2729 5
            $mUcs4 = $in;
2730
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2731 41
            $mState = 1;
2732
            $mBytes = 2;
2733
          } elseif (0xE0 === (0xF0 & $in)) {
2734 36
            // First octet of 3 octet sequence.
2735
            $mUcs4 = $in;
2736 33
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2737 33
            $mState = 2;
2738 33
            $mBytes = 3;
2739 33 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2740
            // First octet of 4 octet sequence.
2741
            $mUcs4 = $in;
2742
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2743
            $mState = 3;
2744 33
            $mBytes = 4;
2745
          } elseif (0xF8 === (0xFC & $in)) {
2746
            /* First octet of 5 octet sequence.
2747
            *
2748
            * This is illegal because the encoded codepoint must be either
2749
            * (a) not the shortest form or
2750 33
            * (b) outside the Unicode range of 0-0x10FFFF.
2751 33
            * Rather than trying to resynchronize, we will carry on until the end
2752 33
            * of the sequence and let the later error handling code catch it.
2753 33
            */
2754
            $mUcs4 = $in;
2755 33
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2756
            $mState = 4;
2757 33
            $mBytes = 5;
2758 33 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2759 5
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2760
            $mUcs4 = $in;
2761
            $mUcs4 = ($mUcs4 & 1) << 30;
2762 33
            $mState = 5;
2763 33
            $mBytes = 6;
2764 33
          } else {
2765 33
            /* Current octet is neither in the US-ASCII range nor a legal first
2766 33
             * octet of a multi-octet sequence.
2767
             */
2768
            return false;
2769
          }
2770
        } else {
2771 18
          // When mState is non-zero, we expect a continuation of the multi-octet
2772
          // sequence
2773
          if (0x80 === (0xC0 & $in)) {
2774 41
            // Legal continuation.
2775
            $shift = ($mState - 1) * 6;
2776 20
            $tmp = $in;
2777
            $tmp = ($tmp & 0x0000003F) << $shift;
2778
            $mUcs4 |= $tmp;
2779
            /**
2780
             * End of the multi-octet sequence. mUcs4 now contains the final
2781
             * Unicode code point to be output
2782
             */
2783
            if (0 === --$mState) {
2784
              /*
2785
              * Check for illegal sequences and code points.
2786
              */
2787
              // From Unicode 3.1, non-shortest form is illegal
2788
              if (
2789
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2790
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2791
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2792
                  (4 < $mBytes) ||
2793
                  // From Unicode 3.2, surrogate characters are illegal.
2794
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2795
                  // Code points outside the Unicode range are illegal.
2796
                  ($mUcs4 > 0x10FFFF)
2797
              ) {
2798
                return false;
2799
              }
2800
              // initialize UTF8 cache
2801
              $mState = 0;
2802
              $mUcs4 = 0;
2803
              $mBytes = 1;
2804
            }
2805
          } else {
2806
            /**
2807
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2808
             * Incomplete multi-octet sequence.
2809
             */
2810
            return false;
2811
          }
2812
        }
2813
      }
2814
2815
      return true;
2816 2
    }
2817
  }
2818 2
2819
  /**
2820 2
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2821 2
   * Decodes a JSON string
2822 2
   *
2823
   * @link http://php.net/manual/en/function.json-decode.php
2824
   *
2825
   * @param string $json    <p>
2826 2
   *                        The <i>json</i> string being decoded.
2827
   *                        </p>
2828
   *                        <p>
2829
   *                        This function only works with UTF-8 encoded strings.
2830
   *                        </p>
2831
   *                        <p>PHP implements a superset of
2832
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2833
   *                        only supports these values when they are nested inside an array or an object.
2834
   *                        </p>
2835
   * @param bool   $assoc   [optional] <p>
2836
   *                        When <b>TRUE</b>, returned objects will be converted into
2837
   *                        associative arrays.
2838
   *                        </p>
2839
   * @param int    $depth   [optional] <p>
2840
   *                        User specified recursion depth.
2841
   *                        </p>
2842
   * @param int    $options [optional] <p>
2843
   *                        Bitmask of JSON decode options. Currently only
2844
   *                        <b>JSON_BIGINT_AS_STRING</b>
2845
   *                        is supported (default is to cast large integers as floats)
2846
   *                        </p>
2847
   *
2848
   * @return mixed the value encoded in <i>json</i> in appropriate
2849
   * PHP type. Values true, false and
2850
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2851
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2852
   * <i>json</i> cannot be decoded or if the encoded
2853
   * data is deeper than the recursion limit.
2854
   */
2855
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2856
  {
2857
    $json = self::filter($json);
2858
2859
    if (Bootup::is_php('5.4') === true) {
2860
      $json = json_decode($json, $assoc, $depth, $options);
2861
    } else {
2862
      $json = json_decode($json, $assoc, $depth);
2863
    }
2864
2865 2
    return $json;
2866
  }
2867 2
2868
  /**
2869 2
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2870
   * Returns the JSON representation of a value.
2871
   *
2872 2
   * @link http://php.net/manual/en/function.json-encode.php
2873
   *
2874
   * @param mixed $value   <p>
2875 2
   *                       The <i>value</i> being encoded. Can be any type except
2876
   *                       a resource.
2877
   *                       </p>
2878
   *                       <p>
2879
   *                       All string data must be UTF-8 encoded.
2880
   *                       </p>
2881
   *                       <p>PHP implements a superset of
2882
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2883
   *                       only supports these values when they are nested inside an array or an object.
2884
   *                       </p>
2885 6
   * @param int   $options [optional] <p>
2886
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2887 6
   *                       <b>JSON_HEX_TAG</b>,
2888
   *                       <b>JSON_HEX_AMP</b>,
2889
   *                       <b>JSON_HEX_APOS</b>,
2890
   *                       <b>JSON_NUMERIC_CHECK</b>,
2891
   *                       <b>JSON_PRETTY_PRINT</b>,
2892
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2893
   *                       <b>JSON_FORCE_OBJECT</b>,
2894
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2895
   *                       constants is described on
2896
   *                       the JSON constants page.
2897
   *                       </p>
2898 24
   * @param int   $depth   [optional] <p>
2899
   *                       Set the maximum depth. Must be greater than zero.
2900 24
   *                       </p>
2901
   *
2902 24
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2903 2
   */
2904
  public static function json_encode($value, $options = 0, $depth = 512)
2905
  {
2906
    $value = self::filter($value);
2907 23
2908 2
    if (Bootup::is_php('5.5')) {
2909
      $json = json_encode($value, $options, $depth);
2910
    } else {
2911 23
      $json = json_encode($value, $options);
2912
    }
2913 23
2914
    return $json;
2915
  }
2916
2917
  /**
2918
   * Makes string's first char lowercase.
2919
   *
2920
   * @param string $str <p>The input string</p>
2921
   *
2922
   * @return string <p>The resulting string</p>
2923 1
   */
2924
  public static function lcfirst($str)
2925 1
  {
2926
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2927
  }
2928
2929 1
  /**
2930
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2931
   *
2932
   * @param string $str   <p>The string to be trimmed</p>
2933
   * @param string $chars <p>Optional characters to be stripped</p>
2934
   *
2935
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2936
   */
2937 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2938
  {
2939
    $str = (string)$str;
2940 1
2941
    if (!isset($str[0])) {
2942 1
      return '';
2943 1
    }
2944 1
2945
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2946 1
    if ($chars === INF || !$chars) {
2947
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2948
    }
2949
2950
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2951
2952
    return preg_replace("/^{$chars}+/u", '', $str);
2953
  }
2954
2955 2
  /**
2956
   * Returns the UTF-8 character with the maximum code point in the given data.
2957 2
   *
2958
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2959 2
   *
2960 2
   * @return string <p>The character with the highest code point than others.</p>
2961 2
   */
2962 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2963 2
  {
2964
    if (is_array($arg)) {
2965
      $arg = implode($arg);
2966
    }
2967
2968
    return self::chr(max(self::codepoints($arg)));
2969
  }
2970
2971
  /**
2972
   * Calculates and returns the maximum number of bytes taken by any
2973 1
   * UTF-8 encoded character in the given string.
2974
   *
2975 1
   * @param string $str <p>The original Unicode string.</p>
2976
   *
2977
   * @return int <p>Max byte lengths of the given chars.</p>
2978
   */
2979 1
  public static function max_chr_width($str)
2980
  {
2981
    $bytes = self::chr_size_list($str);
2982
    if (count($bytes) > 0) {
2983
      return (int)max($bytes);
2984
    } else {
2985
      return 0;
2986
    }
2987
  }
2988
2989
  /**
2990
   * Checks whether mbstring is available on the server.
2991 1
   *
2992
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2993 1
   */
2994
  public static function mbstring_loaded()
2995
  {
2996
    $return = extension_loaded('mbstring');
2997
2998
    if ($return === true) {
2999
      \mb_internal_encoding('UTF-8');
3000
    }
3001
3002
    return $return;
3003 4
  }
3004
3005 4
  /**
3006
   * Returns the UTF-8 character with the minimum code point in the given data.
3007 4
   *
3008 1
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
3009
   *
3010
   * @return string <p>The character with the lowest code point than others.</p>
3011 4
   */
3012 1 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3013
  {
3014
    if (is_array($arg)) {
3015 4
      $arg = implode($arg);
3016 3
    }
3017
3018
    return self::chr(min(self::codepoints($arg)));
3019 3
  }
3020 3
3021
  /**
3022
   * alias for "UTF8::normalize_encoding()"
3023 2
   *
3024 2
   * @see UTF8::normalize_encoding()
3025 2
   *
3026
   * @param string $encoding
3027
   *
3028 2
   * @return string
3029 2
   */
3030 2
  public static function normalizeEncoding($encoding)
3031 2
  {
3032 2
    return self::normalize_encoding($encoding);
3033 2
  }
3034 2
3035 2
  /**
3036 2
   * Normalize the encoding-"name" input.
3037 2
   *
3038 2
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3039 2
   *
3040 2
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3041 2
   */
3042 2
  public static function normalize_encoding($encoding)
3043
  {
3044 2
    static $staticNormalizeEncodingCache = array();
3045 2
3046 2
    if (!$encoding) {
3047
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3048 2
    }
3049
3050 2
    if ('UTF-8' === $encoding) {
3051
      return $encoding;
3052
    }
3053
3054
    if (in_array($encoding, self::$iconvEncoding, true)) {
3055
      return $encoding;
3056
    }
3057
3058
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3059
      return $staticNormalizeEncodingCache[$encoding];
3060 2
    }
3061
3062 2
    $encodingOrig = $encoding;
3063 2
    $encoding = strtoupper($encoding);
3064
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3065 2
3066 1
    $equivalences = array(
3067 1
        'ISO88591'    => 'ISO-8859-1',
3068 1
        'ISO8859'     => 'ISO-8859-1',
3069
        'ISO'         => 'ISO-8859-1',
3070 2
        'LATIN1'      => 'ISO-8859-1',
3071
        'LATIN'       => 'ISO-8859-1',
3072
        'WIN1252'     => 'ISO-8859-1',
3073
        'WINDOWS1252' => 'ISO-8859-1',
3074
        'UTF16'       => 'UTF-16',
3075
        'UTF32'       => 'UTF-32',
3076
        'UTF8'        => 'UTF-8',
3077
        'UTF'         => 'UTF-8',
3078
        'UTF7'        => 'UTF-7',
3079
        '8BIT'        => 'CP850',
3080
        'BINARY'      => 'CP850',
3081
    );
3082
3083 7
    if (!empty($equivalences[$encodingUpperHelper])) {
3084
      $encoding = $equivalences[$encodingUpperHelper];
3085 7
    }
3086 7
3087
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3088 7
3089
    return $encoding;
3090 7
  }
3091
3092 2
  /**
3093
   * Normalize some MS Word special characters.
3094 2
   *
3095
   * @param string $str <p>The string to be normalized.</p>
3096 1
   *
3097 1
   * @return string
3098
   */
3099 2
  public static function normalize_msword($str)
3100 2
  {
3101
    static $utf8MSWordKeys = null;
3102 7
    static $utf8MSWordValues = null;
3103 7
3104 1
    if ($utf8MSWordKeys === null) {
3105 1
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3106
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3107 7
    }
3108 7
3109
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3110 7
  }
3111
3112
  /**
3113
   * Normalize the whitespace.
3114
   *
3115
   * @param string $str                     <p>The string to be normalized.</p>
3116
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3117
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3118
   *                                        bidirectional text chars.</p>
3119
   *
3120
   * @return string
3121
   */
3122
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3123
  {
3124
    static $whitespaces = array();
3125
    static $bidiUniCodeControls = null;
3126
3127
    $cacheKey = (int)$keepNonBreakingSpace;
3128
3129
    if (!isset($whitespaces[$cacheKey])) {
3130
3131
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3132
3133
      if ($keepNonBreakingSpace === true) {
3134
        /** @noinspection OffsetOperationsInspection */
3135
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3136
      }
3137
3138
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3139
    }
3140
3141
    if ($keepBidiUnicodeControls === false) {
3142
      if ($bidiUniCodeControls === null) {
3143
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3144
      }
3145
3146
      $str = str_replace($bidiUniCodeControls, '', $str);
3147
    }
3148
3149
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3150
  }
3151
3152
  /**
3153
   * Format a number with grouped thousands.
3154
   *
3155
   * @param float  $number
3156
   * @param int    $decimals
3157
   * @param string $dec_point
3158
   * @param string $thousands_sep
3159
   *
3160
   * @return string
3161
   *    *
3162
   * @deprecated Because this has nothing to do with UTF8. :/
3163 17
   */
3164
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3165 17
  {
3166 3
    $thousands_sep = (string)$thousands_sep;
3167
    $dec_point = (string)$dec_point;
3168
3169 16
    if (
3170
        isset($thousands_sep[1], $dec_point[1])
3171
        &&
3172
        Bootup::is_php('5.4') === true
3173 16
    ) {
3174
      return str_replace(
3175
          array(
3176
              '.',
3177
              ',',
3178
          ),
3179
          array(
3180 16
              $dec_point,
3181 16
              $thousands_sep,
3182
          ),
3183 16
          number_format($number, $decimals, '.', ',')
3184 3
      );
3185
    }
3186
3187 15
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3188 10
  }
3189
3190
  /**
3191 13
   * Calculates Unicode code point of the given UTF-8 encoded character.
3192 10
   *
3193
   * INFO: opposite to UTF8::chr()
3194
   *
3195 12
   * @param string $chr <p>The character of which to calculate code point.<p/>
3196
   *
3197
   * @return int <p>
3198
   *             Unicode code point of the given character,<br />
3199
   *             0 on invalid UTF-8 byte sequence.
3200
   *             </p>
3201
   */
3202
  public static function ord($chr)
3203
  {
3204
    if (!$chr && $chr !== '0') {
3205
      return 0;
3206
    }
3207
3208
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3209
      self::checkForSupport();
3210
    }
3211 1
3212
    if (self::$support['intlChar'] === true) {
3213
      $tmpReturn = \IntlChar::ord($chr);
3214 1
      if ($tmpReturn) {
3215
        return $tmpReturn;
3216 1
      }
3217 1
    }
3218 1
3219
    // use static cache, if there is no support for "IntlChar"
3220
    static $cache = array();
3221 1
    if (isset($cache[$chr]) === true) {
3222
      return $cache[$chr];
3223
    }
3224
3225
    $chr_orig = $chr;
3226
    $chr = unpack('C*', substr($chr, 0, 4));
3227
    $a = $chr ? $chr[1] : 0;
3228
3229 41
    if (0xF0 <= $a && isset($chr[4])) {
3230
      return $cache[$chr_orig] = (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3231
    }
3232 41
3233
    if (0xE0 <= $a && isset($chr[3])) {
3234
      return $cache[$chr_orig] = (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3235
    }
3236
3237
    if (0xC0 <= $a && isset($chr[2])) {
3238
      return $cache[$chr_orig] = (($a - 0xC0) << 6) + $chr[2] - 0x80;
3239
    }
3240
3241
    return $cache[$chr_orig] = $a;
3242
  }
3243 1
3244
  /**
3245 1
   * Parses the string into an array (into the the second parameter).
3246 1
   *
3247
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3248
   *          if the second parameter is not set!
3249 1
   *
3250 1
   * @link http://php.net/manual/en/function.parse-str.php
3251 1
   *
3252
   * @param string $str    <p>The input string.</p>
3253
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3254 1
   *
3255
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3256
   */
3257 1
  public static function parse_str($str, &$result)
3258
  {
3259
    // init
3260
    $str = self::clean($str);
3261 1
3262 1
    $return = \mb_parse_str($str, $result);
3263 1
    if ($return === false || empty($result)) {
3264
      return false;
3265
    }
3266 1
3267
    return true;
3268
  }
3269 1
3270
  /**
3271
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3272
   *
3273 1
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3274
   */
3275 1
  public static function pcre_utf8_support()
3276 1
  {
3277 1
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3278 1
    return (bool)@preg_match('//u', '');
3279 1
  }
3280
3281
  /**
3282
   * Create an array containing a range of UTF-8 characters.
3283
   *
3284
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3285
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3286
   *
3287
   * @return array
3288
   */
3289 10
  public static function range($var1, $var2)
3290
  {
3291 10
    if (!$var1 || !$var2) {
3292 10
      return array();
3293 5
    }
3294 5
3295 10 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3296
      $start = (int)$var1;
3297 10
    } elseif (ctype_xdigit($var1)) {
3298
      $start = (int)self::hex_to_int($var1);
3299
    } else {
3300
      $start = self::ord($var1);
3301
    }
3302
3303
    if (!$start) {
3304
      return array();
3305
    }
3306
3307 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3308
      $end = (int)$var2;
3309 5
    } elseif (ctype_xdigit($var2)) {
3310
      $end = (int)self::hex_to_int($var2);
3311 5
    } else {
3312
      $end = self::ord($var2);
3313
    }
3314
3315
    if (!$end) {
3316
      return array();
3317
    }
3318
3319
    return array_map(
3320
        array(
3321
            '\\voku\\helper\\UTF8',
3322 1
            'chr',
3323
        ),
3324 1
        range($start, $end)
3325 1
    );
3326 1
  }
3327
3328 1
  /**
3329 1
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3330 1
   *
3331 1
   * @param string $str <p>The input string.</p>
3332 1
   *
3333
   * @return string <p>String without UTF-BOM</p>
3334 1
   */
3335
  public static function remove_bom($str)
3336
  {
3337
    foreach (self::$bom as $bomString => $bomByteLength) {
3338
      if (0 === strpos($str, $bomString)) {
3339
        $str = substr($str, $bomByteLength);
3340
      }
3341
    }
3342
3343
    return $str;
3344
  }
3345
3346
  /**
3347
   * alias for "UTF8::remove_bom()"
3348
   *
3349
   * @see UTF8::remove_bom()
3350 43
   *
3351
   * @param string $str
3352
   *
3353 43
   * @return string
3354
   */
3355
  public static function removeBOM($str)
3356
  {
3357 43
    return self::remove_bom($str);
3358 43
  }
3359 43
3360 43
  /**
3361
   * Removes duplicate occurrences of a string in another string.
3362 43
   *
3363
   * @param string          $str  <p>The base string.</p>
3364
   * @param string|string[] $what <p>String to search for in the base string.</p>
3365 43
   *
3366 43
   * @return string <p>The result string with removed duplicates.</p>
3367
   */
3368 43
  public static function remove_duplicates($str, $what = ' ')
3369
  {
3370
    if (is_string($what)) {
3371
      $what = array($what);
3372
    }
3373
3374
    if (is_array($what)) {
3375
      foreach ($what as $item) {
3376
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3377
      }
3378
    }
3379 43
3380
    return $str;
3381 43
  }
3382
3383 43
  /**
3384 43
   * Remove invisible characters from a string.
3385 43
   *
3386
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3387 43
   *
3388 43
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3389 43
   *
3390
   * @param string $str
3391 43
   * @param bool   $url_encoded
3392
   * @param string $replacement
3393
   *
3394
   * @return string
3395
   */
3396
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3397
  {
3398
    // init
3399
    $non_displayables = array();
3400
3401
    // every control character except newline (dec 10),
3402 23
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3403
    if ($url_encoded) {
3404 23
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3405
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3406 23
    }
3407 5
3408
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3409
3410
    do {
3411 19
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3412 3
    } while ($count !== 0);
3413
3414
    return $str;
3415 18
  }
3416
3417 18
  /**
3418
   * Replace the diamond question mark (�) with the replacement.
3419
   *
3420
   * @param string $str
3421
   * @param string $unknown
3422
   *
3423
   * @return string
3424
   */
3425
  public static function replace_diamond_question_mark($str, $unknown = '?')
3426
  {
3427
    return str_replace(
3428 45
        array(
3429
            "\xEF\xBF\xBD",
3430 45
            '�',
3431
        ),
3432 45
        array(
3433
            $unknown,
3434 45
            $unknown,
3435 34
        ),
3436
        $str
3437
    );
3438 17
  }
3439
3440
  /**
3441 17
   * Strip whitespace or other characters from end of a UTF-8 string.
3442 17
   *
3443
   * @param string $str   <p>The string to be trimmed.</p>
3444 17
   * @param string $chars <p>Optional characters to be stripped.</p>
3445 17
   *
3446 17
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3447 2
   */
3448 2 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3449
  {
3450
    $str = (string)$str;
3451 17
3452
    if (!isset($str[0])) {
3453 17
      return '';
3454 17
    }
3455 17
3456
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3457 17
    if ($chars === INF || !$chars) {
3458 17
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3459 17
    }
3460
3461
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3462
3463 17
    return preg_replace("/{$chars}+$/u", '', $str);
3464
  }
3465 17
3466
  /**
3467
   * rxClass
3468
   *
3469
   * @param string $s
3470
   * @param string $class
3471
   *
3472
   * @return string
3473
   */
3474
  private static function rxClass($s, $class = '')
3475
  {
3476
    static $rxClassCache = array();
3477
3478
    $cacheKey = $s . $class;
3479
3480
    if (isset($rxClassCache[$cacheKey])) {
3481
      return $rxClassCache[$cacheKey];
3482
    }
3483
3484
    $class = array($class);
3485
3486 1
    /** @noinspection SuspiciousLoopInspection */
3487
    foreach (self::str_split($s) as $s) {
3488 1
      if ('-' === $s) {
3489 1
        $class[0] = '-' . $class[0];
3490
      } elseif (!isset($s[2])) {
3491
        $class[0] .= preg_quote($s, '/');
3492
      } elseif (1 === self::strlen($s)) {
3493
        $class[0] .= $s;
3494 1
      } else {
3495 1
        $class[] = $s;
3496 1
      }
3497 1
    }
3498
3499
    if ($class[0]) {
3500 1
      $class[0] = '[' . $class[0] . ']';
3501
    }
3502
3503
    if (1 === count($class)) {
3504
      $return = $class[0];
3505
    } else {
3506
      $return = '(?:' . implode('|', $class) . ')';
3507
    }
3508
3509
    $rxClassCache[$cacheKey] = $return;
3510
3511
    return $return;
3512 35
  }
3513
3514 35
  /**
3515
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3516 35
   */
3517 2
  public static function showSupport()
3518
  {
3519
    foreach (self::$support as $utf8Support) {
3520
      echo $utf8Support . "\n<br>";
3521 35
    }
3522 35
  }
3523
3524 35
  /**
3525
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3526
   *
3527
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3528 35
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3529
   *
3530 35
   * @return string <p>The HTML numbered entity.</p>
3531 6
   */
3532 6
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3533
  {
3534 35
    if (!$char) {
3535 35
      return '';
3536 35
    }
3537 35
3538 35
    if (
3539
        $keepAsciiChars === true
3540 35
        &&
3541
        self::isAscii($char) === true
3542
    ) {
3543
      return $char;
3544
    }
3545
3546
    return '&#' . self::ord($char) . ';';
3547
  }
3548
3549
  /**
3550
   * Convert a string to an array of Unicode characters.
3551
   *
3552
   * @param string  $str       <p>The string to split into array.</p>
3553
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3554
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3555
   *
3556
   * @return string[] <p>An array containing chunks of the string.</p>
3557
   */
3558
  public static function split($str, $length = 1, $cleanUtf8 = false)
3559
  {
3560
    $str = (string)$str;
3561
3562
    if (!isset($str[0])) {
3563
      return array();
3564
    }
3565
3566
    // init
3567
    $str = (string)$str;
3568
    $ret = array();
3569
3570
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3571
      self::checkForSupport();
3572 35
    }
3573 5
3574
    if (self::$support['pcre_utf8'] === true) {
3575 5
3576 5
      if ($cleanUtf8 === true) {
3577
        $str = self::clean($str);
3578
      }
3579 35
3580
      preg_match_all('/./us', $str, $retArray);
3581
      if (isset($retArray[0])) {
3582
        $ret = $retArray[0];
3583 35
      }
3584
      unset($retArray);
3585
3586
    } else {
3587
3588
      // fallback
3589
3590
      $len = strlen($str);
3591
3592
      /** @noinspection ForeachInvariantsInspection */
3593
      for ($i = 0; $i < $len; $i++) {
3594
        if (($str[$i] & "\x80") === "\x00") {
3595
          $ret[] = $str[$i];
3596 12
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3597
          if (($str[$i + 1] & "\xC0") === "\x80") {
3598
            $ret[] = $str[$i] . $str[$i + 1];
3599
3600
            $i++;
3601
          }
3602 12 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3603 2
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3604 1
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3605 2
3606 1
            $i += 2;
3607 2
          }
3608
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3609 2 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3610
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3611
3612 2
            $i += 3;
3613
          }
3614
        }
3615
      }
3616
    }
3617
3618 12
    if ($length > 1) {
3619 3
      $ret = array_chunk($ret, $length);
3620
3621
      $ret = array_map('implode', $ret);
3622
    }
3623
3624
    /** @noinspection OffsetOperationsInspection */
3625
    if (isset($ret[0]) && $ret[0] === '') {
3626 12
      return array();
3627 9
    }
3628
3629
    return $ret;
3630
  }
3631
3632
  /**
3633
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3634
   *
3635
   * @param string $str <p>The input string.</p>
3636 6
   *
3637 6
   * @return false|string <p>
3638 6
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3639 6
   *                      otherwise it will return false.
3640 6
   *                      </p>
3641 6
   */
3642 6
  public static function str_detect_encoding($str)
3643 6
  {
3644 6
    //
3645 6
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3646 6
    //
3647 6
3648 6
    if (self::is_binary($str)) {
3649 6
      if (self::is_utf16($str) === 1) {
3650 6
        return 'UTF-16LE';
3651 6
      } elseif (self::is_utf16($str) === 2) {
3652 6
        return 'UTF-16BE';
3653 6
      } elseif (self::is_utf32($str) === 1) {
3654 6
        return 'UTF-32LE';
3655 6
      } elseif (self::is_utf32($str) === 2) {
3656 6
        return 'UTF-32BE';
3657
      }
3658 6
    }
3659 6
3660 6
    //
3661
    // 2.) simple check for ASCII chars
3662
    //
3663
3664
    if (self::is_ascii($str) === true) {
3665
      return 'ASCII';
3666
    }
3667
3668
    //
3669
    // 3.) simple check for UTF-8 chars
3670
    //
3671
3672
    if (self::is_utf8($str) === true) {
3673
      return 'UTF-8';
3674
    }
3675
3676
    //
3677
    // 4.) check via "\mb_detect_encoding()"
3678
    //
3679
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3680
3681
    $detectOrder = array(
3682
        'ISO-8859-1',
3683
        'ISO-8859-2',
3684
        'ISO-8859-3',
3685
        'ISO-8859-4',
3686
        'ISO-8859-5',
3687
        'ISO-8859-6',
3688
        'ISO-8859-7',
3689
        'ISO-8859-8',
3690
        'ISO-8859-9',
3691
        'ISO-8859-10',
3692
        'ISO-8859-13',
3693
        'ISO-8859-14',
3694
        'ISO-8859-15',
3695
        'ISO-8859-16',
3696
        'WINDOWS-1251',
3697
        'WINDOWS-1252',
3698
        'WINDOWS-1254',
3699
        'ISO-2022-JP',
3700
        'JIS',
3701
        'EUC-JP',
3702
    );
3703
3704 13
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3705
    if ($encoding) {
3706 13
      return $encoding;
3707
    }
3708
3709 13
    //
3710 13
    // 5.) check via "iconv()"
3711 1
    //
3712 1
3713 12
    $md5 = md5($str);
3714
    foreach (self::$iconvEncoding as $encodingTmp) {
3715 13
      # INFO: //IGNORE and //TRANSLIT still throw notice
3716
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3717 13
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3718 13
        return $encodingTmp;
3719
      }
3720 13
    }
3721
3722
    return false;
3723
  }
3724
3725
  /**
3726
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3727
   *
3728
   * @link  http://php.net/manual/en/function.str-ireplace.php
3729
   *
3730
   * @param mixed $search  <p>
3731
   *                       Every replacement with search array is
3732 1
   *                       performed on the result of previous replacement.
3733
   *                       </p>
3734 1
   * @param mixed $replace <p>
3735
   *                       </p>
3736 1
   * @param mixed $subject <p>
3737
   *                       If subject is an array, then the search and
3738
   *                       replace is performed with every entry of
3739
   *                       subject, and the return value is an array as
3740 1
   *                       well.
3741
   *                       </p>
3742 1
   * @param int   $count   [optional] <p>
3743
   *                       The number of matched and replaced needles will
3744
   *                       be returned in count which is passed by
3745
   *                       reference.
3746 1
   *                       </p>
3747 1
   *
3748
   * @return mixed <p>A string or an array of replacements.</p>
3749
   */
3750 1
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3751 1
  {
3752 1
    $search = (array)$search;
3753 1
3754
    /** @noinspection AlterInForeachInspection */
3755 1
    foreach ($search as &$s) {
3756
      if ('' === $s .= '') {
3757
        $s = '/^(?<=.)$/';
3758 1
      } else {
3759
        $s = '/' . preg_quote($s, '/') . '/ui';
3760
      }
3761 1
    }
3762
3763
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3764
    $count = $replace; // used as reference parameter
3765
3766
    return $subject;
3767
  }
3768
3769
  /**
3770
   * Limit the number of characters in a string, but also after the next word.
3771
   *
3772
   * @param string $str
3773
   * @param int    $length
3774
   * @param string $strAddOn
3775
   *
3776
   * @return string
3777 2
   */
3778
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3779 2
  {
3780
    $str = (string)$str;
3781 2
3782 2
    if (!isset($str[0])) {
3783
      return '';
3784 2
    }
3785
3786
    $length = (int)$length;
3787 2
3788 2
    if (self::strlen($str) <= $length) {
3789 2
      return $str;
3790 2
    }
3791 2
3792
    if (self::substr($str, $length - 1, 1) === ' ') {
3793 2
      return self::substr($str, 0, $length - 1) . $strAddOn;
3794 2
    }
3795 2
3796 2
    $str = self::substr($str, 0, $length);
3797 2
    $array = explode(' ', $str);
3798 2
    array_pop($array);
3799
    $new_str = implode(' ', $array);
3800 2
3801 2
    if ($new_str === '') {
3802 2
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3803 2
    } else {
3804 2
      $str = $new_str . $strAddOn;
3805 2
    }
3806
3807 2
    return $str;
3808
  }
3809
3810 2
  /**
3811
   * Pad a UTF-8 string to given length with another string.
3812
   *
3813
   * @param string $str        <p>The input string.</p>
3814
   * @param int    $pad_length <p>The length of return string.</p>
3815
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3816
   * @param int    $pad_type   [optional] <p>
3817
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3818
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3819
   *                           </p>
3820
   *
3821
   * @return string <strong>Returns the padded string</strong>
3822
   */
3823
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3824
  {
3825
    $str_length = self::strlen($str);
3826
3827
    if (
3828
        is_int($pad_length) === true
3829
        &&
3830
        $pad_length > 0
3831 1
        &&
3832
        $pad_length >= $str_length
3833 1
    ) {
3834
      $ps_length = self::strlen($pad_string);
3835 1
3836
      $diff = $pad_length - $str_length;
3837
3838
      switch ($pad_type) {
3839 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3840
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3841
          $pre = self::substr($pre, 0, $diff);
3842
          $post = '';
3843
          break;
3844
3845
        case STR_PAD_BOTH:
3846
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3847
          $pre = self::substr($pre, 0, (int)$diff / 2);
3848
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3849
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3850
          break;
3851
3852
        case STR_PAD_RIGHT:
3853 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3854
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3855
          $post = self::substr($post, 0, $diff);
3856
          $pre = '';
3857
      }
3858
3859
      return $pre . $str . $post;
3860
    }
3861
3862
    return $str;
3863
  }
3864
3865
  /**
3866
   * Repeat a string.
3867 12
   *
3868
   * @param string $str        <p>
3869 12
   *                           The string to be repeated.
3870
   *                           </p>
3871
   * @param int    $multiplier <p>
3872
   *                           Number of time the input string should be
3873
   *                           repeated.
3874
   *                           </p>
3875
   *                           <p>
3876
   *                           multiplier has to be greater than or equal to 0.
3877
   *                           If the multiplier is set to 0, the function
3878
   *                           will return an empty string.
3879 1
   *                           </p>
3880
   *
3881 1
   * @return string <p>The repeated string.</p>
3882
   */
3883 1
  public static function str_repeat($str, $multiplier)
3884
  {
3885 1
    $str = self::filter($str);
3886
3887
    return str_repeat($str, $multiplier);
3888
  }
3889
3890
  /**
3891
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3892
   *
3893
   * Replace all occurrences of the search string with the replacement string
3894
   *
3895
   * @link http://php.net/manual/en/function.str-replace.php
3896
   *
3897 1
   * @param mixed $search  <p>
3898
   *                       The value being searched for, otherwise known as the needle.
3899 1
   *                       An array may be used to designate multiple needles.
3900
   *                       </p>
3901 1
   * @param mixed $replace <p>
3902 1
   *                       The replacement value that replaces found search
3903 1
   *                       values. An array may be used to designate multiple replacements.
3904
   *                       </p>
3905 1
   * @param mixed $subject <p>
3906 1
   *                       The string or array being searched and replaced on,
3907 1
   *                       otherwise known as the haystack.
3908 1
   *                       </p>
3909
   *                       <p>
3910
   *                       If subject is an array, then the search and
3911 1
   *                       replace is performed with every entry of
3912
   *                       subject, and the return value is an array as
3913
   *                       well.
3914
   *                       </p>
3915
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3916
   *
3917
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3918
   */
3919
  public static function str_replace($search, $replace, $subject, &$count = null)
3920
  {
3921
    return str_replace($search, $replace, $subject, $count);
3922 20
  }
3923
3924
  /**
3925 20
   * Shuffles all the characters in the string.
3926
   *
3927 20
   * @param string $str <p>The input string</p>
3928
   *
3929
   * @return string <p>The shuffled string.</p>
3930
   */
3931 20
  public static function str_shuffle($str)
3932 20
  {
3933
    $array = self::split($str);
3934 20
3935 20
    shuffle($array);
3936
3937
    return implode('', $array);
3938 1
  }
3939 1
3940
  /**
3941
   * Sort all characters according to code points.
3942 1
   *
3943 1
   * @param string $str    <p>A UTF-8 string.</p>
3944 1
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3945 1
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3946 1
   *
3947
   * @return string <p>String of sorted characters.</p>
3948 1
   */
3949
  public static function str_sort($str, $unique = false, $desc = false)
3950 1
  {
3951
    $array = self::codepoints($str);
3952
3953
    if ($unique) {
3954
      $array = array_flip(array_flip($array));
3955
    }
3956
3957
    if ($desc) {
3958
      arsort($array);
3959
    } else {
3960 1
      asort($array);
3961
    }
3962 1
3963
    return self::string($array);
3964 1
  }
3965
3966 1
  /**
3967
   * Split a string into an array.
3968
   *
3969
   * @param string $str
3970
   * @param int    $len
3971
   *
3972
   * @return array
3973
   */
3974
  public static function str_split($str, $len = 1)
3975
  {
3976
    // init
3977
    $len = (int)$len;
3978
3979 7
    if ($len < 1) {
3980
      return str_split($str, $len);
3981 7
    }
3982
3983
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
3984
    $a = $a[0];
3985
3986
    if ($len === 1) {
3987
      return $a;
3988
    }
3989
3990
    $arrayOutput = array();
3991
    $p = -1;
3992
3993
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
3994
    foreach ($a as $l => $a) {
3995
      if ($l % $len) {
3996
        $arrayOutput[$p] .= $a;
3997
      } else {
3998 1
        $arrayOutput[++$p] = $a;
3999
      }
4000 1
    }
4001 1
4002
    return $arrayOutput;
4003 1
  }
4004
4005 1
  /**
4006
   * Get a binary representation of a specific string.
4007 1
   *
4008 1
   * @param string $str <p>The input string.</p>
4009 1
   *
4010 1
   * @return string
4011
   */
4012 1
  public static function str_to_binary($str)
4013
  {
4014 1
    $str = (string)$str;
4015 1
4016 1
    $value = unpack('H*', $str);
4017 1
4018 1
    return base_convert($value[1], 16, 2);
4019 1
  }
4020
4021 1
  /**
4022
   * alias for "UTF8::to_ascii()"
4023 1
   *
4024
   * @see UTF8::to_ascii()
4025
   *
4026
   * @param string $str
4027 1
   * @param string $unknown
4028
   * @param bool   $strict
4029
   *
4030
   * @return string
4031
   */
4032
  public static function str_transliterate($str, $unknown = '?', $strict = false)
4033
  {
4034
    return self::to_ascii($str, $unknown, $strict);
4035
  }
4036
4037
  /**
4038
   * Counts number of words in the UTF-8 string.
4039
   *
4040
   * @param string $str      <p>The input string.</p>
4041
   * @param int    $format   [optional] <p>
4042
   *                         <strong>0</strong> => return a number of words (default)<br />
4043
   *                         <strong>1</strong> => return an array of words<br />
4044 9
   *                         <strong>2</strong> => return an array of words with word-offset as key
4045
   *                         </p>
4046 9
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word
4047
   *                         (default: "'", "’")</p>
4048
   *
4049
   * @return array|int <p>The number of words in the string</p>
4050
   */
4051
  public static function str_word_count($str, $format = 0, $charlist = '')
4052
  {
4053
    $charlist = self::rxClass($charlist, '\pL');
4054
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4055
4056
    $len = count($strParts);
4057
4058
    if ($format === 1) {
4059
4060
      $numberOfWords = array();
4061 12
      for ($i = 1; $i < $len; $i += 2) {
4062
        $numberOfWords[] = $strParts[$i];
4063 12
      }
4064 11
4065 11
    } elseif ($format === 2) {
4066 12
4067
      $numberOfWords = array();
4068
      $offset = self::strlen($strParts[0]);
4069
      for ($i = 1; $i < $len; $i += 2) {
4070
        $numberOfWords[$offset] = $strParts[$i];
4071
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4072
      }
4073
4074
    } else {
4075
4076
      $numberOfWords = ($len - 1) / 2;
4077
4078
    }
4079 8
4080
    return $numberOfWords;
4081 8
  }
4082 1
4083
  /**
4084
   * Case-insensitive string comparison.
4085 7
   *
4086 2
   * INFO: Case-insensitive version of UTF8::strcmp()
4087 2
   *
4088 5
   * @param string $str1
4089
   * @param string $str2
4090
   *
4091 7
   * @return int <p>
4092
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4093 7
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4094
   *             <strong>0</strong> if they are equal.
4095 1
   *             </p>
4096
   */
4097
  public static function strcasecmp($str1, $str2)
4098
  {
4099
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4100
  }
4101
4102
  /**
4103
   * Case-sensitive string comparison.
4104
   *
4105
   * @param string $str1
4106
   * @param string $str2
4107
   *
4108 2
   * @return int  <p>
4109
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4110 2
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4111 2
   *              <strong>0</strong> if they are equal.
4112
   *              </p>
4113 2
   */
4114 2
  public static function strcmp($str1, $str2)
4115 2
  {
4116
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4117 2
        \Normalizer::normalize($str1, \Normalizer::NFD),
4118 2
        \Normalizer::normalize($str2, \Normalizer::NFD)
4119
    );
4120
  }
4121
4122
  /**
4123
   * Find length of initial segment not matching mask.
4124
   *
4125
   * @param string $str
4126
   * @param string $charList
4127
   * @param int    $offset
4128
   * @param int    $length
4129
   *
4130
   * @return int|null
4131
   */
4132
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4133
  {
4134
    if ('' === $charList .= '') {
4135
      return null;
4136
    }
4137
4138
    if ($offset || 2147483647 !== $length) {
4139
      $str = (string)self::substr($str, $offset, $length);
4140
    } else {
4141
      $str = (string)$str;
4142 3
    }
4143
4144 3
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4145 3
      /** @noinspection OffsetOperationsInspection */
4146 3
      return self::strlen($length[1]);
4147
    } else {
4148 3
      return self::strlen($str);
4149
    }
4150 3
  }
4151
4152
  /**
4153
   * Create a UTF-8 string from code points.
4154
   *
4155
   * INFO: opposite to UTF8::codepoints()
4156
   *
4157
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4158
   *
4159
   * @return string <p>UTF-8 encoded string.</p>
4160
   */
4161
  public static function string(array $array)
4162
  {
4163
    return implode(
4164
        array_map(
4165
            array(
4166
                '\\voku\\helper\\UTF8',
4167
                'chr',
4168
            ),
4169
            $array
4170
        )
4171
    );
4172 2
  }
4173
4174
  /**
4175 2
   * alias for "UTF8::string_has_bom()"
4176
   *
4177 2
   * @see UTF8::string_has_bom()
4178
   *
4179
   * @param string $str
4180
   *
4181
   * @return bool
4182
   */
4183
  public static function hasBom($str)
4184
  {
4185
    return self::string_has_bom($str);
4186
  }
4187
4188
  /**
4189
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4190
   *
4191
   * @param string $str <p>The input string.</p>
4192
   *
4193
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4194
   */
4195
  public static function string_has_bom($str)
4196
  {
4197
    foreach (self::$bom as $bomString => $bomByteLength) {
4198
      if (0 === strpos($str, $bomString)) {
4199
        return true;
4200
      }
4201
    }
4202
4203
    return false;
4204 8
  }
4205
4206 8
  /**
4207 8
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4208
   *
4209 8
   * @link http://php.net/manual/en/function.strip-tags.php
4210 3
   *
4211
   * @param string $str            <p>
4212
   *                               The input string.
4213 7
   *                               </p>
4214 1
   * @param string $allowable_tags [optional] <p>
4215 1
   *                               You can use the optional second parameter to specify tags which should
4216 1
   *                               not be stripped.
4217
   *                               </p>
4218
   *                               <p>
4219
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4220 7
   *                               can not be changed with allowable_tags.
4221 1
   *                               </p>
4222 7
   *
4223 7
   * @return string <p>The stripped string.</p>
4224 7
   */
4225
  public static function strip_tags($str, $allowable_tags = null)
4226
  {
4227
    // clean broken utf8
4228 7
    $str = self::clean($str);
4229
4230
    return strip_tags($str, $allowable_tags);
4231
  }
4232
4233
  /**
4234
   * Finds position of first occurrence of a string within another, case insensitive.
4235
   *
4236
   * @link http://php.net/manual/en/function.mb-stripos.php
4237
   *
4238
   * @param string  $haystack  <p>
4239
   *                           The string from which to get the position of the first occurrence
4240
   *                           of needle
4241
   *                           </p>
4242
   * @param string  $needle    <p>
4243
   *                           The string to find in haystack
4244 8
   *                           </p>
4245
   * @param int     $offset    [optional] <p>
4246 8
   *                           The position in haystack
4247 2
   *                           to start searching
4248
   *                           </p>
4249
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4250 6
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4251
   *
4252
   * @return int|false <p>
4253
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4254 6
   *                   or false if needle is not found.
4255
   *                   </p>
4256
   */
4257
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4258
  {
4259
    $haystack = (string)$haystack;
4260
    $needle = (string)$needle;
4261
4262
    if (!isset($haystack[0], $needle[0])) {
4263
      return false;
4264
    }
4265
4266
    if ($cleanUtf8 === true) {
4267
      $haystack = self::clean($haystack);
4268
      $needle = self::clean($needle);
4269 62
    }
4270
4271 62
    if (
4272
        $encoding === 'UTF-8'
4273 62
        ||
4274 4
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4275
    ) {
4276
      $encoding = 'UTF-8';
4277
    } else {
4278
      $encoding = self::normalize_encoding($encoding);
4279 61
    }
4280 1
4281 61
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4282 61
  }
4283 61
4284 1
  /**
4285
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4286
   *
4287
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4288 61
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4289 61
   * @param bool   $before_needle [optional] <p>
4290
   *                              If <b>TRUE</b>, grapheme_strstr() returns the part of the
4291
   *                              haystack before the first occurrence of the needle (excluding the needle).
4292
   *                              </p>
4293 61
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4294 2
   *
4295 2
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4296
   */
4297 61
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4298
  {
4299
    if ('' === $needle .= '') {
4300
      return false;
4301
    }
4302
4303
    if ($encoding !== 'UTF-8') {
4304
      $encoding = self::normalize_encoding($encoding);
4305
    }
4306
4307
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4308
  }
4309
4310
  /**
4311
   * Get the string length, not the byte-length!
4312 1
   *
4313
   * @link     http://php.net/manual/en/function.mb-strlen.php
4314 1
   *
4315
   * @param string  $str       <p>The string being checked for length.</p>
4316
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4317
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4318
   *
4319
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4320
   *             character counted as +1)</p>
4321
   */
4322
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4323
  {
4324
    $str = (string)$str;
4325
4326
    if (!isset($str[0])) {
4327
      return 0;
4328
    }
4329
4330
    if (
4331 2
        $encoding === 'UTF-8'
4332
        ||
4333 2
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4334
    ) {
4335
      $encoding = 'UTF-8';
4336
    } else {
4337
      $encoding = self::normalize_encoding($encoding);
4338
    }
4339
4340
    switch ($encoding) {
4341
      case 'ASCII':
4342
      case 'CP850':
4343
        return strlen($str);
4344
    }
4345
4346
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4347
      $str = self::clean($str);
4348
    }
4349 1
4350
    return \mb_strlen($str, $encoding);
4351 1
  }
4352
4353
  /**
4354
   * Case insensitive string comparisons using a "natural order" algorithm.
4355
   *
4356
   * INFO: natural order version of UTF8::strcasecmp()
4357
   *
4358
   * @param string $str1 <p>The first string.</p>
4359
   * @param string $str2 <p>The second string.</p>
4360
   *
4361
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4362
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4363
   *             <strong>0</strong> if they are equal
4364
   */
4365
  public static function strnatcasecmp($str1, $str2)
4366
  {
4367 2
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4368
  }
4369 2
4370 2
  /**
4371
   * String comparisons using a "natural order" algorithm
4372 2
   *
4373
   * INFO: natural order version of UTF8::strcmp()
4374
   *
4375
   * @link  http://php.net/manual/en/function.strnatcmp.php
4376
   *
4377
   * @param string $str1 <p>The first string.</p>
4378
   * @param string $str2 <p>The second string.</p>
4379
   *
4380
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4381
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4382
   *             <strong>0</strong> if they are equal
4383
   */
4384
  public static function strnatcmp($str1, $str2)
4385 1
  {
4386
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4387 1
  }
4388 1
4389
  /**
4390 1
   * Case-insensitive string comparison of the first n characters.
4391 1
   *
4392
   * @link  http://php.net/manual/en/function.strncasecmp.php
4393
   *
4394 1
   * @param string $str1 <p>The first string.</p>
4395 1
   * @param string $str2 <p>The second string.</p>
4396
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4397
   *
4398
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4399
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4400
   *             <strong>0</strong> if they are equal
4401
   */
4402
  public static function strncasecmp($str1, $str2, $len)
4403
  {
4404
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4405
  }
4406
4407
  /**
4408
   * String comparison of the first n characters.
4409
   *
4410
   * @link  http://php.net/manual/en/function.strncmp.php
4411
   *
4412
   * @param string $str1 <p>The first string.</p>
4413
   * @param string $str2 <p>The second string.</p>
4414
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4415
   *
4416
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4417 15
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4418
   *             <strong>0</strong> if they are equal
4419 15
   */
4420 15
  public static function strncmp($str1, $str2, $len)
4421
  {
4422 15
    $str1 = self::substr($str1, 0, $len);
4423 2
    $str2 = self::substr($str2, 0, $len);
4424
4425
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4422 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4423 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4426
  }
4427 14
4428
  /**
4429
   * Search a string for any of a set of characters.
4430
   *
4431 14
   * @link  http://php.net/manual/en/function.strpbrk.php
4432
   *
4433
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4434
   * @param string $char_list <p>This parameter is case sensitive.</p>
4435 14
   *
4436
   * @return string String starting from the character found, or false if it is not found.
4437
   */
4438
  public static function strpbrk($haystack, $char_list)
4439 1
  {
4440 1
    $haystack = (string)$haystack;
4441 1
    $char_list = (string)$char_list;
4442
4443 14
    if (!isset($haystack[0], $char_list[0])) {
4444
      return false;
4445
    }
4446
4447
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4448
      return substr($haystack, strpos($haystack, $m[0]));
4449 14
    } else {
4450 1
      return false;
4451 14
    }
4452 14
  }
4453 14
4454
  /**
4455
   * Find position of first occurrence of string in a string.
4456
   *
4457
   * @link http://php.net/manual/en/function.mb-strpos.php
4458
   *
4459 14
   * @param string  $haystack  <p>The string being checked.</p>
4460 14
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4461 14
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4462 14
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4463
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4464
   *
4465
   * @return int|false <p>
4466
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4467
   *                   If needle is not found it returns false.
4468
   *                   </p>
4469
   */
4470
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4471
  {
4472
    $haystack = (string)$haystack;
4473
    $needle = (string)$needle;
4474
4475
    if (!isset($haystack[0], $needle[0])) {
4476
      return false;
4477
    }
4478
4479
    // init
4480
    $offset = (int)$offset;
4481
4482
    // iconv and mbstring do not support integer $needle
4483
4484
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4485
      $needle = (string)self::chr($needle);
4486
    }
4487
4488
    if ($cleanUtf8 === true) {
4489
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4490
      // if invalid characters are found in $haystack before $needle
4491
      $needle = self::clean($needle);
4492
      $haystack = self::clean($haystack);
4493
    }
4494
4495
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4496
      self::checkForSupport();
4497
    }
4498
4499
    if (
4500
        $encoding === 'UTF-8'
4501
        ||
4502
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4503
    ) {
4504
      $encoding = 'UTF-8';
4505
    } else {
4506
      $encoding = self::normalize_encoding($encoding);
4507 1
    }
4508
4509 1
    if (self::$support['mbstring'] === true) {
4510
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4511
    }
4512
4513 1
    if (self::$support['iconv'] === true) {
4514
      // ignore invalid negative offset to keep compatibility
4515
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4516
      return \iconv_strpos($haystack, $needle, $offset > 0 ? $offset : 0, $encoding);
4517
    }
4518
4519
    if ($offset > 0) {
4520
      $haystack = self::substr($haystack, $offset);
4521
    }
4522
4523 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4524
      $left = substr($haystack, 0, $pos);
4525
4526
      // negative offset not supported in PHP strpos(), ignoring
4527 1
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4528
    }
4529 1
4530
    return false;
4531
  }
4532
4533
  /**
4534
   * Finds the last occurrence of a character in a string within another.
4535
   *
4536
   * @link http://php.net/manual/en/function.mb-strrchr.php
4537
   *
4538
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4539
   * @param string $needle   <p>The string to find in haystack</p>
4540
   * @param bool   $part     [optional] <p>
4541
   *                         Determines which portion of haystack
4542
   *                         this function returns.
4543
   *                         If set to true, it returns all of haystack
4544
   *                         from the beginning to the last occurrence of needle.
4545
   *                         If set to false, it returns all of haystack
4546
   *                         from the last occurrence of needle to the end,
4547
   *                         </p>
4548
   * @param string $encoding [optional] <p>
4549
   *                         Character encoding name to use.
4550
   *                         If it is omitted, internal character encoding is used.
4551
   *                         </p>
4552
   *
4553
   * @return string|false The portion of haystack or false if needle is not found.
4554
   */
4555 4 View Code Duplication
  public static function strrchr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4556
  {
4557 4
    if ($encoding !== 'UTF-8') {
4558
      $encoding = self::normalize_encoding($encoding);
4559 4
    }
4560 2
4561
    return \mb_strrchr($haystack, $needle, $part, $encoding);
4562
  }
4563 3
4564
  /**
4565
   * alias for "UTF8::strstr()"
4566
   *
4567
   * @see UTF8::strstr()
4568
   *
4569
   * @param string $haystack
4570
   * @param string $needle
4571
   * @param bool   $before_needle
4572
   *
4573
   * @return string|false
4574
   */
4575
  public static function strchr($haystack, $needle, $before_needle = false)
4576
  {
4577
    return self::strstr($haystack, $needle, $before_needle);
4578
  }
4579
4580
  /**
4581
   * alias for "UTF8::stristr()"
4582
   *
4583
   * @see UTF8::stristr()
4584
   *
4585
   * @param string $haystack
4586
   * @param string $needle
4587
   * @param bool   $before_needle
4588 1
   *
4589
   * @return string|false
4590 1
   */
4591
  public static function strichr($haystack, $needle, $before_needle = false)
4592
  {
4593
    return self::stristr($haystack, $needle, $before_needle);
4594 1
  }
4595
4596
  /**
4597
   * Reverses characters order in the string.
4598
   *
4599
   * @param string $str The input string
4600
   *
4601
   * @return string The string with characters in the reverse sequence
4602
   */
4603
  public static function strrev($str)
4604
  {
4605
    $str = (string)$str;
4606
4607
    if (!isset($str[0])) {
4608
      return '';
4609
    }
4610 1
4611
    return implode(array_reverse(self::split($str)));
4612 1
  }
4613
4614
  /**
4615
   * Finds the last occurrence of a character in a string within another, case insensitive.
4616
   *
4617
   * @link http://php.net/manual/en/function.mb-strrichr.php
4618
   *
4619
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4620
   * @param string $needle   <p>The string to find in haystack.</p>
4621
   * @param bool   $part     [optional] <p>
4622
   *                         Determines which portion of haystack
4623
   *                         this function returns.
4624
   *                         If set to true, it returns all of haystack
4625
   *                         from the beginning to the last occurrence of needle.
4626
   *                         If set to false, it returns all of haystack
4627
   *                         from the last occurrence of needle to the end,
4628
   *                         </p>
4629
   * @param string $encoding [optional] <p>
4630
   *                         Character encoding name to use.
4631 11
   *                         If it is omitted, internal character encoding is used.
4632
   *                         </p>
4633 11
   *
4634
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4635 11
   */
4636 2 View Code Duplication
  public static function strrichr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4637 2
  {
4638
    if ($encoding !== 'UTF-8') {
4639 11
      $encoding = self::normalize_encoding($encoding);
4640
    }
4641 11
4642 2
    return \mb_strrichr($haystack, $needle, $part, $encoding);
4643
  }
4644
4645
  /**
4646 10
   * Find position of last occurrence of a case-insensitive string.
4647 10
   *
4648
   * @param string  $haystack  <p>The string to look in.</p>
4649 10
   * @param string  $needle    <p>The string to look for.</p>
4650
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4651
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4652 2
   *
4653 2
   * @return int|false <p>
4654 2
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4655
   *                   not found, it returns false.
4656
   *                   </p>
4657 10
   */
4658
  public static function strripos($haystack, $needle, $offset = 0, $cleanUtf8 = false)
4659
  {
4660
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $cleanUtf8);
4661 10
  }
4662 10
4663
  /**
4664
   * Find position of last occurrence of a string in a string.
4665
   *
4666
   * @link http://php.net/manual/en/function.mb-strrpos.php
4667
   *
4668
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4669
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4670
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4671
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4672
   *                              the end of the string.
4673
   *                              </p>
4674
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4675
   *
4676
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4677
   *                   is not found, it returns false.</p>
4678
   */
4679
  public static function strrpos($haystack, $needle, $offset = null, $cleanUtf8 = false)
4680
  {
4681
    $haystack = (string)$haystack;
4682
4683
    if (((int)$needle) === $needle && ($needle >= 0)) {
4684
      $needle = self::chr($needle);
4685
    }
4686
4687
    $needle = (string)$needle;
4688
4689
    if (!isset($haystack[0], $needle[0])) {
4690
      return false;
4691
    }
4692
4693
    // init
4694
    $needle = (string)$needle;
4695
    $offset = (int)$offset;
4696
4697
    if ($cleanUtf8 === true) {
4698 8
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4699
4700 8
      $needle = self::clean($needle);
4701 2
      $haystack = self::clean($haystack);
4702 2
    }
4703
4704 8
4705
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4706
      self::checkForSupport();
4707
    }
4708
4709
    if (self::$support['mbstring'] === true) {
4710
      return \mb_strrpos($haystack, $needle, $offset, 'UTF-8');
4711
    }
4712
4713
    if (self::$support['iconv'] === true) {
4714
      return \grapheme_strrpos($haystack, $needle, $offset);
4715
    }
4716
4717
    // fallback
4718
4719
    if ($offset > 0) {
4720 2
      $haystack = self::substr($haystack, $offset);
4721
    } elseif ($offset < 0) {
4722 2
      $haystack = self::substr($haystack, 0, $offset);
4723
    }
4724
4725 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4726
      $left = substr($haystack, 0, $pos);
4727
4728 2
      // negative offset not supported in PHP strpos(), ignoring
4729 2
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4730 2
    }
4731 2
4732
    return false;
4733
  }
4734
4735
  /**
4736
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4737
   * mask.
4738
   *
4739
   * @param string $str    <p>The input string.</p>
4740
   * @param string $mask   <p>The mask of chars</p>
4741
   * @param int    $offset [optional]
4742
   * @param int    $length [optional]
4743
   *
4744
   * @return int
4745
   */
4746
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4747
  {
4748
    if ($offset || 2147483647 !== $length) {
4749
      $str = self::substr($str, $offset, $length);
4750 11
    }
4751
4752 11
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4753 11
  }
4754 11
4755
  /**
4756 11
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4757 1
   *
4758 1
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4759 1
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4760
   * @param bool   $before_needle [optional] <p>
4761 11
   *                              If <b>TRUE</b>, strstr() returns the part of the
4762
   *                              haystack before the first occurrence of the needle (excluding the needle).
4763 11
   *                              </p>
4764
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4765 11
   *
4766 1
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4767 1
   */
4768
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4769
  {
4770 11
    if ($encoding !== 'UTF-8') {
4771 11
      $encoding = self::normalize_encoding($encoding);
4772
    }
4773 11
4774
    if (
4775 11
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4776
        ||
4777
        self::$support['mbstring'] === true
4778
    ) {
4779
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4780
    }
4781
4782
    return \grapheme_strstr($haystack, $needle, $before_needle);
4783
  }
4784
4785
  /**
4786
   * Unicode transformation for case-less matching.
4787
   *
4788 21
   * @link http://unicode.org/reports/tr21/tr21-5.html
4789
   *
4790
   * @param string $str  <p>The input string.</p>
4791 21
   * @param bool   $full <p>
4792
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4793 21
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4794 6
   *                     </p>
4795
   *
4796
   * @return string
4797 19
   */
4798
  public static function strtocasefold($str, $full = true)
4799
  {
4800
    static $fullCaseFold = null;
4801 19
    static $commonCaseFoldKeys = null;
4802
    static $commonCaseFoldValues = null;
4803
4804
    if ($commonCaseFoldKeys === null) {
4805
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4806
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4807
    }
4808
4809
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4810
4811 3
    if ($full) {
4812
4813 3
      if ($fullCaseFold === null) {
4814
        $fullCaseFold = self::getData('caseFolding_full');
4815
      }
4816
4817
      /** @noinspection OffsetOperationsInspection */
4818
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4819
    }
4820
4821
    $str = self::clean($str);
4822
4823
    return self::strtolower($str);
4824
  }
4825
4826 16
  /**
4827
   * Make a string lowercase.
4828 16
   *
4829
   * @link http://php.net/manual/en/function.mb-strtolower.php
4830 16
   *
4831 4
   * @param string $str      <p>The string being lowercased.</p>
4832
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function</p>
4833
   *
4834 15
   * @return string str with all alphabetic characters converted to lowercase.
4835
   */
4836 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4837
  {
4838 15
    // init
4839
    $str = (string)$str;
4840
4841
    if (!isset($str[0])) {
4842
      return '';
4843
    }
4844
4845
    if ($encoding !== 'UTF-8') {
4846
      $encoding = self::normalize_encoding($encoding);
4847
    }
4848
4849
    return \mb_strtolower($str, $encoding);
4850
  }
4851
4852
  /**
4853
   * Generic case sensitive transformation for collation matching.
4854
   *
4855 1
   * @param string $str <p>The input string</p>
4856
   *
4857 1
   * @return string
4858 1
   */
4859 1
  private static function strtonatfold($str)
4860 1
  {
4861 1
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4862
  }
4863 1
4864 1
  /**
4865 1
   * Make a string uppercase.
4866 1
   *
4867 1
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4868
   *
4869 1
   * @param string $str      <p>The string being uppercased.</p>
4870 1
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4871
   *
4872 1
   * @return string str with all alphabetic characters converted to uppercase.
4873
   */
4874 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4875
  {
4876
    $str = (string)$str;
4877
4878
    if (!isset($str[0])) {
4879
      return '';
4880
    }
4881
4882
    if ($encoding !== 'UTF-8') {
4883
      $encoding = self::normalize_encoding($encoding);
4884 1
    }
4885
4886 1
    return \mb_strtoupper($str, $encoding);
4887 1
  }
4888 1
4889
  /**
4890 1
   * Translate characters or replace sub-strings.
4891
   *
4892
   * @link  http://php.net/manual/en/function.strtr.php
4893
   *
4894 1
   * @param string          $str  <p>The string being translated.</p>
4895 1
   * @param string|string[] $from <p>The string replacing from.</p>
4896
   * @param string|string[] $to   <p>The string being translated to to.</p>
4897 1
   *
4898
   * @return string <p>
4899
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4900
   *                corresponding character in to.
4901
   *                </p>
4902
   */
4903
  public static function strtr($str, $from, $to = INF)
4904
  {
4905
    if (INF !== $to) {
4906
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 4906 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4907
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 4907 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4908
      $countFrom = count($from);
4909
      $countTo = count($to);
4910
4911
      if ($countFrom > $countTo) {
4912
        $from = array_slice($from, 0, $countTo);
4913 47
      } elseif ($countFrom < $countTo) {
4914
        $to = array_slice($to, 0, $countFrom);
4915
      }
4916 47
4917
      $from = array_combine($from, $to);
4918 47
    }
4919 11
4920
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 4903 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
4921
  }
4922 45
4923
  /**
4924
   * Return the width of a string.
4925
   *
4926 1
   * @param string  $str       <p>The input string.</p>
4927 1
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4928
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4929 45
   *
4930 45
   * @return int
4931 37
   */
4932 37
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4933
  {
4934 45
    if ($encoding !== 'UTF-8') {
4935 2
      $encoding = self::normalize_encoding($encoding);
4936
    }
4937
4938 43
    if ($cleanUtf8 === true) {
4939 20
      // iconv and mbstring are not tolerant to invalid encoding
4940 20
      // further, their behaviour is inconsistent with that of PHP's substr
4941 41
4942
      $str = self::clean($str);
4943
    }
4944 43
4945
    return \mb_strwidth($str, $encoding);
4946
  }
4947
4948
  /**
4949
   * Get part of a string.
4950 43
   *
4951 1
   * @link http://php.net/manual/en/function.mb-substr.php
4952 43
   *
4953 43
   * @param string  $str       <p>The string being checked.</p>
4954 43
   * @param int     $start     <p>The first position used in str.</p>
4955
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
4956
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4957
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4958
   *
4959
   * @return string Returns a sub-string specified by the start and length parameters.
4960 43
   */
4961 43
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4962 43
  {
4963 43
    // init
4964
    $str = (string)$str;
4965
4966
    if (!isset($str[0])) {
4967
      return '';
4968
    }
4969
4970
    if ($cleanUtf8 === true) {
4971
      // iconv and mbstring are not tolerant to invalid encoding
4972
      // further, their behaviour is inconsistent with that of PHP's substr
4973
4974
      $str = self::clean($str);
4975
    }
4976
4977
    $str_length = 0;
4978
    if ($start || $length === null) {
4979
      $str_length = (int)self::strlen($str);
4980
    }
4981
4982
    if ($start && $start > $str_length) {
4983
      return false;
4984
    }
4985
4986
    if ($length === null) {
4987
      $length = $str_length;
4988
    } else {
4989
      $length = (int)$length;
4990
    }
4991
4992
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4993 1
      self::checkForSupport();
4994
    }
4995 1
4996 1
    if (
4997
        $encoding === 'UTF-8'
4998 1
        ||
4999
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
5000
    ) {
5001
      $encoding = 'UTF-8';
5002
    } else {
5003
      $encoding = self::normalize_encoding($encoding);
5004
    }
5005
5006
    if (self::$support['mbstring'] === true) {
5007
      return \mb_substr($str, $start, $length, $encoding);
5008
    }
5009
5010
    if (self::$support['iconv'] === true) {
5011
      return \iconv_substr($str, $start, $length, $encoding);
5012
    }
5013
5014
    // fallback
5015
5016
    // split to array, and remove invalid characters
5017
    $array = self::split($str);
5018 1
5019
    // extract relevant part, and join to make sting again
5020 1
    return implode(array_slice($array, $start, $length));
5021 1
  }
5022
5023 1
  /**
5024 1
   * Binary safe comparison of two strings from an offset, up to length characters.
5025
   *
5026
   * @param string  $main_str           <p>The main string being compared.</p>
5027 1
   * @param string  $str                <p>The secondary string being compared.</p>
5028 1
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
5029 1
   *                                    the end of the string.</p>
5030
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
5031 1
   *                                    the length of the str compared to the length of main_str less the offset.</p>
5032 1
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
5033
   *                                    insensitive.</p>
5034
   *
5035 1
   * @return int
5036 1
   */
5037
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
5038 1
  {
5039
    $main_str = self::substr($main_str, $offset, $length);
5040
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5039 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5041
5042 1
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5039 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5040 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5039 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5040 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5043
  }
5044
5045
  /**
5046
   * Count the number of substring occurrences.
5047
   *
5048
   * @link  http://php.net/manual/en/function.substr-count.php
5049
   *
5050
   * @param string $haystack  <p>The string to search in.</p>
5051
   * @param string $needle    <p>The substring to search for.</p>
5052
   * @param int    $offset    [optional] <p>The offset where to start counting.</p>
5053
   * @param int    $length    [optional] <p>
5054
   *                          The maximum length after the specified offset to search for the
5055
   *                          substring. It outputs a warning if the offset plus the length is
5056
   *                          greater than the haystack length.
5057 6
   *                          </p>
5058
   * @param string $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5059 6
   *
5060 1
   * @return int|false <p>This functions returns an integer or false if there isn't a string.</p>
5061
   */
5062
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8')
5063 1
  {
5064 1
    $haystack = (string)$haystack;
5065 1
    $needle = (string)$needle;
5066 1
5067
    if (!isset($haystack[0], $needle[0])) {
5068
      return false;
5069
    }
5070 1
5071 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5072 1
      $offset = (int)$offset;
5073 1
      $length = (int)$length;
5074 1
5075 1
      if ($length + $offset <= 0) {
5076 1
        return false;
5077 1
      }
5078
5079
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5080
    }
5081 1
5082 1
    if ($encoding !== 'UTF-8') {
5083 1
      $encoding = self::normalize_encoding($encoding);
5084 1
    }
5085 1
5086 1
    return \mb_substr_count($haystack, $needle, $encoding);
5087 1
  }
5088 1
5089
  /**
5090
   * Replace text within a portion of a string.
5091 1
   *
5092 1
   * source: https://gist.github.com/stemar/8287074
5093 1
   *
5094 1
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5095
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5096
   * @param int|int[]       $start
5097
   * @param int|int[]|void  $length      [optional]
5098 1
   *
5099
   * @return string|string[]
5100 6
   */
5101 1
  public static function substr_replace($str, $replacement, $start, $length = null)
5102 1
  {
5103 1
    if (is_array($str)) {
5104 1
      $num = count($str);
5105
5106 1
      // $replacement
5107
      if (is_array($replacement)) {
5108
        $replacement = array_slice($replacement, 0, $num);
5109 6
      } else {
5110 6
        $replacement = array_pad(array($replacement), $num, $replacement);
5111
      }
5112 6
5113 4
      // $start
5114 4 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5115
        $start = array_slice($start, 0, $num);
5116 6
        foreach ($start as &$valueTmp) {
5117
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5118 6
        }
5119
        unset($valueTmp);
5120
      } else {
5121
        $start = array_pad(array($start), $num, $start);
5122
      }
5123
5124
      // $length
5125
      if (!isset($length)) {
5126
        $length = array_fill(0, $num, 0);
5127 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5128
        $length = array_slice($length, 0, $num);
5129 1
        foreach ($length as &$valueTmpV2) {
5130
          if (isset($valueTmpV2)) {
5131 1
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5132
          } else {
5133 1
            $valueTmpV2 = 0;
5134 1
          }
5135
        }
5136
        unset($valueTmpV2);
5137 1
      } else {
5138
        $length = array_pad(array($length), $num, $length);
5139
      }
5140
5141 1
      // Recursive call
5142
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5143 1
    } else {
5144 1
      if (is_array($replacement)) {
5145
        if (count($replacement) > 0) {
5146 1
          $replacement = $replacement[0];
5147
        } else {
5148 1
          $replacement = '';
5149 1
        }
5150
      }
5151 1
    }
5152
5153 1
    preg_match_all('/./us', (string)$str, $smatches);
5154
    preg_match_all('/./us', (string)$replacement, $rmatches);
5155 1
5156
    if ($length === null) {
5157 1
      $length = \mb_strlen($str);
5158
    }
5159
5160
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5161
5162
    return implode($smatches[0], null);
5163
  }
5164
5165
  /**
5166
   * Returns a case swapped version of the string.
5167
   *
5168
   * @param string $str      <p>The input string.</p>
5169
   * @param string $encoding [optional] <p>Default is UTF-8</p>
5170 6
   *
5171
   * @return string <p>Each character's case swapped.</p>
5172 6
   */
5173
  public static function swapCase($str, $encoding = 'UTF-8')
5174
  {
5175
    $str = (string)$str;
5176
5177
    if (!isset($str[0])) {
5178
      return '';
5179
    }
5180
5181
    if ($encoding !== 'UTF-8') {
5182
      $encoding = self::normalize_encoding($encoding);
5183
    }
5184 1
5185
    $str = self::clean($str);
5186 1
5187
    $strSwappedCase = preg_replace_callback(
5188
        '/[\S]/u',
5189
        function ($match) use ($encoding) {
5190
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5191
5192
          if ($match[0] === $marchToUpper) {
5193
            return UTF8::strtolower($match[0], $encoding);
5194
          } else {
5195
            return $marchToUpper;
5196
          }
5197
        },
5198 1
        $str
5199
    );
5200 1
5201
    return $strSwappedCase;
5202
  }
5203
5204
  /**
5205
   * alias for "UTF8::to_ascii()"
5206
   *
5207
   * @see UTF8::to_ascii()
5208
   *
5209
   * @param string $s
5210
   * @param string $subst_chr
5211 13
   * @param bool   $strict
5212
   *
5213 13
   * @return string
5214
   */
5215
  public static function toAscii($s, $subst_chr = '?', $strict = false)
5216 13
  {
5217
    return self::to_ascii($s, $subst_chr, $strict);
5218 13
  }
5219 3
5220
  /**
5221
   * alias for "UTF8::to_latin1()"
5222 11
   *
5223
   * @see UTF8::to_latin1()
5224 11
   *
5225
   * @param $str
5226
   *
5227
   * @return string
5228 11
   */
5229 11
  public static function toLatin1($str)
5230
  {
5231
    return self::to_latin1($str);
5232 11
  }
5233 11
5234
  /**
5235 1
   * alias for "UTF8::to_utf8()"
5236
   *
5237 1
   * @see UTF8::to_utf8()
5238 1
   *
5239 1
   * @param string $str
5240
   *
5241 1
   * @return string
5242
   */
5243 1
  public static function toUTF8($str)
5244 1
  {
5245
    return self::to_utf8($str);
5246
  }
5247 1
5248
  /**
5249
   * Convert a string into ASCII.
5250 1
   *
5251 1
   * @param string $str     <p>The input string.</p>
5252 1
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5253
   * @param bool   $strict  [optional] <p>Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad
5254 1
   *                        performance</p>
5255 1
   *
5256
   * @return string
5257 1
   *
5258 1
   * @throws \Exception
5259 1
   */
5260
  public static function to_ascii($str, $unknown = '?', $strict = false)
5261 1
  {
5262 1
    static $UTF8_TO_ASCII;
5263
5264 1
    // init
5265 1
    $str = (string)$str;
5266 1
5267
    if (!isset($str[0])) {
5268 1
      return '';
5269
    }
5270
5271
    $str = self::clean($str, false, true, true);
5272
5273
    // check if we only have ASCII
5274
    if (self::is_ascii($str) === true) {
5275
      return $str;
5276
    }
5277
5278
    if ($strict === true) {
5279
      if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5280
        self::checkForSupport();
5281
      }
5282
5283 1
      if (self::$support['intl'] == true && Bootup::is_php('5.4')) {
5284 1
        $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5285
5286 1
        // check again, if we only have ASCII, now ...
5287
        if (self::is_ascii($str) === true) {
5288
          return $str;
5289
        }
5290
5291 1
      } else {
5292
        throw new \Exception('Intl is not supported or you use PHP < 5.4!');
5293
      }
5294
    }
5295
5296 1
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5297 1
    $chars = $ar[0];
5298 1
    foreach ($chars as &$c) {
5299 1
5300
      $ordC0 = ord($c[0]);
5301 1
5302 1
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5303 1
        continue;
5304
      }
5305 1
5306
      $ordC1 = ord($c[1]);
5307 1
5308 1
      // ASCII - next please
5309 1
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5310 1
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5311 1
      }
5312
5313 1
      if ($ordC0 >= 224) {
5314
        $ordC2 = ord($c[2]);
5315 1
5316
        if ($ordC0 <= 239) {
5317
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5318
        }
5319
5320
        if ($ordC0 >= 240) {
5321
          $ordC3 = ord($c[3]);
5322
5323
          if ($ordC0 <= 247) {
5324
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5325
          }
5326
5327 1
          if ($ordC0 >= 248) {
5328
            $ordC4 = ord($c[4]);
5329 1
5330 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5331
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5332
            }
5333
5334
            if ($ordC0 >= 252) {
5335
              $ordC5 = ord($c[5]);
5336
5337 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5338
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5339
              }
5340
            }
5341 1
          }
5342
        }
5343 1
      }
5344
5345
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5346
        $c = $unknown;
5347
        continue;
5348
      }
5349
5350
      if (!isset($ord)) {
5351
        $c = $unknown;
5352
        continue;
5353
      }
5354
5355
      $bank = $ord >> 8;
5356
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5357
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5358
        if (file_exists($bankfile)) {
5359
          /** @noinspection PhpIncludeInspection */
5360
          require $bankfile;
5361
        } else {
5362
          $UTF8_TO_ASCII[$bank] = array();
5363
        }
5364
      }
5365
5366
      $newchar = $ord & 255;
5367
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5368
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5369 20
      } else {
5370
        $c = $unknown;
5371 20
      }
5372 2
    }
5373
5374
    return implode('', $chars);
5375 2
  }
5376 2
5377
  /**
5378 2
   * alias for "UTF8::to_iso8859()"
5379
   *
5380
   * @see UTF8::to_iso8859()
5381 20
   *
5382
   * @param string $str
5383 20
   *
5384 4
   * @return string|string[]
5385
   */
5386
  public static function toIso8859($str)
5387 19
  {
5388 19
    return self::to_iso8859($str);
5389
  }
5390
5391 19
  /**
5392 19
   * alias for "UTF8::to_iso8859()"
5393
   *
5394 19
   * @see UTF8::to_iso8859()
5395 19
   *
5396 19
   * @param string|string[] $str
5397 19
   *
5398
   * @return string|string[]
5399 19
   */
5400
  public static function to_latin1($str)
5401 16
  {
5402 16
    return self::to_iso8859($str);
5403 16
  }
5404 16
5405 5
  /**
5406 5
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5407 5
   *
5408
   * - It decode UTF-8 codepoints and unicode escape sequences.
5409
   *
5410 19
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5411
   *
5412 17
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5413 13
   *
5414 13
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5415 13
   *    are followed by any of these:  ("group B")
5416 8
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5417 8
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5418 8
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5419
   * is also a valid unicode character, and will be left unchanged.
5420
   *
5421 19
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5422
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5423 9
   *
5424 4
   * @param string|string[] $str <p>Any string or array.</p>
5425 4
   *
5426 4
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5427 6
   */
5428 6
  public static function to_utf8($str)
5429 6
  {
5430
    if (is_array($str)) {
5431
      foreach ($str as $k => $v) {
5432 9
        /** @noinspection AlterInForeachInspection */
5433 6
        /** @noinspection OffsetOperationsInspection */
5434 6
        $str[$k] = self::to_utf8($v);
5435 6
      }
5436
5437
      return $str;
5438 19
    }
5439
5440 4
    $str = (string)$str;
5441 4
5442 2
    if (!isset($str[0])) {
5443 2
      return $str;
5444 3
    }
5445 3
5446 3
    $max = strlen($str);
5447
    $buf = '';
5448
5449 4
    /** @noinspection ForeachInvariantsInspection */
5450 16
    for ($i = 0; $i < $max; $i++) {
5451
      $c1 = $str[$i];
5452 19
5453
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5454
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5455 19
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5456 19
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5457
5458 3
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5459 19
5460
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5461 19
            $buf .= $c1 . $c2;
5462
            $i++;
5463
          } else { // not valid UTF8 - convert it
5464 19
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5465 19
            $cc2 = ($c1 & "\x3f") | "\x80";
5466 19
            $buf .= $cc1 . $cc2;
5467 2
          }
5468 19
5469 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5470 19
5471
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5472 19
            $buf .= $c1 . $c2 . $c3;
5473
            $i += 2;
5474
          } else { // not valid UTF8 - convert it
5475
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5476
            $cc2 = ($c1 & "\x3f") | "\x80";
5477
            $buf .= $cc1 . $cc2;
5478
          }
5479
5480
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5481
5482 2 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5483
            $buf .= $c1 . $c2 . $c3 . $c4;
5484 2
            $i += 3;
5485
          } else { // not valid UTF8 - convert it
5486 1
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5487
            $cc2 = ($c1 & "\x3f") | "\x80";
5488
            $buf .= $cc1 . $cc2;
5489 1
          }
5490 1
5491
        } else { // doesn't look like UTF8, but should be converted
5492 1
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5493
          $cc2 = (($c1 & "\x3f") | "\x80");
5494
          $buf .= $cc1 . $cc2;
5495 2
        }
5496
5497 2
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5498 1
5499
        $ordC1 = ord($c1);
5500
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5501 2
          $buf .= self::$win1252ToUtf8[$ordC1];
5502
        } else {
5503
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5504
          $cc2 = (($c1 & "\x3f") | "\x80");
5505
          $buf .= $cc1 . $cc2;
5506
        }
5507
5508
      } else { // it doesn't need conversion
5509
        $buf .= $c1;
5510
      }
5511
    }
5512
5513
    // decode unicode escape sequences
5514
    $buf = preg_replace_callback(
5515
        '/\\\\u([0-9a-f]{4})/i',
5516
        function ($match) {
5517 26
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5518
        },
5519 26
        $buf
5520
    );
5521 26
5522 5
    // decode UTF-8 codepoints
5523
    $buf = preg_replace_callback(
5524
        '/&#\d{2,4};/',
5525
        function ($match) {
5526 22
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5527 6
        },
5528
        $buf
5529
    );
5530 16
5531
    return $buf;
5532
  }
5533
5534
  /**
5535
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5536
   *
5537
   * @param string|string[] $str
5538
   *
5539
   * @return string|string[]
5540 14
   */
5541
  public static function to_iso8859($str)
5542 14
  {
5543
    if (is_array($str)) {
5544
5545
      foreach ($str as $k => $v) {
5546
        /** @noinspection AlterInForeachInspection */
5547
        /** @noinspection OffsetOperationsInspection */
5548
        $str[$k] = self::to_iso8859($v);
5549
      }
5550
5551
      return $str;
5552
    }
5553
5554 1
    $str = (string)$str;
5555
5556 1
    if (!isset($str[0])) {
5557
      return '';
5558
    }
5559
5560
    return self::utf8_decode($str);
5561
  }
5562
5563
  /**
5564
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5565
   *
5566
   * INFO: This is slower then "trim()"
5567 8
   *
5568
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5569 8
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5570 2
   *
5571
   * @param string $str   <p>The string to be trimmed</p>
5572
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5573
   *
5574 7
   * @return string <p>The trimmed string.</p>
5575 7
   */
5576
  public static function trim($str = '', $chars = INF)
5577 7
  {
5578 1
    $str = (string)$str;
5579 1
5580 7
    if (!isset($str[0])) {
5581
      return '';
5582
    }
5583 7
5584
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5585 7
    if ($chars === INF || !$chars) {
5586
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5587
    }
5588
5589 1
    return self::rtrim(self::ltrim($str, $chars), $chars);
5590 1
  }
5591 1
5592 7
  /**
5593 7
   * Makes string's first char uppercase.
5594 7
   *
5595 7
   * @param string $str <p>The input string.</p>
5596 7
   *
5597
   * @return string <p>The resulting string</p>
5598 7
   */
5599
  public static function ucfirst($str)
5600
  {
5601
    return self::strtoupper(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5602
  }
5603
5604
  /**
5605
   * alias for "UTF8::ucfirst()"
5606
   *
5607
   * @see UTF8::ucfirst()
5608
   *
5609
   * @param string $word
5610
   *
5611
   * @return string
5612
   */
5613
  public static function ucword($word)
5614
  {
5615
    return self::ucfirst($word);
5616
  }
5617
5618 1
  /**
5619
   * Uppercase for all words in the string.
5620 1
   *
5621
   * @param string   $str        <p>The input string.</p>
5622 1
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5623 1
   *
5624
   * @return string
5625
   */
5626 1
  public static function ucwords($str, $exceptions = array())
5627
  {
5628 1
    if (!$str) {
5629
      return '';
5630 1
    }
5631 1
5632 1
    // init
5633 1
    $words = explode(' ', $str);
5634
    $newwords = array();
5635 1
5636 1
    if (count($exceptions) > 0) {
5637 1
      $useExceptions = true;
5638
    } else {
5639 1
      $useExceptions = false;
5640
    }
5641
5642
    foreach ($words as $word) {
5643
      if (
5644
          ($useExceptions === false)
5645
          ||
5646
          (
5647 1
              $useExceptions === true
5648
              &&
5649
              !in_array($word, $exceptions, true)
5650
          )
5651
      ) {
5652
        $word = self::ucfirst($word);
5653
      }
5654
      $newwords[] = $word;
5655
    }
5656
5657
    return implode(' ', $newwords);
5658
  }
5659
5660
  /**
5661
   * Multi decode html entity & fix urlencoded-win1252-chars.
5662
   *
5663
   * e.g:
5664
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5665
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5666
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5667
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5668
   * 'Düsseldorf'                   => 'Düsseldorf'
5669
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5670
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5671
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5672
   *
5673
   * @param string $str <p>The input string.</p>
5674
   *
5675
   * @return string
5676
   */
5677
  public static function urldecode($str)
5678
  {
5679
    $str = (string)$str;
5680
5681
    if (!isset($str[0])) {
5682
      return '';
5683
    }
5684
5685
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5686
5687
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5688
5689
    $str = self::fix_simple_utf8(
5690
        rawurldecode(
5691
            self::html_entity_decode(
5692
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5693
                $flags
5694
            )
5695
        )
5696
    );
5697
5698
    return (string)$str;
5699
  }
5700
5701
  /**
5702
   * Return a array with "urlencoded"-win1252 -> UTF-8
5703
   *
5704
   * @return mixed
5705
   */
5706
  public static function urldecode_fix_win1252_chars()
5707
  {
5708
    static $array = array(
5709
        '%20' => ' ',
5710
        '%21' => '!',
5711
        '%22' => '"',
5712
        '%23' => '#',
5713
        '%24' => '$',
5714
        '%25' => '%',
5715
        '%26' => '&',
5716
        '%27' => "'",
5717
        '%28' => '(',
5718
        '%29' => ')',
5719
        '%2A' => '*',
5720
        '%2B' => '+',
5721
        '%2C' => ',',
5722
        '%2D' => '-',
5723
        '%2E' => '.',
5724
        '%2F' => '/',
5725
        '%30' => '0',
5726
        '%31' => '1',
5727
        '%32' => '2',
5728
        '%33' => '3',
5729
        '%34' => '4',
5730
        '%35' => '5',
5731
        '%36' => '6',
5732
        '%37' => '7',
5733
        '%38' => '8',
5734
        '%39' => '9',
5735
        '%3A' => ':',
5736
        '%3B' => ';',
5737
        '%3C' => '<',
5738
        '%3D' => '=',
5739
        '%3E' => '>',
5740
        '%3F' => '?',
5741
        '%40' => '@',
5742
        '%41' => 'A',
5743
        '%42' => 'B',
5744
        '%43' => 'C',
5745
        '%44' => 'D',
5746
        '%45' => 'E',
5747
        '%46' => 'F',
5748
        '%47' => 'G',
5749
        '%48' => 'H',
5750
        '%49' => 'I',
5751
        '%4A' => 'J',
5752
        '%4B' => 'K',
5753
        '%4C' => 'L',
5754
        '%4D' => 'M',
5755
        '%4E' => 'N',
5756
        '%4F' => 'O',
5757
        '%50' => 'P',
5758
        '%51' => 'Q',
5759
        '%52' => 'R',
5760
        '%53' => 'S',
5761
        '%54' => 'T',
5762
        '%55' => 'U',
5763
        '%56' => 'V',
5764
        '%57' => 'W',
5765
        '%58' => 'X',
5766
        '%59' => 'Y',
5767
        '%5A' => 'Z',
5768
        '%5B' => '[',
5769
        '%5C' => '\\',
5770
        '%5D' => ']',
5771
        '%5E' => '^',
5772
        '%5F' => '_',
5773
        '%60' => '`',
5774
        '%61' => 'a',
5775
        '%62' => 'b',
5776
        '%63' => 'c',
5777
        '%64' => 'd',
5778
        '%65' => 'e',
5779
        '%66' => 'f',
5780
        '%67' => 'g',
5781
        '%68' => 'h',
5782
        '%69' => 'i',
5783
        '%6A' => 'j',
5784
        '%6B' => 'k',
5785
        '%6C' => 'l',
5786
        '%6D' => 'm',
5787
        '%6E' => 'n',
5788
        '%6F' => 'o',
5789
        '%70' => 'p',
5790
        '%71' => 'q',
5791
        '%72' => 'r',
5792
        '%73' => 's',
5793
        '%74' => 't',
5794
        '%75' => 'u',
5795
        '%76' => 'v',
5796
        '%77' => 'w',
5797
        '%78' => 'x',
5798
        '%79' => 'y',
5799
        '%7A' => 'z',
5800
        '%7B' => '{',
5801
        '%7C' => '|',
5802
        '%7D' => '}',
5803
        '%7E' => '~',
5804
        '%7F' => '',
5805
        '%80' => '`',
5806
        '%81' => '',
5807
        '%82' => '‚',
5808
        '%83' => 'ƒ',
5809
        '%84' => '„',
5810
        '%85' => '…',
5811
        '%86' => '†',
5812
        '%87' => '‡',
5813
        '%88' => 'ˆ',
5814
        '%89' => '‰',
5815
        '%8A' => 'Š',
5816
        '%8B' => '‹',
5817
        '%8C' => 'Œ',
5818
        '%8D' => '',
5819
        '%8E' => 'Ž',
5820
        '%8F' => '',
5821
        '%90' => '',
5822
        '%91' => '‘',
5823
        '%92' => '’',
5824
        '%93' => '“',
5825
        '%94' => '”',
5826
        '%95' => '•',
5827
        '%96' => '–',
5828
        '%97' => '—',
5829
        '%98' => '˜',
5830
        '%99' => '™',
5831
        '%9A' => 'š',
5832
        '%9B' => '›',
5833
        '%9C' => 'œ',
5834
        '%9D' => '',
5835
        '%9E' => 'ž',
5836
        '%9F' => 'Ÿ',
5837
        '%A0' => '',
5838
        '%A1' => '¡',
5839
        '%A2' => '¢',
5840
        '%A3' => '£',
5841
        '%A4' => '¤',
5842
        '%A5' => '¥',
5843
        '%A6' => '¦',
5844
        '%A7' => '§',
5845
        '%A8' => '¨',
5846
        '%A9' => '©',
5847
        '%AA' => 'ª',
5848
        '%AB' => '«',
5849
        '%AC' => '¬',
5850
        '%AD' => '',
5851
        '%AE' => '®',
5852
        '%AF' => '¯',
5853
        '%B0' => '°',
5854
        '%B1' => '±',
5855
        '%B2' => '²',
5856
        '%B3' => '³',
5857
        '%B4' => '´',
5858
        '%B5' => 'µ',
5859
        '%B6' => '¶',
5860
        '%B7' => '·',
5861
        '%B8' => '¸',
5862
        '%B9' => '¹',
5863
        '%BA' => 'º',
5864
        '%BB' => '»',
5865
        '%BC' => '¼',
5866
        '%BD' => '½',
5867
        '%BE' => '¾',
5868
        '%BF' => '¿',
5869
        '%C0' => 'À',
5870
        '%C1' => 'Á',
5871
        '%C2' => 'Â',
5872
        '%C3' => 'Ã',
5873
        '%C4' => 'Ä',
5874 1
        '%C5' => 'Å',
5875
        '%C6' => 'Æ',
5876 1
        '%C7' => 'Ç',
5877
        '%C8' => 'È',
5878
        '%C9' => 'É',
5879
        '%CA' => 'Ê',
5880
        '%CB' => 'Ë',
5881
        '%CC' => 'Ì',
5882
        '%CD' => 'Í',
5883
        '%CE' => 'Î',
5884
        '%CF' => 'Ï',
5885
        '%D0' => 'Ð',
5886 6
        '%D1' => 'Ñ',
5887
        '%D2' => 'Ò',
5888 6
        '%D3' => 'Ó',
5889 6
        '%D4' => 'Ô',
5890
        '%D5' => 'Õ',
5891 6
        '%D6' => 'Ö',
5892
        '%D7' => '×',
5893 6
        '%D8' => 'Ø',
5894 3
        '%D9' => 'Ù',
5895
        '%DA' => 'Ú',
5896
        '%DB' => 'Û',
5897
        '%DC' => 'Ü',
5898 6
        '%DD' => 'Ý',
5899
        '%DE' => 'Þ',
5900 6
        '%DF' => 'ß',
5901 1
        '%E0' => 'à',
5902 1
        '%E1' => 'á',
5903 1
        '%E2' => 'â',
5904
        '%E3' => 'ã',
5905 6
        '%E4' => 'ä',
5906
        '%E5' => 'å',
5907
        '%E6' => 'æ',
5908
        '%E7' => 'ç',
5909
        '%E8' => 'è',
5910
        '%E9' => 'é',
5911
        '%EA' => 'ê',
5912
        '%EB' => 'ë',
5913
        '%EC' => 'ì',
5914
        '%ED' => 'í',
5915 6
        '%EE' => 'î',
5916
        '%EF' => 'ï',
5917 6
        '%F0' => 'ð',
5918
        '%F1' => 'ñ',
5919 6
        '%F2' => 'ò',
5920 6
        '%F3' => 'ó',
5921
        '%F4' => 'ô',
5922
        '%F5' => 'õ',
5923 5
        '%F6' => 'ö',
5924 5
        '%F7' => '÷',
5925
        '%F8' => 'ø',
5926 5
        '%F9' => 'ù',
5927 1
        '%FA' => 'ú',
5928 1
        '%FB' => 'û',
5929 1
        '%FC' => 'ü',
5930
        '%FD' => 'ý',
5931 5
        '%FE' => 'þ',
5932
        '%FF' => 'ÿ',
5933
    );
5934
5935
    return $array;
5936
  }
5937
5938
  /**
5939
   * Decodes an UTF-8 string to ISO-8859-1.
5940
   *
5941
   * @param string $str <p>The input string.</p>
5942
   *
5943
   * @return string
5944
   */
5945
  public static function utf8_decode($str)
5946
  {
5947
    static $utf8ToWin1252Keys = null;
5948
    static $utf8ToWin1252Values = null;
5949
5950
    $str = (string)$str;
5951
5952
    if (!isset($str[0])) {
5953
      return '';
5954
    }
5955
5956
    // init
5957
    $str = self::to_utf8($str);
5958
5959
    if ($utf8ToWin1252Keys === null) {
5960
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
5961 1
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
5962
    }
5963 1
5964
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
5965
  }
5966
5967
  /**
5968
   * Encodes an ISO-8859-1 string to UTF-8.
5969
   *
5970
   * @param string $str <p>The input string.</p>
5971
   *
5972
   * @return string
5973
   */
5974
  public static function utf8_encode($str)
5975 1
  {
5976
    $str = \utf8_encode($str);
5977 1
5978
    if (false === strpos($str, "\xC2")) {
5979 1
      return $str;
5980 1
    } else {
5981
5982
      static $cp1252ToUtf8Keys = null;
5983 1
      static $cp1252ToUtf8Values = null;
5984
5985 1
      if ($cp1252ToUtf8Keys === null) {
5986 1
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
5987
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
5988
      }
5989 1
5990
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
5991
    }
5992 1
  }
5993 1
5994 1
  /**
5995 1
   * fix -> utf8-win1252 chars
5996 1
   *
5997
   * @param string $str <p>The input string.</p>
5998
   *
5999 1
   * @return string
6000
   *
6001
   * @deprecated use "UTF8::fix_simple_utf8()"
6002
   */
6003
  public static function utf8_fix_win1252_chars($str)
6004
  {
6005
    return self::fix_simple_utf8($str);
6006
  }
6007
6008
  /**
6009
   * Returns an array with all utf8 whitespace characters.
6010
   *
6011
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
6012
   *
6013
   * @author: Derek E. [email protected]
6014
   *
6015
   * @return array <p>
6016
   *               An array with all known whitespace characters as values and the type of whitespace as keys
6017
   *               as defined in above URL.
6018 9
   *               </p>
6019
   */
6020 9
  public static function whitespace_table()
6021 9
  {
6022
    return self::$whitespaceTable;
6023 9
  }
6024 2
6025
  /**
6026
   * Limit the number of words in a string.
6027 8
   *
6028 8
   * @param string $str      <p>The input string.</p>
6029 8
   * @param int    $words    <p>The limit of words as integer.</p>
6030
   * @param string $strAddOn <p>Replacement for the striped string.</p>
6031 8
   *
6032
   * @return string
6033
   */
6034
  public static function words_limit($str, $words = 100, $strAddOn = '...')
6035 8
  {
6036
    $str = (string)$str;
6037 8
6038
    if (!isset($str[0])) {
6039 8
      return '';
6040 1
    }
6041 1
6042 1
    $words = (int)$words;
6043
6044 8
    if ($words < 1) {
6045 8
      return '';
6046
    }
6047 8
6048 8
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
6049 8
6050 8
    if (
6051 8
        !isset($matches[0])
6052
        ||
6053 8
        self::strlen($str) === self::strlen($matches[0])
6054 8
    ) {
6055 8
      return $str;
6056 8
    }
6057
6058 8
    return self::rtrim($matches[0]) . $strAddOn;
6059 6
  }
6060 6
6061 6
  /**
6062 6
   * Wraps a string to a given number of characters
6063
   *
6064 6
   * @link  http://php.net/manual/en/function.wordwrap.php
6065 3
   *
6066 3
   * @param string $str   <p>The input string.</p>
6067
   * @param int    $width [optional] <p>The column width.</p>
6068 6
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6069 6
   * @param bool   $cut   [optional] <p>
6070
   *                      If the cut is set to true, the string is
6071 8
   *                      always wrapped at or before the specified width. So if you have
6072
   *                      a word that is larger than the given width, it is broken apart.
6073
   *                      </p>
6074
   *
6075
   * @return string <p>The given string wrapped at the specified column.</p>
6076
   */
6077
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6078
  {
6079 1
    $str = (string)$str;
6080
    $break = (string)$break;
6081 1
6082
    if (!isset($str[0], $break[0])) {
6083
      return '';
6084
    }
6085
6086
    $w = '';
6087
    $strSplit = explode($break, $str);
6088
    $count = count($strSplit);
6089
6090
    if (1 === $count && '' === $strSplit[0]) {
6091
      return '';
6092
    }
6093
6094
    $chars = array();
6095
    /** @noinspection ForeachInvariantsInspection */
6096
    for ($i = 0; $i < $count; ++$i) {
6097
6098
      if ($i) {
6099
        $chars[] = $break;
6100
        $w .= '#';
6101
      }
6102
6103
      $c = $strSplit[$i];
6104
      unset($strSplit[$i]);
6105
6106
      foreach (self::split($c) as $c) {
6107
        $chars[] = $c;
6108
        $w .= ' ' === $c ? ' ' : '?';
6109
      }
6110
    }
6111
6112
    $strReturn = '';
6113
    $j = 0;
6114
    $b = $i = -1;
6115
    $w = wordwrap($w, $width, '#', $cut);
6116
6117
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6118
      for (++$i; $i < $b; ++$i) {
6119
        $strReturn .= $chars[$j];
6120
        unset($chars[$j++]);
6121
      }
6122
6123
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6124
        unset($chars[$j++]);
6125
      }
6126
6127
      $strReturn .= $break;
6128
    }
6129
6130
    return $strReturn . implode('', $chars);
6131
  }
6132
6133
  /**
6134
   * Returns an array of Unicode White Space characters.
6135
   *
6136
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6137
   */
6138
  public static function ws()
6139
  {
6140
    return self::$whitespace;
6141
  }
6142
6143
}
6144