Completed
Push — master ( 92b2e4...40d497 )
by Lars
06:02
created

UTF8::strpbrk()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 15
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 3.0175

Importance

Changes 5
Bugs 1 Features 2
Metric Value
c 5
b 1
f 2
dl 0
loc 15
ccs 7
cts 8
cp 0.875
rs 9.4285
cc 3
eloc 9
nc 3
nop 2
crap 3.0175
1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834 1
  public static function add_bom_to_string($str)
835
  {
836 1
    if (self::string_has_bom($str) === false) {
837 1
      $str = self::bom() . $str;
838 1
    }
839
840 1
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 2
  public static function bom()
861
  {
862 2
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884 2
  public static function checkForSupport()
885
  {
886 2
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888 1
      self::$support['already_checked_via_portable_utf8'] = true;
889
890 1
      self::$support['mbstring'] = self::mbstring_loaded();
891 1
      self::$support['iconv'] = self::iconv_loaded();
892 1
      self::$support['intl'] = self::intl_loaded();
893 1
      self::$support['intlChar'] = self::intlChar_loaded();
894 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895 1
    }
896 2
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907 9
  public static function chr($code_point)
908
  {
909
    // init
910 9
    $i = (int)$code_point;
911
912 9
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
913
      self::checkForSupport();
914
    }
915
916 9
    if (self::$support['intlChar'] === true) {
917
      return \IntlChar::chr($code_point);
918
    }
919
920 9
    if ($i !== $code_point) {
921 1
      $i = self::hex_to_int($code_point);
922 1
    }
923
924 9
    if (!$i) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $i of type integer|false is loosely compared to false; this is ambiguous if the integer can be zero. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
925 2
      return null;
926
    }
927
928 9
    return self::html_entity_decode("&#{$i};", ENT_QUOTES);
929
  }
930
931
  /**
932
   * Applies callback to all characters of a string.
933
   *
934
   * @param string|array $callback <p>The callback function.</p>
935
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
936
   *
937
   * @return array <p>The outcome of callback.</p>
938
   */
939 1
  public static function chr_map($callback, $str)
940
  {
941 1
    $chars = self::split($str);
942
943 1
    return array_map($callback, $chars);
944
  }
945
946
  /**
947
   * Generates an array of byte length of each character of a Unicode string.
948
   *
949
   * 1 byte => U+0000  - U+007F
950
   * 2 byte => U+0080  - U+07FF
951
   * 3 byte => U+0800  - U+FFFF
952
   * 4 byte => U+10000 - U+10FFFF
953
   *
954
   * @param string $str <p>The original Unicode string.</p>
955
   *
956
   * @return array <p>An array of byte lengths of each character.</p>
957
   */
958 4
  public static function chr_size_list($str)
959
  {
960 4
    if (!$str) {
961 3
      return array();
962
    }
963
964 4
    return array_map('strlen', self::split($str));
965
  }
966
967
  /**
968
   * Get a decimal code representation of a specific character.
969
   *
970
   * @param string $char <p>The input character.</p>
971
   *
972
   * @return int
973
   */
974 2
  public static function chr_to_decimal($char)
975
  {
976 2
    $char = (string)$char;
977 2
    $code = self::ord($char[0]);
978 2
    $bytes = 1;
979
980 2
    if (!($code & 0x80)) {
981
      // 0xxxxxxx
982 2
      return $code;
983
    }
984
985 2
    if (($code & 0xe0) === 0xc0) {
986
      // 110xxxxx
987 2
      $bytes = 2;
988 2
      $code &= ~0xc0;
989 2
    } elseif (($code & 0xf0) === 0xe0) {
990
      // 1110xxxx
991 1
      $bytes = 3;
992 1
      $code &= ~0xe0;
993 1
    } elseif (($code & 0xf8) === 0xf0) {
994
      // 11110xxx
995
      $bytes = 4;
996
      $code &= ~0xf0;
997
    }
998
999 2
    for ($i = 2; $i <= $bytes; $i++) {
1000
      // 10xxxxxx
1001 2
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1002 2
    }
1003
1004 2
    return $code;
1005
  }
1006
1007
  /**
1008
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1009
   *
1010
   * @param string $char <p>The input character</p>
1011
   * @param string $pfix [optional]
1012
   *
1013
   * @return string <p>The code point encoded as U+xxxx<p>
1014
   */
1015 1
  public static function chr_to_hex($char, $pfix = 'U+')
1016
  {
1017 1
    return self::int_to_hex(self::ord($char), $pfix);
1018
  }
1019
1020
  /**
1021
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1022
   *
1023
   * @param string $body     <p>The original string to be split.</p>
1024
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1025
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1026
   *
1027
   * @return string <p>The chunked string</p>
1028
   */
1029 1
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1030
  {
1031 1
    return implode($end, self::split($body, $chunklen));
1032
  }
1033
1034
  /**
1035
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1036
   *
1037
   * @param string $str                     <p>The string to be sanitized.</p>
1038
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1039
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1040
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1041
   *                                        => "..."</p>
1042
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1043
   *                                        $normalize_whitespace</p>
1044
   *
1045
   * @return string <p>Clean UTF-8 encoded string.</p>
1046
   */
1047 42
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1048
  {
1049
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1050
    // caused connection reset problem on larger strings
1051
1052
    $regx = '/
1053
      (
1054
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1055
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1056
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1057
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1058
        ){1,100}                      # ...one or more times
1059
      )
1060
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1061
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1062 42
    /x';
1063 42
    $str = preg_replace($regx, '$1', $str);
1064
1065 42
    $str = self::replace_diamond_question_mark($str, '');
1066 42
    $str = self::remove_invisible_characters($str);
1067
1068 42
    if ($normalize_whitespace === true) {
1069 6
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1070 6
    }
1071
1072 42
    if ($normalize_msword === true) {
1073 1
      $str = self::normalize_msword($str);
1074 1
    }
1075
1076 42
    if ($remove_bom === true) {
1077 5
      $str = self::removeBOM($str);
1078 5
    }
1079
1080 42
    return $str;
1081
  }
1082
1083
  /**
1084
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1085
   *
1086
   * @param string $str <p>The input string.</p>
1087
   *
1088
   * @return string
1089
   */
1090 4
  public static function cleanup($str)
1091
  {
1092 4
    $str = (string)$str;
1093
1094 4
    if (!isset($str[0])) {
1095 1
      return '';
1096
    }
1097
1098
    // fixed ISO <-> UTF-8 Errors
1099 4
    $str = self::fix_simple_utf8($str);
1100
1101
    // remove all none UTF-8 symbols
1102
    // && remove diamond question mark (�)
1103
    // && remove remove invisible characters (e.g. "\0")
1104
    // && remove BOM
1105
    // && normalize whitespace chars (but keep non-breaking-spaces)
1106 4
    $str = self::clean($str, true, true, false, true);
1107
1108 4
    return (string)$str;
1109
  }
1110
1111
  /**
1112
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1113
   *
1114
   * INFO: opposite to UTF8::string()
1115
   *
1116
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1117
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1118
   *                                    default, code points will be returned as integers.</p>
1119
   *
1120
   * @return array <p>The array of code points.</p>
1121
   */
1122 5
  public static function codepoints($arg, $u_style = false)
1123
  {
1124 5
    if (is_string($arg)) {
1125 5
      $arg = self::split($arg);
1126 5
    }
1127
1128 5
    $arg = array_map(
1129
        array(
1130 5
            '\\voku\\helper\\UTF8',
1131 5
            'ord',
1132 5
        ),
1133
        $arg
1134 5
    );
1135
1136 5
    if ($u_style) {
1137 1
      $arg = array_map(
1138
          array(
1139 1
              '\\voku\\helper\\UTF8',
1140 1
              'int_to_hex',
1141 1
          ),
1142
          $arg
1143 1
      );
1144 1
    }
1145
1146 5
    return $arg;
1147
  }
1148
1149
  /**
1150
   * Returns count of characters used in a string.
1151
   *
1152
   * @param string $str       <p>The input string.</p>
1153
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1154
   *
1155
   * @return array <p>An associative array of Character as keys and
1156
   *               their count as values.</p>
1157
   */
1158 6
  public static function count_chars($str, $cleanUtf8 = false)
1159
  {
1160 6
    return array_count_values(self::split($str, 1, $cleanUtf8));
1161
  }
1162
1163
  /**
1164
   * Get a UTF-8 character from its decimal code representation.
1165
   *
1166
   * @param int $code
1167
   *
1168
   * @return string
1169
   */
1170 1
  public static function decimal_to_chr($code)
1171
  {
1172 1
    return \mb_convert_encoding(
1173 1
        '&#x' . dechex($code) . ';',
1174 1
        'UTF-8',
1175
        'HTML-ENTITIES'
1176 1
    );
1177
  }
1178
1179
  /**
1180
   * Encode a string with a new charset-encoding.
1181
   *
1182
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1183
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1184
   *
1185
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1186
   * @param string $str      <p>The input string</p>
1187
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1188
   *                         /> otherwise we auto-detect the current string-encoding</p>
1189
   *
1190
   * @return string
1191
   */
1192 11
  public static function encode($encoding, $str, $force = true)
1193
  {
1194 11
    $str = (string)$str;
1195 11
    $encoding = (string)$encoding;
1196
1197 11
    if (!isset($str[0], $encoding[0])) {
1198 5
      return $str;
1199
    }
1200
1201 11
    if ($encoding !== 'UTF-8') {
1202 1
      $encoding = self::normalize_encoding($encoding);
1203 1
    }
1204
1205 11
    $encodingDetected = self::str_detect_encoding($str);
1206
1207
    if (
1208
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1209 11
        &&
1210
        (
1211
            $force === true
1212 11
            ||
1213
            $encodingDetected !== $encoding
1214 1
        )
1215 11
    ) {
1216
1217
      if (
1218
          $encoding === 'UTF-8'
1219 11
          &&
1220
          (
1221
              $force === true
1222 11
              || $encodingDetected === 'UTF-8'
1223 1
              || $encodingDetected === 'WINDOWS-1252'
1224 1
              || $encodingDetected === 'ISO-8859-1'
1225 1
          )
1226 11
      ) {
1227 11
        return self::to_utf8($str);
1228
      }
1229
1230
      if (
1231
          $encoding === 'ISO-8859-1'
1232 2
          &&
1233
          (
1234
              $force === true
1235 1
              || $encodingDetected === 'ISO-8859-1'
1236
              || $encodingDetected === 'UTF-8'
1237
          )
1238 2
      ) {
1239 1
        return self::to_iso8859($str);
1240
      }
1241
1242 2
      $strEncoded = \mb_convert_encoding(
1243 2
          $str,
1244 2
          $encoding,
1245
          $encodingDetected
1246 2
      );
1247
1248 2
      if ($strEncoded) {
1249 2
        return $strEncoded;
1250
      }
1251
    }
1252
1253 1
    return $str;
1254
  }
1255
1256
  /**
1257
   * Reads entire file into a string.
1258
   *
1259
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1260
   *
1261
   * @link http://php.net/manual/en/function.file-get-contents.php
1262
   *
1263
   * @param string        $filename      <p>
1264
   *                                     Name of the file to read.
1265
   *                                     </p>
1266
   * @param int|null      $flags         [optional] <p>
1267
   *                                     Prior to PHP 6, this parameter is called
1268
   *                                     use_include_path and is a bool.
1269
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1270
   *                                     to trigger include path
1271
   *                                     search.
1272
   *                                     </p>
1273
   *                                     <p>
1274
   *                                     The value of flags can be any combination of
1275
   *                                     the following flags (with some restrictions), joined with the
1276
   *                                     binary OR (|)
1277
   *                                     operator.
1278
   *                                     </p>
1279
   *                                     <p>
1280
   *                                     <table>
1281
   *                                     Available flags
1282
   *                                     <tr valign="top">
1283
   *                                     <td>Flag</td>
1284
   *                                     <td>Description</td>
1285
   *                                     </tr>
1286
   *                                     <tr valign="top">
1287
   *                                     <td>
1288
   *                                     FILE_USE_INCLUDE_PATH
1289
   *                                     </td>
1290
   *                                     <td>
1291
   *                                     Search for filename in the include directory.
1292
   *                                     See include_path for more
1293
   *                                     information.
1294
   *                                     </td>
1295
   *                                     </tr>
1296
   *                                     <tr valign="top">
1297
   *                                     <td>
1298
   *                                     FILE_TEXT
1299
   *                                     </td>
1300
   *                                     <td>
1301
   *                                     As of PHP 6, the default encoding of the read
1302
   *                                     data is UTF-8. You can specify a different encoding by creating a
1303
   *                                     custom context or by changing the default using
1304
   *                                     stream_default_encoding. This flag cannot be
1305
   *                                     used with FILE_BINARY.
1306
   *                                     </td>
1307
   *                                     </tr>
1308
   *                                     <tr valign="top">
1309
   *                                     <td>
1310
   *                                     FILE_BINARY
1311
   *                                     </td>
1312
   *                                     <td>
1313
   *                                     With this flag, the file is read in binary mode. This is the default
1314
   *                                     setting and cannot be used with FILE_TEXT.
1315
   *                                     </td>
1316
   *                                     </tr>
1317
   *                                     </table>
1318
   *                                     </p>
1319
   * @param resource|null $context       [optional] <p>
1320
   *                                     A valid context resource created with
1321
   *                                     stream_context_create. If you don't need to use a
1322
   *                                     custom context, you can skip this parameter by &null;.
1323
   *                                     </p>
1324
   * @param int|null      $offset        [optional] <p>
1325
   *                                     The offset where the reading starts.
1326
   *                                     </p>
1327
   * @param int|null      $maxlen        [optional] <p>
1328
   *                                     Maximum length of data read. The default is to read until end
1329
   *                                     of file is reached.
1330
   *                                     </p>
1331
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1332
   *
1333
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1334
   *                                     or pdf, because they used non default utf-8 chars</p>
1335
   *
1336
   * @return string <p>The function returns the read data or false on failure.</p>
1337
   */
1338 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1339
  {
1340
    // init
1341 2
    $timeout = (int)$timeout;
1342 2
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1343
1344 2
    if ($timeout && $context === null) {
1345 2
      $context = stream_context_create(
1346
          array(
1347
              'http' =>
1348
                  array(
1349 2
                      'timeout' => $timeout,
1350 2
                  ),
1351
          )
1352 2
      );
1353 2
    }
1354
1355 2
    if (is_int($maxlen)) {
1356 1
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1357 1
    } else {
1358 2
      $data = file_get_contents($filename, $flags, $context, $offset);
1359
    }
1360
1361
    // return false on error
1362 2
    if ($data === false) {
1363 1
      return false;
1364
    }
1365
1366 1
    if ($convertToUtf8 === true) {
1367 1
      $data = self::encode('UTF-8', $data, false);
1368 1
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1369 1
    }
1370
1371 1
    return $data;
1372
  }
1373
1374
  /**
1375
   * Checks if a file starts with BOM (Byte Order Mark) character.
1376
   *
1377
   * @param string $file_path <p>Path to a valid file.</p>
1378
   *
1379
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1380
   */
1381 1
  public static function file_has_bom($file_path)
1382
  {
1383 1
    return self::string_has_bom(file_get_contents($file_path));
1384
  }
1385
1386
  /**
1387
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1388
   *
1389
   * @param mixed  $var
1390
   * @param int    $normalization_form
1391
   * @param string $leading_combining
1392
   *
1393
   * @return mixed
1394
   */
1395 9
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1396
  {
1397 9
    switch (gettype($var)) {
1398 9 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1399 3
        foreach ($var as $k => $v) {
1400
          /** @noinspection AlterInForeachInspection */
1401 3
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1402 3
        }
1403 3
        break;
1404 9 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1405 2
        foreach ($var as $k => $v) {
1406 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1407 2
        }
1408 2
        break;
1409 9
      case 'string':
1410 8
        if (false !== strpos($var, "\r")) {
1411
          // Workaround https://bugs.php.net/65732
1412 2
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1413 2
        }
1414 8
        if (preg_match('/[\x80-\xFF]/', $var)) {
1415 8
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1416 6
            $n = '-';
1417 6
          } else {
1418 6
            $n = \Normalizer::normalize($var, $normalization_form);
1419
1420 6
            if (isset($n[0])) {
1421 3
              $var = $n;
1422 3
            } else {
1423 5
              $var = self::encode('UTF-8', $var);
1424
            }
1425
1426
          }
1427 8
          if ($var[0] >= "\x80" && isset($n[0], $leading_combining[0]) && preg_match('/^\p{Mn}/u', $var)) {
1428
            // Prevent leading combining chars
1429
            // for NFC-safe concatenations.
1430 2
            $var = $leading_combining . $var;
1431 2
          }
1432 8
        }
1433 8
        break;
1434 9
    }
1435
1436 9
    return $var;
1437
  }
1438
1439
  /**
1440
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1441
   *
1442
   * @param int    $type
1443
   * @param string $var
1444
   * @param int    $filter
1445
   * @param mixed  $option
1446
   *
1447
   * @return mixed
1448
   */
1449 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1450
  {
1451
    if (4 > func_num_args()) {
1452
      $var = filter_input($type, $var, $filter);
1453
    } else {
1454
      $var = filter_input($type, $var, $filter, $option);
1455
    }
1456
1457
    return self::filter($var);
1458
  }
1459
1460
  /**
1461
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1462
   *
1463
   * @param int   $type
1464
   * @param mixed $definition
1465
   * @param bool  $add_empty
1466
   *
1467
   * @return mixed
1468
   */
1469 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1470
  {
1471
    if (2 > func_num_args()) {
1472
      $a = filter_input_array($type);
1473
    } else {
1474
      $a = filter_input_array($type, $definition, $add_empty);
1475
    }
1476
1477
    return self::filter($a);
1478
  }
1479
1480
  /**
1481
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1482
   *
1483
   * @param mixed $var
1484
   * @param int   $filter
1485
   * @param mixed $option
1486
   *
1487
   * @return mixed
1488
   */
1489 1 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1490
  {
1491 1
    if (3 > func_num_args()) {
1492 1
      $var = filter_var($var, $filter);
1493 1
    } else {
1494 1
      $var = filter_var($var, $filter, $option);
1495
    }
1496
1497 1
    return self::filter($var);
1498
  }
1499
1500
  /**
1501
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1502
   *
1503
   * @param array $data
1504
   * @param mixed $definition
1505
   * @param bool  $add_empty
1506
   *
1507
   * @return mixed
1508
   */
1509 1 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1510
  {
1511 1
    if (2 > func_num_args()) {
1512 1
      $a = filter_var_array($data);
1513 1
    } else {
1514 1
      $a = filter_var_array($data, $definition, $add_empty);
1515
    }
1516
1517 1
    return self::filter($a);
1518
  }
1519
1520
  /**
1521
   * Check if the number of unicode characters are not more than the specified integer.
1522
   *
1523
   * @param string $str      The original string to be checked.
1524
   * @param int    $box_size The size in number of chars to be checked against string.
1525
   *
1526
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1527
   */
1528 1
  public static function fits_inside($str, $box_size)
1529
  {
1530 1
    return (self::strlen($str) <= $box_size);
1531
  }
1532
1533
  /**
1534
   * Try to fix simple broken UTF-8 strings.
1535
   *
1536
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1537
   *
1538
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1539
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1540
   * See: http://en.wikipedia.org/wiki/Windows-1252
1541
   *
1542
   * @param string $str <p>The input string</p>
1543
   *
1544
   * @return string
1545
   */
1546 7
  public static function fix_simple_utf8($str)
1547
  {
1548 7
    static $brokenUtf8ToUtf8Keys = null;
1549 7
    static $brokenUtf8ToUtf8Values = null;
1550
1551 7
    $str = (string)$str;
1552
1553 7
    if (!isset($str[0])) {
1554 2
      return '';
1555
    }
1556
1557 7
    if ($brokenUtf8ToUtf8Keys === null) {
1558 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1559 1
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1560 1
    }
1561
1562 7
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1563
  }
1564
1565
  /**
1566
   * Fix a double (or multiple) encoded UTF8 string.
1567
   *
1568
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1569
   *
1570
   * @return mixed
1571
   */
1572 1
  public static function fix_utf8($str)
1573
  {
1574 1
    if (is_array($str)) {
1575
1576 1
      foreach ($str as $k => $v) {
1577
        /** @noinspection AlterInForeachInspection */
1578
        /** @noinspection OffsetOperationsInspection */
1579 1
        $str[$k] = self::fix_utf8($v);
1580 1
      }
1581
1582 1
      return $str;
1583
    }
1584
1585 1
    $last = '';
1586 1
    while ($last !== $str) {
1587 1
      $last = $str;
1588 1
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1588 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1589 1
    }
1590
1591 1
    return $str;
1592
  }
1593
1594
  /**
1595
   * Get character of a specific character.
1596
   *
1597
   * @param string $char
1598
   *
1599
   * @return string <p>'RTL' or 'LTR'</p>
1600
   */
1601 1
  public static function getCharDirection($char)
1602
  {
1603 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1604
      self::checkForSupport();
1605
    }
1606
1607 1
    if (self::$support['intlChar'] === true) {
1608
      $tmpReturn = \IntlChar::charDirection($char);
1609
1610
      // from "IntlChar"-Class
1611
      $charDirection = array(
1612
          'RTL' => array(1, 13, 14, 15, 21),
1613
          'LTR' => array(0, 11, 12, 20),
1614
      );
1615
1616
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1617
        return 'LTR';
1618
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1619
        return 'RTL';
1620
      }
1621
    }
1622
1623 1
    $c = static::chr_to_decimal($char);
1624
1625 1
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1626 1
      return 'LTR';
1627
    }
1628
1629 1
    if (0x85e >= $c) {
1630
1631 1
      if (0x5be === $c ||
1632 1
          0x5c0 === $c ||
1633 1
          0x5c3 === $c ||
1634 1
          0x5c6 === $c ||
1635 1
          (0x5d0 <= $c && 0x5ea >= $c) ||
1636 1
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1637 1
          0x608 === $c ||
1638 1
          0x60b === $c ||
1639 1
          0x60d === $c ||
1640 1
          0x61b === $c ||
1641 1
          (0x61e <= $c && 0x64a >= $c) ||
1642
          (0x66d <= $c && 0x66f >= $c) ||
1643
          (0x671 <= $c && 0x6d5 >= $c) ||
1644
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1645
          (0x6ee <= $c && 0x6ef >= $c) ||
1646
          (0x6fa <= $c && 0x70d >= $c) ||
1647
          0x710 === $c ||
1648
          (0x712 <= $c && 0x72f >= $c) ||
1649
          (0x74d <= $c && 0x7a5 >= $c) ||
1650
          0x7b1 === $c ||
1651
          (0x7c0 <= $c && 0x7ea >= $c) ||
1652
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1653
          0x7fa === $c ||
1654
          (0x800 <= $c && 0x815 >= $c) ||
1655
          0x81a === $c ||
1656
          0x824 === $c ||
1657
          0x828 === $c ||
1658
          (0x830 <= $c && 0x83e >= $c) ||
1659
          (0x840 <= $c && 0x858 >= $c) ||
1660
          0x85e === $c
1661 1
      ) {
1662 1
        return 'RTL';
1663
      }
1664
1665
    } elseif (0x200f === $c) {
1666
1667
      return 'RTL';
1668
1669
    } elseif (0xfb1d <= $c) {
1670
1671
      if (0xfb1d === $c ||
1672
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1673
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1674
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1675
          0xfb3e === $c ||
1676
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1677
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1678
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1679
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1680
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1681
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1682
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1683
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1684
          (0xfe76 <= $c && 0xfefc >= $c) ||
1685
          (0x10800 <= $c && 0x10805 >= $c) ||
1686
          0x10808 === $c ||
1687
          (0x1080a <= $c && 0x10835 >= $c) ||
1688
          (0x10837 <= $c && 0x10838 >= $c) ||
1689
          0x1083c === $c ||
1690
          (0x1083f <= $c && 0x10855 >= $c) ||
1691
          (0x10857 <= $c && 0x1085f >= $c) ||
1692
          (0x10900 <= $c && 0x1091b >= $c) ||
1693
          (0x10920 <= $c && 0x10939 >= $c) ||
1694
          0x1093f === $c ||
1695
          0x10a00 === $c ||
1696
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1697
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1698
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1699
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1700
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1701
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1702
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1703
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1704
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1705
          (0x10b78 <= $c && 0x10b7f >= $c)
1706
      ) {
1707
        return 'RTL';
1708
      }
1709
    }
1710
1711
    return 'LTR';
1712
  }
1713
1714
  /**
1715
   * get data from "/data/*.ser"
1716
   *
1717
   * @param string $file
1718
   *
1719
   * @return bool|string|array|int <p>Will return false on error.</p>
1720
   */
1721 1
  private static function getData($file)
1722
  {
1723 1
    $file = __DIR__ . '/data/' . $file . '.php';
1724 1
    if (file_exists($file)) {
1725
      /** @noinspection PhpIncludeInspection */
1726 1
      return require $file;
1727
    } else {
1728
      return false;
1729
    }
1730
  }
1731
1732
  /**
1733
   * Converts hexadecimal U+xxxx code point representation to integer.
1734
   *
1735
   * INFO: opposite to UTF8::int_to_hex()
1736
   *
1737
   * @param string $str <p>The hexadecimal code point representation.</p>
1738
   *
1739
   * @return int|false <p>The code point, or false on failure.</p>
1740
   */
1741 2
  public static function hex_to_int($str)
1742
  {
1743 2
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1744 1
      return intval($match[1], 16);
1745
    }
1746
1747 1
    return false;
1748
  }
1749
1750
  /**
1751
   * alias for "UTF8::html_entity_decode()"
1752
   *
1753
   * @see UTF8::html_entity_decode()
1754
   *
1755
   * @param string $str
1756
   * @param int    $flags
1757
   * @param string $encoding
1758
   *
1759
   * @return string
1760
   */
1761 1
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1762
  {
1763 1
    return self::html_entity_decode($str, $flags, $encoding);
1764
  }
1765
1766
  /**
1767
   * Converts a UTF-8 string to a series of HTML numbered entities.
1768
   *
1769
   * INFO: opposite to UTF8::html_decode()
1770
   *
1771
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1772
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1773
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1774
   *
1775
   * @return string <p>HTML numbered entities.</p>
1776
   */
1777 2
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1778
  {
1779
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1780 2
    if (function_exists('mb_encode_numericentity')) {
1781
1782 2
      $startCode = 0x00;
1783 2
      if ($keepAsciiChars === true) {
1784 1
        $startCode = 0x80;
1785 1
      }
1786
1787 2
      if ($encoding !== 'UTF-8') {
1788
        $encoding = self::normalize_encoding($encoding);
1789
      }
1790
1791 2
      return mb_encode_numericentity(
1792 2
          $str,
1793 2
          array($startCode, 0xffff, 0, 0xffff,),
1794
          $encoding
1795 2
      );
1796
    }
1797
1798
    return implode(
1799
        array_map(
1800
            function ($data) use ($keepAsciiChars) {
1801
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1802
            },
1803
            self::split($str)
1804
        )
1805
    );
1806
  }
1807
1808
  /**
1809
   * UTF-8 version of html_entity_decode()
1810
   *
1811
   * The reason we are not using html_entity_decode() by itself is because
1812
   * while it is not technically correct to leave out the semicolon
1813
   * at the end of an entity most browsers will still interpret the entity
1814
   * correctly. html_entity_decode() does not convert entities without
1815
   * semicolons, so we are left with our own little solution here. Bummer.
1816
   *
1817
   * Convert all HTML entities to their applicable characters
1818
   *
1819
   * INFO: opposite to UTF8::html_encode()
1820
   *
1821
   * @link http://php.net/manual/en/function.html-entity-decode.php
1822
   *
1823
   * @param string $str      <p>
1824
   *                         The input string.
1825
   *                         </p>
1826
   * @param int    $flags    [optional] <p>
1827
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1828
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1829
   *                         <table>
1830
   *                         Available <i>flags</i> constants
1831
   *                         <tr valign="top">
1832
   *                         <td>Constant Name</td>
1833
   *                         <td>Description</td>
1834
   *                         </tr>
1835
   *                         <tr valign="top">
1836
   *                         <td><b>ENT_COMPAT</b></td>
1837
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1838
   *                         </tr>
1839
   *                         <tr valign="top">
1840
   *                         <td><b>ENT_QUOTES</b></td>
1841
   *                         <td>Will convert both double and single quotes.</td>
1842
   *                         </tr>
1843
   *                         <tr valign="top">
1844
   *                         <td><b>ENT_NOQUOTES</b></td>
1845
   *                         <td>Will leave both double and single quotes unconverted.</td>
1846
   *                         </tr>
1847
   *                         <tr valign="top">
1848
   *                         <td><b>ENT_HTML401</b></td>
1849
   *                         <td>
1850
   *                         Handle code as HTML 4.01.
1851
   *                         </td>
1852
   *                         </tr>
1853
   *                         <tr valign="top">
1854
   *                         <td><b>ENT_XML1</b></td>
1855
   *                         <td>
1856
   *                         Handle code as XML 1.
1857
   *                         </td>
1858
   *                         </tr>
1859
   *                         <tr valign="top">
1860
   *                         <td><b>ENT_XHTML</b></td>
1861
   *                         <td>
1862
   *                         Handle code as XHTML.
1863
   *                         </td>
1864
   *                         </tr>
1865
   *                         <tr valign="top">
1866
   *                         <td><b>ENT_HTML5</b></td>
1867
   *                         <td>
1868
   *                         Handle code as HTML 5.
1869
   *                         </td>
1870
   *                         </tr>
1871
   *                         </table>
1872
   *                         </p>
1873
   * @param string $encoding [optional] <p>Encoding to use.</p>
1874
   *
1875
   * @return string <p>The decoded string.</p>
1876
   */
1877 18
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1878
  {
1879 18
    $str = (string)$str;
1880
1881 18
    if (!isset($str[0])) {
1882 6
      return '';
1883
    }
1884
1885 18
    if (strpos($str, '&') === false) {
1886 7
      return $str;
1887
    }
1888
1889 18
    if ($encoding !== 'UTF-8') {
1890 1
      $encoding = self::normalize_encoding($encoding);
1891 1
    }
1892
1893 18
    if ($flags === null) {
1894 4
      if (Bootup::is_php('5.4') === true) {
1895 4
        $flags = ENT_COMPAT | ENT_HTML5;
1896 4
      } else {
1897
        $flags = ENT_COMPAT;
1898
      }
1899 4
    }
1900
1901
    do {
1902 18
      $str_compare = $str;
1903
1904 18
      $str = preg_replace_callback(
1905 18
          "/&#\d{2,5};/",
1906
          function ($matches) {
1907 16
            $returnTmp = \mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
1908
1909 16
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1910 15
              return $returnTmp;
1911
            } else {
1912 7
              return $matches[0];
1913
            }
1914 18
          },
1915
          $str
1916 18
      );
1917
1918
      // decode numeric & UTF16 two byte entities
1919 18
      $str = html_entity_decode(
1920 18
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1921 18
          $flags,
1922
          $encoding
1923 18
      );
1924
1925 18
    } while ($str_compare !== $str);
1926
1927 18
    return $str;
1928
  }
1929
1930
  /**
1931
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1932
   *
1933
   * @link http://php.net/manual/en/function.htmlentities.php
1934
   *
1935
   * @param string $str           <p>
1936
   *                              The input string.
1937
   *                              </p>
1938
   * @param int    $flags         [optional] <p>
1939
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
1940
   *                              invalid code unit sequences and the used document type. The default is
1941
   *                              ENT_COMPAT | ENT_HTML401.
1942
   *                              <table>
1943
   *                              Available <i>flags</i> constants
1944
   *                              <tr valign="top">
1945
   *                              <td>Constant Name</td>
1946
   *                              <td>Description</td>
1947
   *                              </tr>
1948
   *                              <tr valign="top">
1949
   *                              <td><b>ENT_COMPAT</b></td>
1950
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
1951
   *                              </tr>
1952
   *                              <tr valign="top">
1953
   *                              <td><b>ENT_QUOTES</b></td>
1954
   *                              <td>Will convert both double and single quotes.</td>
1955
   *                              </tr>
1956
   *                              <tr valign="top">
1957
   *                              <td><b>ENT_NOQUOTES</b></td>
1958
   *                              <td>Will leave both double and single quotes unconverted.</td>
1959
   *                              </tr>
1960
   *                              <tr valign="top">
1961
   *                              <td><b>ENT_IGNORE</b></td>
1962
   *                              <td>
1963
   *                              Silently discard invalid code unit sequences instead of returning
1964
   *                              an empty string. Using this flag is discouraged as it
1965
   *                              may have security implications.
1966
   *                              </td>
1967
   *                              </tr>
1968
   *                              <tr valign="top">
1969
   *                              <td><b>ENT_SUBSTITUTE</b></td>
1970
   *                              <td>
1971
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
1972
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
1973
   *                              </td>
1974
   *                              </tr>
1975
   *                              <tr valign="top">
1976
   *                              <td><b>ENT_DISALLOWED</b></td>
1977
   *                              <td>
1978
   *                              Replace invalid code points for the given document type with a
1979
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
1980
   *                              (otherwise) instead of leaving them as is. This may be useful, for
1981
   *                              instance, to ensure the well-formedness of XML documents with
1982
   *                              embedded external content.
1983
   *                              </td>
1984
   *                              </tr>
1985
   *                              <tr valign="top">
1986
   *                              <td><b>ENT_HTML401</b></td>
1987
   *                              <td>
1988
   *                              Handle code as HTML 4.01.
1989
   *                              </td>
1990
   *                              </tr>
1991
   *                              <tr valign="top">
1992
   *                              <td><b>ENT_XML1</b></td>
1993
   *                              <td>
1994
   *                              Handle code as XML 1.
1995
   *                              </td>
1996
   *                              </tr>
1997
   *                              <tr valign="top">
1998
   *                              <td><b>ENT_XHTML</b></td>
1999
   *                              <td>
2000
   *                              Handle code as XHTML.
2001
   *                              </td>
2002
   *                              </tr>
2003
   *                              <tr valign="top">
2004
   *                              <td><b>ENT_HTML5</b></td>
2005
   *                              <td>
2006
   *                              Handle code as HTML 5.
2007
   *                              </td>
2008
   *                              </tr>
2009
   *                              </table>
2010
   *                              </p>
2011
   * @param string $encoding      [optional] <p>
2012
   *                              Like <b>htmlspecialchars</b>,
2013
   *                              <b>htmlentities</b> takes an optional third argument
2014
   *                              <i>encoding</i> which defines encoding used in
2015
   *                              conversion.
2016
   *                              Although this argument is technically optional, you are highly
2017
   *                              encouraged to specify the correct value for your code.
2018
   *                              </p>
2019
   * @param bool   $double_encode [optional] <p>
2020
   *                              When <i>double_encode</i> is turned off PHP will not
2021
   *                              encode existing html entities. The default is to convert everything.
2022
   *                              </p>
2023
   *
2024
   *
2025
   * @return string the encoded string.
2026
   * </p>
2027
   * <p>
2028
   * If the input <i>string</i> contains an invalid code unit
2029
   * sequence within the given <i>encoding</i> an empty string
2030
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2031
   * <b>ENT_SUBSTITUTE</b> flags are set.
2032
   */
2033 2
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2034
  {
2035 2
    if ($encoding !== 'UTF-8') {
2036 1
      $encoding = self::normalize_encoding($encoding);
2037 1
    }
2038
2039 2
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2040
2041 2
    if ($encoding !== 'UTF-8') {
2042 1
      return $str;
2043
    }
2044
2045 2
    $byteLengths = self::chr_size_list($str);
2046 2
    $search = array();
2047 2
    $replacements = array();
2048 2
    foreach ($byteLengths as $counter => $byteLength) {
2049 2
      if ($byteLength >= 3) {
2050 1
        $char = self::access($str, $counter);
2051
2052 1
        if (!isset($replacements[$char])) {
2053 1
          $search[$char] = $char;
2054 1
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2050 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2055 1
        }
2056 1
      }
2057 2
    }
2058
2059 2
    return str_replace($search, $replacements, $str);
2060
  }
2061
2062
  /**
2063
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2064
   *
2065
   * INFO: Take a look at "UTF8::htmlentities()"
2066
   *
2067
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2068
   *
2069
   * @param string $str           <p>
2070
   *                              The string being converted.
2071
   *                              </p>
2072
   * @param int    $flags         [optional] <p>
2073
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2074
   *                              invalid code unit sequences and the used document type. The default is
2075
   *                              ENT_COMPAT | ENT_HTML401.
2076
   *                              <table>
2077
   *                              Available <i>flags</i> constants
2078
   *                              <tr valign="top">
2079
   *                              <td>Constant Name</td>
2080
   *                              <td>Description</td>
2081
   *                              </tr>
2082
   *                              <tr valign="top">
2083
   *                              <td><b>ENT_COMPAT</b></td>
2084
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2085
   *                              </tr>
2086
   *                              <tr valign="top">
2087
   *                              <td><b>ENT_QUOTES</b></td>
2088
   *                              <td>Will convert both double and single quotes.</td>
2089
   *                              </tr>
2090
   *                              <tr valign="top">
2091
   *                              <td><b>ENT_NOQUOTES</b></td>
2092
   *                              <td>Will leave both double and single quotes unconverted.</td>
2093
   *                              </tr>
2094
   *                              <tr valign="top">
2095
   *                              <td><b>ENT_IGNORE</b></td>
2096
   *                              <td>
2097
   *                              Silently discard invalid code unit sequences instead of returning
2098
   *                              an empty string. Using this flag is discouraged as it
2099
   *                              may have security implications.
2100
   *                              </td>
2101
   *                              </tr>
2102
   *                              <tr valign="top">
2103
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2104
   *                              <td>
2105
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2106
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2107
   *                              </td>
2108
   *                              </tr>
2109
   *                              <tr valign="top">
2110
   *                              <td><b>ENT_DISALLOWED</b></td>
2111
   *                              <td>
2112
   *                              Replace invalid code points for the given document type with a
2113
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2114
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2115
   *                              instance, to ensure the well-formedness of XML documents with
2116
   *                              embedded external content.
2117
   *                              </td>
2118
   *                              </tr>
2119
   *                              <tr valign="top">
2120
   *                              <td><b>ENT_HTML401</b></td>
2121
   *                              <td>
2122
   *                              Handle code as HTML 4.01.
2123
   *                              </td>
2124
   *                              </tr>
2125
   *                              <tr valign="top">
2126
   *                              <td><b>ENT_XML1</b></td>
2127
   *                              <td>
2128
   *                              Handle code as XML 1.
2129
   *                              </td>
2130
   *                              </tr>
2131
   *                              <tr valign="top">
2132
   *                              <td><b>ENT_XHTML</b></td>
2133
   *                              <td>
2134
   *                              Handle code as XHTML.
2135
   *                              </td>
2136
   *                              </tr>
2137
   *                              <tr valign="top">
2138
   *                              <td><b>ENT_HTML5</b></td>
2139
   *                              <td>
2140
   *                              Handle code as HTML 5.
2141
   *                              </td>
2142
   *                              </tr>
2143
   *                              </table>
2144
   *                              </p>
2145
   * @param string $encoding      [optional] <p>
2146
   *                              Defines encoding used in conversion.
2147
   *                              </p>
2148
   *                              <p>
2149
   *                              For the purposes of this function, the encodings
2150
   *                              ISO-8859-1, ISO-8859-15,
2151
   *                              UTF-8, cp866,
2152
   *                              cp1251, cp1252, and
2153
   *                              KOI8-R are effectively equivalent, provided the
2154
   *                              <i>string</i> itself is valid for the encoding, as
2155
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2156
   *                              the same positions in all of these encodings.
2157
   *                              </p>
2158
   * @param bool   $double_encode [optional] <p>
2159
   *                              When <i>double_encode</i> is turned off PHP will not
2160
   *                              encode existing html entities, the default is to convert everything.
2161
   *                              </p>
2162
   *
2163
   * @return string The converted string.
2164
   * </p>
2165
   * <p>
2166
   * If the input <i>string</i> contains an invalid code unit
2167
   * sequence within the given <i>encoding</i> an empty string
2168
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2169
   * <b>ENT_SUBSTITUTE</b> flags are set.
2170
   */
2171 1
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2172
  {
2173 1
    if ($encoding !== 'UTF-8') {
2174
      $encoding = self::normalize_encoding($encoding);
2175
    }
2176
2177 1
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2178
  }
2179
2180
  /**
2181
   * Checks whether iconv is available on the server.
2182
   *
2183
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2184
   */
2185 1
  public static function iconv_loaded()
2186
  {
2187 1
    return extension_loaded('iconv') ? true : false;
2188
  }
2189
2190
  /**
2191
   * Converts Integer to hexadecimal U+xxxx code point representation.
2192
   *
2193
   * INFO: opposite to UTF8::hex_to_int()
2194
   *
2195
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2196
   * @param string $pfix [optional]
2197
   *
2198
   * @return string <p>The code point, or empty string on failure.</p>
2199
   */
2200 3
  public static function int_to_hex($int, $pfix = 'U+')
2201
  {
2202 3
    if (ctype_digit((string)$int)) {
2203 3
      $hex = dechex((int)$int);
2204
2205 3
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2206
2207 3
      return $pfix . $hex;
2208
    }
2209
2210
    return '';
2211
  }
2212
2213
  /**
2214
   * Checks whether intl-char is available on the server.
2215
   *
2216
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2217
   */
2218 1
  public static function intlChar_loaded()
2219
  {
2220 1
    return (Bootup::is_php('7.0') === true && class_exists('IntlChar') === true);
2221
  }
2222
2223
  /**
2224
   * Checks whether intl is available on the server.
2225
   *
2226
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2227
   */
2228 3
  public static function intl_loaded()
2229
  {
2230 3
    return extension_loaded('intl') ? true : false;
2231
  }
2232
2233
  /**
2234
   * alias for "UTF8::is_ascii()"
2235
   *
2236
   * @see UTF8::is_ascii()
2237
   *
2238
   * @param string $str
2239
   *
2240
   * @return boolean
2241
   */
2242 2
  public static function isAscii($str)
2243
  {
2244 2
    return self::is_ascii($str);
2245
  }
2246
2247
  /**
2248
   * alias for "UTF8::is_base64()"
2249
   *
2250
   * @see UTF8::is_base64()
2251
   *
2252
   * @param string $str
2253
   *
2254
   * @return bool
2255
   */
2256 1
  public static function isBase64($str)
2257
  {
2258 1
    return self::is_base64($str);
2259
  }
2260
2261
  /**
2262
   * alias for "UTF8::is_binary()"
2263
   *
2264
   * @see UTF8::is_binary()
2265
   *
2266
   * @param string $str
2267
   *
2268
   * @return bool
2269
   */
2270
  public static function isBinary($str)
2271
  {
2272
    return self::is_binary($str);
2273
  }
2274
2275
  /**
2276
   * alias for "UTF8::is_bom()"
2277
   *
2278
   * @see UTF8::is_bom()
2279
   *
2280
   * @param string $utf8_chr
2281
   *
2282
   * @return boolean
2283
   */
2284
  public static function isBom($utf8_chr)
2285
  {
2286
    return self::is_bom($utf8_chr);
2287
  }
2288
2289
  /**
2290
   * alias for "UTF8::is_html()"
2291
   *
2292
   * @see UTF8::is_html()
2293
   *
2294
   * @param string $str
2295
   *
2296
   * @return boolean
2297
   */
2298 1
  public static function isHtml($str)
2299
  {
2300 1
    return self::is_html($str);
2301
  }
2302
2303
  /**
2304
   * alias for "UTF8::is_json()"
2305
   *
2306
   * @see UTF8::is_json()
2307
   *
2308
   * @param string $str
2309
   *
2310
   * @return bool
2311
   */
2312
  public static function isJson($str)
2313
  {
2314
    return self::is_json($str);
2315
  }
2316
2317
  /**
2318
   * alias for "UTF8::is_utf16()"
2319
   *
2320
   * @see UTF8::is_utf16()
2321
   *
2322
   * @param string $str
2323
   *
2324
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2325
   */
2326 1
  public static function isUtf16($str)
2327
  {
2328 1
    return self::is_utf16($str);
2329
  }
2330
2331
  /**
2332
   * alias for "UTF8::is_utf32()"
2333
   *
2334
   * @see UTF8::is_utf32()
2335
   *
2336
   * @param string $str
2337
   *
2338
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2339
   */
2340 1
  public static function isUtf32($str)
2341
  {
2342 1
    return self::is_utf32($str);
2343
  }
2344
2345
  /**
2346
   * alias for "UTF8::is_utf8()"
2347
   *
2348
   * @see UTF8::is_utf8()
2349
   *
2350
   * @param string $str
2351
   * @param bool   $strict
2352
   *
2353
   * @return bool
2354
   */
2355 16
  public static function isUtf8($str, $strict = false)
2356
  {
2357 16
    return self::is_utf8($str, $strict);
2358
  }
2359
2360
  /**
2361
   * Checks if a string is 7 bit ASCII.
2362
   *
2363
   * @param string $str <p>The string to check.</p>
2364
   *
2365
   * @return bool <p>
2366
   *              <strong>true</strong> if it is ASCII<br />
2367
   *              <strong>false</strong> otherwise
2368
   *              </p>
2369
   */
2370 14
  public static function is_ascii($str)
2371
  {
2372 14
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2373
  }
2374
2375
  /**
2376
   * Returns true if the string is base64 encoded, false otherwise.
2377
   *
2378
   * @param string $str <p>The input string.</p>
2379
   *
2380
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2381
   */
2382 1
  public static function is_base64($str)
2383
  {
2384 1
    $str = (string)$str;
2385
2386 1
    if (!isset($str[0])) {
2387 1
      return false;
2388
    }
2389
2390 1
    if (base64_encode(base64_decode($str, true)) === $str) {
2391 1
      return true;
2392
    } else {
2393 1
      return false;
2394
    }
2395
  }
2396
2397
  /**
2398
   * Check if the input is binary... (is look like a hack).
2399
   *
2400
   * @param mixed $input
2401
   *
2402
   * @return bool
2403
   */
2404 16
  public static function is_binary($input)
2405
  {
2406
2407 16
    $testLength = strlen($input);
2408
2409
    if (
2410 16
        preg_match('~^[01]+$~', $input)
2411
        ||
2412 16
        substr_count($input, "\x00") > 0
2413 16
        ||
2414 15
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2415 16
    ) {
2416 6
      return true;
2417
    } else {
2418 15
      return false;
2419
    }
2420
  }
2421
2422
  /**
2423
   * Check if the file is binary.
2424
   *
2425
   * @param string $file
2426
   *
2427
   * @return boolean
2428
   */
2429
  public static function is_binary_file($file)
2430
  {
2431
    try {
2432
      $fp = fopen($file, 'r');
2433
      $block = fread($fp, 512);
2434
      fclose($fp);
2435
    } catch (\Exception $e) {
2436
      $block = '';
2437
    }
2438
2439
    return self::is_binary($block);
2440
  }
2441
2442
  /**
2443
   * Checks if the given string is equal to any "Byte Order Mark".
2444
   *
2445
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2446
   *
2447
   * @param string $str <p>The input string.</p>
2448
   *
2449
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2450
   */
2451
  public static function is_bom($str)
2452
  {
2453
    foreach (self::$bom as $bomString => $bomByteLength) {
2454
      if ($str === $bomString) {
2455
        return true;
2456
      }
2457
    }
2458
2459
    return false;
2460
  }
2461
2462
  /**
2463
   * Check if the string contains any html-tags <lall>.
2464
   *
2465
   * @param string $str <p>The input string.</p>
2466
   *
2467
   * @return boolean
2468
   */
2469 1
  public static function is_html($str)
2470
  {
2471 1
    $str = (string)$str;
2472
2473 1
    if (!isset($str[0])) {
2474
      return false;
2475
    }
2476
2477
    // init
2478 1
    $matches = array();
2479
2480 1
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2481
2482 1
    if (count($matches) == 0) {
2483 1
      return false;
2484
    } else {
2485 1
      return true;
2486
    }
2487
  }
2488
2489
  /**
2490
   * Try to check if "$str" is an json-string.
2491
   *
2492
   * @param string $str <p>The input string.</p>
2493
   *
2494
   * @return bool
2495
   */
2496 1
  public static function is_json($str)
2497
  {
2498 1
    $str = (string)$str;
2499
2500 1
    if (!isset($str[0])) {
2501
      return false;
2502
    }
2503
2504
    if (
2505 1
        is_object(self::json_decode($str))
2506 1
        &&
2507 1
        json_last_error() === JSON_ERROR_NONE
2508 1
    ) {
2509 1
      return true;
2510
    } else {
2511 1
      return false;
2512
    }
2513
  }
2514
2515
  /**
2516
   * Check if the string is UTF-16.
2517
   *
2518
   * @param string $str <p>The input string.</p>
2519
   *
2520
   * @return int|false <p>
2521
   *                   <strong>false</strong> if is't not UTF-16,<br />
2522
   *                   <strong>1</strong> for UTF-16LE,<br />
2523
   *                   <strong>2</strong> for UTF-16BE.
2524
   *                   </p>
2525
   */
2526 4 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2527
  {
2528 4
    $str = self::remove_bom($str);
2529
2530 4
    if (self::is_binary($str)) {
2531
2532 4
      $maybeUTF16LE = 0;
2533 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2534 4
      if ($test) {
2535 4
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2536 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2537 4
        if ($test3 === $test) {
2538 4
          $strChars = self::count_chars($str, true);
2539 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2540 4
            if (in_array($test3char, $strChars, true) === true) {
2541 2
              $maybeUTF16LE++;
2542 2
            }
2543 4
          }
2544 4
        }
2545 4
      }
2546
2547 4
      $maybeUTF16BE = 0;
2548 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2549 4
      if ($test) {
2550 4
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2551 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2552 4
        if ($test3 === $test) {
2553 4
          $strChars = self::count_chars($str, true);
2554 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2555 4
            if (in_array($test3char, $strChars, true) === true) {
2556 3
              $maybeUTF16BE++;
2557 3
            }
2558 4
          }
2559 4
        }
2560 4
      }
2561
2562 4
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2563 3
        if ($maybeUTF16LE > $maybeUTF16BE) {
2564 2
          return 1;
2565
        } else {
2566 3
          return 2;
2567
        }
2568
      }
2569
2570 3
    }
2571
2572 3
    return false;
2573
  }
2574
2575
  /**
2576
   * Check if the string is UTF-32.
2577
   *
2578
   * @param string $str
2579
   *
2580
   * @return int|false <p>
2581
   *                   <strong>false</strong> if is't not UTF-16,<br />
2582
   *                   <strong>1</strong> for UTF-32LE,<br />
2583
   *                   <strong>2</strong> for UTF-32BE.
2584
   *                   </p>
2585
   */
2586 3 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2587
  {
2588 3
    $str = self::remove_bom($str);
2589
2590 3
    if (self::is_binary($str)) {
2591
2592 3
      $maybeUTF32LE = 0;
2593 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2594 3
      if ($test) {
2595 3
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2596 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2597 3
        if ($test3 === $test) {
2598 3
          $strChars = self::count_chars($str, true);
2599 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2600 3
            if (in_array($test3char, $strChars, true) === true) {
2601 1
              $maybeUTF32LE++;
2602 1
            }
2603 3
          }
2604 3
        }
2605 3
      }
2606
2607 3
      $maybeUTF32BE = 0;
2608 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2609 3
      if ($test) {
2610 3
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2611 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2612 3
        if ($test3 === $test) {
2613 3
          $strChars = self::count_chars($str, true);
2614 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2615 3
            if (in_array($test3char, $strChars, true) === true) {
2616 1
              $maybeUTF32BE++;
2617 1
            }
2618 3
          }
2619 3
        }
2620 3
      }
2621
2622 3
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2623 1
        if ($maybeUTF32LE > $maybeUTF32BE) {
2624 1
          return 1;
2625
        } else {
2626 1
          return 2;
2627
        }
2628
      }
2629
2630 3
    }
2631
2632 3
    return false;
2633
  }
2634
2635
  /**
2636
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2637
   *
2638
   * @see    http://hsivonen.iki.fi/php-utf8/
2639
   *
2640
   * @param string $str    <p>The string to be checked.</p>
2641
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2642
   *
2643
   * @return bool
2644
   */
2645 43
  public static function is_utf8($str, $strict = false)
2646
  {
2647 43
    $str = (string)$str;
2648
2649 43
    if (!isset($str[0])) {
2650 3
      return true;
2651
    }
2652
2653 41
    if ($strict === true) {
2654 1
      if (self::is_utf16($str) !== false) {
2655 1
        return false;
2656
      }
2657
2658
      if (self::is_utf32($str) !== false) {
2659
        return false;
2660
      }
2661
    }
2662
2663 41
    if (self::pcre_utf8_support() !== true) {
2664
2665
      // If even just the first character can be matched, when the /u
2666
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2667
      // invalid, nothing at all will match, even if the string contains
2668
      // some valid sequences
2669
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2670
2671
    } else {
2672
2673 41
      $mState = 0; // cached expected number of octets after the current octet
2674
      // until the beginning of the next UTF8 character sequence
2675 41
      $mUcs4 = 0; // cached Unicode character
2676 41
      $mBytes = 1; // cached expected number of octets in the current sequence
2677 41
      $len = strlen($str);
2678
2679
      /** @noinspection ForeachInvariantsInspection */
2680 41
      for ($i = 0; $i < $len; $i++) {
2681 41
        $in = ord($str[$i]);
2682 41
        if ($mState === 0) {
2683
          // When mState is zero we expect either a US-ASCII character or a
2684
          // multi-octet sequence.
2685 41
          if (0 === (0x80 & $in)) {
2686
            // US-ASCII, pass straight through.
2687 36
            $mBytes = 1;
2688 41 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2689
            // First octet of 2 octet sequence.
2690 34
            $mUcs4 = $in;
2691 34
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2692 34
            $mState = 1;
2693 34
            $mBytes = 2;
2694 39
          } elseif (0xE0 === (0xF0 & $in)) {
2695
            // First octet of 3 octet sequence.
2696 21
            $mUcs4 = $in;
2697 21
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2698 21
            $mState = 2;
2699 21
            $mBytes = 3;
2700 33 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2701
            // First octet of 4 octet sequence.
2702 9
            $mUcs4 = $in;
2703 9
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2704 9
            $mState = 3;
2705 9
            $mBytes = 4;
2706 16
          } elseif (0xF8 === (0xFC & $in)) {
2707
            /* First octet of 5 octet sequence.
2708
            *
2709
            * This is illegal because the encoded codepoint must be either
2710
            * (a) not the shortest form or
2711
            * (b) outside the Unicode range of 0-0x10FFFF.
2712
            * Rather than trying to resynchronize, we will carry on until the end
2713
            * of the sequence and let the later error handling code catch it.
2714
            */
2715 3
            $mUcs4 = $in;
2716 3
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2717 3
            $mState = 4;
2718 3
            $mBytes = 5;
2719 9 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2720
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2721 3
            $mUcs4 = $in;
2722 3
            $mUcs4 = ($mUcs4 & 1) << 30;
2723 3
            $mState = 5;
2724 3
            $mBytes = 6;
2725 3
          } else {
2726
            /* Current octet is neither in the US-ASCII range nor a legal first
2727
             * octet of a multi-octet sequence.
2728
             */
2729 5
            return false;
2730
          }
2731 41
        } else {
2732
          // When mState is non-zero, we expect a continuation of the multi-octet
2733
          // sequence
2734 36
          if (0x80 === (0xC0 & $in)) {
2735
            // Legal continuation.
2736 33
            $shift = ($mState - 1) * 6;
2737 33
            $tmp = $in;
2738 33
            $tmp = ($tmp & 0x0000003F) << $shift;
2739 33
            $mUcs4 |= $tmp;
2740
            /**
2741
             * End of the multi-octet sequence. mUcs4 now contains the final
2742
             * Unicode code point to be output
2743
             */
2744 33
            if (0 === --$mState) {
2745
              /*
2746
              * Check for illegal sequences and code points.
2747
              */
2748
              // From Unicode 3.1, non-shortest form is illegal
2749
              if (
2750 33
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2751 33
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2752 33
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2753 33
                  (4 < $mBytes) ||
2754
                  // From Unicode 3.2, surrogate characters are illegal.
2755 33
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2756
                  // Code points outside the Unicode range are illegal.
2757 33
                  ($mUcs4 > 0x10FFFF)
2758 33
              ) {
2759 5
                return false;
2760
              }
2761
              // initialize UTF8 cache
2762 33
              $mState = 0;
2763 33
              $mUcs4 = 0;
2764 33
              $mBytes = 1;
2765 33
            }
2766 33
          } else {
2767
            /**
2768
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2769
             * Incomplete multi-octet sequence.
2770
             */
2771 18
            return false;
2772
          }
2773
        }
2774 41
      }
2775
2776 20
      return true;
2777
    }
2778
  }
2779
2780
  /**
2781
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2782
   * Decodes a JSON string
2783
   *
2784
   * @link http://php.net/manual/en/function.json-decode.php
2785
   *
2786
   * @param string $json    <p>
2787
   *                        The <i>json</i> string being decoded.
2788
   *                        </p>
2789
   *                        <p>
2790
   *                        This function only works with UTF-8 encoded strings.
2791
   *                        </p>
2792
   *                        <p>PHP implements a superset of
2793
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2794
   *                        only supports these values when they are nested inside an array or an object.
2795
   *                        </p>
2796
   * @param bool   $assoc   [optional] <p>
2797
   *                        When <b>TRUE</b>, returned objects will be converted into
2798
   *                        associative arrays.
2799
   *                        </p>
2800
   * @param int    $depth   [optional] <p>
2801
   *                        User specified recursion depth.
2802
   *                        </p>
2803
   * @param int    $options [optional] <p>
2804
   *                        Bitmask of JSON decode options. Currently only
2805
   *                        <b>JSON_BIGINT_AS_STRING</b>
2806
   *                        is supported (default is to cast large integers as floats)
2807
   *                        </p>
2808
   *
2809
   * @return mixed the value encoded in <i>json</i> in appropriate
2810
   * PHP type. Values true, false and
2811
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2812
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2813
   * <i>json</i> cannot be decoded or if the encoded
2814
   * data is deeper than the recursion limit.
2815
   */
2816 2
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2817
  {
2818 2
    $json = self::filter($json);
2819
2820 2
    if (Bootup::is_php('5.4') === true) {
2821 2
      $json = json_decode($json, $assoc, $depth, $options);
2822 2
    } else {
2823
      $json = json_decode($json, $assoc, $depth);
2824
    }
2825
2826 2
    return $json;
2827
  }
2828
2829
  /**
2830
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2831
   * Returns the JSON representation of a value.
2832
   *
2833
   * @link http://php.net/manual/en/function.json-encode.php
2834
   *
2835
   * @param mixed $value   <p>
2836
   *                       The <i>value</i> being encoded. Can be any type except
2837
   *                       a resource.
2838
   *                       </p>
2839
   *                       <p>
2840
   *                       All string data must be UTF-8 encoded.
2841
   *                       </p>
2842
   *                       <p>PHP implements a superset of
2843
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2844
   *                       only supports these values when they are nested inside an array or an object.
2845
   *                       </p>
2846
   * @param int   $options [optional] <p>
2847
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2848
   *                       <b>JSON_HEX_TAG</b>,
2849
   *                       <b>JSON_HEX_AMP</b>,
2850
   *                       <b>JSON_HEX_APOS</b>,
2851
   *                       <b>JSON_NUMERIC_CHECK</b>,
2852
   *                       <b>JSON_PRETTY_PRINT</b>,
2853
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2854
   *                       <b>JSON_FORCE_OBJECT</b>,
2855
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2856
   *                       constants is described on
2857
   *                       the JSON constants page.
2858
   *                       </p>
2859
   * @param int   $depth   [optional] <p>
2860
   *                       Set the maximum depth. Must be greater than zero.
2861
   *                       </p>
2862
   *
2863
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2864
   */
2865 2
  public static function json_encode($value, $options = 0, $depth = 512)
2866
  {
2867 2
    $value = self::filter($value);
2868
2869 2
    if (Bootup::is_php('5.5')) {
2870
      $json = json_encode($value, $options, $depth);
2871
    } else {
2872 2
      $json = json_encode($value, $options);
2873
    }
2874
2875 2
    return $json;
2876
  }
2877
2878
  /**
2879
   * Makes string's first char lowercase.
2880
   *
2881
   * @param string $str <p>The input string</p>
2882
   *
2883
   * @return string <p>The resulting string</p>
2884
   */
2885 6
  public static function lcfirst($str)
2886
  {
2887 6
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2888
  }
2889
2890
  /**
2891
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2892
   *
2893
   * @param string $str   <p>The string to be trimmed</p>
2894
   * @param string $chars <p>Optional characters to be stripped</p>
2895
   *
2896
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2897
   */
2898 24 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2899
  {
2900 24
    $str = (string)$str;
2901
2902 24
    if (!isset($str[0])) {
2903 2
      return '';
2904
    }
2905
2906
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2907 23
    if ($chars === INF || !$chars) {
2908 2
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2909
    }
2910
2911 23
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2912
2913 23
    return preg_replace("/^{$chars}+/u", '', $str);
2914
  }
2915
2916
  /**
2917
   * Returns the UTF-8 character with the maximum code point in the given data.
2918
   *
2919
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2920
   *
2921
   * @return string <p>The character with the highest code point than others.</p>
2922
   */
2923 1 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2924
  {
2925 1
    if (is_array($arg)) {
2926
      $arg = implode($arg);
2927
    }
2928
2929 1
    return self::chr(max(self::codepoints($arg)));
2930
  }
2931
2932
  /**
2933
   * Calculates and returns the maximum number of bytes taken by any
2934
   * UTF-8 encoded character in the given string.
2935
   *
2936
   * @param string $str <p>The original Unicode string.</p>
2937
   *
2938
   * @return int <p>Max byte lengths of the given chars.</p>
2939
   */
2940 1
  public static function max_chr_width($str)
2941
  {
2942 1
    $bytes = self::chr_size_list($str);
2943 1
    if (count($bytes) > 0) {
2944 1
      return (int)max($bytes);
2945
    } else {
2946 1
      return 0;
2947
    }
2948
  }
2949
2950
  /**
2951
   * Checks whether mbstring is available on the server.
2952
   *
2953
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2954
   */
2955 2
  public static function mbstring_loaded()
2956
  {
2957 2
    $return = extension_loaded('mbstring');
2958
2959 2
    if ($return === true) {
2960 2
      \mb_internal_encoding('UTF-8');
2961 2
    }
2962
2963 2
    return $return;
2964
  }
2965
2966
  /**
2967
   * Returns the UTF-8 character with the minimum code point in the given data.
2968
   *
2969
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
2970
   *
2971
   * @return string <p>The character with the lowest code point than others.</p>
2972
   */
2973 1 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2974
  {
2975 1
    if (is_array($arg)) {
2976
      $arg = implode($arg);
2977
    }
2978
2979 1
    return self::chr(min(self::codepoints($arg)));
2980
  }
2981
2982
  /**
2983
   * alias for "UTF8::normalize_encoding()"
2984
   *
2985
   * @see UTF8::normalize_encoding()
2986
   *
2987
   * @param string $encoding
2988
   *
2989
   * @return string
2990
   */
2991 1
  public static function normalizeEncoding($encoding)
2992
  {
2993 1
    return self::normalize_encoding($encoding);
2994
  }
2995
2996
  /**
2997
   * Normalize the encoding-"name" input.
2998
   *
2999
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3000
   *
3001
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3002
   */
3003 4
  public static function normalize_encoding($encoding)
3004
  {
3005 4
    static $staticNormalizeEncodingCache = array();
3006
3007 4
    if (!$encoding) {
3008 1
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3009
    }
3010
3011 4
    if ('UTF-8' === $encoding) {
3012 1
      return $encoding;
3013
    }
3014
3015 4
    if (in_array($encoding, self::$iconvEncoding, true)) {
3016 3
      return $encoding;
3017
    }
3018
3019 3
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3020 3
      return $staticNormalizeEncodingCache[$encoding];
3021
    }
3022
3023 2
    $encodingOrig = $encoding;
3024 2
    $encoding = strtoupper($encoding);
3025 2
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3026
3027
    $equivalences = array(
3028 2
        'ISO88591'    => 'ISO-8859-1',
3029 2
        'ISO8859'     => 'ISO-8859-1',
3030 2
        'ISO'         => 'ISO-8859-1',
3031 2
        'LATIN1'      => 'ISO-8859-1',
3032 2
        'LATIN'       => 'ISO-8859-1',
3033 2
        'WIN1252'     => 'ISO-8859-1',
3034 2
        'WINDOWS1252' => 'ISO-8859-1',
3035 2
        'UTF16'       => 'UTF-16',
3036 2
        'UTF32'       => 'UTF-32',
3037 2
        'UTF8'        => 'UTF-8',
3038 2
        'UTF'         => 'UTF-8',
3039 2
        'UTF7'        => 'UTF-7',
3040 2
        '8BIT'        => 'CP850',
3041 2
        'BINARY'      => 'CP850',
3042 2
    );
3043
3044 2
    if (!empty($equivalences[$encodingUpperHelper])) {
3045 2
      $encoding = $equivalences[$encodingUpperHelper];
3046 2
    }
3047
3048 2
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3049
3050 2
    return $encoding;
3051
  }
3052
3053
  /**
3054
   * Normalize some MS Word special characters.
3055
   *
3056
   * @param string $str <p>The string to be normalized.</p>
3057
   *
3058
   * @return string
3059
   */
3060 2
  public static function normalize_msword($str)
3061
  {
3062 2
    static $utf8MSWordKeys = null;
3063 2
    static $utf8MSWordValues = null;
3064
3065 2
    if ($utf8MSWordKeys === null) {
3066 1
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3067 1
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3068 1
    }
3069
3070 2
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3071
  }
3072
3073
  /**
3074
   * Normalize the whitespace.
3075
   *
3076
   * @param string $str                     <p>The string to be normalized.</p>
3077
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3078
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3079
   *                                        bidirectional text chars.</p>
3080
   *
3081
   * @return string
3082
   */
3083 7
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3084
  {
3085 7
    static $whitespaces = array();
3086 7
    static $bidiUniCodeControls = null;
3087
3088 7
    $cacheKey = (int)$keepNonBreakingSpace;
3089
3090 7
    if (!isset($whitespaces[$cacheKey])) {
3091
3092 2
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3093
3094 2
      if ($keepNonBreakingSpace === true) {
3095
        /** @noinspection OffsetOperationsInspection */
3096 1
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3097 1
      }
3098
3099 2
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3100 2
    }
3101
3102 7
    if ($keepBidiUnicodeControls === false) {
3103 7
      if ($bidiUniCodeControls === null) {
3104 1
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3105 1
      }
3106
3107 7
      $str = str_replace($bidiUniCodeControls, '', $str);
3108 7
    }
3109
3110 7
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3111
  }
3112
3113
  /**
3114
   * Format a number with grouped thousands.
3115
   *
3116
   * @param float  $number
3117
   * @param int    $decimals
3118
   * @param string $dec_point
3119
   * @param string $thousands_sep
3120
   *
3121
   * @return string
3122
   *    *
3123
   * @deprecated Because this has nothing to do with UTF8. :/
3124
   */
3125
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3126
  {
3127
    $thousands_sep = (string)$thousands_sep;
3128
    $dec_point = (string)$dec_point;
3129
3130
    if (
3131
        isset($thousands_sep[1], $dec_point[1])
3132
        &&
3133
        Bootup::is_php('5.4') === true
3134
    ) {
3135
      return str_replace(
3136
          array(
3137
              '.',
3138
              ',',
3139
          ),
3140
          array(
3141
              $dec_point,
3142
              $thousands_sep,
3143
          ),
3144
          number_format($number, $decimals, '.', ',')
3145
      );
3146
    }
3147
3148
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3149
  }
3150
3151
  /**
3152
   * Calculates Unicode code point of the given UTF-8 encoded character.
3153
   *
3154
   * INFO: opposite to UTF8::chr()
3155
   *
3156
   * @param string $chr <p>The character of which to calculate code point.<p/>
3157
   *
3158
   * @return int <p>
3159
   *             Unicode code point of the given character,<br />
3160
   *             0 on invalid UTF-8 byte sequence.
3161
   *             </p>
3162
   */
3163 17
  public static function ord($chr)
3164
  {
3165 17
    if (!$chr && $chr !== '0') {
3166 3
      return 0;
3167
    }
3168
3169 16
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3170
      self::checkForSupport();
3171
    }
3172
3173 16
    if (self::$support['intlChar'] === true) {
3174
      $tmpReturn = \IntlChar::ord($chr);
3175
      if ($tmpReturn) {
3176
        return $tmpReturn;
3177
      }
3178
    }
3179
3180 16
    $chr = unpack('C*', substr($chr, 0, 4));
3181 16
    $a = $chr ? $chr[1] : 0;
3182
3183 16
    if (0xF0 <= $a && isset($chr[4])) {
3184 3
      return (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3185
    }
3186
3187 15
    if (0xE0 <= $a && isset($chr[3])) {
3188 10
      return (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3189
    }
3190
3191 13
    if (0xC0 <= $a && isset($chr[2])) {
3192 10
      return (($a - 0xC0) << 6) + $chr[2] - 0x80;
3193
    }
3194
3195 12
    return $a;
3196
  }
3197
3198
  /**
3199
   * Parses the string into an array (into the the second parameter).
3200
   *
3201
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3202
   *          if the second parameter is not set!
3203
   *
3204
   * @link http://php.net/manual/en/function.parse-str.php
3205
   *
3206
   * @param string $str    <p>The input string.</p>
3207
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3208
   *
3209
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3210
   */
3211 1
  public static function parse_str($str, &$result)
3212
  {
3213
    // init
3214 1
    $str = self::clean($str);
3215
3216 1
    $return = \mb_parse_str($str, $result);
3217 1
    if ($return === false || empty($result)) {
3218 1
      return false;
3219
    }
3220
3221 1
    return true;
3222
  }
3223
3224
  /**
3225
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3226
   *
3227
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3228
   */
3229 41
  public static function pcre_utf8_support()
3230
  {
3231
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3232 41
    return (bool)@preg_match('//u', '');
3233
  }
3234
3235
  /**
3236
   * Create an array containing a range of UTF-8 characters.
3237
   *
3238
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3239
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3240
   *
3241
   * @return array
3242
   */
3243 1
  public static function range($var1, $var2)
3244
  {
3245 1
    if (!$var1 || !$var2) {
3246 1
      return array();
3247
    }
3248
3249 1 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3250 1
      $start = (int)$var1;
3251 1
    } elseif (ctype_xdigit($var1)) {
3252
      $start = (int)self::hex_to_int($var1);
3253
    } else {
3254 1
      $start = self::ord($var1);
3255
    }
3256
3257 1
    if (!$start) {
3258
      return array();
3259
    }
3260
3261 1 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3262 1
      $end = (int)$var2;
3263 1
    } elseif (ctype_xdigit($var2)) {
3264
      $end = (int)self::hex_to_int($var2);
3265
    } else {
3266 1
      $end = self::ord($var2);
3267
    }
3268
3269 1
    if (!$end) {
3270
      return array();
3271
    }
3272
3273 1
    return array_map(
3274
        array(
3275 1
            '\\voku\\helper\\UTF8',
3276 1
            'chr',
3277 1
        ),
3278 1
        range($start, $end)
3279 1
    );
3280
  }
3281
3282
  /**
3283
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3284
   *
3285
   * @param string $str <p>The input string.</p>
3286
   *
3287
   * @return string <p>String without UTF-BOM</p>
3288
   */
3289 10
  public static function remove_bom($str)
3290
  {
3291 10
    foreach (self::$bom as $bomString => $bomByteLength) {
3292 10
      if (0 === strpos($str, $bomString)) {
3293 5
        $str = substr($str, $bomByteLength);
3294 5
      }
3295 10
    }
3296
3297 10
    return $str;
3298
  }
3299
3300
  /**
3301
   * alias for "UTF8::remove_bom()"
3302
   *
3303
   * @see UTF8::remove_bom()
3304
   *
3305
   * @param string $str
3306
   *
3307
   * @return string
3308
   */
3309 5
  public static function removeBOM($str)
3310
  {
3311 5
    return self::remove_bom($str);
3312
  }
3313
3314
  /**
3315
   * Removes duplicate occurrences of a string in another string.
3316
   *
3317
   * @param string          $str  <p>The base string.</p>
3318
   * @param string|string[] $what <p>String to search for in the base string.</p>
3319
   *
3320
   * @return string <p>The result string with removed duplicates.</p>
3321
   */
3322 1
  public static function remove_duplicates($str, $what = ' ')
3323
  {
3324 1
    if (is_string($what)) {
3325 1
      $what = array($what);
3326 1
    }
3327
3328 1
    if (is_array($what)) {
3329 1
      foreach ($what as $item) {
3330 1
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3331 1
      }
3332 1
    }
3333
3334 1
    return $str;
3335
  }
3336
3337
  /**
3338
   * Remove invisible characters from a string.
3339
   *
3340
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3341
   *
3342
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3343
   *
3344
   * @param string $str
3345
   * @param bool   $url_encoded
3346
   * @param string $replacement
3347
   *
3348
   * @return string
3349
   */
3350 43
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3351
  {
3352
    // init
3353 43
    $non_displayables = array();
3354
3355
    // every control character except newline (dec 10),
3356
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3357 43
    if ($url_encoded) {
3358 43
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3359 43
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3360 43
    }
3361
3362 43
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3363
3364
    do {
3365 43
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3366 43
    } while ($count !== 0);
3367
3368 43
    return $str;
3369
  }
3370
3371
  /**
3372
   * Replace the diamond question mark (�) with the replacement.
3373
   *
3374
   * @param string $str
3375
   * @param string $unknown
3376
   *
3377
   * @return string
3378
   */
3379 43
  public static function replace_diamond_question_mark($str, $unknown = '?')
3380
  {
3381 43
    return str_replace(
3382
        array(
3383 43
            "\xEF\xBF\xBD",
3384 43
            '�',
3385 43
        ),
3386
        array(
3387 43
            $unknown,
3388 43
            $unknown,
3389 43
        ),
3390
        $str
3391 43
    );
3392
  }
3393
3394
  /**
3395
   * Strip whitespace or other characters from end of a UTF-8 string.
3396
   *
3397
   * @param string $str   <p>The string to be trimmed.</p>
3398
   * @param string $chars <p>Optional characters to be stripped.</p>
3399
   *
3400
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3401
   */
3402 23 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3403
  {
3404 23
    $str = (string)$str;
3405
3406 23
    if (!isset($str[0])) {
3407 5
      return '';
3408
    }
3409
3410
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3411 19
    if ($chars === INF || !$chars) {
3412 3
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3413
    }
3414
3415 18
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3416
3417 18
    return preg_replace("/{$chars}+$/u", '', $str);
3418
  }
3419
3420
  /**
3421
   * rxClass
3422
   *
3423
   * @param string $s
3424
   * @param string $class
3425
   *
3426
   * @return string
3427
   */
3428 45
  private static function rxClass($s, $class = '')
3429
  {
3430 45
    static $rxClassCache = array();
3431
3432 45
    $cacheKey = $s . $class;
3433
3434 45
    if (isset($rxClassCache[$cacheKey])) {
3435 34
      return $rxClassCache[$cacheKey];
3436
    }
3437
3438 17
    $class = array($class);
3439
3440
    /** @noinspection SuspiciousLoopInspection */
3441 17
    foreach (self::str_split($s) as $s) {
3442 17
      if ('-' === $s) {
3443
        $class[0] = '-' . $class[0];
3444 17
      } elseif (!isset($s[2])) {
3445 17
        $class[0] .= preg_quote($s, '/');
3446 17
      } elseif (1 === self::strlen($s)) {
3447 2
        $class[0] .= $s;
3448 2
      } else {
3449
        $class[] = $s;
3450
      }
3451 17
    }
3452
3453 17
    if ($class[0]) {
3454 17
      $class[0] = '[' . $class[0] . ']';
3455 17
    }
3456
3457 17
    if (1 === count($class)) {
3458 17
      $return = $class[0];
3459 17
    } else {
3460
      $return = '(?:' . implode('|', $class) . ')';
3461
    }
3462
3463 17
    $rxClassCache[$cacheKey] = $return;
3464
3465 17
    return $return;
3466
  }
3467
3468
  /**
3469
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3470
   */
3471
  public static function showSupport()
3472
  {
3473
    foreach (self::$support as $utf8Support) {
3474
      echo $utf8Support . "\n<br>";
3475
    }
3476
  }
3477
3478
  /**
3479
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3480
   *
3481
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3482
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3483
   *
3484
   * @return string <p>The HTML numbered entity.</p>
3485
   */
3486 1
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3487
  {
3488 1
    if (!$char) {
3489 1
      return '';
3490
    }
3491
3492
    if (
3493
        $keepAsciiChars === true
3494 1
        &&
3495 1
        self::isAscii($char) === true
3496 1
    ) {
3497 1
      return $char;
3498
    }
3499
3500 1
    return '&#' . self::ord($char) . ';';
3501
  }
3502
3503
  /**
3504
   * Convert a string to an array of Unicode characters.
3505
   *
3506
   * @param string  $str       <p>The string to split into array.</p>
3507
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3508
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3509
   *
3510
   * @return string[] <p>An array containing chunks of the string.</p>
3511
   */
3512 35
  public static function split($str, $length = 1, $cleanUtf8 = false)
3513
  {
3514 35
    $str = (string)$str;
3515
3516 35
    if (!isset($str[0])) {
3517 2
      return array();
3518
    }
3519
3520
    // init
3521 35
    $str = (string)$str;
3522 35
    $ret = array();
3523
3524 35
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3525
      self::checkForSupport();
3526
    }
3527
3528 35
    if (self::$support['pcre_utf8'] === true) {
3529
3530 35
      if ($cleanUtf8 === true) {
3531 6
        $str = self::clean($str);
3532 6
      }
3533
3534 35
      preg_match_all('/./us', $str, $retArray);
3535 35
      if (isset($retArray[0])) {
3536 35
        $ret = $retArray[0];
3537 35
      }
3538 35
      unset($retArray);
3539
3540 35
    } else {
3541
3542
      // fallback
3543
3544
      $len = strlen($str);
3545
3546
      /** @noinspection ForeachInvariantsInspection */
3547
      for ($i = 0; $i < $len; $i++) {
3548
        if (($str[$i] & "\x80") === "\x00") {
3549
          $ret[] = $str[$i];
3550
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3551
          if (($str[$i + 1] & "\xC0") === "\x80") {
3552
            $ret[] = $str[$i] . $str[$i + 1];
3553
3554
            $i++;
3555
          }
3556 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3557
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3558
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3559
3560
            $i += 2;
3561
          }
3562
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3563 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3564
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3565
3566
            $i += 3;
3567
          }
3568
        }
3569
      }
3570
    }
3571
3572 35
    if ($length > 1) {
3573 5
      $ret = array_chunk($ret, $length);
3574
3575 5
      $ret = array_map('implode', $ret);
3576 5
    }
3577
3578
    /** @noinspection OffsetOperationsInspection */
3579 35
    if (isset($ret[0]) && $ret[0] === '') {
3580
      return array();
3581
    }
3582
3583 35
    return $ret;
3584
  }
3585
3586
  /**
3587
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3588
   *
3589
   * @param string $str <p>The input string.</p>
3590
   *
3591
   * @return false|string <p>
3592
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3593
   *                      otherwise it will return false.
3594
   *                      </p>
3595
   */
3596 12
  public static function str_detect_encoding($str)
3597
  {
3598
    //
3599
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3600
    //
3601
3602 12
    if (self::is_binary($str)) {
3603 2
      if (self::is_utf16($str) === 1) {
3604 1
        return 'UTF-16LE';
3605 2
      } elseif (self::is_utf16($str) === 2) {
3606 1
        return 'UTF-16BE';
3607 2
      } elseif (self::is_utf32($str) === 1) {
3608
        return 'UTF-32LE';
3609 2
      } elseif (self::is_utf32($str) === 2) {
3610
        return 'UTF-32BE';
3611
      }
3612 2
    }
3613
3614
    //
3615
    // 2.) simple check for ASCII chars
3616
    //
3617
3618 12
    if (self::is_ascii($str) === true) {
3619 3
      return 'ASCII';
3620
    }
3621
3622
    //
3623
    // 3.) simple check for UTF-8 chars
3624
    //
3625
3626 12
    if (self::is_utf8($str) === true) {
3627 9
      return 'UTF-8';
3628
    }
3629
3630
    //
3631
    // 4.) check via "\mb_detect_encoding()"
3632
    //
3633
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3634
3635
    $detectOrder = array(
3636 6
        'ISO-8859-1',
3637 6
        'ISO-8859-2',
3638 6
        'ISO-8859-3',
3639 6
        'ISO-8859-4',
3640 6
        'ISO-8859-5',
3641 6
        'ISO-8859-6',
3642 6
        'ISO-8859-7',
3643 6
        'ISO-8859-8',
3644 6
        'ISO-8859-9',
3645 6
        'ISO-8859-10',
3646 6
        'ISO-8859-13',
3647 6
        'ISO-8859-14',
3648 6
        'ISO-8859-15',
3649 6
        'ISO-8859-16',
3650 6
        'WINDOWS-1251',
3651 6
        'WINDOWS-1252',
3652 6
        'WINDOWS-1254',
3653 6
        'ISO-2022-JP',
3654 6
        'JIS',
3655 6
        'EUC-JP',
3656 6
    );
3657
3658 6
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3659 6
    if ($encoding) {
3660 6
      return $encoding;
3661
    }
3662
3663
    //
3664
    // 5.) check via "iconv()"
3665
    //
3666
3667
    $md5 = md5($str);
3668
    foreach (self::$iconvEncoding as $encodingTmp) {
3669
      # INFO: //IGNORE and //TRANSLIT still throw notice
3670
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3671
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3672
        return $encodingTmp;
3673
      }
3674
    }
3675
3676
    return false;
3677
  }
3678
3679
  /**
3680
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3681
   *
3682
   * @link  http://php.net/manual/en/function.str-ireplace.php
3683
   *
3684
   * @param mixed $search  <p>
3685
   *                       Every replacement with search array is
3686
   *                       performed on the result of previous replacement.
3687
   *                       </p>
3688
   * @param mixed $replace <p>
3689
   *                       </p>
3690
   * @param mixed $subject <p>
3691
   *                       If subject is an array, then the search and
3692
   *                       replace is performed with every entry of
3693
   *                       subject, and the return value is an array as
3694
   *                       well.
3695
   *                       </p>
3696
   * @param int   $count   [optional] <p>
3697
   *                       The number of matched and replaced needles will
3698
   *                       be returned in count which is passed by
3699
   *                       reference.
3700
   *                       </p>
3701
   *
3702
   * @return mixed <p>A string or an array of replacements.</p>
3703
   */
3704 13
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3705
  {
3706 13
    $search = (array)$search;
3707
3708
    /** @noinspection AlterInForeachInspection */
3709 13
    foreach ($search as &$s) {
3710 13
      if ('' === $s .= '') {
3711 1
        $s = '/^(?<=.)$/';
3712 1
      } else {
3713 12
        $s = '/' . preg_quote($s, '/') . '/ui';
3714
      }
3715 13
    }
3716
3717 13
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3718 13
    $count = $replace; // used as reference parameter
3719
3720 13
    return $subject;
3721
  }
3722
3723
  /**
3724
   * Limit the number of characters in a string, but also after the next word.
3725
   *
3726
   * @param string $str
3727
   * @param int    $length
3728
   * @param string $strAddOn
3729
   *
3730
   * @return string
3731
   */
3732 1
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3733
  {
3734 1
    $str = (string)$str;
3735
3736 1
    if (!isset($str[0])) {
3737
      return '';
3738
    }
3739
3740 1
    $length = (int)$length;
3741
3742 1
    if (self::strlen($str) <= $length) {
3743
      return $str;
3744
    }
3745
3746 1
    if (self::substr($str, $length - 1, 1) === ' ') {
3747 1
      return self::substr($str, 0, $length - 1) . $strAddOn;
3748
    }
3749
3750 1
    $str = self::substr($str, 0, $length);
3751 1
    $array = explode(' ', $str);
3752 1
    array_pop($array);
3753 1
    $new_str = implode(' ', $array);
3754
3755 1
    if ($new_str === '') {
3756
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3757
    } else {
3758 1
      $str = $new_str . $strAddOn;
3759
    }
3760
3761 1
    return $str;
3762
  }
3763
3764
  /**
3765
   * Pad a UTF-8 string to given length with another string.
3766
   *
3767
   * @param string $str        <p>The input string.</p>
3768
   * @param int    $pad_length <p>The length of return string.</p>
3769
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3770
   * @param int    $pad_type   [optional] <p>
3771
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3772
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3773
   *                           </p>
3774
   *
3775
   * @return string <strong>Returns the padded string</strong>
3776
   */
3777 2
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3778
  {
3779 2
    $str_length = self::strlen($str);
3780
3781 2
    if (is_int($pad_length) && ($pad_length > 0) && ($pad_length >= $str_length)) {
3782 2
      $ps_length = self::strlen($pad_string);
3783
3784 2
      $diff = $pad_length - $str_length;
3785
3786
      switch ($pad_type) {
3787 2 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3788 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3789 2
          $pre = self::substr($pre, 0, $diff);
3790 2
          $post = '';
3791 2
          break;
3792
3793 2
        case STR_PAD_BOTH:
3794 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3795 2
          $pre = self::substr($pre, 0, (int)$diff / 2);
3796 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3797 2
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3798 2
          break;
3799
3800 2
        case STR_PAD_RIGHT:
3801 2 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3802 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3803 2
          $post = self::substr($post, 0, $diff);
3804 2
          $pre = '';
3805 2
      }
3806
3807 2
      return $pre . $str . $post;
3808
    }
3809
3810 2
    return $str;
3811
  }
3812
3813
  /**
3814
   * Repeat a string.
3815
   *
3816
   * @param string $str        <p>
3817
   *                           The string to be repeated.
3818
   *                           </p>
3819
   * @param int    $multiplier <p>
3820
   *                           Number of time the input string should be
3821
   *                           repeated.
3822
   *                           </p>
3823
   *                           <p>
3824
   *                           multiplier has to be greater than or equal to 0.
3825
   *                           If the multiplier is set to 0, the function
3826
   *                           will return an empty string.
3827
   *                           </p>
3828
   *
3829
   * @return string <p>The repeated string.</p>
3830
   */
3831 1
  public static function str_repeat($str, $multiplier)
3832
  {
3833 1
    $str = self::filter($str);
3834
3835 1
    return str_repeat($str, $multiplier);
3836
  }
3837
3838
  /**
3839
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3840
   *
3841
   * Replace all occurrences of the search string with the replacement string
3842
   *
3843
   * @link http://php.net/manual/en/function.str-replace.php
3844
   *
3845
   * @param mixed $search  <p>
3846
   *                       The value being searched for, otherwise known as the needle.
3847
   *                       An array may be used to designate multiple needles.
3848
   *                       </p>
3849
   * @param mixed $replace <p>
3850
   *                       The replacement value that replaces found search
3851
   *                       values. An array may be used to designate multiple replacements.
3852
   *                       </p>
3853
   * @param mixed $subject <p>
3854
   *                       The string or array being searched and replaced on,
3855
   *                       otherwise known as the haystack.
3856
   *                       </p>
3857
   *                       <p>
3858
   *                       If subject is an array, then the search and
3859
   *                       replace is performed with every entry of
3860
   *                       subject, and the return value is an array as
3861
   *                       well.
3862
   *                       </p>
3863
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3864
   *
3865
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3866
   */
3867 12
  public static function str_replace($search, $replace, $subject, &$count = null)
3868
  {
3869 12
    return str_replace($search, $replace, $subject, $count);
3870
  }
3871
3872
  /**
3873
   * Shuffles all the characters in the string.
3874
   *
3875
   * @param string $str <p>The input string</p>
3876
   *
3877
   * @return string <p>The shuffled string.</p>
3878
   */
3879 1
  public static function str_shuffle($str)
3880
  {
3881 1
    $array = self::split($str);
3882
3883 1
    shuffle($array);
3884
3885 1
    return implode('', $array);
3886
  }
3887
3888
  /**
3889
   * Sort all characters according to code points.
3890
   *
3891
   * @param string $str    <p>A UTF-8 string.</p>
3892
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3893
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3894
   *
3895
   * @return string <p>String of sorted characters.</p>
3896
   */
3897 1
  public static function str_sort($str, $unique = false, $desc = false)
3898
  {
3899 1
    $array = self::codepoints($str);
3900
3901 1
    if ($unique) {
3902 1
      $array = array_flip(array_flip($array));
3903 1
    }
3904
3905 1
    if ($desc) {
3906 1
      arsort($array);
3907 1
    } else {
3908 1
      asort($array);
3909
    }
3910
3911 1
    return self::string($array);
3912
  }
3913
3914
  /**
3915
   * Split a string into an array.
3916
   *
3917
   * @param string $str
3918
   * @param int    $len
3919
   *
3920
   * @return array
3921
   */
3922 20
  public static function str_split($str, $len = 1)
3923
  {
3924
    // init
3925 20
    $len = (int)$len;
3926
3927 20
    if ($len < 1) {
3928
      return str_split($str, $len);
3929
    }
3930
3931 20
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
3932 20
    $a = $a[0];
3933
3934 20
    if ($len === 1) {
3935 20
      return $a;
3936
    }
3937
3938 1
    $arrayOutput = array();
3939 1
    $p = -1;
3940
3941
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
3942 1
    foreach ($a as $l => $a) {
3943 1
      if ($l % $len) {
3944 1
        $arrayOutput[$p] .= $a;
3945 1
      } else {
3946 1
        $arrayOutput[++$p] = $a;
3947
      }
3948 1
    }
3949
3950 1
    return $arrayOutput;
3951
  }
3952
3953
  /**
3954
   * Get a binary representation of a specific string.
3955
   *
3956
   * @param string $str <p>The input string.</p>
3957
   *
3958
   * @return string
3959
   */
3960 1
  public static function str_to_binary($str)
3961
  {
3962 1
    $str = (string)$str;
3963
3964 1
    $value = unpack('H*', $str);
3965
3966 1
    return base_convert($value[1], 16, 2);
3967
  }
3968
3969
  /**
3970
   * alias for "UTF8::to_ascii()"
3971
   *
3972
   * @see UTF8::to_ascii()
3973
   *
3974
   * @param string $str
3975
   * @param string $unknown
3976
   *
3977
   * @return string
3978
   */
3979 7
  public static function str_transliterate($str, $unknown = '?')
3980
  {
3981 7
    return self::to_ascii($str, $unknown);
3982
  }
3983
3984
  /**
3985
   * Counts number of words in the UTF-8 string.
3986
   *
3987
   * @param string $str      <p>The input string.</p>
3988
   * @param int    $format   [optional] <p>
3989
   *                         <strong>0</strong> => return a number of words (default)<br />
3990
   *                         <strong>1</strong> => return an array of words<br />
3991
   *                         <strong>2</strong> => return an array of words with word-offset as key
3992
   *                         </p>
3993
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word
3994
   *                         (default: "'", "’")</p>
3995
   *
3996
   * @return array|int <p>The number of words in the string</p>
3997
   */
3998 1
  public static function str_word_count($str, $format = 0, $charlist = '')
3999
  {
4000 1
    $charlist = self::rxClass($charlist, '\pL');
4001 1
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4002
4003 1
    $len = count($strParts);
4004
4005 1
    if ($format === 1) {
4006
4007 1
      $numberOfWords = array();
4008 1
      for ($i = 1; $i < $len; $i += 2) {
4009 1
        $numberOfWords[] = $strParts[$i];
4010 1
      }
4011
4012 1
    } elseif ($format === 2) {
4013
4014 1
      $numberOfWords = array();
4015 1
      $offset = self::strlen($strParts[0]);
4016 1
      for ($i = 1; $i < $len; $i += 2) {
4017 1
        $numberOfWords[$offset] = $strParts[$i];
4018 1
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4019 1
      }
4020
4021 1
    } else {
4022
4023 1
      $numberOfWords = ($len - 1) / 2;
4024
4025
    }
4026
4027 1
    return $numberOfWords;
4028
  }
4029
4030
  /**
4031
   * Case-insensitive string comparison.
4032
   *
4033
   * INFO: Case-insensitive version of UTF8::strcmp()
4034
   *
4035
   * @param string $str1
4036
   * @param string $str2
4037
   *
4038
   * @return int <p>
4039
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4040
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4041
   *             <strong>0</strong> if they are equal.
4042
   *             </p>
4043
   */
4044 9
  public static function strcasecmp($str1, $str2)
4045
  {
4046 9
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4047
  }
4048
4049
  /**
4050
   * Case-sensitive string comparison.
4051
   *
4052
   * @param string $str1
4053
   * @param string $str2
4054
   *
4055
   * @return int  <p>
4056
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4057
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4058
   *              <strong>0</strong> if they are equal.
4059
   *              </p>
4060
   */
4061 12
  public static function strcmp($str1, $str2)
4062
  {
4063 12
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4064 11
        \Normalizer::normalize($str1, \Normalizer::NFD),
4065 11
        \Normalizer::normalize($str2, \Normalizer::NFD)
4066 12
    );
4067
  }
4068
4069
  /**
4070
   * Find length of initial segment not matching mask.
4071
   *
4072
   * @param string $str
4073
   * @param string $charList
4074
   * @param int    $offset
4075
   * @param int    $length
4076
   *
4077
   * @return int|null
4078
   */
4079 8
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4080
  {
4081 8
    if ('' === $charList .= '') {
4082 1
      return null;
4083
    }
4084
4085 7
    if ($offset || 2147483647 !== $length) {
4086 2
      $str = (string)self::substr($str, $offset, $length);
4087 2
    } else {
4088 5
      $str = (string)$str;
4089
    }
4090
4091 7
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4092
      /** @noinspection OffsetOperationsInspection */
4093 7
      return self::strlen($length[1]);
4094
    } else {
4095 1
      return self::strlen($str);
4096
    }
4097
  }
4098
4099
  /**
4100
   * Create a UTF-8 string from code points.
4101
   *
4102
   * INFO: opposite to UTF8::codepoints()
4103
   *
4104
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4105
   *
4106
   * @return string <p>UTF-8 encoded string.</p>
4107
   */
4108 2
  public static function string(array $array)
4109
  {
4110 2
    return implode(
4111 2
        array_map(
4112
            array(
4113 2
                '\\voku\\helper\\UTF8',
4114 2
                'chr',
4115 2
            ),
4116
            $array
4117 2
        )
4118 2
    );
4119
  }
4120
4121
  /**
4122
   * alias for "UTF8::string_has_bom()"
4123
   *
4124
   * @see UTF8::string_has_bom()
4125
   *
4126
   * @param string $str
4127
   *
4128
   * @return bool
4129
   */
4130
  public static function hasBom($str)
4131
  {
4132
    return self::string_has_bom($str);
4133
  }
4134
4135
  /**
4136
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4137
   *
4138
   * @param string $str <p>The input string.</p>
4139
   *
4140
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4141
   */
4142 3
  public static function string_has_bom($str)
4143
  {
4144 3
    foreach (self::$bom as $bomString => $bomByteLength) {
4145 3
      if (0 === strpos($str, $bomString)) {
4146 3
        return true;
4147
      }
4148 3
    }
4149
4150 3
    return false;
4151
  }
4152
4153
  /**
4154
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4155
   *
4156
   * @link http://php.net/manual/en/function.strip-tags.php
4157
   *
4158
   * @param string $str            <p>
4159
   *                               The input string.
4160
   *                               </p>
4161
   * @param string $allowable_tags [optional] <p>
4162
   *                               You can use the optional second parameter to specify tags which should
4163
   *                               not be stripped.
4164
   *                               </p>
4165
   *                               <p>
4166
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4167
   *                               can not be changed with allowable_tags.
4168
   *                               </p>
4169
   *
4170
   * @return string <p>The stripped string.</p>
4171
   */
4172 2
  public static function strip_tags($str, $allowable_tags = null)
4173
  {
4174
    // clean broken utf8
4175 2
    $str = self::clean($str);
4176
4177 2
    return strip_tags($str, $allowable_tags);
4178
  }
4179
4180
  /**
4181
   * Finds position of first occurrence of a string within another, case insensitive.
4182
   *
4183
   * @link http://php.net/manual/en/function.mb-stripos.php
4184
   *
4185
   * @param string  $haystack  <p>
4186
   *                           The string from which to get the position of the first occurrence
4187
   *                           of needle
4188
   *                           </p>
4189
   * @param string  $needle    <p>
4190
   *                           The string to find in haystack
4191
   *                           </p>
4192
   * @param int     $offset    [optional] <p>
4193
   *                           The position in haystack
4194
   *                           to start searching
4195
   *                           </p>
4196
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4197
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4198
   *
4199
   * @return int|false <p>
4200
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4201
   *                   or false if needle is not found.
4202
   *                   </p>
4203
   */
4204 8
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4205
  {
4206 8
    $haystack = (string)$haystack;
4207 8
    $needle = (string)$needle;
4208
4209 8
    if (!isset($haystack[0], $needle[0])) {
4210 3
      return false;
4211
    }
4212
4213 7
    if ($cleanUtf8 === true) {
4214 1
      $haystack = self::clean($haystack);
4215 1
      $needle = self::clean($needle);
4216 1
    }
4217
4218
    if (
4219
        $encoding === 'UTF-8'
4220 7
        ||
4221 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4222 7
    ) {
4223 7
      $encoding = 'UTF-8';
4224 7
    } else {
4225
      $encoding = self::normalize_encoding($encoding);
4226
    }
4227
4228 7
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4229
  }
4230
4231
  /**
4232
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4233
   *
4234
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4235
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4236
   * @param bool   $before_needle [optional] <p>
4237
   *                              If <b>TRUE</b>, grapheme_strstr() returns the part of the
4238
   *                              haystack before the first occurrence of the needle (excluding the needle).
4239
   *                              </p>
4240
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4241
   *
4242
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4243
   */
4244 8
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4245
  {
4246 8
    if ('' === $needle .= '') {
4247 2
      return false;
4248
    }
4249
4250 6
    if ($encoding !== 'UTF-8') {
4251
      $encoding = self::normalize_encoding($encoding);
4252
    }
4253
4254 6
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4255
  }
4256
4257
  /**
4258
   * Get the string length, not the byte-length!
4259
   *
4260
   * @link     http://php.net/manual/en/function.mb-strlen.php
4261
   *
4262
   * @param string  $str       <p>The string being checked for length.</p>
4263
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4264
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4265
   *
4266
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4267
   *             character counted as +1)</p>
4268
   */
4269 62
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4270
  {
4271 62
    $str = (string)$str;
4272
4273 62
    if (!isset($str[0])) {
4274 4
      return 0;
4275
    }
4276
4277
    if (
4278
        $encoding === 'UTF-8'
4279 61
        ||
4280 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4281 61
    ) {
4282 61
      $encoding = 'UTF-8';
4283 61
    } else {
4284 1
      $encoding = self::normalize_encoding($encoding);
4285
    }
4286
4287
    switch ($encoding) {
4288 61
      case 'ASCII':
4289 61
      case 'CP850':
4290
        return strlen($str);
4291
    }
4292
4293 61
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4294 2
      $str = self::clean($str);
4295 2
    }
4296
4297 61
    return \mb_strlen($str, $encoding);
4298
  }
4299
4300
  /**
4301
   * Case insensitive string comparisons using a "natural order" algorithm.
4302
   *
4303
   * INFO: natural order version of UTF8::strcasecmp()
4304
   *
4305
   * @param string $str1 <p>The first string.</p>
4306
   * @param string $str2 <p>The second string.</p>
4307
   *
4308
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4309
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4310
   *             <strong>0</strong> if they are equal
4311
   */
4312 1
  public static function strnatcasecmp($str1, $str2)
4313
  {
4314 1
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4315
  }
4316
4317
  /**
4318
   * String comparisons using a "natural order" algorithm
4319
   *
4320
   * INFO: natural order version of UTF8::strcmp()
4321
   *
4322
   * @link  http://php.net/manual/en/function.strnatcmp.php
4323
   *
4324
   * @param string $str1 <p>The first string.</p>
4325
   * @param string $str2 <p>The second string.</p>
4326
   *
4327
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4328
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4329
   *             <strong>0</strong> if they are equal
4330
   */
4331 2
  public static function strnatcmp($str1, $str2)
4332
  {
4333 2
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4334
  }
4335
4336
  /**
4337
   * Case-insensitive string comparison of the first n characters.
4338
   *
4339
   * @link  http://php.net/manual/en/function.strncasecmp.php
4340
   *
4341
   * @param string $str1 <p>The first string.</p>
4342
   * @param string $str2 <p>The second string.</p>
4343
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4344
   *
4345
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4346
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4347
   *             <strong>0</strong> if they are equal
4348
   */
4349 1
  public static function strncasecmp($str1, $str2, $len)
4350
  {
4351 1
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4352
  }
4353
4354
  /**
4355
   * String comparison of the first n characters.
4356
   *
4357
   * @link  http://php.net/manual/en/function.strncmp.php
4358
   *
4359
   * @param string $str1 <p>The first string.</p>
4360
   * @param string $str2 <p>The second string.</p>
4361
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4362
   *
4363
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4364
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4365
   *             <strong>0</strong> if they are equal
4366
   */
4367 2
  public static function strncmp($str1, $str2, $len)
4368
  {
4369 2
    $str1 = self::substr($str1, 0, $len);
4370 2
    $str2 = self::substr($str2, 0, $len);
4371
4372 2
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4369 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4370 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4373
  }
4374
4375
  /**
4376
   * Search a string for any of a set of characters.
4377
   *
4378
   * @link  http://php.net/manual/en/function.strpbrk.php
4379
   *
4380
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4381
   * @param string $char_list <p>This parameter is case sensitive.</p>
4382
   *
4383
   * @return string String starting from the character found, or false if it is not found.
4384
   */
4385 1
  public static function strpbrk($haystack, $char_list)
4386
  {
4387 1
    $haystack = (string)$haystack;
4388 1
    $char_list = (string)$char_list;
4389
4390 1
    if (!isset($haystack[0], $char_list[0])) {
4391 1
      return false;
4392
    }
4393
4394 1
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4395 1
      return substr($haystack, strpos($haystack, $m[0]));
4396
    } else {
4397
      return false;
4398
    }
4399
  }
4400
4401
  /**
4402
   * Find position of first occurrence of string in a string.
4403
   *
4404
   * @link http://php.net/manual/en/function.mb-strpos.php
4405
   *
4406
   * @param string  $haystack  <p>The string being checked.</p>
4407
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4408
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4409
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4410
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4411
   *
4412
   * @return int|false <p>
4413
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4414
   *                   If needle is not found it returns false.
4415
   *                   </p>
4416
   */
4417 15
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4418
  {
4419 15
    $haystack = (string)$haystack;
4420 15
    $needle = (string)$needle;
4421
4422 15
    if (!isset($haystack[0], $needle[0])) {
4423 2
      return false;
4424
    }
4425
4426
    // init
4427 14
    $offset = (int)$offset;
4428
4429
    // iconv and mbstring do not support integer $needle
4430
4431 14
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4432
      $needle = (string)self::chr($needle);
4433
    }
4434
4435 14
    if ($cleanUtf8 === true) {
4436
      // \mb_strpos returns wrong position if invalid characters are found in $haystack before $needle
4437
      // iconv_strpos is not tolerant to invalid characters
4438
4439 1
      $needle = self::clean((string)$needle);
4440 1
      $haystack = self::clean($haystack);
4441 1
    }
4442
4443 14
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4444
      self::checkForSupport();
4445
    }
4446
4447
    if (
4448
        $encoding === 'UTF-8'
4449 14
        ||
4450 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4451 14
    ) {
4452 14
      $encoding = 'UTF-8';
4453 14
    } else {
4454
      $encoding = self::normalize_encoding($encoding);
4455
    }
4456
4457
    if (
4458
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4459 14
        ||
4460 14
        self::$support['mbstring'] === true
4461 14
    ) {
4462 14
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4463
    }
4464
4465
    if (self::$support['iconv'] === true) {
4466
      // ignore invalid negative offset to keep compatility
4467
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4468
      return \grapheme_strpos($haystack, $needle, $offset > 0 ? $offset : 0);
4469
    }
4470
4471
    if ($offset > 0) {
4472
      $haystack = self::substr($haystack, $offset);
4473
    }
4474
4475 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4476
      $left = substr($haystack, 0, $pos);
4477
4478
      // negative offset not supported in PHP strpos(), ignoring
4479
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4480
    }
4481
4482
    return false;
4483
  }
4484
4485
  /**
4486
   * Finds the last occurrence of a character in a string within another.
4487
   *
4488
   * @link http://php.net/manual/en/function.mb-strrchr.php
4489
   *
4490
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4491
   * @param string $needle   <p>The string to find in haystack</p>
4492
   * @param bool   $part     [optional] <p>
4493
   *                         Determines which portion of haystack
4494
   *                         this function returns.
4495
   *                         If set to true, it returns all of haystack
4496
   *                         from the beginning to the last occurrence of needle.
4497
   *                         If set to false, it returns all of haystack
4498
   *                         from the last occurrence of needle to the end,
4499
   *                         </p>
4500
   * @param string $encoding [optional] <p>
4501
   *                         Character encoding name to use.
4502
   *                         If it is omitted, internal character encoding is used.
4503
   *                         </p>
4504
   *
4505
   * @return string|false The portion of haystack or false if needle is not found.
4506
   */
4507 1 View Code Duplication
  public static function strrchr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4508
  {
4509 1
    if ($encoding !== 'UTF-8') {
4510
      $encoding = self::normalize_encoding($encoding);
4511
    }
4512
4513 1
    return \mb_strrchr($haystack, $needle, $part, $encoding);
4514
  }
4515
4516
  /**
4517
   * alias for "UTF8::strstr()"
4518
   *
4519
   * @see UTF8::strstr()
4520
   *
4521
   * @param string $haystack
4522
   * @param string $needle
4523
   * @param bool   $before_needle
4524
   *
4525
   * @return string|false
4526
   */
4527 1
  public static function strchr($haystack, $needle, $before_needle = false)
4528
  {
4529 1
    return self::strstr($haystack, $needle, $before_needle);
4530
  }
4531
4532
  /**
4533
   * alias for "UTF8::stristr()"
4534
   *
4535
   * @see UTF8::stristr()
4536
   *
4537
   * @param string $haystack
4538
   * @param string $needle
4539
   * @param bool   $before_needle
4540
   *
4541
   * @return string|false
4542
   */
4543
  public static function strichr($haystack, $needle, $before_needle = false)
4544
  {
4545
    return self::stristr($haystack, $needle, $before_needle);
4546
  }
4547
4548
  /**
4549
   * Reverses characters order in the string.
4550
   *
4551
   * @param string $str The input string
4552
   *
4553
   * @return string The string with characters in the reverse sequence
4554
   */
4555 4
  public static function strrev($str)
4556
  {
4557 4
    $str = (string)$str;
4558
4559 4
    if (!isset($str[0])) {
4560 2
      return '';
4561
    }
4562
4563 3
    return implode(array_reverse(self::split($str)));
4564
  }
4565
4566
  /**
4567
   * Finds the last occurrence of a character in a string within another, case insensitive.
4568
   *
4569
   * @link http://php.net/manual/en/function.mb-strrichr.php
4570
   *
4571
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4572
   * @param string $needle   <p>The string to find in haystack.</p>
4573
   * @param bool   $part     [optional] <p>
4574
   *                         Determines which portion of haystack
4575
   *                         this function returns.
4576
   *                         If set to true, it returns all of haystack
4577
   *                         from the beginning to the last occurrence of needle.
4578
   *                         If set to false, it returns all of haystack
4579
   *                         from the last occurrence of needle to the end,
4580
   *                         </p>
4581
   * @param string $encoding [optional] <p>
4582
   *                         Character encoding name to use.
4583
   *                         If it is omitted, internal character encoding is used.
4584
   *                         </p>
4585
   *
4586
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4587
   */
4588 1 View Code Duplication
  public static function strrichr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4589
  {
4590 1
    if ($encoding !== 'UTF-8') {
4591
      $encoding = self::normalize_encoding($encoding);
4592
    }
4593
4594 1
    return \mb_strrichr($haystack, $needle, $part, $encoding);
4595
  }
4596
4597
  /**
4598
   * Find position of last occurrence of a case-insensitive string.
4599
   *
4600
   * @param string  $haystack  <p>The string to look in.</p>
4601
   * @param string  $needle    <p>The string to look for.</p>
4602
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4603
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4604
   *
4605
   * @return int|false <p>
4606
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4607
   *                   not found, it returns false.
4608
   *                   </p>
4609
   */
4610 1
  public static function strripos($haystack, $needle, $offset = 0, $cleanUtf8 = false)
4611
  {
4612 1
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $cleanUtf8);
4613
  }
4614
4615
  /**
4616
   * Find position of last occurrence of a string in a string.
4617
   *
4618
   * @link http://php.net/manual/en/function.mb-strrpos.php
4619
   *
4620
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4621
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4622
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4623
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4624
   *                              the end of the string.
4625
   *                              </p>
4626
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4627
   *
4628
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4629
   *                   is not found, it returns false.</p>
4630
   */
4631 11
  public static function strrpos($haystack, $needle, $offset = null, $cleanUtf8 = false)
4632
  {
4633 11
    $haystack = (string)$haystack;
4634
4635 11
    if (((int)$needle) === $needle && ($needle >= 0)) {
4636 2
      $needle = self::chr($needle);
4637 2
    }
4638
4639 11
    $needle = (string)$needle;
4640
4641 11
    if (!isset($haystack[0], $needle[0])) {
4642 2
      return false;
4643
    }
4644
4645
    // init
4646 10
    $needle = (string)$needle;
4647 10
    $offset = (int)$offset;
4648
4649 10
    if ($cleanUtf8 === true) {
4650
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4651
4652 2
      $needle = self::clean($needle);
4653 2
      $haystack = self::clean($haystack);
4654 2
    }
4655
4656
4657 10
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4658
      self::checkForSupport();
4659
    }
4660
4661 10
    if (self::$support['mbstring'] === true) {
4662 10
      return \mb_strrpos($haystack, $needle, $offset, 'UTF-8');
4663
    }
4664
4665
    if (self::$support['iconv'] === true) {
4666
      return \grapheme_strrpos($haystack, $needle, $offset);
4667
    }
4668
4669
    // fallback
4670
4671
    if ($offset > 0) {
4672
      $haystack = self::substr($haystack, $offset);
4673
    } elseif ($offset < 0) {
4674
      $haystack = self::substr($haystack, 0, $offset);
4675
    }
4676
4677 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4678
      $left = substr($haystack, 0, $pos);
4679
4680
      // negative offset not supported in PHP strpos(), ignoring
4681
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4682
    }
4683
4684
    return false;
4685
  }
4686
4687
  /**
4688
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4689
   * mask.
4690
   *
4691
   * @param string $str    <p>The input string.</p>
4692
   * @param string $mask   <p>The mask of chars</p>
4693
   * @param int    $offset [optional]
4694
   * @param int    $length [optional]
4695
   *
4696
   * @return int
4697
   */
4698 8
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4699
  {
4700 8
    if ($offset || 2147483647 !== $length) {
4701 2
      $str = self::substr($str, $offset, $length);
4702 2
    }
4703
4704 8
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4705
  }
4706
4707
  /**
4708
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4709
   *
4710
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4711
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4712
   * @param bool   $before_needle [optional] <p>
4713
   *                              If <b>TRUE</b>, strstr() returns the part of the
4714
   *                              haystack before the first occurrence of the needle (excluding the needle).
4715
   *                              </p>
4716
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4717
   *
4718
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4719
   */
4720 2
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4721
  {
4722 2
    if ($encoding !== 'UTF-8') {
4723
      $encoding = self::normalize_encoding($encoding);
4724
    }
4725
4726
    if (
4727
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4728 2
        ||
4729 2
        self::$support['mbstring'] === true
4730 2
    ) {
4731 2
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4732
    }
4733
4734
    return \grapheme_strstr($haystack, $needle, $before_needle);
4735
  }
4736
4737
  /**
4738
   * Unicode transformation for case-less matching.
4739
   *
4740
   * @link http://unicode.org/reports/tr21/tr21-5.html
4741
   *
4742
   * @param string $str  <p>The input string.</p>
4743
   * @param bool   $full <p>
4744
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4745
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4746
   *                     </p>
4747
   *
4748
   * @return string
4749
   */
4750 11
  public static function strtocasefold($str, $full = true)
4751
  {
4752 11
    static $fullCaseFold = null;
4753 11
    static $commonCaseFoldKeys = null;
4754 11
    static $commonCaseFoldValues = null;
4755
4756 11
    if ($commonCaseFoldKeys === null) {
4757 1
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4758 1
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4759 1
    }
4760
4761 11
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4762
4763 11
    if ($full) {
4764
4765 11
      if ($fullCaseFold === null) {
4766 1
        $fullCaseFold = self::getData('caseFolding_full');
4767 1
      }
4768
4769
      /** @noinspection OffsetOperationsInspection */
4770 11
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4771 11
    }
4772
4773 11
    $str = self::clean($str);
4774
4775 11
    return self::strtolower($str);
4776
  }
4777
4778
  /**
4779
   * Make a string lowercase.
4780
   *
4781
   * @link http://php.net/manual/en/function.mb-strtolower.php
4782
   *
4783
   * @param string $str      <p>The string being lowercased.</p>
4784
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function</p>
4785
   *
4786
   * @return string str with all alphabetic characters converted to lowercase.
4787
   */
4788 21 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4789
  {
4790
    // init
4791 21
    $str = (string)$str;
4792
4793 21
    if (!isset($str[0])) {
4794 6
      return '';
4795
    }
4796
4797 19
    if ($encoding !== 'UTF-8') {
4798
      $encoding = self::normalize_encoding($encoding);
4799
    }
4800
4801 19
    return \mb_strtolower($str, $encoding);
4802
  }
4803
4804
  /**
4805
   * Generic case sensitive transformation for collation matching.
4806
   *
4807
   * @param string $str <p>The input string</p>
4808
   *
4809
   * @return string
4810
   */
4811 3
  private static function strtonatfold($str)
4812
  {
4813 3
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4814
  }
4815
4816
  /**
4817
   * Make a string uppercase.
4818
   *
4819
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4820
   *
4821
   * @param string $str      <p>The string being uppercased.</p>
4822
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4823
   *
4824
   * @return string str with all alphabetic characters converted to uppercase.
4825
   */
4826 16 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4827
  {
4828 16
    $str = (string)$str;
4829
4830 16
    if (!isset($str[0])) {
4831 4
      return '';
4832
    }
4833
4834 15
    if ($encoding !== 'UTF-8') {
4835
      $encoding = self::normalize_encoding($encoding);
4836
    }
4837
4838 15
    return \mb_strtoupper($str, $encoding);
4839
  }
4840
4841
  /**
4842
   * Translate characters or replace sub-strings.
4843
   *
4844
   * @link  http://php.net/manual/en/function.strtr.php
4845
   *
4846
   * @param string          $str  <p>The string being translated.</p>
4847
   * @param string|string[] $from <p>The string replacing from.</p>
4848
   * @param string|string[] $to   <p>The string being translated to to.</p>
4849
   *
4850
   * @return string <p>
4851
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4852
   *                corresponding character in to.
4853
   *                </p>
4854
   */
4855 1
  public static function strtr($str, $from, $to = INF)
4856
  {
4857 1
    if (INF !== $to) {
4858 1
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 4858 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4859 1
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 4859 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4860 1
      $countFrom = count($from);
4861 1
      $countTo = count($to);
4862
4863 1
      if ($countFrom > $countTo) {
4864 1
        $from = array_slice($from, 0, $countTo);
4865 1
      } elseif ($countFrom < $countTo) {
4866 1
        $to = array_slice($to, 0, $countFrom);
4867 1
      }
4868
4869 1
      $from = array_combine($from, $to);
4870 1
    }
4871
4872 1
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 4855 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
4873
  }
4874
4875
  /**
4876
   * Return the width of a string.
4877
   *
4878
   * @param string  $str       <p>The input string.</p>
4879
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4880
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4881
   *
4882
   * @return int
4883
   */
4884 1
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4885
  {
4886 1
    if ($encoding !== 'UTF-8') {
4887 1
      $encoding = self::normalize_encoding($encoding);
4888 1
    }
4889
4890 1
    if ($cleanUtf8 === true) {
4891
      // iconv and mbstring are not tolerant to invalid encoding
4892
      // further, their behaviour is inconsistent with that of PHP's substr
4893
4894 1
      $str = self::clean($str);
4895 1
    }
4896
4897 1
    return \mb_strwidth($str, $encoding);
4898
  }
4899
4900
  /**
4901
   * Get part of a string.
4902
   *
4903
   * @link http://php.net/manual/en/function.mb-substr.php
4904
   *
4905
   * @param string  $str       <p>The string being checked.</p>
4906
   * @param int     $start     <p>The first position used in str.</p>
4907
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
4908
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4909
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4910
   *
4911
   * @return string Returns a sub-string specified by the start and length parameters.
4912
   */
4913 47
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4914
  {
4915
    // init
4916 47
    $str = (string)$str;
4917
4918 47
    if (!isset($str[0])) {
4919 11
      return '';
4920
    }
4921
4922 45
    if ($cleanUtf8 === true) {
4923
      // iconv and mbstring are not tolerant to invalid encoding
4924
      // further, their behaviour is inconsistent with that of PHP's substr
4925
4926 1
      $str = self::clean($str);
4927 1
    }
4928
4929 45
    $str_length = 0;
4930 45
    if ($start || $length === null) {
4931 37
      $str_length = (int)self::strlen($str);
4932 37
    }
4933
4934 45
    if ($start && $start > $str_length) {
4935 2
      return false;
4936
    }
4937
4938 43
    if ($length === null) {
4939 20
      $length = $str_length;
4940 20
    } else {
4941 41
      $length = (int)$length;
4942
    }
4943
4944 43
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4945
      self::checkForSupport();
4946
    }
4947
4948
    if (
4949
        $encoding === 'UTF-8'
4950 43
        ||
4951 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4952 43
    ) {
4953 43
      $encoding = 'UTF-8';
4954 43
    } else {
4955
      $encoding = self::normalize_encoding($encoding);
4956
    }
4957
4958
    if (
4959
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4960 43
        ||
4961 43
        self::$support['mbstring'] === true
4962 43
    ) {
4963 43
      return \mb_substr($str, $start, $length, $encoding);
4964
    }
4965
4966
    if (self::$support['iconv'] === true) {
4967
      return (string)\grapheme_substr($str, $start, $length);
4968
    }
4969
4970
    // fallback
4971
4972
    // split to array, and remove invalid characters
4973
    $array = self::split($str);
4974
4975
    // extract relevant part, and join to make sting again
4976
    return implode(array_slice($array, $start, $length));
4977
  }
4978
4979
  /**
4980
   * Binary safe comparison of two strings from an offset, up to length characters.
4981
   *
4982
   * @param string  $main_str           <p>The main string being compared.</p>
4983
   * @param string  $str                <p>The secondary string being compared.</p>
4984
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
4985
   *                                    the end of the string.</p>
4986
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
4987
   *                                    the length of the str compared to the length of main_str less the offset.</p>
4988
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
4989
   *                                    insensitive.</p>
4990
   *
4991
   * @return int
4992
   */
4993 1
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
4994
  {
4995 1
    $main_str = self::substr($main_str, $offset, $length);
4996 1
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4997
4998 1
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 4996 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 4996 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4999
  }
5000
5001
  /**
5002
   * Count the number of substring occurrences.
5003
   *
5004
   * @link  http://php.net/manual/en/function.substr-count.php
5005
   *
5006
   * @param string $haystack  <p>The string to search in.</p>
5007
   * @param string $needle    <p>The substring to search for.</p>
5008
   * @param int    $offset    [optional] <p>The offset where to start counting.</p>
5009
   * @param int    $length    [optional] <p>
5010
   *                          The maximum length after the specified offset to search for the
5011
   *                          substring. It outputs a warning if the offset plus the length is
5012
   *                          greater than the haystack length.
5013
   *                          </p>
5014
   * @param string $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5015
   *
5016
   * @return int|false <p>This functions returns an integer or false if there isn't a string.</p>
5017
   */
5018 1
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8')
5019
  {
5020 1
    $haystack = (string)$haystack;
5021 1
    $needle = (string)$needle;
5022
5023 1
    if (!isset($haystack[0], $needle[0])) {
5024 1
      return false;
5025
    }
5026
5027 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5028 1
      $offset = (int)$offset;
5029 1
      $length = (int)$length;
5030
5031 1
      if ($length + $offset <= 0) {
5032 1
        return false;
5033
      }
5034
5035 1
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5036 1
    }
5037
5038 1
    if ($encoding !== 'UTF-8') {
5039
      $encoding = self::normalize_encoding($encoding);
5040
    }
5041
5042 1
    return \mb_substr_count($haystack, $needle, $encoding);
5043
  }
5044
5045
  /**
5046
   * Replace text within a portion of a string.
5047
   *
5048
   * source: https://gist.github.com/stemar/8287074
5049
   *
5050
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5051
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5052
   * @param int|int[]       $start
5053
   * @param int|int[]|void  $length      [optional]
5054
   *
5055
   * @return string|string[]
5056
   */
5057 6
  public static function substr_replace($str, $replacement, $start, $length = null)
5058
  {
5059 6
    if (is_array($str)) {
5060 1
      $num = count($str);
5061
5062
      // $replacement
5063 1
      if (is_array($replacement)) {
5064 1
        $replacement = array_slice($replacement, 0, $num);
5065 1
      } else {
5066 1
        $replacement = array_pad(array($replacement), $num, $replacement);
5067
      }
5068
5069
      // $start
5070 1 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5071 1
        $start = array_slice($start, 0, $num);
5072 1
        foreach ($start as &$valueTmp) {
5073 1
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5074 1
        }
5075 1
        unset($valueTmp);
5076 1
      } else {
5077 1
        $start = array_pad(array($start), $num, $start);
5078
      }
5079
5080
      // $length
5081 1
      if (!isset($length)) {
5082 1
        $length = array_fill(0, $num, 0);
5083 1 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5084 1
        $length = array_slice($length, 0, $num);
5085 1
        foreach ($length as &$valueTmpV2) {
5086 1
          if (isset($valueTmpV2)) {
5087 1
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5088 1
          } else {
5089
            $valueTmpV2 = 0;
5090
          }
5091 1
        }
5092 1
        unset($valueTmpV2);
5093 1
      } else {
5094 1
        $length = array_pad(array($length), $num, $length);
5095
      }
5096
5097
      // Recursive call
5098 1
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5099
    } else {
5100 6
      if (is_array($replacement)) {
5101 1
        if (count($replacement) > 0) {
5102 1
          $replacement = $replacement[0];
5103 1
        } else {
5104 1
          $replacement = '';
5105
        }
5106 1
      }
5107
    }
5108
5109 6
    preg_match_all('/./us', (string)$str, $smatches);
5110 6
    preg_match_all('/./us', (string)$replacement, $rmatches);
5111
5112 6
    if ($length === null) {
5113 4
      $length = \mb_strlen($str);
5114 4
    }
5115
5116 6
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5117
5118 6
    return implode($smatches[0], null);
5119
  }
5120
5121
  /**
5122
   * Returns a case swapped version of the string.
5123
   *
5124
   * @param string $str      <p>The input string.</p>
5125
   * @param string $encoding [optional] <p>Default is UTF-8</p>
5126
   *
5127
   * @return string <p>Each character's case swapped.</p>
5128
   */
5129 1
  public static function swapCase($str, $encoding = 'UTF-8')
5130
  {
5131 1
    $str = (string)$str;
5132
5133 1
    if (!isset($str[0])) {
5134 1
      return '';
5135
    }
5136
5137 1
    if ($encoding !== 'UTF-8') {
5138
      $encoding = self::normalize_encoding($encoding);
5139
    }
5140
5141 1
    $str = self::clean($str);
5142
5143 1
    $strSwappedCase = preg_replace_callback(
5144 1
        '/[\S]/u',
5145
        function ($match) use ($encoding) {
5146 1
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5147
5148 1
          if ($match[0] === $marchToUpper) {
5149 1
            return UTF8::strtolower($match[0], $encoding);
5150
          } else {
5151 1
            return $marchToUpper;
5152
          }
5153 1
        },
5154
        $str
5155 1
    );
5156
5157 1
    return $strSwappedCase;
5158
  }
5159
5160
  /**
5161
   * alias for "UTF8::to_ascii()"
5162
   *
5163
   * @see UTF8::to_ascii()
5164
   *
5165
   * @param string $s
5166
   * @param string $subst_chr
5167
   *
5168
   * @return string
5169
   */
5170 6
  public static function toAscii($s, $subst_chr = '?')
5171
  {
5172 6
    return self::to_ascii($s, $subst_chr);
5173
  }
5174
5175
  /**
5176
   * alias for "UTF8::to_latin1()"
5177
   *
5178
   * @see UTF8::to_latin1()
5179
   *
5180
   * @param $str
5181
   *
5182
   * @return string
5183
   */
5184 1
  public static function toLatin1($str)
5185
  {
5186 1
    return self::to_latin1($str);
5187
  }
5188
5189
  /**
5190
   * alias for "UTF8::to_utf8()"
5191
   *
5192
   * @see UTF8::to_utf8()
5193
   *
5194
   * @param string $str
5195
   *
5196
   * @return string
5197
   */
5198 1
  public static function toUTF8($str)
5199
  {
5200 1
    return self::to_utf8($str);
5201
  }
5202
5203
  /**
5204
   * Convert a string into ASCII.
5205
   *
5206
   * @param string $str     <p>The input string.</p>
5207
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5208
   *
5209
   * @return string
5210
   */
5211 13
  public static function to_ascii($str, $unknown = '?')
5212
  {
5213 13
    static $UTF8_TO_ASCII;
5214
5215
    // init
5216 13
    $str = (string)$str;
5217
5218 13
    if (!isset($str[0])) {
5219 3
      return '';
5220
    }
5221
5222 11
    $str = self::clean($str);
5223
5224 11
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5225
      self::checkForSupport();
5226
    }
5227
5228 11
    if (self::$support['intl'] === true && Bootup::is_php('5.4')) {
5229 11
      $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5230
5231
      // check again, if we only have ASCII, now ...
5232 11
      if (!preg_match("/[\x80-\xFF]/", $str)) {
5233 11
        return $str;
5234
      }
5235 1
    }
5236
5237 1
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5238 1
    $chars = $ar[0];
5239 1
    foreach ($chars as &$c) {
5240
5241 1
      $ordC0 = ord($c[0]);
5242
5243 1
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5244 1
        continue;
5245
      }
5246
5247 1
      $ordC1 = ord($c[1]);
5248
5249
      // ASCII - next please
5250 1
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5251 1
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5252 1
      }
5253
5254 1
      if ($ordC0 >= 224) {
5255 1
        $ordC2 = ord($c[2]);
5256
5257 1
        if ($ordC0 <= 239) {
5258 1
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5259 1
        }
5260
5261 1
        if ($ordC0 >= 240) {
5262 1
          $ordC3 = ord($c[3]);
5263
5264 1
          if ($ordC0 <= 247) {
5265 1
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5266 1
          }
5267
5268 1
          if ($ordC0 >= 248) {
5269
            $ordC4 = ord($c[4]);
5270
5271 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5272
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5273
            }
5274
5275
            if ($ordC0 >= 252) {
5276
              $ordC5 = ord($c[5]);
5277
5278 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5279
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5280
              }
5281
            }
5282
          }
5283 1
        }
5284 1
      }
5285
5286 1
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5287
        $c = $unknown;
5288
        continue;
5289
      }
5290
5291 1
      if (!isset($ord)) {
5292
        $c = $unknown;
5293
        continue;
5294
      }
5295
5296 1
      $bank = $ord >> 8;
5297 1
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5298 1
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5299 1
        if (file_exists($bankfile)) {
5300
          /** @noinspection PhpIncludeInspection */
5301 1
          require $bankfile;
5302 1
        } else {
5303 1
          $UTF8_TO_ASCII[$bank] = array();
5304
        }
5305 1
      }
5306
5307 1
      $newchar = $ord & 255;
5308 1
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5309 1
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5310 1
      } else {
5311 1
        $c = $unknown;
5312
      }
5313 1
    }
5314
5315 1
    return implode('', $chars);
5316
  }
5317
5318
  /**
5319
   * alias for "UTF8::to_iso8859()"
5320
   *
5321
   * @see UTF8::to_iso8859()
5322
   *
5323
   * @param string $str
5324
   *
5325
   * @return string|string[]
5326
   */
5327 1
  public static function toIso8859($str)
5328
  {
5329 1
    return self::to_iso8859($str);
5330
  }
5331
5332
  /**
5333
   * alias for "UTF8::to_iso8859()"
5334
   *
5335
   * @see UTF8::to_iso8859()
5336
   *
5337
   * @param string|string[] $str
5338
   *
5339
   * @return string|string[]
5340
   */
5341 1
  public static function to_latin1($str)
5342
  {
5343 1
    return self::to_iso8859($str);
5344
  }
5345
5346
  /**
5347
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5348
   *
5349
   * - It decode UTF-8 codepoints and unicode escape sequences.
5350
   *
5351
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5352
   *
5353
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5354
   *
5355
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5356
   *    are followed by any of these:  ("group B")
5357
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5358
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5359
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5360
   * is also a valid unicode character, and will be left unchanged.
5361
   *
5362
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5363
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5364
   *
5365
   * @param string|string[] $str <p>Any string or array.</p>
5366
   *
5367
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5368
   */
5369 20
  public static function to_utf8($str)
5370
  {
5371 20
    if (is_array($str)) {
5372 2
      foreach ($str as $k => $v) {
5373
        /** @noinspection AlterInForeachInspection */
5374
        /** @noinspection OffsetOperationsInspection */
5375 2
        $str[$k] = self::to_utf8($v);
5376 2
      }
5377
5378 2
      return $str;
5379
    }
5380
5381 20
    $str = (string)$str;
5382
5383 20
    if (!isset($str[0])) {
5384 4
      return $str;
5385
    }
5386
5387 19
    $max = strlen($str);
5388 19
    $buf = '';
5389
5390
    /** @noinspection ForeachInvariantsInspection */
5391 19
    for ($i = 0; $i < $max; $i++) {
5392 19
      $c1 = $str[$i];
5393
5394 19
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5395 19
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5396 19
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5397 19
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5398
5399 19
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5400
5401 16
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5402 16
            $buf .= $c1 . $c2;
5403 16
            $i++;
5404 16
          } else { // not valid UTF8 - convert it
5405 5
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5406 5
            $cc2 = ($c1 & "\x3f") | "\x80";
5407 5
            $buf .= $cc1 . $cc2;
5408
          }
5409
5410 19 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5411
5412 17
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5413 13
            $buf .= $c1 . $c2 . $c3;
5414 13
            $i += 2;
5415 13
          } else { // not valid UTF8 - convert it
5416 8
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5417 8
            $cc2 = ($c1 & "\x3f") | "\x80";
5418 8
            $buf .= $cc1 . $cc2;
5419
          }
5420
5421 19
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5422
5423 9 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5424 4
            $buf .= $c1 . $c2 . $c3 . $c4;
5425 4
            $i += 3;
5426 4
          } else { // not valid UTF8 - convert it
5427 6
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5428 6
            $cc2 = ($c1 & "\x3f") | "\x80";
5429 6
            $buf .= $cc1 . $cc2;
5430
          }
5431
5432 9
        } else { // doesn't look like UTF8, but should be converted
5433 6
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5434 6
          $cc2 = (($c1 & "\x3f") | "\x80");
5435 6
          $buf .= $cc1 . $cc2;
5436
        }
5437
5438 19
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5439
5440 4
        $ordC1 = ord($c1);
5441 4
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5442 2
          $buf .= self::$win1252ToUtf8[$ordC1];
5443 2
        } else {
5444 3
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5445 3
          $cc2 = (($c1 & "\x3f") | "\x80");
5446 3
          $buf .= $cc1 . $cc2;
5447
        }
5448
5449 4
      } else { // it doesn't need conversion
5450 16
        $buf .= $c1;
5451
      }
5452 19
    }
5453
5454
    // decode unicode escape sequences
5455 19
    $buf = preg_replace_callback(
5456 19
        '/\\\\u([0-9a-f]{4})/i',
5457
        function ($match) {
5458 3
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5459 19
        },
5460
        $buf
5461 19
    );
5462
5463
    // decode UTF-8 codepoints
5464 19
    $buf = preg_replace_callback(
5465 19
        '/&#\d{2,4};/',
5466 19
        function ($match) {
5467 2
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5468 19
        },
5469
        $buf
5470 19
    );
5471
5472 19
    return $buf;
5473
  }
5474
5475
  /**
5476
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5477
   *
5478
   * @param string|string[] $str
5479
   *
5480
   * @return string|string[]
5481
   */
5482 2
  public static function to_iso8859($str)
5483
  {
5484 2
    if (is_array($str)) {
5485
5486 1
      foreach ($str as $k => $v) {
5487
        /** @noinspection AlterInForeachInspection */
5488
        /** @noinspection OffsetOperationsInspection */
5489 1
        $str[$k] = self::to_iso8859($v);
5490 1
      }
5491
5492 1
      return $str;
5493
    }
5494
5495 2
    $str = (string)$str;
5496
5497 2
    if (!isset($str[0])) {
5498 1
      return '';
5499
    }
5500
5501 2
    return self::utf8_decode($str);
5502
  }
5503
5504
  /**
5505
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5506
   *
5507
   * INFO: This is slower then "trim()"
5508
   *
5509
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5510
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5511
   *
5512
   * @param string $str   <p>The string to be trimmed</p>
5513
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5514
   *
5515
   * @return string <p>The trimmed string.</p>
5516
   */
5517 26
  public static function trim($str = '', $chars = INF)
5518
  {
5519 26
    $str = (string)$str;
5520
5521 26
    if (!isset($str[0])) {
5522 5
      return '';
5523
    }
5524
5525
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5526 22
    if ($chars === INF || !$chars) {
5527 6
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5528
    }
5529
5530 16
    return self::rtrim(self::ltrim($str, $chars), $chars);
5531
  }
5532
5533
  /**
5534
   * Makes string's first char uppercase.
5535
   *
5536
   * @param string $str <p>The input string.</p>
5537
   *
5538
   * @return string <p>The resulting string</p>
5539
   */
5540 14
  public static function ucfirst($str)
5541
  {
5542 14
    return self::strtoupper(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5543
  }
5544
5545
  /**
5546
   * alias for "UTF8::ucfirst()"
5547
   *
5548
   * @see UTF8::ucfirst()
5549
   *
5550
   * @param string $word
5551
   *
5552
   * @return string
5553
   */
5554 1
  public static function ucword($word)
5555
  {
5556 1
    return self::ucfirst($word);
5557
  }
5558
5559
  /**
5560
   * Uppercase for all words in the string.
5561
   *
5562
   * @param string   $str        <p>The input string.</p>
5563
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5564
   *
5565
   * @return string
5566
   */
5567 8
  public static function ucwords($str, $exceptions = array())
5568
  {
5569 8
    if (!$str) {
5570 2
      return '';
5571
    }
5572
5573
    // init
5574 7
    $words = explode(' ', $str);
5575 7
    $newwords = array();
5576
5577 7
    if (count($exceptions) > 0) {
5578 1
      $useExceptions = true;
5579 1
    } else {
5580 7
      $useExceptions = false;
5581
    }
5582
5583 7
    foreach ($words as $word) {
5584
      if (
5585 7
          ($useExceptions === false)
5586
          ||
5587
          (
5588
              $useExceptions === true
5589 1
              &&
5590 1
              !in_array($word, $exceptions, true)
5591 1
          )
5592 7
      ) {
5593 7
        $word = self::ucfirst($word);
5594 7
      }
5595 7
      $newwords[] = $word;
5596 7
    }
5597
5598 7
    return implode(' ', $newwords);
5599
  }
5600
5601
  /**
5602
   * Multi decode html entity & fix urlencoded-win1252-chars.
5603
   *
5604
   * e.g:
5605
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5606
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5607
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5608
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5609
   * 'Düsseldorf'                   => 'Düsseldorf'
5610
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5611
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5612
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5613
   *
5614
   * @param string $str <p>The input string.</p>
5615
   *
5616
   * @return string
5617
   */
5618 1
  public static function urldecode($str)
5619
  {
5620 1
    $str = (string)$str;
5621
5622 1
    if (!isset($str[0])) {
5623 1
      return '';
5624
    }
5625
5626 1
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5627
5628 1
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5629
5630 1
    $str = self::fix_simple_utf8(
5631 1
        rawurldecode(
5632 1
            self::html_entity_decode(
5633 1
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5634
                $flags
5635 1
            )
5636 1
        )
5637 1
    );
5638
5639 1
    return (string)$str;
5640
  }
5641
5642
  /**
5643
   * Return a array with "urlencoded"-win1252 -> UTF-8
5644
   *
5645
   * @return mixed
5646
   */
5647 1
  public static function urldecode_fix_win1252_chars()
5648
  {
5649
    static $array = array(
5650
        '%20' => ' ',
5651
        '%21' => '!',
5652
        '%22' => '"',
5653
        '%23' => '#',
5654
        '%24' => '$',
5655
        '%25' => '%',
5656
        '%26' => '&',
5657
        '%27' => "'",
5658
        '%28' => '(',
5659
        '%29' => ')',
5660
        '%2A' => '*',
5661
        '%2B' => '+',
5662
        '%2C' => ',',
5663
        '%2D' => '-',
5664
        '%2E' => '.',
5665
        '%2F' => '/',
5666
        '%30' => '0',
5667
        '%31' => '1',
5668
        '%32' => '2',
5669
        '%33' => '3',
5670
        '%34' => '4',
5671
        '%35' => '5',
5672
        '%36' => '6',
5673
        '%37' => '7',
5674
        '%38' => '8',
5675
        '%39' => '9',
5676
        '%3A' => ':',
5677
        '%3B' => ';',
5678
        '%3C' => '<',
5679
        '%3D' => '=',
5680
        '%3E' => '>',
5681
        '%3F' => '?',
5682
        '%40' => '@',
5683
        '%41' => 'A',
5684
        '%42' => 'B',
5685
        '%43' => 'C',
5686
        '%44' => 'D',
5687
        '%45' => 'E',
5688
        '%46' => 'F',
5689
        '%47' => 'G',
5690
        '%48' => 'H',
5691
        '%49' => 'I',
5692
        '%4A' => 'J',
5693
        '%4B' => 'K',
5694
        '%4C' => 'L',
5695
        '%4D' => 'M',
5696
        '%4E' => 'N',
5697
        '%4F' => 'O',
5698
        '%50' => 'P',
5699
        '%51' => 'Q',
5700
        '%52' => 'R',
5701
        '%53' => 'S',
5702
        '%54' => 'T',
5703
        '%55' => 'U',
5704
        '%56' => 'V',
5705
        '%57' => 'W',
5706
        '%58' => 'X',
5707
        '%59' => 'Y',
5708
        '%5A' => 'Z',
5709
        '%5B' => '[',
5710
        '%5C' => '\\',
5711
        '%5D' => ']',
5712
        '%5E' => '^',
5713
        '%5F' => '_',
5714
        '%60' => '`',
5715
        '%61' => 'a',
5716
        '%62' => 'b',
5717
        '%63' => 'c',
5718
        '%64' => 'd',
5719
        '%65' => 'e',
5720
        '%66' => 'f',
5721
        '%67' => 'g',
5722
        '%68' => 'h',
5723
        '%69' => 'i',
5724
        '%6A' => 'j',
5725
        '%6B' => 'k',
5726
        '%6C' => 'l',
5727
        '%6D' => 'm',
5728
        '%6E' => 'n',
5729
        '%6F' => 'o',
5730
        '%70' => 'p',
5731
        '%71' => 'q',
5732
        '%72' => 'r',
5733
        '%73' => 's',
5734
        '%74' => 't',
5735
        '%75' => 'u',
5736
        '%76' => 'v',
5737
        '%77' => 'w',
5738
        '%78' => 'x',
5739
        '%79' => 'y',
5740
        '%7A' => 'z',
5741
        '%7B' => '{',
5742
        '%7C' => '|',
5743
        '%7D' => '}',
5744
        '%7E' => '~',
5745
        '%7F' => '',
5746
        '%80' => '`',
5747
        '%81' => '',
5748
        '%82' => '‚',
5749
        '%83' => 'ƒ',
5750
        '%84' => '„',
5751
        '%85' => '…',
5752
        '%86' => '†',
5753
        '%87' => '‡',
5754
        '%88' => 'ˆ',
5755
        '%89' => '‰',
5756
        '%8A' => 'Š',
5757
        '%8B' => '‹',
5758
        '%8C' => 'Œ',
5759
        '%8D' => '',
5760
        '%8E' => 'Ž',
5761
        '%8F' => '',
5762
        '%90' => '',
5763
        '%91' => '‘',
5764
        '%92' => '’',
5765
        '%93' => '“',
5766
        '%94' => '”',
5767
        '%95' => '•',
5768
        '%96' => '–',
5769
        '%97' => '—',
5770
        '%98' => '˜',
5771
        '%99' => '™',
5772
        '%9A' => 'š',
5773
        '%9B' => '›',
5774
        '%9C' => 'œ',
5775
        '%9D' => '',
5776
        '%9E' => 'ž',
5777
        '%9F' => 'Ÿ',
5778
        '%A0' => '',
5779
        '%A1' => '¡',
5780
        '%A2' => '¢',
5781
        '%A3' => '£',
5782
        '%A4' => '¤',
5783
        '%A5' => '¥',
5784
        '%A6' => '¦',
5785
        '%A7' => '§',
5786
        '%A8' => '¨',
5787
        '%A9' => '©',
5788
        '%AA' => 'ª',
5789
        '%AB' => '«',
5790
        '%AC' => '¬',
5791
        '%AD' => '',
5792
        '%AE' => '®',
5793
        '%AF' => '¯',
5794
        '%B0' => '°',
5795
        '%B1' => '±',
5796
        '%B2' => '²',
5797
        '%B3' => '³',
5798
        '%B4' => '´',
5799
        '%B5' => 'µ',
5800
        '%B6' => '¶',
5801
        '%B7' => '·',
5802
        '%B8' => '¸',
5803
        '%B9' => '¹',
5804
        '%BA' => 'º',
5805
        '%BB' => '»',
5806
        '%BC' => '¼',
5807
        '%BD' => '½',
5808
        '%BE' => '¾',
5809
        '%BF' => '¿',
5810
        '%C0' => 'À',
5811
        '%C1' => 'Á',
5812
        '%C2' => 'Â',
5813
        '%C3' => 'Ã',
5814
        '%C4' => 'Ä',
5815
        '%C5' => 'Å',
5816
        '%C6' => 'Æ',
5817
        '%C7' => 'Ç',
5818
        '%C8' => 'È',
5819
        '%C9' => 'É',
5820
        '%CA' => 'Ê',
5821
        '%CB' => 'Ë',
5822
        '%CC' => 'Ì',
5823
        '%CD' => 'Í',
5824
        '%CE' => 'Î',
5825
        '%CF' => 'Ï',
5826
        '%D0' => 'Ð',
5827
        '%D1' => 'Ñ',
5828
        '%D2' => 'Ò',
5829
        '%D3' => 'Ó',
5830
        '%D4' => 'Ô',
5831
        '%D5' => 'Õ',
5832
        '%D6' => 'Ö',
5833
        '%D7' => '×',
5834
        '%D8' => 'Ø',
5835
        '%D9' => 'Ù',
5836
        '%DA' => 'Ú',
5837
        '%DB' => 'Û',
5838
        '%DC' => 'Ü',
5839
        '%DD' => 'Ý',
5840
        '%DE' => 'Þ',
5841
        '%DF' => 'ß',
5842
        '%E0' => 'à',
5843
        '%E1' => 'á',
5844
        '%E2' => 'â',
5845
        '%E3' => 'ã',
5846
        '%E4' => 'ä',
5847
        '%E5' => 'å',
5848
        '%E6' => 'æ',
5849
        '%E7' => 'ç',
5850
        '%E8' => 'è',
5851
        '%E9' => 'é',
5852
        '%EA' => 'ê',
5853
        '%EB' => 'ë',
5854
        '%EC' => 'ì',
5855
        '%ED' => 'í',
5856
        '%EE' => 'î',
5857
        '%EF' => 'ï',
5858
        '%F0' => 'ð',
5859
        '%F1' => 'ñ',
5860
        '%F2' => 'ò',
5861
        '%F3' => 'ó',
5862
        '%F4' => 'ô',
5863
        '%F5' => 'õ',
5864
        '%F6' => 'ö',
5865
        '%F7' => '÷',
5866
        '%F8' => 'ø',
5867
        '%F9' => 'ù',
5868
        '%FA' => 'ú',
5869
        '%FB' => 'û',
5870
        '%FC' => 'ü',
5871
        '%FD' => 'ý',
5872
        '%FE' => 'þ',
5873
        '%FF' => 'ÿ',
5874 1
    );
5875
5876 1
    return $array;
5877
  }
5878
5879
  /**
5880
   * Decodes an UTF-8 string to ISO-8859-1.
5881
   *
5882
   * @param string $str <p>The input string.</p>
5883
   *
5884
   * @return string
5885
   */
5886 6
  public static function utf8_decode($str)
5887
  {
5888 6
    static $utf8ToWin1252Keys = null;
5889 6
    static $utf8ToWin1252Values = null;
5890
5891 6
    $str = (string)$str;
5892
5893 6
    if (!isset($str[0])) {
5894 3
      return '';
5895
    }
5896
5897
    // init
5898 6
    $str = self::to_utf8($str);
5899
5900 6
    if ($utf8ToWin1252Keys === null) {
5901 1
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
5902 1
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
5903 1
    }
5904
5905 6
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
5906
  }
5907
5908
  /**
5909
   * Encodes an ISO-8859-1 string to UTF-8.
5910
   *
5911
   * @param string $str <p>The input string.</p>
5912
   *
5913
   * @return string
5914
   */
5915 6
  public static function utf8_encode($str)
5916
  {
5917 6
    $str = \utf8_encode($str);
5918
5919 6
    if (false === strpos($str, "\xC2")) {
5920 6
      return $str;
5921
    } else {
5922
5923 5
      static $cp1252ToUtf8Keys = null;
5924 5
      static $cp1252ToUtf8Values = null;
5925
5926 5
      if ($cp1252ToUtf8Keys === null) {
5927 1
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
5928 1
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
5929 1
      }
5930
5931 5
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
5932
    }
5933
  }
5934
5935
  /**
5936
   * fix -> utf8-win1252 chars
5937
   *
5938
   * @param string $str <p>The input string.</p>
5939
   *
5940
   * @return string
5941
   *
5942
   * @deprecated use "UTF8::fix_simple_utf8()"
5943
   */
5944
  public static function utf8_fix_win1252_chars($str)
5945
  {
5946
    return self::fix_simple_utf8($str);
5947
  }
5948
5949
  /**
5950
   * Returns an array with all utf8 whitespace characters.
5951
   *
5952
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
5953
   *
5954
   * @author: Derek E. [email protected]
5955
   *
5956
   * @return array <p>
5957
   *               An array with all known whitespace characters as values and the type of whitespace as keys
5958
   *               as defined in above URL.
5959
   *               </p>
5960
   */
5961 1
  public static function whitespace_table()
5962
  {
5963 1
    return self::$whitespaceTable;
5964
  }
5965
5966
  /**
5967
   * Limit the number of words in a string.
5968
   *
5969
   * @param string $str      <p>The input string.</p>
5970
   * @param int    $words    <p>The limit of words as integer.</p>
5971
   * @param string $strAddOn <p>Replacement for the striped string.</p>
5972
   *
5973
   * @return string
5974
   */
5975 1
  public static function words_limit($str, $words = 100, $strAddOn = '...')
5976
  {
5977 1
    $str = (string)$str;
5978
5979 1
    if (!isset($str[0])) {
5980 1
      return '';
5981
    }
5982
5983 1
    $words = (int)$words;
5984
5985 1
    if ($words < 1) {
5986 1
      return '';
5987
    }
5988
5989 1
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
5990
5991
    if (
5992 1
        !isset($matches[0])
5993 1
        ||
5994 1
        self::strlen($str) === self::strlen($matches[0])
5995 1
    ) {
5996 1
      return $str;
5997
    }
5998
5999 1
    return self::rtrim($matches[0]) . $strAddOn;
6000
  }
6001
6002
  /**
6003
   * Wraps a string to a given number of characters
6004
   *
6005
   * @link  http://php.net/manual/en/function.wordwrap.php
6006
   *
6007
   * @param string $str   <p>The input string.</p>
6008
   * @param int    $width [optional] <p>The column width.</p>
6009
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6010
   * @param bool   $cut   [optional] <p>
6011
   *                      If the cut is set to true, the string is
6012
   *                      always wrapped at or before the specified width. So if you have
6013
   *                      a word that is larger than the given width, it is broken apart.
6014
   *                      </p>
6015
   *
6016
   * @return string <p>The given string wrapped at the specified column.</p>
6017
   */
6018 9
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6019
  {
6020 9
    $str = (string)$str;
6021 9
    $break = (string)$break;
6022
6023 9
    if (!isset($str[0], $break[0])) {
6024 2
      return '';
6025
    }
6026
6027 8
    $w = '';
6028 8
    $strSplit = explode($break, $str);
6029 8
    $count = count($strSplit);
6030
6031 8
    if (1 === $count && '' === $strSplit[0]) {
6032
      return '';
6033
    }
6034
6035 8
    $chars = array();
6036
    /** @noinspection ForeachInvariantsInspection */
6037 8
    for ($i = 0; $i < $count; ++$i) {
6038
6039 8
      if ($i) {
6040 1
        $chars[] = $break;
6041 1
        $w .= '#';
6042 1
      }
6043
6044 8
      $c = $strSplit[$i];
6045 8
      unset($strSplit[$i]);
6046
6047 8
      foreach (self::split($c) as $c) {
6048 8
        $chars[] = $c;
6049 8
        $w .= ' ' === $c ? ' ' : '?';
6050 8
      }
6051 8
    }
6052
6053 8
    $strReturn = '';
6054 8
    $j = 0;
6055 8
    $b = $i = -1;
6056 8
    $w = wordwrap($w, $width, '#', $cut);
6057
6058 8
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6059 6
      for (++$i; $i < $b; ++$i) {
6060 6
        $strReturn .= $chars[$j];
6061 6
        unset($chars[$j++]);
6062 6
      }
6063
6064 6
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6065 3
        unset($chars[$j++]);
6066 3
      }
6067
6068 6
      $strReturn .= $break;
6069 6
    }
6070
6071 8
    return $strReturn . implode('', $chars);
6072
  }
6073
6074
  /**
6075
   * Returns an array of Unicode White Space characters.
6076
   *
6077
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6078
   */
6079 1
  public static function ws()
6080
  {
6081 1
    return self::$whitespace;
6082
  }
6083
6084
}
6085