Completed
Push — master ( c96d38...6e5540 )
by Lars
04:04
created

UTF8::strspn()   B

Complexity

Conditions 5
Paths 6

Size

Total Lines 17
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 5

Importance

Changes 6
Bugs 1 Features 2
Metric Value
c 6
b 1
f 2
dl 0
loc 17
ccs 7
cts 7
cp 1
rs 8.8571
cc 5
eloc 9
nc 6
nop 4
crap 5
1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834 1
  public static function add_bom_to_string($str)
835
  {
836 1
    if (self::string_has_bom($str) === false) {
837 1
      $str = self::bom() . $str;
838 1
    }
839
840 1
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 2
  public static function bom()
861
  {
862 2
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884 2
  public static function checkForSupport()
885
  {
886 2
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888 1
      self::$support['already_checked_via_portable_utf8'] = true;
889
890 1
      self::$support['mbstring'] = self::mbstring_loaded();
891 1
      self::$support['iconv'] = self::iconv_loaded();
892 1
      self::$support['intl'] = self::intl_loaded();
893 1
      self::$support['intlChar'] = self::intlChar_loaded();
894 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895 1
    }
896 2
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907 9
  public static function chr($code_point)
908
  {
909
    $i = (int)$code_point;
910 9
    if ($i !== $code_point) {
911
      return null;
912 9
    }
913
914
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
915
      self::checkForSupport();
916 9
    }
917
918
    if (self::$support['intlChar'] === true) {
919
      return \IntlChar::chr($code_point);
920 9
    }
921 1
922 1
    // use static cache, if there is no support for "IntlChar"
923
    static $cache = array();
924 9
    if (isset($cache[$code_point]) === true) {
925 2
      return $cache[$code_point];
926
    }
927
928 9
929
    if ($code_point <= 0x7f) {
930
      return $cache[$code_point] = chr($code_point);
931
    }
932
933
    if ($code_point <= 0x7ff) {
934
      return $cache[$code_point] = chr(0xc0 | ($code_point >> 6)) .
935
                                   chr(0x80 | ($code_point & 0x3f));
936
    }
937
938
    if ($code_point <= 0xffff) {
939 1
      return $cache[$code_point] = chr(0xe0 | ($code_point >> 12)) .
940
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
941 1
                                   chr(0x80 | ($code_point & 0x3f));
942
    }
943 1
944
    if ($code_point <= 0x10ffff) {
945
      return $cache[$code_point] = chr(0xf0 | ($code_point >> 18)) .
946
                                   chr(0x80 | (($code_point >> 12) & 0x3f)) .
947
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
948
                                   chr(0x80 | ($code_point & 0x3f));
949
    }
950
951
    # U+FFFD REPLACEMENT CHARACTER
952
    return $cache[$code_point] = "\xEF\xBF\xBD";
953
  }
954
955
  /**
956
   * Applies callback to all characters of a string.
957
   *
958 4
   * @param string|array $callback <p>The callback function.</p>
959
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
960 4
   *
961 3
   * @return array <p>The outcome of callback.</p>
962
   */
963
  public static function chr_map($callback, $str)
964 4
  {
965
    $chars = self::split($str);
966
967
    return array_map($callback, $chars);
968
  }
969
970
  /**
971
   * Generates an array of byte length of each character of a Unicode string.
972
   *
973
   * 1 byte => U+0000  - U+007F
974 2
   * 2 byte => U+0080  - U+07FF
975
   * 3 byte => U+0800  - U+FFFF
976 2
   * 4 byte => U+10000 - U+10FFFF
977 2
   *
978 2
   * @param string $str <p>The original Unicode string.</p>
979
   *
980 2
   * @return array <p>An array of byte lengths of each character.</p>
981
   */
982 2
  public static function chr_size_list($str)
983
  {
984
    if (!$str) {
985 2
      return array();
986
    }
987 2
988 2
    return array_map('strlen', self::split($str));
989 2
  }
990
991 1
  /**
992 1
   * Get a decimal code representation of a specific character.
993 1
   *
994
   * @param string $char <p>The input character.</p>
995
   *
996
   * @return int
997
   */
998
  public static function chr_to_decimal($char)
999 2
  {
1000
    $char = (string)$char;
1001 2
    $code = self::ord($char[0]);
1002 2
    $bytes = 1;
1003
1004 2
    if (!($code & 0x80)) {
1005
      // 0xxxxxxx
1006
      return $code;
1007
    }
1008
1009
    if (($code & 0xe0) === 0xc0) {
1010
      // 110xxxxx
1011
      $bytes = 2;
1012
      $code &= ~0xc0;
1013
    } elseif (($code & 0xf0) === 0xe0) {
1014
      // 1110xxxx
1015 1
      $bytes = 3;
1016
      $code &= ~0xe0;
1017 1
    } elseif (($code & 0xf8) === 0xf0) {
1018
      // 11110xxx
1019
      $bytes = 4;
1020
      $code &= ~0xf0;
1021
    }
1022
1023
    for ($i = 2; $i <= $bytes; $i++) {
1024
      // 10xxxxxx
1025
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1026
    }
1027
1028
    return $code;
1029 1
  }
1030
1031 1
  /**
1032
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1033
   *
1034
   * @param string $char <p>The input character</p>
1035
   * @param string $pfix [optional]
1036
   *
1037
   * @return string <p>The code point encoded as U+xxxx<p>
1038
   */
1039
  public static function chr_to_hex($char, $pfix = 'U+')
1040
  {
1041
    return self::int_to_hex(self::ord($char), $pfix);
1042
  }
1043
1044
  /**
1045
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1046
   *
1047 42
   * @param string $body     <p>The original string to be split.</p>
1048
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1049
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1050
   *
1051
   * @return string <p>The chunked string</p>
1052
   */
1053
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1054
  {
1055
    return implode($end, self::split($body, $chunklen));
1056
  }
1057
1058
  /**
1059
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1060
   *
1061
   * @param string $str                     <p>The string to be sanitized.</p>
1062 42
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1063 42
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1064
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1065 42
   *                                        => "..."</p>
1066 42
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1067
   *                                        $normalize_whitespace</p>
1068 42
   *
1069 6
   * @return string <p>Clean UTF-8 encoded string.</p>
1070 6
   */
1071
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1072 42
  {
1073 1
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1074 1
    // caused connection reset problem on larger strings
1075
1076 42
    $regx = '/
1077 5
      (
1078 5
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1079
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1080 42
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1081
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1082
        ){1,100}                      # ...one or more times
1083
      )
1084
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1085
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1086
    /x';
1087
    $str = preg_replace($regx, '$1', $str);
1088
1089
    $str = self::replace_diamond_question_mark($str, '');
1090 4
    $str = self::remove_invisible_characters($str);
1091
1092 4
    if ($normalize_whitespace === true) {
1093
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1094 4
    }
1095 1
1096
    if ($normalize_msword === true) {
1097
      $str = self::normalize_msword($str);
1098
    }
1099 4
1100
    if ($remove_bom === true) {
1101
      $str = self::removeBOM($str);
1102
    }
1103
1104
    return $str;
1105
  }
1106 4
1107
  /**
1108 4
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1109
   *
1110
   * @param string $str <p>The input string.</p>
1111
   *
1112
   * @return string
1113
   */
1114
  public static function cleanup($str)
1115
  {
1116
    $str = (string)$str;
1117
1118
    if (!isset($str[0])) {
1119
      return '';
1120
    }
1121
1122 5
    // fixed ISO <-> UTF-8 Errors
1123
    $str = self::fix_simple_utf8($str);
1124 5
1125 5
    // remove all none UTF-8 symbols
1126 5
    // && remove diamond question mark (�)
1127
    // && remove remove invisible characters (e.g. "\0")
1128 5
    // && remove BOM
1129
    // && normalize whitespace chars (but keep non-breaking-spaces)
1130 5
    $str = self::clean($str, true, true, false, true);
1131 5
1132 5
    return (string)$str;
1133
  }
1134 5
1135
  /**
1136 5
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1137 1
   *
1138
   * INFO: opposite to UTF8::string()
1139 1
   *
1140 1
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1141 1
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1142
   *                                    default, code points will be returned as integers.</p>
1143 1
   *
1144 1
   * @return array <p>The array of code points.</p>
1145
   */
1146 5
  public static function codepoints($arg, $u_style = false)
1147
  {
1148
    if (is_string($arg)) {
1149
      $arg = self::split($arg);
1150
    }
1151
1152
    $arg = array_map(
1153
        array(
1154
            '\\voku\\helper\\UTF8',
1155
            'ord',
1156
        ),
1157
        $arg
1158 6
    );
1159
1160 6
    if ($u_style) {
1161
      $arg = array_map(
1162
          array(
1163
              '\\voku\\helper\\UTF8',
1164
              'int_to_hex',
1165
          ),
1166
          $arg
1167
      );
1168
    }
1169
1170 1
    return $arg;
1171
  }
1172 1
1173 1
  /**
1174 1
   * Returns count of characters used in a string.
1175
   *
1176 1
   * @param string $str       <p>The input string.</p>
1177
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1178
   *
1179
   * @return array <p>An associative array of Character as keys and
1180
   *               their count as values.</p>
1181
   */
1182
  public static function count_chars($str, $cleanUtf8 = false)
1183
  {
1184
    return array_count_values(self::split($str, 1, $cleanUtf8));
1185
  }
1186
1187
  /**
1188
   * Get a UTF-8 character from its decimal code representation.
1189
   *
1190
   * @param int $code
1191
   *
1192 11
   * @return string
1193
   */
1194 11
  public static function decimal_to_chr($code)
1195 11
  {
1196
    return \mb_convert_encoding(
1197 11
        '&#x' . dechex($code) . ';',
1198 5
        'UTF-8',
1199
        'HTML-ENTITIES'
1200
    );
1201 11
  }
1202 1
1203 1
  /**
1204
   * Encode a string with a new charset-encoding.
1205 11
   *
1206
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1207
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1208
   *
1209 11
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1210
   * @param string $str      <p>The input string</p>
1211
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1212 11
   *                         /> otherwise we auto-detect the current string-encoding</p>
1213
   *
1214 1
   * @return string
1215 11
   */
1216
  public static function encode($encoding, $str, $force = true)
1217
  {
1218
    $str = (string)$str;
1219 11
    $encoding = (string)$encoding;
1220
1221
    if (!isset($str[0], $encoding[0])) {
1222 11
      return $str;
1223 1
    }
1224 1
1225 1
    if ($encoding !== 'UTF-8') {
1226 11
      $encoding = self::normalize_encoding($encoding);
1227 11
    }
1228
1229
    $encodingDetected = self::str_detect_encoding($str);
1230
1231
    if (
1232 2
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1233
        &&
1234
        (
1235 1
            $force === true
1236
            ||
1237
            $encodingDetected !== $encoding
1238 2
        )
1239 1
    ) {
1240
1241
      if (
1242 2
          $encoding === 'UTF-8'
1243 2
          &&
1244 2
          (
1245
              $force === true
1246 2
              || $encodingDetected === 'UTF-8'
1247
              || $encodingDetected === 'WINDOWS-1252'
1248 2
              || $encodingDetected === 'ISO-8859-1'
1249 2
          )
1250
      ) {
1251
        return self::to_utf8($str);
1252
      }
1253 1
1254
      if (
1255
          $encoding === 'ISO-8859-1'
1256
          &&
1257
          (
1258
              $force === true
1259
              || $encodingDetected === 'ISO-8859-1'
1260
              || $encodingDetected === 'UTF-8'
1261
          )
1262
      ) {
1263
        return self::to_iso8859($str);
1264
      }
1265
1266
      $strEncoded = \mb_convert_encoding(
1267
          $str,
1268
          $encoding,
1269
          $encodingDetected
1270
      );
1271
1272
      if ($strEncoded) {
1273
        return $strEncoded;
1274
      }
1275
    }
1276
1277
    return $str;
1278
  }
1279
1280
  /**
1281
   * Reads entire file into a string.
1282
   *
1283
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1284
   *
1285
   * @link http://php.net/manual/en/function.file-get-contents.php
1286
   *
1287
   * @param string        $filename      <p>
1288
   *                                     Name of the file to read.
1289
   *                                     </p>
1290
   * @param int|null      $flags         [optional] <p>
1291
   *                                     Prior to PHP 6, this parameter is called
1292
   *                                     use_include_path and is a bool.
1293
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1294
   *                                     to trigger include path
1295
   *                                     search.
1296
   *                                     </p>
1297
   *                                     <p>
1298
   *                                     The value of flags can be any combination of
1299
   *                                     the following flags (with some restrictions), joined with the
1300
   *                                     binary OR (|)
1301
   *                                     operator.
1302
   *                                     </p>
1303
   *                                     <p>
1304
   *                                     <table>
1305
   *                                     Available flags
1306
   *                                     <tr valign="top">
1307
   *                                     <td>Flag</td>
1308
   *                                     <td>Description</td>
1309
   *                                     </tr>
1310
   *                                     <tr valign="top">
1311
   *                                     <td>
1312
   *                                     FILE_USE_INCLUDE_PATH
1313
   *                                     </td>
1314
   *                                     <td>
1315
   *                                     Search for filename in the include directory.
1316
   *                                     See include_path for more
1317
   *                                     information.
1318
   *                                     </td>
1319
   *                                     </tr>
1320
   *                                     <tr valign="top">
1321
   *                                     <td>
1322
   *                                     FILE_TEXT
1323
   *                                     </td>
1324
   *                                     <td>
1325
   *                                     As of PHP 6, the default encoding of the read
1326
   *                                     data is UTF-8. You can specify a different encoding by creating a
1327
   *                                     custom context or by changing the default using
1328
   *                                     stream_default_encoding. This flag cannot be
1329
   *                                     used with FILE_BINARY.
1330
   *                                     </td>
1331
   *                                     </tr>
1332
   *                                     <tr valign="top">
1333
   *                                     <td>
1334
   *                                     FILE_BINARY
1335
   *                                     </td>
1336
   *                                     <td>
1337
   *                                     With this flag, the file is read in binary mode. This is the default
1338 2
   *                                     setting and cannot be used with FILE_TEXT.
1339
   *                                     </td>
1340
   *                                     </tr>
1341 2
   *                                     </table>
1342 2
   *                                     </p>
1343
   * @param resource|null $context       [optional] <p>
1344 2
   *                                     A valid context resource created with
1345 2
   *                                     stream_context_create. If you don't need to use a
1346
   *                                     custom context, you can skip this parameter by &null;.
1347
   *                                     </p>
1348
   * @param int|null      $offset        [optional] <p>
1349 2
   *                                     The offset where the reading starts.
1350 2
   *                                     </p>
1351
   * @param int|null      $maxlen        [optional] <p>
1352 2
   *                                     Maximum length of data read. The default is to read until end
1353 2
   *                                     of file is reached.
1354
   *                                     </p>
1355 2
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1356 1
   *
1357 1
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1358 2
   *                                     or pdf, because they used non default utf-8 chars</p>
1359
   *
1360
   * @return string <p>The function returns the read data or false on failure.</p>
1361
   */
1362 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1363 1
  {
1364
    // init
1365
    $timeout = (int)$timeout;
1366 1
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1367 1
1368 1
    if ($timeout && $context === null) {
1369 1
      $context = stream_context_create(
1370
          array(
1371 1
              'http' =>
1372
                  array(
1373
                      'timeout' => $timeout,
1374
                  ),
1375
          )
1376
      );
1377
    }
1378
1379
    if (is_int($maxlen)) {
1380
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1381 1
    } else {
1382
      $data = file_get_contents($filename, $flags, $context, $offset);
1383 1
    }
1384
1385
    // return false on error
1386
    if ($data === false) {
1387
      return false;
1388
    }
1389
1390
    if ($convertToUtf8 === true) {
1391
      $data = self::encode('UTF-8', $data, false);
1392
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1393
    }
1394
1395 9
    return $data;
1396
  }
1397 9
1398 9
  /**
1399 3
   * Checks if a file starts with BOM (Byte Order Mark) character.
1400
   *
1401 3
   * @param string $file_path <p>Path to a valid file.</p>
1402 3
   *
1403 3
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1404 9
   */
1405 2
  public static function file_has_bom($file_path)
1406 2
  {
1407 2
    return self::string_has_bom(file_get_contents($file_path));
1408 2
  }
1409 9
1410 8
  /**
1411
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1412 2
   *
1413 2
   * @param mixed  $var
1414 8
   * @param int    $normalization_form
1415 8
   * @param string $leading_combining
1416 6
   *
1417 6
   * @return mixed
1418 6
   */
1419
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1420 6
  {
1421 3
    switch (gettype($var)) {
1422 3 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1423 5
        foreach ($var as $k => $v) {
1424
          /** @noinspection AlterInForeachInspection */
1425
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1426
        }
1427 8
        break;
1428 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1429
        foreach ($var as $k => $v) {
1430 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1431 2
        }
1432 8
        break;
1433 8
      case 'string':
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
1434 9
1435
        if (false !== strpos($var, "\r")) {
1436 9
          // Workaround https://bugs.php.net/65732
1437
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1438
        }
1439
1440
        if (self::is_ascii($var) === false) {
1441
1442
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1443
            $n = '-';
1444
          } else {
1445
            $n = \Normalizer::normalize($var, $normalization_form);
1446
1447
            if (isset($n[0])) {
1448
              $var = $n;
1449
            } else {
1450
              $var = self::encode('UTF-8', $var);
1451
            }
1452
          }
1453
1454
          if (
1455
              $var[0] >= "\x80" && isset($n[0], $leading_combining[0])
1456
              &&
1457
              preg_match('/^\p{Mn}/u', $var)
1458
          ) {
1459
            // Prevent leading combining chars
1460
            // for NFC-safe concatenations.
1461
            $var = $leading_combining . $var;
1462
          }
1463
        }
1464
        break;
1465
    }
1466
1467
    return $var;
1468
  }
1469
1470
  /**
1471
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1472
   *
1473
   * @param int    $type
1474
   * @param string $var
1475
   * @param int    $filter
1476
   * @param mixed  $option
1477
   *
1478
   * @return mixed
1479
   */
1480 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1481
  {
1482
    if (4 > func_num_args()) {
1483
      $var = filter_input($type, $var, $filter);
1484
    } else {
1485
      $var = filter_input($type, $var, $filter, $option);
1486
    }
1487
1488
    return self::filter($var);
1489 1
  }
1490
1491 1
  /**
1492 1
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1493 1
   *
1494 1
   * @param int   $type
1495
   * @param mixed $definition
1496
   * @param bool  $add_empty
1497 1
   *
1498
   * @return mixed
1499
   */
1500 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1501
  {
1502
    if (2 > func_num_args()) {
1503
      $a = filter_input_array($type);
1504
    } else {
1505
      $a = filter_input_array($type, $definition, $add_empty);
1506
    }
1507
1508
    return self::filter($a);
1509 1
  }
1510
1511 1
  /**
1512 1
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1513 1
   *
1514 1
   * @param mixed $var
1515
   * @param int   $filter
1516
   * @param mixed $option
1517 1
   *
1518
   * @return mixed
1519
   */
1520 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1521
  {
1522
    if (3 > func_num_args()) {
1523
      $var = filter_var($var, $filter);
1524
    } else {
1525
      $var = filter_var($var, $filter, $option);
1526
    }
1527
1528 1
    return self::filter($var);
1529
  }
1530 1
1531
  /**
1532
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1533
   *
1534
   * @param array $data
1535
   * @param mixed $definition
1536
   * @param bool  $add_empty
1537
   *
1538
   * @return mixed
1539
   */
1540 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1541
  {
1542
    if (2 > func_num_args()) {
1543
      $a = filter_var_array($data);
1544
    } else {
1545
      $a = filter_var_array($data, $definition, $add_empty);
1546 7
    }
1547
1548 7
    return self::filter($a);
1549 7
  }
1550
1551 7
  /**
1552
   * Check if the number of unicode characters are not more than the specified integer.
1553 7
   *
1554 2
   * @param string $str      The original string to be checked.
1555
   * @param int    $box_size The size in number of chars to be checked against string.
1556
   *
1557 7
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1558 1
   */
1559 1
  public static function fits_inside($str, $box_size)
1560 1
  {
1561
    return (self::strlen($str) <= $box_size);
1562 7
  }
1563
1564
  /**
1565
   * Try to fix simple broken UTF-8 strings.
1566
   *
1567
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1568
   *
1569
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1570
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1571
   * See: http://en.wikipedia.org/wiki/Windows-1252
1572 1
   *
1573
   * @param string $str <p>The input string</p>
1574 1
   *
1575
   * @return string
1576 1
   */
1577
  public static function fix_simple_utf8($str)
1578
  {
1579 1
    static $brokenUtf8ToUtf8Keys = null;
1580 1
    static $brokenUtf8ToUtf8Values = null;
1581
1582 1
    $str = (string)$str;
1583
1584
    if (!isset($str[0])) {
1585 1
      return '';
1586 1
    }
1587 1
1588 1
    if ($brokenUtf8ToUtf8Keys === null) {
1589 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1590
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1591 1
    }
1592
1593
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1594
  }
1595
1596
  /**
1597
   * Fix a double (or multiple) encoded UTF8 string.
1598
   *
1599
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1600
   *
1601 1
   * @return mixed
1602
   */
1603 1
  public static function fix_utf8($str)
1604
  {
1605
    if (is_array($str)) {
1606
1607 1
      foreach ($str as $k => $v) {
1608
        /** @noinspection AlterInForeachInspection */
1609
        /** @noinspection OffsetOperationsInspection */
1610
        $str[$k] = self::fix_utf8($v);
1611
      }
1612
1613
      return $str;
1614
    }
1615
1616
    $last = '';
1617
    while ($last !== $str) {
1618
      $last = $str;
1619
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1619 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1620
    }
1621
1622
    return $str;
1623 1
  }
1624
1625 1
  /**
1626 1
   * Get character of a specific character.
1627
   *
1628
   * @param string $char
1629 1
   *
1630
   * @return string <p>'RTL' or 'LTR'</p>
1631 1
   */
1632 1
  public static function getCharDirection($char)
1633 1
  {
1634 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1635 1
      self::checkForSupport();
1636 1
    }
1637 1
1638 1
    if (self::$support['intlChar'] === true) {
1639 1
      $tmpReturn = \IntlChar::charDirection($char);
1640 1
1641 1
      // from "IntlChar"-Class
1642
      $charDirection = array(
1643
          'RTL' => array(1, 13, 14, 15, 21),
1644
          'LTR' => array(0, 11, 12, 20),
1645
      );
1646
1647
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1648
        return 'LTR';
1649
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1650
        return 'RTL';
1651
      }
1652
    }
1653
1654
    $c = static::chr_to_decimal($char);
1655
1656
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1657
      return 'LTR';
1658
    }
1659
1660
    if (0x85e >= $c) {
1661 1
1662 1
      if (0x5be === $c ||
1663
          0x5c0 === $c ||
1664
          0x5c3 === $c ||
1665
          0x5c6 === $c ||
1666
          (0x5d0 <= $c && 0x5ea >= $c) ||
1667
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1668
          0x608 === $c ||
1669
          0x60b === $c ||
1670
          0x60d === $c ||
1671
          0x61b === $c ||
1672
          (0x61e <= $c && 0x64a >= $c) ||
1673
          (0x66d <= $c && 0x66f >= $c) ||
1674
          (0x671 <= $c && 0x6d5 >= $c) ||
1675
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1676
          (0x6ee <= $c && 0x6ef >= $c) ||
1677
          (0x6fa <= $c && 0x70d >= $c) ||
1678
          0x710 === $c ||
1679
          (0x712 <= $c && 0x72f >= $c) ||
1680
          (0x74d <= $c && 0x7a5 >= $c) ||
1681
          0x7b1 === $c ||
1682
          (0x7c0 <= $c && 0x7ea >= $c) ||
1683
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1684
          0x7fa === $c ||
1685
          (0x800 <= $c && 0x815 >= $c) ||
1686
          0x81a === $c ||
1687
          0x824 === $c ||
1688
          0x828 === $c ||
1689
          (0x830 <= $c && 0x83e >= $c) ||
1690
          (0x840 <= $c && 0x858 >= $c) ||
1691
          0x85e === $c
1692
      ) {
1693
        return 'RTL';
1694
      }
1695
1696
    } elseif (0x200f === $c) {
1697
1698
      return 'RTL';
1699
1700
    } elseif (0xfb1d <= $c) {
1701
1702
      if (0xfb1d === $c ||
1703
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1704
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1705
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1706
          0xfb3e === $c ||
1707
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1708
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1709
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1710
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1711
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1712
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1713
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1714
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1715
          (0xfe76 <= $c && 0xfefc >= $c) ||
1716
          (0x10800 <= $c && 0x10805 >= $c) ||
1717
          0x10808 === $c ||
1718
          (0x1080a <= $c && 0x10835 >= $c) ||
1719
          (0x10837 <= $c && 0x10838 >= $c) ||
1720
          0x1083c === $c ||
1721 1
          (0x1083f <= $c && 0x10855 >= $c) ||
1722
          (0x10857 <= $c && 0x1085f >= $c) ||
1723 1
          (0x10900 <= $c && 0x1091b >= $c) ||
1724 1
          (0x10920 <= $c && 0x10939 >= $c) ||
1725
          0x1093f === $c ||
1726 1
          0x10a00 === $c ||
1727
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1728
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1729
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1730
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1731
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1732
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1733
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1734
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1735
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1736
          (0x10b78 <= $c && 0x10b7f >= $c)
1737
      ) {
1738
        return 'RTL';
1739
      }
1740
    }
1741 2
1742
    return 'LTR';
1743 2
  }
1744 1
1745
  /**
1746
   * get data from "/data/*.ser"
1747 1
   *
1748
   * @param string $file
1749
   *
1750
   * @return bool|string|array|int <p>Will return false on error.</p>
1751
   */
1752
  private static function getData($file)
1753
  {
1754
    $file = __DIR__ . '/data/' . $file . '.php';
1755
    if (file_exists($file)) {
1756
      /** @noinspection PhpIncludeInspection */
1757
      return require $file;
1758
    } else {
1759
      return false;
1760
    }
1761 1
  }
1762
1763 1
  /**
1764
   * Converts hexadecimal U+xxxx code point representation to integer.
1765
   *
1766
   * INFO: opposite to UTF8::int_to_hex()
1767
   *
1768
   * @param string $str <p>The hexadecimal code point representation.</p>
1769
   *
1770
   * @return int|false <p>The code point, or false on failure.</p>
1771
   */
1772
  public static function hex_to_int($str)
1773
  {
1774
    if (!$str) {
1775
      return false;
1776
    }
1777 2
1778
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1779
      return intval($match[1], 16);
1780 2
    }
1781
1782 2
    return false;
1783 2
  }
1784 1
1785 1
  /**
1786
   * alias for "UTF8::html_entity_decode()"
1787 2
   *
1788
   * @see UTF8::html_entity_decode()
1789
   *
1790
   * @param string $str
1791 2
   * @param int    $flags
1792 2
   * @param string $encoding
1793 2
   *
1794
   * @return string
1795 2
   */
1796
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1797
  {
1798
    return self::html_entity_decode($str, $flags, $encoding);
1799
  }
1800
1801
  /**
1802
   * Converts a UTF-8 string to a series of HTML numbered entities.
1803
   *
1804
   * INFO: opposite to UTF8::html_decode()
1805
   *
1806
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1807
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1808
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1809
   *
1810
   * @return string <p>HTML numbered entities.</p>
1811
   */
1812
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1813
  {
1814
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1815
    if (function_exists('mb_encode_numericentity')) {
1816
1817
      $startCode = 0x00;
1818
      if ($keepAsciiChars === true) {
1819
        $startCode = 0x80;
1820
      }
1821
1822
      if ($encoding !== 'UTF-8') {
1823
        $encoding = self::normalize_encoding($encoding);
1824
      }
1825
1826
      return mb_encode_numericentity(
1827
          $str,
1828
          array($startCode, 0xffff, 0, 0xffff,),
1829
          $encoding
1830
      );
1831
    }
1832
1833
    return implode(
1834
        array_map(
1835
            function ($data) use ($keepAsciiChars) {
1836
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1837
            },
1838
            self::split($str)
1839
        )
1840
    );
1841
  }
1842
1843
  /**
1844
   * UTF-8 version of html_entity_decode()
1845
   *
1846
   * The reason we are not using html_entity_decode() by itself is because
1847
   * while it is not technically correct to leave out the semicolon
1848
   * at the end of an entity most browsers will still interpret the entity
1849
   * correctly. html_entity_decode() does not convert entities without
1850
   * semicolons, so we are left with our own little solution here. Bummer.
1851
   *
1852
   * Convert all HTML entities to their applicable characters
1853
   *
1854
   * INFO: opposite to UTF8::html_encode()
1855
   *
1856
   * @link http://php.net/manual/en/function.html-entity-decode.php
1857
   *
1858
   * @param string $str      <p>
1859
   *                         The input string.
1860
   *                         </p>
1861
   * @param int    $flags    [optional] <p>
1862
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1863
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1864
   *                         <table>
1865
   *                         Available <i>flags</i> constants
1866
   *                         <tr valign="top">
1867
   *                         <td>Constant Name</td>
1868
   *                         <td>Description</td>
1869
   *                         </tr>
1870
   *                         <tr valign="top">
1871
   *                         <td><b>ENT_COMPAT</b></td>
1872
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1873
   *                         </tr>
1874
   *                         <tr valign="top">
1875
   *                         <td><b>ENT_QUOTES</b></td>
1876
   *                         <td>Will convert both double and single quotes.</td>
1877 18
   *                         </tr>
1878
   *                         <tr valign="top">
1879 18
   *                         <td><b>ENT_NOQUOTES</b></td>
1880
   *                         <td>Will leave both double and single quotes unconverted.</td>
1881 18
   *                         </tr>
1882 6
   *                         <tr valign="top">
1883
   *                         <td><b>ENT_HTML401</b></td>
1884
   *                         <td>
1885 18
   *                         Handle code as HTML 4.01.
1886 7
   *                         </td>
1887
   *                         </tr>
1888
   *                         <tr valign="top">
1889 18
   *                         <td><b>ENT_XML1</b></td>
1890 1
   *                         <td>
1891 1
   *                         Handle code as XML 1.
1892
   *                         </td>
1893 18
   *                         </tr>
1894 4
   *                         <tr valign="top">
1895 4
   *                         <td><b>ENT_XHTML</b></td>
1896 4
   *                         <td>
1897
   *                         Handle code as XHTML.
1898
   *                         </td>
1899 4
   *                         </tr>
1900
   *                         <tr valign="top">
1901
   *                         <td><b>ENT_HTML5</b></td>
1902 18
   *                         <td>
1903
   *                         Handle code as HTML 5.
1904 18
   *                         </td>
1905 18
   *                         </tr>
1906
   *                         </table>
1907 16
   *                         </p>
1908
   * @param string $encoding [optional] <p>Encoding to use.</p>
1909 16
   *
1910 15
   * @return string <p>The decoded string.</p>
1911
   */
1912 7
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1913
  {
1914 18
    $str = (string)$str;
1915
1916 18
    if (!isset($str[0])) {
1917
      return '';
1918
    }
1919 18
1920 18
    if (!isset($str[3])) { // examples: &; || &x;
0 ignored issues
show
Unused Code Comprehensibility introduced by
46% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
1921 18
      return $str;
1922
    }
1923 18
1924
    if (
1925 18
        strpos($str, '&') === false
1926
        ||
1927 18
        strpos($str, ';') === false
1928
    ) {
1929
      return $str;
1930
    }
1931
1932
    if ($encoding !== 'UTF-8') {
1933
      $encoding = self::normalize_encoding($encoding);
1934
    }
1935
1936
    if ($flags === null) {
1937
      if (Bootup::is_php('5.4') === true) {
1938
        $flags = ENT_COMPAT | ENT_HTML5;
1939
      } else {
1940
        $flags = ENT_COMPAT;
1941
      }
1942
    }
1943
1944
    do {
1945
      $str_compare = $str;
1946
1947
      $str = preg_replace_callback(
1948
          "/&#\d{2,5};/",
1949
          function ($matches) {
1950
            $returnTmp = \mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
1951
1952
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1953
              return $returnTmp;
1954
            } else {
1955
              return $matches[0];
1956
            }
1957
          },
1958
          $str
1959
      );
1960
1961
      // decode numeric & UTF16 two byte entities
1962
      $str = html_entity_decode(
1963
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1964
          $flags,
1965
          $encoding
1966
      );
1967
1968
    } while ($str_compare !== $str);
1969
1970
    return $str;
1971
  }
1972
1973
  /**
1974
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1975
   *
1976
   * @link http://php.net/manual/en/function.htmlentities.php
1977
   *
1978
   * @param string $str           <p>
1979
   *                              The input string.
1980
   *                              </p>
1981
   * @param int    $flags         [optional] <p>
1982
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
1983
   *                              invalid code unit sequences and the used document type. The default is
1984
   *                              ENT_COMPAT | ENT_HTML401.
1985
   *                              <table>
1986
   *                              Available <i>flags</i> constants
1987
   *                              <tr valign="top">
1988
   *                              <td>Constant Name</td>
1989
   *                              <td>Description</td>
1990
   *                              </tr>
1991
   *                              <tr valign="top">
1992
   *                              <td><b>ENT_COMPAT</b></td>
1993
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
1994
   *                              </tr>
1995
   *                              <tr valign="top">
1996
   *                              <td><b>ENT_QUOTES</b></td>
1997
   *                              <td>Will convert both double and single quotes.</td>
1998
   *                              </tr>
1999
   *                              <tr valign="top">
2000
   *                              <td><b>ENT_NOQUOTES</b></td>
2001
   *                              <td>Will leave both double and single quotes unconverted.</td>
2002
   *                              </tr>
2003
   *                              <tr valign="top">
2004
   *                              <td><b>ENT_IGNORE</b></td>
2005
   *                              <td>
2006
   *                              Silently discard invalid code unit sequences instead of returning
2007
   *                              an empty string. Using this flag is discouraged as it
2008
   *                              may have security implications.
2009
   *                              </td>
2010
   *                              </tr>
2011
   *                              <tr valign="top">
2012
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2013
   *                              <td>
2014
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2015
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2016
   *                              </td>
2017
   *                              </tr>
2018
   *                              <tr valign="top">
2019
   *                              <td><b>ENT_DISALLOWED</b></td>
2020
   *                              <td>
2021
   *                              Replace invalid code points for the given document type with a
2022
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2023
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2024
   *                              instance, to ensure the well-formedness of XML documents with
2025
   *                              embedded external content.
2026
   *                              </td>
2027
   *                              </tr>
2028
   *                              <tr valign="top">
2029
   *                              <td><b>ENT_HTML401</b></td>
2030
   *                              <td>
2031
   *                              Handle code as HTML 4.01.
2032
   *                              </td>
2033 2
   *                              </tr>
2034
   *                              <tr valign="top">
2035 2
   *                              <td><b>ENT_XML1</b></td>
2036 1
   *                              <td>
2037 1
   *                              Handle code as XML 1.
2038
   *                              </td>
2039 2
   *                              </tr>
2040
   *                              <tr valign="top">
2041 2
   *                              <td><b>ENT_XHTML</b></td>
2042 1
   *                              <td>
2043
   *                              Handle code as XHTML.
2044
   *                              </td>
2045 2
   *                              </tr>
2046 2
   *                              <tr valign="top">
2047 2
   *                              <td><b>ENT_HTML5</b></td>
2048 2
   *                              <td>
2049 2
   *                              Handle code as HTML 5.
2050 1
   *                              </td>
2051
   *                              </tr>
2052 1
   *                              </table>
2053 1
   *                              </p>
2054 1
   * @param string $encoding      [optional] <p>
2055 1
   *                              Like <b>htmlspecialchars</b>,
2056 1
   *                              <b>htmlentities</b> takes an optional third argument
2057 2
   *                              <i>encoding</i> which defines encoding used in
2058
   *                              conversion.
2059 2
   *                              Although this argument is technically optional, you are highly
2060
   *                              encouraged to specify the correct value for your code.
2061
   *                              </p>
2062
   * @param bool   $double_encode [optional] <p>
2063
   *                              When <i>double_encode</i> is turned off PHP will not
2064
   *                              encode existing html entities. The default is to convert everything.
2065
   *                              </p>
2066
   *
2067
   *
2068
   * @return string the encoded string.
2069
   * </p>
2070
   * <p>
2071
   * If the input <i>string</i> contains an invalid code unit
2072
   * sequence within the given <i>encoding</i> an empty string
2073
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2074
   * <b>ENT_SUBSTITUTE</b> flags are set.
2075
   */
2076
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2077
  {
2078
    if ($encoding !== 'UTF-8') {
2079
      $encoding = self::normalize_encoding($encoding);
2080
    }
2081
2082
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2083
2084
    if ($encoding !== 'UTF-8') {
2085
      return $str;
2086
    }
2087
2088
    $byteLengths = self::chr_size_list($str);
2089
    $search = array();
2090
    $replacements = array();
2091
    foreach ($byteLengths as $counter => $byteLength) {
2092
      if ($byteLength >= 3) {
2093
        $char = self::access($str, $counter);
2094
2095
        if (!isset($replacements[$char])) {
2096
          $search[$char] = $char;
2097
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2093 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2098
        }
2099
      }
2100
    }
2101
2102
    return str_replace($search, $replacements, $str);
2103
  }
2104
2105
  /**
2106
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2107
   *
2108
   * INFO: Take a look at "UTF8::htmlentities()"
2109
   *
2110
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2111
   *
2112
   * @param string $str           <p>
2113
   *                              The string being converted.
2114
   *                              </p>
2115
   * @param int    $flags         [optional] <p>
2116
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2117
   *                              invalid code unit sequences and the used document type. The default is
2118
   *                              ENT_COMPAT | ENT_HTML401.
2119
   *                              <table>
2120
   *                              Available <i>flags</i> constants
2121
   *                              <tr valign="top">
2122
   *                              <td>Constant Name</td>
2123
   *                              <td>Description</td>
2124
   *                              </tr>
2125
   *                              <tr valign="top">
2126
   *                              <td><b>ENT_COMPAT</b></td>
2127
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2128
   *                              </tr>
2129
   *                              <tr valign="top">
2130
   *                              <td><b>ENT_QUOTES</b></td>
2131
   *                              <td>Will convert both double and single quotes.</td>
2132
   *                              </tr>
2133
   *                              <tr valign="top">
2134
   *                              <td><b>ENT_NOQUOTES</b></td>
2135
   *                              <td>Will leave both double and single quotes unconverted.</td>
2136
   *                              </tr>
2137
   *                              <tr valign="top">
2138
   *                              <td><b>ENT_IGNORE</b></td>
2139
   *                              <td>
2140
   *                              Silently discard invalid code unit sequences instead of returning
2141
   *                              an empty string. Using this flag is discouraged as it
2142
   *                              may have security implications.
2143
   *                              </td>
2144
   *                              </tr>
2145
   *                              <tr valign="top">
2146
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2147
   *                              <td>
2148
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2149
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2150
   *                              </td>
2151
   *                              </tr>
2152
   *                              <tr valign="top">
2153
   *                              <td><b>ENT_DISALLOWED</b></td>
2154
   *                              <td>
2155
   *                              Replace invalid code points for the given document type with a
2156
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2157
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2158
   *                              instance, to ensure the well-formedness of XML documents with
2159
   *                              embedded external content.
2160
   *                              </td>
2161
   *                              </tr>
2162
   *                              <tr valign="top">
2163
   *                              <td><b>ENT_HTML401</b></td>
2164
   *                              <td>
2165
   *                              Handle code as HTML 4.01.
2166
   *                              </td>
2167
   *                              </tr>
2168
   *                              <tr valign="top">
2169
   *                              <td><b>ENT_XML1</b></td>
2170
   *                              <td>
2171 1
   *                              Handle code as XML 1.
2172
   *                              </td>
2173 1
   *                              </tr>
2174
   *                              <tr valign="top">
2175
   *                              <td><b>ENT_XHTML</b></td>
2176
   *                              <td>
2177 1
   *                              Handle code as XHTML.
2178
   *                              </td>
2179
   *                              </tr>
2180
   *                              <tr valign="top">
2181
   *                              <td><b>ENT_HTML5</b></td>
2182
   *                              <td>
2183
   *                              Handle code as HTML 5.
2184
   *                              </td>
2185 1
   *                              </tr>
2186
   *                              </table>
2187 1
   *                              </p>
2188
   * @param string $encoding      [optional] <p>
2189
   *                              Defines encoding used in conversion.
2190
   *                              </p>
2191
   *                              <p>
2192
   *                              For the purposes of this function, the encodings
2193
   *                              ISO-8859-1, ISO-8859-15,
2194
   *                              UTF-8, cp866,
2195
   *                              cp1251, cp1252, and
2196
   *                              KOI8-R are effectively equivalent, provided the
2197
   *                              <i>string</i> itself is valid for the encoding, as
2198
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2199
   *                              the same positions in all of these encodings.
2200 3
   *                              </p>
2201
   * @param bool   $double_encode [optional] <p>
2202 3
   *                              When <i>double_encode</i> is turned off PHP will not
2203 3
   *                              encode existing html entities, the default is to convert everything.
2204
   *                              </p>
2205 3
   *
2206
   * @return string The converted string.
2207 3
   * </p>
2208
   * <p>
2209
   * If the input <i>string</i> contains an invalid code unit
2210
   * sequence within the given <i>encoding</i> an empty string
2211
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2212
   * <b>ENT_SUBSTITUTE</b> flags are set.
2213
   */
2214
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2215
  {
2216
    if ($encoding !== 'UTF-8') {
2217
      $encoding = self::normalize_encoding($encoding);
2218 1
    }
2219
2220 1
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2221
  }
2222
2223
  /**
2224
   * Checks whether iconv is available on the server.
2225
   *
2226
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2227
   */
2228 3
  public static function iconv_loaded()
2229
  {
2230 3
    return extension_loaded('iconv') ? true : false;
2231
  }
2232
2233
  /**
2234
   * Converts Integer to hexadecimal U+xxxx code point representation.
2235
   *
2236
   * INFO: opposite to UTF8::hex_to_int()
2237
   *
2238
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2239
   * @param string $pfix [optional]
2240
   *
2241
   * @return string <p>The code point, or empty string on failure.</p>
2242 2
   */
2243
  public static function int_to_hex($int, $pfix = 'U+')
2244 2
  {
2245
    if (ctype_digit((string)$int)) {
2246
      $hex = dechex((int)$int);
2247
2248
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2249
2250
      return $pfix . $hex;
2251
    }
2252
2253
    return '';
2254
  }
2255
2256 1
  /**
2257
   * Checks whether intl-char is available on the server.
2258 1
   *
2259
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2260
   */
2261
  public static function intlChar_loaded()
2262
  {
2263
    return (Bootup::is_php('7.0') === true && class_exists('IntlChar') === true);
2264
  }
2265
2266
  /**
2267
   * Checks whether intl is available on the server.
2268
   *
2269
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2270
   */
2271
  public static function intl_loaded()
2272
  {
2273
    return extension_loaded('intl') ? true : false;
2274
  }
2275
2276
  /**
2277
   * alias for "UTF8::is_ascii()"
2278
   *
2279
   * @see UTF8::is_ascii()
2280
   *
2281
   * @param string $str
2282
   *
2283
   * @return boolean
2284
   */
2285
  public static function isAscii($str)
2286
  {
2287
    return self::is_ascii($str);
2288
  }
2289
2290
  /**
2291
   * alias for "UTF8::is_base64()"
2292
   *
2293
   * @see UTF8::is_base64()
2294
   *
2295
   * @param string $str
2296
   *
2297
   * @return bool
2298 1
   */
2299
  public static function isBase64($str)
2300 1
  {
2301
    return self::is_base64($str);
2302
  }
2303
2304
  /**
2305
   * alias for "UTF8::is_binary()"
2306
   *
2307
   * @see UTF8::is_binary()
2308
   *
2309
   * @param string $str
2310
   *
2311
   * @return bool
2312
   */
2313
  public static function isBinary($str)
2314
  {
2315
    return self::is_binary($str);
2316
  }
2317
2318
  /**
2319
   * alias for "UTF8::is_bom()"
2320
   *
2321
   * @see UTF8::is_bom()
2322
   *
2323
   * @param string $utf8_chr
2324
   *
2325
   * @return boolean
2326 1
   */
2327
  public static function isBom($utf8_chr)
2328 1
  {
2329
    return self::is_bom($utf8_chr);
2330
  }
2331
2332
  /**
2333
   * alias for "UTF8::is_html()"
2334
   *
2335
   * @see UTF8::is_html()
2336
   *
2337
   * @param string $str
2338
   *
2339
   * @return boolean
2340 1
   */
2341
  public static function isHtml($str)
2342 1
  {
2343
    return self::is_html($str);
2344
  }
2345
2346
  /**
2347
   * alias for "UTF8::is_json()"
2348
   *
2349
   * @see UTF8::is_json()
2350
   *
2351
   * @param string $str
2352
   *
2353
   * @return bool
2354
   */
2355 16
  public static function isJson($str)
2356
  {
2357 16
    return self::is_json($str);
2358
  }
2359
2360
  /**
2361
   * alias for "UTF8::is_utf16()"
2362
   *
2363
   * @see UTF8::is_utf16()
2364
   *
2365
   * @param string $str
2366
   *
2367
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2368
   */
2369
  public static function isUtf16($str)
2370 14
  {
2371
    return self::is_utf16($str);
2372 14
  }
2373
2374
  /**
2375
   * alias for "UTF8::is_utf32()"
2376
   *
2377
   * @see UTF8::is_utf32()
2378
   *
2379
   * @param string $str
2380
   *
2381
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2382 1
   */
2383
  public static function isUtf32($str)
2384 1
  {
2385
    return self::is_utf32($str);
2386 1
  }
2387 1
2388
  /**
2389
   * alias for "UTF8::is_utf8()"
2390 1
   *
2391 1
   * @see UTF8::is_utf8()
2392
   *
2393 1
   * @param string $str
2394
   * @param bool   $strict
2395
   *
2396
   * @return bool
2397
   */
2398
  public static function isUtf8($str, $strict = false)
2399
  {
2400
    return self::is_utf8($str, $strict);
2401
  }
2402
2403
  /**
2404 16
   * Checks if a string is 7 bit ASCII.
2405
   *
2406
   * @param string $str <p>The string to check.</p>
2407 16
   *
2408
   * @return bool <p>
2409
   *              <strong>true</strong> if it is ASCII<br />
2410 16
   *              <strong>false</strong> otherwise
2411
   *              </p>
2412 16
   */
2413 16
  public static function is_ascii($str)
2414 15
  {
2415 16
    $str = (string)$str;
2416 6
2417
    if (!isset($str[0])) {
2418 15
      return true;
2419
    }
2420
2421
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2422
  }
2423
2424
  /**
2425
   * Returns true if the string is base64 encoded, false otherwise.
2426
   *
2427
   * @param string $str <p>The input string.</p>
2428
   *
2429
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2430
   */
2431
  public static function is_base64($str)
2432
  {
2433
    $str = (string)$str;
2434
2435
    if (!isset($str[0])) {
2436
      return false;
2437
    }
2438
2439
    if (base64_encode(base64_decode($str, true)) === $str) {
2440
      return true;
2441
    } else {
2442
      return false;
2443
    }
2444
  }
2445
2446
  /**
2447
   * Check if the input is binary... (is look like a hack).
2448
   *
2449
   * @param mixed $input
2450
   *
2451
   * @return bool
2452
   */
2453
  public static function is_binary($input)
2454
  {
2455
2456
    $testLength = strlen($input);
2457
2458
    if (
2459
        preg_match('~^[01]+$~', $input)
2460
        ||
2461
        substr_count($input, "\x00") > 0
2462
        ||
2463
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2464
    ) {
2465
      return true;
2466
    } else {
2467
      return false;
2468
    }
2469 1
  }
2470
2471 1
  /**
2472
   * Check if the file is binary.
2473 1
   *
2474
   * @param string $file
2475
   *
2476
   * @return boolean
2477
   */
2478 1
  public static function is_binary_file($file)
2479
  {
2480 1
    try {
2481
      $fp = fopen($file, 'r');
2482 1
      $block = fread($fp, 512);
2483 1
      fclose($fp);
2484
    } catch (\Exception $e) {
2485 1
      $block = '';
2486
    }
2487
2488
    return self::is_binary($block);
2489
  }
2490
2491
  /**
2492
   * Checks if the given string is equal to any "Byte Order Mark".
2493
   *
2494
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2495
   *
2496 1
   * @param string $str <p>The input string.</p>
2497
   *
2498 1
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2499
   */
2500 1
  public static function is_bom($str)
2501
  {
2502
    foreach (self::$bom as $bomString => $bomByteLength) {
2503
      if ($str === $bomString) {
2504
        return true;
2505 1
      }
2506 1
    }
2507 1
2508 1
    return false;
2509 1
  }
2510
2511 1
  /**
2512
   * Check if the string contains any html-tags <lall>.
2513
   *
2514
   * @param string $str <p>The input string.</p>
2515
   *
2516
   * @return boolean
2517
   */
2518
  public static function is_html($str)
2519
  {
2520
    $str = (string)$str;
2521
2522
    if (!isset($str[0])) {
2523
      return false;
2524
    }
2525
2526 4
    // init
2527
    $matches = array();
2528 4
2529
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2530 4
2531
    if (count($matches) == 0) {
2532 4
      return false;
2533 4
    } else {
2534 4
      return true;
2535 4
    }
2536 4
  }
2537 4
2538 4
  /**
2539 4
   * Try to check if "$str" is an json-string.
2540 4
   *
2541 2
   * @param string $str <p>The input string.</p>
2542 2
   *
2543 4
   * @return bool
2544 4
   */
2545 4
  public static function is_json($str)
2546
  {
2547 4
    $str = (string)$str;
2548 4
2549 4
    if (!isset($str[0])) {
2550 4
      return false;
2551 4
    }
2552 4
2553 4
    if (
2554 4
        is_object(self::json_decode($str))
2555 4
        &&
2556 3
        json_last_error() === JSON_ERROR_NONE
2557 3
    ) {
2558 4
      return true;
2559 4
    } else {
2560 4
      return false;
2561
    }
2562 4
  }
2563 3
2564 2
  /**
2565
   * Check if the string is UTF-16.
2566 3
   *
2567
   * @param string $str <p>The input string.</p>
2568
   *
2569
   * @return int|false <p>
2570 3
   *                   <strong>false</strong> if is't not UTF-16,<br />
2571
   *                   <strong>1</strong> for UTF-16LE,<br />
2572 3
   *                   <strong>2</strong> for UTF-16BE.
2573
   *                   </p>
2574
   */
2575 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2576
  {
2577
    $str = self::remove_bom($str);
2578
2579
    if (self::is_binary($str)) {
2580
2581
      $maybeUTF16LE = 0;
2582
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2583
      if ($test) {
2584
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2585
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2586 3
        if ($test3 === $test) {
2587
          $strChars = self::count_chars($str, true);
2588 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2589
            if (in_array($test3char, $strChars, true) === true) {
2590 3
              $maybeUTF16LE++;
2591
            }
2592 3
          }
2593 3
        }
2594 3
      }
2595 3
2596 3
      $maybeUTF16BE = 0;
2597 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2598 3
      if ($test) {
2599 3
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2600 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2601 1
        if ($test3 === $test) {
2602 1
          $strChars = self::count_chars($str, true);
2603 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2604 3
            if (in_array($test3char, $strChars, true) === true) {
2605 3
              $maybeUTF16BE++;
2606
            }
2607 3
          }
2608 3
        }
2609 3
      }
2610 3
2611 3
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2612 3
        if ($maybeUTF16LE > $maybeUTF16BE) {
2613 3
          return 1;
2614 3
        } else {
2615 3
          return 2;
2616 1
        }
2617 1
      }
2618 3
2619 3
    }
2620 3
2621
    return false;
2622 3
  }
2623 1
2624 1
  /**
2625
   * Check if the string is UTF-32.
2626 1
   *
2627
   * @param string $str
2628
   *
2629
   * @return int|false <p>
2630 3
   *                   <strong>false</strong> if is't not UTF-16,<br />
2631
   *                   <strong>1</strong> for UTF-32LE,<br />
2632 3
   *                   <strong>2</strong> for UTF-32BE.
2633
   *                   </p>
2634
   */
2635 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2636
  {
2637
    $str = self::remove_bom($str);
2638
2639
    if (self::is_binary($str)) {
2640
2641
      $maybeUTF32LE = 0;
2642
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2643
      if ($test) {
2644
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2645 43
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2646
        if ($test3 === $test) {
2647 43
          $strChars = self::count_chars($str, true);
2648
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2649 43
            if (in_array($test3char, $strChars, true) === true) {
2650 3
              $maybeUTF32LE++;
2651
            }
2652
          }
2653 41
        }
2654 1
      }
2655 1
2656
      $maybeUTF32BE = 0;
2657
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2658
      if ($test) {
2659
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2660
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2661
        if ($test3 === $test) {
2662
          $strChars = self::count_chars($str, true);
2663 41
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2664
            if (in_array($test3char, $strChars, true) === true) {
2665
              $maybeUTF32BE++;
2666
            }
2667
          }
2668
        }
2669
      }
2670
2671
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2672
        if ($maybeUTF32LE > $maybeUTF32BE) {
2673 41
          return 1;
2674
        } else {
2675 41
          return 2;
2676 41
        }
2677 41
      }
2678
2679
    }
2680 41
2681 41
    return false;
2682 41
  }
2683
2684
  /**
2685 41
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2686
   *
2687 36
   * @see    http://hsivonen.iki.fi/php-utf8/
2688 41
   *
2689
   * @param string $str    <p>The string to be checked.</p>
2690 34
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2691 34
   *
2692 34
   * @return bool
2693 34
   */
2694 39
  public static function is_utf8($str, $strict = false)
2695
  {
2696 21
    $str = (string)$str;
2697 21
2698 21
    if (!isset($str[0])) {
2699 21
      return true;
2700 33
    }
2701
2702 9
    if ($strict === true) {
2703 9
      if (self::is_utf16($str) !== false) {
2704 9
        return false;
2705 9
      }
2706 16
2707
      if (self::is_utf32($str) !== false) {
2708
        return false;
2709
      }
2710
    }
2711
2712
    if (self::pcre_utf8_support() !== true) {
2713
2714
      // If even just the first character can be matched, when the /u
2715 3
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2716 3
      // invalid, nothing at all will match, even if the string contains
2717 3
      // some valid sequences
2718 3
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2719 9
2720
    } else {
2721 3
2722 3
      $mState = 0; // cached expected number of octets after the current octet
2723 3
      // until the beginning of the next UTF8 character sequence
2724 3
      $mUcs4 = 0; // cached Unicode character
2725 3
      $mBytes = 1; // cached expected number of octets in the current sequence
2726
      $len = strlen($str);
2727
2728
      /** @noinspection ForeachInvariantsInspection */
2729 5
      for ($i = 0; $i < $len; $i++) {
2730
        $in = ord($str[$i]);
2731 41
        if ($mState === 0) {
2732
          // When mState is zero we expect either a US-ASCII character or a
2733
          // multi-octet sequence.
2734 36
          if (0 === (0x80 & $in)) {
2735
            // US-ASCII, pass straight through.
2736 33
            $mBytes = 1;
2737 33 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2738 33
            // First octet of 2 octet sequence.
2739 33
            $mUcs4 = $in;
2740
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2741
            $mState = 1;
2742
            $mBytes = 2;
2743
          } elseif (0xE0 === (0xF0 & $in)) {
2744 33
            // First octet of 3 octet sequence.
2745
            $mUcs4 = $in;
2746
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2747
            $mState = 2;
2748
            $mBytes = 3;
2749 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2750 33
            // First octet of 4 octet sequence.
2751 33
            $mUcs4 = $in;
2752 33
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2753 33
            $mState = 3;
2754
            $mBytes = 4;
2755 33
          } elseif (0xF8 === (0xFC & $in)) {
2756
            /* First octet of 5 octet sequence.
2757 33
            *
2758 33
            * This is illegal because the encoded codepoint must be either
2759 5
            * (a) not the shortest form or
2760
            * (b) outside the Unicode range of 0-0x10FFFF.
2761
            * Rather than trying to resynchronize, we will carry on until the end
2762 33
            * of the sequence and let the later error handling code catch it.
2763 33
            */
2764 33
            $mUcs4 = $in;
2765 33
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2766 33
            $mState = 4;
2767
            $mBytes = 5;
2768 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2769
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2770
            $mUcs4 = $in;
2771 18
            $mUcs4 = ($mUcs4 & 1) << 30;
2772
            $mState = 5;
2773
            $mBytes = 6;
2774 41
          } else {
2775
            /* Current octet is neither in the US-ASCII range nor a legal first
2776 20
             * octet of a multi-octet sequence.
2777
             */
2778
            return false;
2779
          }
2780
        } else {
2781
          // When mState is non-zero, we expect a continuation of the multi-octet
2782
          // sequence
2783
          if (0x80 === (0xC0 & $in)) {
2784
            // Legal continuation.
2785
            $shift = ($mState - 1) * 6;
2786
            $tmp = $in;
2787
            $tmp = ($tmp & 0x0000003F) << $shift;
2788
            $mUcs4 |= $tmp;
2789
            /**
2790
             * End of the multi-octet sequence. mUcs4 now contains the final
2791
             * Unicode code point to be output
2792
             */
2793
            if (0 === --$mState) {
2794
              /*
2795
              * Check for illegal sequences and code points.
2796
              */
2797
              // From Unicode 3.1, non-shortest form is illegal
2798
              if (
2799
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2800
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2801
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2802
                  (4 < $mBytes) ||
2803
                  // From Unicode 3.2, surrogate characters are illegal.
2804
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2805
                  // Code points outside the Unicode range are illegal.
2806
                  ($mUcs4 > 0x10FFFF)
2807
              ) {
2808
                return false;
2809
              }
2810
              // initialize UTF8 cache
2811
              $mState = 0;
2812
              $mUcs4 = 0;
2813
              $mBytes = 1;
2814
            }
2815
          } else {
2816 2
            /**
2817
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2818 2
             * Incomplete multi-octet sequence.
2819
             */
2820 2
            return false;
2821 2
          }
2822 2
        }
2823
      }
2824
2825
      return true;
2826 2
    }
2827
  }
2828
2829
  /**
2830
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2831
   * Decodes a JSON string
2832
   *
2833
   * @link http://php.net/manual/en/function.json-decode.php
2834
   *
2835
   * @param string $json    <p>
2836
   *                        The <i>json</i> string being decoded.
2837
   *                        </p>
2838
   *                        <p>
2839
   *                        This function only works with UTF-8 encoded strings.
2840
   *                        </p>
2841
   *                        <p>PHP implements a superset of
2842
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2843
   *                        only supports these values when they are nested inside an array or an object.
2844
   *                        </p>
2845
   * @param bool   $assoc   [optional] <p>
2846
   *                        When <b>TRUE</b>, returned objects will be converted into
2847
   *                        associative arrays.
2848
   *                        </p>
2849
   * @param int    $depth   [optional] <p>
2850
   *                        User specified recursion depth.
2851
   *                        </p>
2852
   * @param int    $options [optional] <p>
2853
   *                        Bitmask of JSON decode options. Currently only
2854
   *                        <b>JSON_BIGINT_AS_STRING</b>
2855
   *                        is supported (default is to cast large integers as floats)
2856
   *                        </p>
2857
   *
2858
   * @return mixed the value encoded in <i>json</i> in appropriate
2859
   * PHP type. Values true, false and
2860
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2861
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2862
   * <i>json</i> cannot be decoded or if the encoded
2863
   * data is deeper than the recursion limit.
2864
   */
2865 2
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2866
  {
2867 2
    $json = self::filter($json);
2868
2869 2
    if (Bootup::is_php('5.4') === true) {
2870
      $json = json_decode($json, $assoc, $depth, $options);
2871
    } else {
2872 2
      $json = json_decode($json, $assoc, $depth);
2873
    }
2874
2875 2
    return $json;
2876
  }
2877
2878
  /**
2879
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2880
   * Returns the JSON representation of a value.
2881
   *
2882
   * @link http://php.net/manual/en/function.json-encode.php
2883
   *
2884
   * @param mixed $value   <p>
2885 6
   *                       The <i>value</i> being encoded. Can be any type except
2886
   *                       a resource.
2887 6
   *                       </p>
2888
   *                       <p>
2889
   *                       All string data must be UTF-8 encoded.
2890
   *                       </p>
2891
   *                       <p>PHP implements a superset of
2892
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2893
   *                       only supports these values when they are nested inside an array or an object.
2894
   *                       </p>
2895
   * @param int   $options [optional] <p>
2896
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2897
   *                       <b>JSON_HEX_TAG</b>,
2898 24
   *                       <b>JSON_HEX_AMP</b>,
2899
   *                       <b>JSON_HEX_APOS</b>,
2900 24
   *                       <b>JSON_NUMERIC_CHECK</b>,
2901
   *                       <b>JSON_PRETTY_PRINT</b>,
2902 24
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2903 2
   *                       <b>JSON_FORCE_OBJECT</b>,
2904
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2905
   *                       constants is described on
2906
   *                       the JSON constants page.
2907 23
   *                       </p>
2908 2
   * @param int   $depth   [optional] <p>
2909
   *                       Set the maximum depth. Must be greater than zero.
2910
   *                       </p>
2911 23
   *
2912
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2913 23
   */
2914
  public static function json_encode($value, $options = 0, $depth = 512)
2915
  {
2916
    $value = self::filter($value);
2917
2918
    if (Bootup::is_php('5.5')) {
2919
      $json = json_encode($value, $options, $depth);
2920
    } else {
2921
      $json = json_encode($value, $options);
2922
    }
2923 1
2924
    return $json;
2925 1
  }
2926
2927
  /**
2928
   * Makes string's first char lowercase.
2929 1
   *
2930
   * @param string $str <p>The input string</p>
2931
   *
2932
   * @return string <p>The resulting string</p>
2933
   */
2934
  public static function lcfirst($str)
2935
  {
2936
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2937
  }
2938
2939
  /**
2940 1
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2941
   *
2942 1
   * @param string $str   <p>The string to be trimmed</p>
2943 1
   * @param string $chars <p>Optional characters to be stripped</p>
2944 1
   *
2945
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2946 1
   */
2947 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2948
  {
2949
    $str = (string)$str;
2950
2951
    if (!isset($str[0])) {
2952
      return '';
2953
    }
2954
2955 2
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2956
    if ($chars === INF || !$chars) {
2957 2
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2958
    }
2959 2
2960 2
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2961 2
2962
    return preg_replace("/^{$chars}+/u", '', $str);
2963 2
  }
2964
2965
  /**
2966
   * Returns the UTF-8 character with the maximum code point in the given data.
2967
   *
2968
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2969
   *
2970
   * @return string <p>The character with the highest code point than others.</p>
2971
   */
2972 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2973 1
  {
2974
    if (is_array($arg)) {
2975 1
      $arg = implode($arg);
2976
    }
2977
2978
    return self::chr(max(self::codepoints($arg)));
2979 1
  }
2980
2981
  /**
2982
   * Calculates and returns the maximum number of bytes taken by any
2983
   * UTF-8 encoded character in the given string.
2984
   *
2985
   * @param string $str <p>The original Unicode string.</p>
2986
   *
2987
   * @return int <p>Max byte lengths of the given chars.</p>
2988
   */
2989
  public static function max_chr_width($str)
2990
  {
2991 1
    $bytes = self::chr_size_list($str);
2992
    if (count($bytes) > 0) {
2993 1
      return (int)max($bytes);
2994
    } else {
2995
      return 0;
2996
    }
2997
  }
2998
2999
  /**
3000
   * Checks whether mbstring is available on the server.
3001
   *
3002
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
3003 4
   */
3004
  public static function mbstring_loaded()
3005 4
  {
3006
    $return = extension_loaded('mbstring');
3007 4
3008 1
    if ($return === true) {
3009
      \mb_internal_encoding('UTF-8');
3010
    }
3011 4
3012 1
    return $return;
3013
  }
3014
3015 4
  /**
3016 3
   * Returns the UTF-8 character with the minimum code point in the given data.
3017
   *
3018
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
3019 3
   *
3020 3
   * @return string <p>The character with the lowest code point than others.</p>
3021
   */
3022 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3023 2
  {
3024 2
    if (is_array($arg)) {
3025 2
      $arg = implode($arg);
3026
    }
3027
3028 2
    return self::chr(min(self::codepoints($arg)));
3029 2
  }
3030 2
3031 2
  /**
3032 2
   * alias for "UTF8::normalize_encoding()"
3033 2
   *
3034 2
   * @see UTF8::normalize_encoding()
3035 2
   *
3036 2
   * @param string $encoding
3037 2
   *
3038 2
   * @return string
3039 2
   */
3040 2
  public static function normalizeEncoding($encoding)
3041 2
  {
3042 2
    return self::normalize_encoding($encoding);
3043
  }
3044 2
3045 2
  /**
3046 2
   * Normalize the encoding-"name" input.
3047
   *
3048 2
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3049
   *
3050 2
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3051
   */
3052
  public static function normalize_encoding($encoding)
3053
  {
3054
    static $staticNormalizeEncodingCache = array();
3055
3056
    if (!$encoding) {
3057
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3058
    }
3059
3060 2
    if ('UTF-8' === $encoding) {
3061
      return $encoding;
3062 2
    }
3063 2
3064
    if (in_array($encoding, self::$iconvEncoding, true)) {
3065 2
      return $encoding;
3066 1
    }
3067 1
3068 1
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3069
      return $staticNormalizeEncodingCache[$encoding];
3070 2
    }
3071
3072
    $encodingOrig = $encoding;
3073
    $encoding = strtoupper($encoding);
3074
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3075
3076
    $equivalences = array(
3077
        'ISO88591'    => 'ISO-8859-1',
3078
        'ISO8859'     => 'ISO-8859-1',
3079
        'ISO'         => 'ISO-8859-1',
3080
        'LATIN1'      => 'ISO-8859-1',
3081
        'LATIN'       => 'ISO-8859-1',
3082
        'WIN1252'     => 'ISO-8859-1',
3083 7
        'WINDOWS1252' => 'ISO-8859-1',
3084
        'UTF16'       => 'UTF-16',
3085 7
        'UTF32'       => 'UTF-32',
3086 7
        'UTF8'        => 'UTF-8',
3087
        'UTF'         => 'UTF-8',
3088 7
        'UTF7'        => 'UTF-7',
3089
        '8BIT'        => 'CP850',
3090 7
        'BINARY'      => 'CP850',
3091
    );
3092 2
3093
    if (!empty($equivalences[$encodingUpperHelper])) {
3094 2
      $encoding = $equivalences[$encodingUpperHelper];
3095
    }
3096 1
3097 1
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3098
3099 2
    return $encoding;
3100 2
  }
3101
3102 7
  /**
3103 7
   * Normalize some MS Word special characters.
3104 1
   *
3105 1
   * @param string $str <p>The string to be normalized.</p>
3106
   *
3107 7
   * @return string
3108 7
   */
3109
  public static function normalize_msword($str)
3110 7
  {
3111
    static $utf8MSWordKeys = null;
3112
    static $utf8MSWordValues = null;
3113
3114
    if ($utf8MSWordKeys === null) {
3115
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3116
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3117
    }
3118
3119
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3120
  }
3121
3122
  /**
3123
   * Normalize the whitespace.
3124
   *
3125
   * @param string $str                     <p>The string to be normalized.</p>
3126
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3127
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3128
   *                                        bidirectional text chars.</p>
3129
   *
3130
   * @return string
3131
   */
3132
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3133
  {
3134
    static $whitespaces = array();
3135
    static $bidiUniCodeControls = null;
3136
3137
    $cacheKey = (int)$keepNonBreakingSpace;
3138
3139
    if (!isset($whitespaces[$cacheKey])) {
3140
3141
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3142
3143
      if ($keepNonBreakingSpace === true) {
3144
        /** @noinspection OffsetOperationsInspection */
3145
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3146
      }
3147
3148
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3149
    }
3150
3151
    if ($keepBidiUnicodeControls === false) {
3152
      if ($bidiUniCodeControls === null) {
3153
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3154
      }
3155
3156
      $str = str_replace($bidiUniCodeControls, '', $str);
3157
    }
3158
3159
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3160
  }
3161
3162
  /**
3163 17
   * Format a number with grouped thousands.
3164
   *
3165 17
   * @param float  $number
3166 3
   * @param int    $decimals
3167
   * @param string $dec_point
3168
   * @param string $thousands_sep
3169 16
   *
3170
   * @return string
3171
   *    *
3172
   * @deprecated Because this has nothing to do with UTF8. :/
3173 16
   */
3174
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3175
  {
3176
    $thousands_sep = (string)$thousands_sep;
3177
    $dec_point = (string)$dec_point;
3178
3179
    if (
3180 16
        isset($thousands_sep[1], $dec_point[1])
3181 16
        &&
3182
        Bootup::is_php('5.4') === true
3183 16
    ) {
3184 3
      return str_replace(
3185
          array(
3186
              '.',
3187 15
              ',',
3188 10
          ),
3189
          array(
3190
              $dec_point,
3191 13
              $thousands_sep,
3192 10
          ),
3193
          number_format($number, $decimals, '.', ',')
3194
      );
3195 12
    }
3196
3197
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3198
  }
3199
3200
  /**
3201
   * Calculates Unicode code point of the given UTF-8 encoded character.
3202
   *
3203
   * INFO: opposite to UTF8::chr()
3204
   *
3205
   * @param string $chr <p>The character of which to calculate code point.<p/>
3206
   *
3207
   * @return int <p>
3208
   *             Unicode code point of the given character,<br />
3209
   *             0 on invalid UTF-8 byte sequence.
3210
   *             </p>
3211 1
   */
3212
  public static function ord($chr)
3213
  {
3214 1
    if (!$chr && $chr !== '0') {
3215
      return 0;
3216 1
    }
3217 1
3218 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3219
      self::checkForSupport();
3220
    }
3221 1
3222
    if (self::$support['intlChar'] === true) {
3223
      $tmpReturn = \IntlChar::ord($chr);
3224
      if ($tmpReturn) {
3225
        return $tmpReturn;
3226
      }
3227
    }
3228
3229 41
    // use static cache, if there is no support for "IntlChar"
3230
    static $cache = array();
3231
    if (isset($cache[$chr]) === true) {
3232 41
      return $cache[$chr];
3233
    }
3234
3235
    $chr_orig = $chr;
3236
    $chr = unpack('C*', substr($chr, 0, 4));
3237
    $a = $chr ? $chr[1] : 0;
3238
3239
    if (0xF0 <= $a && isset($chr[4])) {
3240
      return $cache[$chr_orig] = (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3241
    }
3242
3243 1
    if (0xE0 <= $a && isset($chr[3])) {
3244
      return $cache[$chr_orig] = (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3245 1
    }
3246 1
3247
    if (0xC0 <= $a && isset($chr[2])) {
3248
      return $cache[$chr_orig] = (($a - 0xC0) << 6) + $chr[2] - 0x80;
3249 1
    }
3250 1
3251 1
    return $cache[$chr_orig] = $a;
3252
  }
3253
3254 1
  /**
3255
   * Parses the string into an array (into the the second parameter).
3256
   *
3257 1
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3258
   *          if the second parameter is not set!
3259
   *
3260
   * @link http://php.net/manual/en/function.parse-str.php
3261 1
   *
3262 1
   * @param string $str    <p>The input string.</p>
3263 1
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3264
   *
3265
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3266 1
   */
3267
  public static function parse_str($str, &$result)
3268
  {
3269 1
    // init
3270
    $str = self::clean($str);
3271
3272
    $return = \mb_parse_str($str, $result);
3273 1
    if ($return === false || empty($result)) {
3274
      return false;
3275 1
    }
3276 1
3277 1
    return true;
3278 1
  }
3279 1
3280
  /**
3281
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3282
   *
3283
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3284
   */
3285
  public static function pcre_utf8_support()
3286
  {
3287
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3288
    return (bool)@preg_match('//u', '');
3289 10
  }
3290
3291 10
  /**
3292 10
   * Create an array containing a range of UTF-8 characters.
3293 5
   *
3294 5
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3295 10
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3296
   *
3297 10
   * @return array
3298
   */
3299
  public static function range($var1, $var2)
3300
  {
3301
    if (!$var1 || !$var2) {
3302
      return array();
3303
    }
3304
3305 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3306
      $start = (int)$var1;
3307
    } elseif (ctype_xdigit($var1)) {
3308
      $start = (int)self::hex_to_int($var1);
3309 5
    } else {
3310
      $start = self::ord($var1);
3311 5
    }
3312
3313
    if (!$start) {
3314
      return array();
3315
    }
3316
3317 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3318
      $end = (int)$var2;
3319
    } elseif (ctype_xdigit($var2)) {
3320
      $end = (int)self::hex_to_int($var2);
3321
    } else {
3322 1
      $end = self::ord($var2);
3323
    }
3324 1
3325 1
    if (!$end) {
3326 1
      return array();
3327
    }
3328 1
3329 1
    return array_map(
3330 1
        array(
3331 1
            '\\voku\\helper\\UTF8',
3332 1
            'chr',
3333
        ),
3334 1
        range($start, $end)
3335
    );
3336
  }
3337
3338
  /**
3339
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3340
   *
3341
   * @param string $str <p>The input string.</p>
3342
   *
3343
   * @return string <p>String without UTF-BOM</p>
3344
   */
3345
  public static function remove_bom($str)
3346
  {
3347
    foreach (self::$bom as $bomString => $bomByteLength) {
3348
      if (0 === strpos($str, $bomString)) {
3349
        $str = substr($str, $bomByteLength);
3350 43
      }
3351
    }
3352
3353 43
    return $str;
3354
  }
3355
3356
  /**
3357 43
   * alias for "UTF8::remove_bom()"
3358 43
   *
3359 43
   * @see UTF8::remove_bom()
3360 43
   *
3361
   * @param string $str
3362 43
   *
3363
   * @return string
3364
   */
3365 43
  public static function removeBOM($str)
3366 43
  {
3367
    return self::remove_bom($str);
3368 43
  }
3369
3370
  /**
3371
   * Removes duplicate occurrences of a string in another string.
3372
   *
3373
   * @param string          $str  <p>The base string.</p>
3374
   * @param string|string[] $what <p>String to search for in the base string.</p>
3375
   *
3376
   * @return string <p>The result string with removed duplicates.</p>
3377
   */
3378
  public static function remove_duplicates($str, $what = ' ')
3379 43
  {
3380
    if (is_string($what)) {
3381 43
      $what = array($what);
3382
    }
3383 43
3384 43
    if (is_array($what)) {
3385 43
      foreach ($what as $item) {
3386
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3387 43
      }
3388 43
    }
3389 43
3390
    return $str;
3391 43
  }
3392
3393
  /**
3394
   * Remove invisible characters from a string.
3395
   *
3396
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3397
   *
3398
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3399
   *
3400
   * @param string $str
3401
   * @param bool   $url_encoded
3402 23
   * @param string $replacement
3403
   *
3404 23
   * @return string
3405
   */
3406 23
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3407 5
  {
3408
    // init
3409
    $non_displayables = array();
3410
3411 19
    // every control character except newline (dec 10),
3412 3
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3413
    if ($url_encoded) {
3414
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3415 18
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3416
    }
3417 18
3418
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3419
3420
    do {
3421
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3422
    } while ($count !== 0);
3423
3424
    return $str;
3425
  }
3426
3427
  /**
3428 45
   * Replace the diamond question mark (�) with the replacement.
3429
   *
3430 45
   * @param string $str
3431
   * @param string $unknown
3432 45
   *
3433
   * @return string
3434 45
   */
3435 34
  public static function replace_diamond_question_mark($str, $unknown = '?')
3436
  {
3437
    return str_replace(
3438 17
        array(
3439
            "\xEF\xBF\xBD",
3440
            '�',
3441 17
        ),
3442 17
        array(
3443
            $unknown,
3444 17
            $unknown,
3445 17
        ),
3446 17
        $str
3447 2
    );
3448 2
  }
3449
3450
  /**
3451 17
   * Strip whitespace or other characters from end of a UTF-8 string.
3452
   *
3453 17
   * @param string $str   <p>The string to be trimmed.</p>
3454 17
   * @param string $chars <p>Optional characters to be stripped.</p>
3455 17
   *
3456
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3457 17
   */
3458 17 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3459 17
  {
3460
    $str = (string)$str;
3461
3462
    if (!isset($str[0])) {
3463 17
      return '';
3464
    }
3465 17
3466
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3467
    if ($chars === INF || !$chars) {
3468
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3469
    }
3470
3471
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3472
3473
    return preg_replace("/{$chars}+$/u", '', $str);
3474
  }
3475
3476
  /**
3477
   * rxClass
3478
   *
3479
   * @param string $s
3480
   * @param string $class
3481
   *
3482
   * @return string
3483
   */
3484
  private static function rxClass($s, $class = '')
3485
  {
3486 1
    static $rxClassCache = array();
3487
3488 1
    $cacheKey = $s . $class;
3489 1
3490
    if (isset($rxClassCache[$cacheKey])) {
3491
      return $rxClassCache[$cacheKey];
3492
    }
3493
3494 1
    $class = array($class);
3495 1
3496 1
    /** @noinspection SuspiciousLoopInspection */
3497 1
    foreach (self::str_split($s) as $s) {
3498
      if ('-' === $s) {
3499
        $class[0] = '-' . $class[0];
3500 1
      } elseif (!isset($s[2])) {
3501
        $class[0] .= preg_quote($s, '/');
3502
      } elseif (1 === self::strlen($s)) {
3503
        $class[0] .= $s;
3504
      } else {
3505
        $class[] = $s;
3506
      }
3507
    }
3508
3509
    if ($class[0]) {
3510
      $class[0] = '[' . $class[0] . ']';
3511
    }
3512 35
3513
    if (1 === count($class)) {
3514 35
      $return = $class[0];
3515
    } else {
3516 35
      $return = '(?:' . implode('|', $class) . ')';
3517 2
    }
3518
3519
    $rxClassCache[$cacheKey] = $return;
3520
3521 35
    return $return;
3522 35
  }
3523
3524 35
  /**
3525
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3526
   */
3527
  public static function showSupport()
3528 35
  {
3529
    foreach (self::$support as $utf8Support) {
3530 35
      echo $utf8Support . "\n<br>";
3531 6
    }
3532 6
  }
3533
3534 35
  /**
3535 35
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3536 35
   *
3537 35
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3538 35
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3539
   *
3540 35
   * @return string <p>The HTML numbered entity.</p>
3541
   */
3542
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3543
  {
3544
    if (!$char) {
3545
      return '';
3546
    }
3547
3548
    if (
3549
        $keepAsciiChars === true
3550
        &&
3551
        self::isAscii($char) === true
3552
    ) {
3553
      return $char;
3554
    }
3555
3556
    return '&#' . self::ord($char) . ';';
3557
  }
3558
3559
  /**
3560
   * Convert a string to an array of Unicode characters.
3561
   *
3562
   * @param string  $str       <p>The string to split into array.</p>
3563
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3564
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3565
   *
3566
   * @return string[] <p>An array containing chunks of the string.</p>
3567
   */
3568
  public static function split($str, $length = 1, $cleanUtf8 = false)
3569
  {
3570
    $str = (string)$str;
3571
3572 35
    if (!isset($str[0])) {
3573 5
      return array();
3574
    }
3575 5
3576 5
    // init
3577
    $str = (string)$str;
3578
    $ret = array();
3579 35
3580
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3581
      self::checkForSupport();
3582
    }
3583 35
3584
    if (self::$support['pcre_utf8'] === true) {
3585
3586
      if ($cleanUtf8 === true) {
3587
        $str = self::clean($str);
3588
      }
3589
3590
      preg_match_all('/./us', $str, $retArray);
3591
      if (isset($retArray[0])) {
3592
        $ret = $retArray[0];
3593
      }
3594
      unset($retArray);
3595
3596 12
    } else {
3597
3598
      // fallback
3599
3600
      $len = strlen($str);
3601
3602 12
      /** @noinspection ForeachInvariantsInspection */
3603 2
      for ($i = 0; $i < $len; $i++) {
3604 1
        if (($str[$i] & "\x80") === "\x00") {
3605 2
          $ret[] = $str[$i];
3606 1
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3607 2
          if (($str[$i + 1] & "\xC0") === "\x80") {
3608
            $ret[] = $str[$i] . $str[$i + 1];
3609 2
3610
            $i++;
3611
          }
3612 2 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3613
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3614
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3615
3616
            $i += 2;
3617
          }
3618 12
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3619 3 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3620
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3621
3622
            $i += 3;
3623
          }
3624
        }
3625
      }
3626 12
    }
3627 9
3628
    if ($length > 1) {
3629
      $ret = array_chunk($ret, $length);
3630
3631
      $ret = array_map('implode', $ret);
3632
    }
3633
3634
    /** @noinspection OffsetOperationsInspection */
3635
    if (isset($ret[0]) && $ret[0] === '') {
3636 6
      return array();
3637 6
    }
3638 6
3639 6
    return $ret;
3640 6
  }
3641 6
3642 6
  /**
3643 6
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3644 6
   *
3645 6
   * @param string $str <p>The input string.</p>
3646 6
   *
3647 6
   * @return false|string <p>
3648 6
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3649 6
   *                      otherwise it will return false.
3650 6
   *                      </p>
3651 6
   */
3652 6
  public static function str_detect_encoding($str)
3653 6
  {
3654 6
    //
3655 6
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3656 6
    //
3657
3658 6
    if (self::is_binary($str)) {
3659 6
      if (self::is_utf16($str) === 1) {
3660 6
        return 'UTF-16LE';
3661
      } elseif (self::is_utf16($str) === 2) {
3662
        return 'UTF-16BE';
3663
      } elseif (self::is_utf32($str) === 1) {
3664
        return 'UTF-32LE';
3665
      } elseif (self::is_utf32($str) === 2) {
3666
        return 'UTF-32BE';
3667
      }
3668
    }
3669
3670
    //
3671
    // 2.) simple check for ASCII chars
3672
    //
3673
3674
    if (self::is_ascii($str) === true) {
3675
      return 'ASCII';
3676
    }
3677
3678
    //
3679
    // 3.) simple check for UTF-8 chars
3680
    //
3681
3682
    if (self::is_utf8($str) === true) {
3683
      return 'UTF-8';
3684
    }
3685
3686
    //
3687
    // 4.) check via "\mb_detect_encoding()"
3688
    //
3689
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3690
3691
    $detectOrder = array(
3692
        'ISO-8859-1',
3693
        'ISO-8859-2',
3694
        'ISO-8859-3',
3695
        'ISO-8859-4',
3696
        'ISO-8859-5',
3697
        'ISO-8859-6',
3698
        'ISO-8859-7',
3699
        'ISO-8859-8',
3700
        'ISO-8859-9',
3701
        'ISO-8859-10',
3702
        'ISO-8859-13',
3703
        'ISO-8859-14',
3704 13
        'ISO-8859-15',
3705
        'ISO-8859-16',
3706 13
        'WINDOWS-1251',
3707
        'WINDOWS-1252',
3708
        'WINDOWS-1254',
3709 13
        'ISO-2022-JP',
3710 13
        'JIS',
3711 1
        'EUC-JP',
3712 1
    );
3713 12
3714
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3715 13
    if ($encoding) {
3716
      return $encoding;
3717 13
    }
3718 13
3719
    //
3720 13
    // 5.) check via "iconv()"
3721
    //
3722
3723
    $md5 = md5($str);
3724
    foreach (self::$iconvEncoding as $encodingTmp) {
3725
      # INFO: //IGNORE and //TRANSLIT still throw notice
3726
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3727
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3728
        return $encodingTmp;
3729
      }
3730
    }
3731
3732 1
    return false;
3733
  }
3734 1
3735
  /**
3736 1
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3737
   *
3738
   * @link  http://php.net/manual/en/function.str-ireplace.php
3739
   *
3740 1
   * @param mixed $search  <p>
3741
   *                       Every replacement with search array is
3742 1
   *                       performed on the result of previous replacement.
3743
   *                       </p>
3744
   * @param mixed $replace <p>
3745
   *                       </p>
3746 1
   * @param mixed $subject <p>
3747 1
   *                       If subject is an array, then the search and
3748
   *                       replace is performed with every entry of
3749
   *                       subject, and the return value is an array as
3750 1
   *                       well.
3751 1
   *                       </p>
3752 1
   * @param int   $count   [optional] <p>
3753 1
   *                       The number of matched and replaced needles will
3754
   *                       be returned in count which is passed by
3755 1
   *                       reference.
3756
   *                       </p>
3757
   *
3758 1
   * @return mixed <p>A string or an array of replacements.</p>
3759
   */
3760
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3761 1
  {
3762
    $search = (array)$search;
3763
3764
    /** @noinspection AlterInForeachInspection */
3765
    foreach ($search as &$s) {
3766
      if ('' === $s .= '') {
3767
        $s = '/^(?<=.)$/';
3768
      } else {
3769
        $s = '/' . preg_quote($s, '/') . '/ui';
3770
      }
3771
    }
3772
3773
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3774
    $count = $replace; // used as reference parameter
3775
3776
    return $subject;
3777 2
  }
3778
3779 2
  /**
3780
   * Limit the number of characters in a string, but also after the next word.
3781 2
   *
3782 2
   * @param string $str
3783
   * @param int    $length
3784 2
   * @param string $strAddOn
3785
   *
3786
   * @return string
3787 2
   */
3788 2
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3789 2
  {
3790 2
    $str = (string)$str;
3791 2
3792
    if (!isset($str[0])) {
3793 2
      return '';
3794 2
    }
3795 2
3796 2
    $length = (int)$length;
3797 2
3798 2
    if (self::strlen($str) <= $length) {
3799
      return $str;
3800 2
    }
3801 2
3802 2
    if (self::substr($str, $length - 1, 1) === ' ') {
3803 2
      return self::substr($str, 0, $length - 1) . $strAddOn;
3804 2
    }
3805 2
3806
    $str = self::substr($str, 0, $length);
3807 2
    $array = explode(' ', $str);
3808
    array_pop($array);
3809
    $new_str = implode(' ', $array);
3810 2
3811
    if ($new_str === '') {
3812
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3813
    } else {
3814
      $str = $new_str . $strAddOn;
3815
    }
3816
3817
    return $str;
3818
  }
3819
3820
  /**
3821
   * Pad a UTF-8 string to given length with another string.
3822
   *
3823
   * @param string $str        <p>The input string.</p>
3824
   * @param int    $pad_length <p>The length of return string.</p>
3825
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3826
   * @param int    $pad_type   [optional] <p>
3827
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3828
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3829
   *                           </p>
3830
   *
3831 1
   * @return string <strong>Returns the padded string</strong>
3832
   */
3833 1
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3834
  {
3835 1
    $str_length = self::strlen($str);
3836
3837
    if (
3838
        is_int($pad_length) === true
3839
        &&
3840
        $pad_length > 0
3841
        &&
3842
        $pad_length >= $str_length
3843
    ) {
3844
      $ps_length = self::strlen($pad_string);
3845
3846
      $diff = $pad_length - $str_length;
3847
3848
      switch ($pad_type) {
3849 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3850
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3851
          $pre = self::substr($pre, 0, $diff);
3852
          $post = '';
3853
          break;
3854
3855
        case STR_PAD_BOTH:
3856
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3857
          $pre = self::substr($pre, 0, (int)$diff / 2);
3858
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3859
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3860
          break;
3861
3862
        case STR_PAD_RIGHT:
3863 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3864
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3865
          $post = self::substr($post, 0, $diff);
3866
          $pre = '';
3867 12
      }
3868
3869 12
      return $pre . $str . $post;
3870
    }
3871
3872
    return $str;
3873
  }
3874
3875
  /**
3876
   * Repeat a string.
3877
   *
3878
   * @param string $str        <p>
3879 1
   *                           The string to be repeated.
3880
   *                           </p>
3881 1
   * @param int    $multiplier <p>
3882
   *                           Number of time the input string should be
3883 1
   *                           repeated.
3884
   *                           </p>
3885 1
   *                           <p>
3886
   *                           multiplier has to be greater than or equal to 0.
3887
   *                           If the multiplier is set to 0, the function
3888
   *                           will return an empty string.
3889
   *                           </p>
3890
   *
3891
   * @return string <p>The repeated string.</p>
3892
   */
3893
  public static function str_repeat($str, $multiplier)
3894
  {
3895
    $str = self::filter($str);
3896
3897 1
    return str_repeat($str, $multiplier);
3898
  }
3899 1
3900
  /**
3901 1
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3902 1
   *
3903 1
   * Replace all occurrences of the search string with the replacement string
3904
   *
3905 1
   * @link http://php.net/manual/en/function.str-replace.php
3906 1
   *
3907 1
   * @param mixed $search  <p>
3908 1
   *                       The value being searched for, otherwise known as the needle.
3909
   *                       An array may be used to designate multiple needles.
3910
   *                       </p>
3911 1
   * @param mixed $replace <p>
3912
   *                       The replacement value that replaces found search
3913
   *                       values. An array may be used to designate multiple replacements.
3914
   *                       </p>
3915
   * @param mixed $subject <p>
3916
   *                       The string or array being searched and replaced on,
3917
   *                       otherwise known as the haystack.
3918
   *                       </p>
3919
   *                       <p>
3920
   *                       If subject is an array, then the search and
3921
   *                       replace is performed with every entry of
3922 20
   *                       subject, and the return value is an array as
3923
   *                       well.
3924
   *                       </p>
3925 20
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3926
   *
3927 20
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3928
   */
3929
  public static function str_replace($search, $replace, $subject, &$count = null)
3930
  {
3931 20
    return str_replace($search, $replace, $subject, $count);
3932 20
  }
3933
3934 20
  /**
3935 20
   * Shuffles all the characters in the string.
3936
   *
3937
   * @param string $str <p>The input string</p>
3938 1
   *
3939 1
   * @return string <p>The shuffled string.</p>
3940
   */
3941
  public static function str_shuffle($str)
3942 1
  {
3943 1
    $array = self::split($str);
3944 1
3945 1
    shuffle($array);
3946 1
3947
    return implode('', $array);
3948 1
  }
3949
3950 1
  /**
3951
   * Sort all characters according to code points.
3952
   *
3953
   * @param string $str    <p>A UTF-8 string.</p>
3954
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3955
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3956
   *
3957
   * @return string <p>String of sorted characters.</p>
3958
   */
3959
  public static function str_sort($str, $unique = false, $desc = false)
3960 1
  {
3961
    $array = self::codepoints($str);
3962 1
3963
    if ($unique) {
3964 1
      $array = array_flip(array_flip($array));
3965
    }
3966 1
3967
    if ($desc) {
3968
      arsort($array);
3969
    } else {
3970
      asort($array);
3971
    }
3972
3973
    return self::string($array);
3974
  }
3975
3976
  /**
3977
   * Split a string into an array.
3978
   *
3979 7
   * @param string $str
3980
   * @param int    $len
3981 7
   *
3982
   * @return array
3983
   */
3984
  public static function str_split($str, $len = 1)
3985
  {
3986
    // init
3987
    $len = (int)$len;
3988
    $str = (string)$str;
3989
3990
    if (!isset($str[0])) {
3991
      return array();
3992
    }
3993
3994
    if ($len < 1) {
3995
      return str_split($str, $len);
3996
    }
3997
3998 1
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
3999
    $a = $a[0];
4000 1
4001 1
    if ($len === 1) {
4002
      return $a;
4003 1
    }
4004
4005 1
    $arrayOutput = array();
4006
    $p = -1;
4007 1
4008 1
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
4009 1
    foreach ($a as $l => $a) {
4010 1
      if ($l % $len) {
4011
        $arrayOutput[$p] .= $a;
4012 1
      } else {
4013
        $arrayOutput[++$p] = $a;
4014 1
      }
4015 1
    }
4016 1
4017 1
    return $arrayOutput;
4018 1
  }
4019 1
4020
  /**
4021 1
   * Get a binary representation of a specific string.
4022
   *
4023 1
   * @param string $str <p>The input string.</p>
4024
   *
4025
   * @return string
4026
   */
4027 1
  public static function str_to_binary($str)
4028
  {
4029
    $str = (string)$str;
4030
4031
    $value = unpack('H*', $str);
4032
4033
    return base_convert($value[1], 16, 2);
4034
  }
4035
4036
  /**
4037
   * alias for "UTF8::to_ascii()"
4038
   *
4039
   * @see UTF8::to_ascii()
4040
   *
4041
   * @param string $str
4042
   * @param string $unknown
4043
   * @param bool   $strict
4044 9
   *
4045
   * @return string
4046 9
   */
4047
  public static function str_transliterate($str, $unknown = '?', $strict = false)
4048
  {
4049
    return self::to_ascii($str, $unknown, $strict);
4050
  }
4051
4052
  /**
4053
   * Counts number of words in the UTF-8 string.
4054
   *
4055
   * @param string $str      <p>The input string.</p>
4056
   * @param int    $format   [optional] <p>
4057
   *                         <strong>0</strong> => return a number of words (default)<br />
4058
   *                         <strong>1</strong> => return an array of words<br />
4059
   *                         <strong>2</strong> => return an array of words with word-offset as key
4060
   *                         </p>
4061 12
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word
4062
   *                         (default: "'", "’")</p>
4063 12
   *
4064 11
   * @return array|int <p>The number of words in the string</p>
4065 11
   */
4066 12
  public static function str_word_count($str, $format = 0, $charlist = '')
4067
  {
4068
    $charlist = self::rxClass($charlist, '\pL');
4069
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4070
4071
    $len = count($strParts);
4072
4073
    if ($format === 1) {
4074
4075
      $numberOfWords = array();
4076
      for ($i = 1; $i < $len; $i += 2) {
4077
        $numberOfWords[] = $strParts[$i];
4078
      }
4079 8
4080
    } elseif ($format === 2) {
4081 8
4082 1
      $numberOfWords = array();
4083
      $offset = self::strlen($strParts[0]);
4084
      for ($i = 1; $i < $len; $i += 2) {
4085 7
        $numberOfWords[$offset] = $strParts[$i];
4086 2
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4087 2
      }
4088 5
4089
    } else {
4090
4091 7
      $numberOfWords = ($len - 1) / 2;
4092
4093 7
    }
4094
4095 1
    return $numberOfWords;
4096
  }
4097
4098
  /**
4099
   * Case-insensitive string comparison.
4100
   *
4101
   * INFO: Case-insensitive version of UTF8::strcmp()
4102
   *
4103
   * @param string $str1
4104
   * @param string $str2
4105
   *
4106
   * @return int <p>
4107
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4108 2
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4109
   *             <strong>0</strong> if they are equal.
4110 2
   *             </p>
4111 2
   */
4112
  public static function strcasecmp($str1, $str2)
4113 2
  {
4114 2
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4115 2
  }
4116
4117 2
  /**
4118 2
   * Case-sensitive string comparison.
4119
   *
4120
   * @param string $str1
4121
   * @param string $str2
4122
   *
4123
   * @return int  <p>
4124
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4125
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4126
   *              <strong>0</strong> if they are equal.
4127
   *              </p>
4128
   */
4129
  public static function strcmp($str1, $str2)
4130
  {
4131
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4132
        \Normalizer::normalize($str1, \Normalizer::NFD),
4133
        \Normalizer::normalize($str2, \Normalizer::NFD)
4134
    );
4135
  }
4136
4137
  /**
4138
   * Find length of initial segment not matching mask.
4139
   *
4140
   * @param string $str
4141
   * @param string $charList
4142 3
   * @param int    $offset
4143
   * @param int    $length
4144 3
   *
4145 3
   * @return int|null
4146 3
   */
4147
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4148 3
  {
4149
    if ('' === $charList .= '') {
4150 3
      return null;
4151
    }
4152
4153
    if ($offset || 2147483647 !== $length) {
4154
      $str = (string)self::substr($str, $offset, $length);
4155
    }
4156
4157
    $str = (string)$str;
4158
    if (!isset($str[0])) {
4159
      return null;
4160
    }
4161
4162
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4163
      /** @noinspection OffsetOperationsInspection */
4164
      return self::strlen($length[1]);
4165
    }
4166
4167
    return self::strlen($str);
4168
  }
4169
4170
  /**
4171
   * Create a UTF-8 string from code points.
4172 2
   *
4173
   * INFO: opposite to UTF8::codepoints()
4174
   *
4175 2
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4176
   *
4177 2
   * @return string <p>UTF-8 encoded string.</p>
4178
   */
4179
  public static function string(array $array)
4180
  {
4181
    return implode(
4182
        array_map(
4183
            array(
4184
                '\\voku\\helper\\UTF8',
4185
                'chr',
4186
            ),
4187
            $array
4188
        )
4189
    );
4190
  }
4191
4192
  /**
4193
   * alias for "UTF8::string_has_bom()"
4194
   *
4195
   * @see UTF8::string_has_bom()
4196
   *
4197
   * @param string $str
4198
   *
4199
   * @return bool
4200
   */
4201
  public static function hasBom($str)
4202
  {
4203
    return self::string_has_bom($str);
4204 8
  }
4205
4206 8
  /**
4207 8
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4208
   *
4209 8
   * @param string $str <p>The input string.</p>
4210 3
   *
4211
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4212
   */
4213 7
  public static function string_has_bom($str)
4214 1
  {
4215 1
    foreach (self::$bom as $bomString => $bomByteLength) {
4216 1
      if (0 === strpos($str, $bomString)) {
4217
        return true;
4218
      }
4219
    }
4220 7
4221 1
    return false;
4222 7
  }
4223 7
4224 7
  /**
4225
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4226
   *
4227
   * @link http://php.net/manual/en/function.strip-tags.php
4228 7
   *
4229
   * @param string $str            <p>
4230
   *                               The input string.
4231
   *                               </p>
4232
   * @param string $allowable_tags [optional] <p>
4233
   *                               You can use the optional second parameter to specify tags which should
4234
   *                               not be stripped.
4235
   *                               </p>
4236
   *                               <p>
4237
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4238
   *                               can not be changed with allowable_tags.
4239
   *                               </p>
4240
   *
4241
   * @return string <p>The stripped string.</p>
4242
   */
4243
  public static function strip_tags($str, $allowable_tags = null)
4244 8
  {
4245
    // clean broken utf8
4246 8
    $str = self::clean($str);
4247 2
4248
    return strip_tags($str, $allowable_tags);
4249
  }
4250 6
4251
  /**
4252
   * Finds position of first occurrence of a string within another, case insensitive.
4253
   *
4254 6
   * @link http://php.net/manual/en/function.mb-stripos.php
4255
   *
4256
   * @param string  $haystack  <p>
4257
   *                           The string from which to get the position of the first occurrence
4258
   *                           of needle
4259
   *                           </p>
4260
   * @param string  $needle    <p>
4261
   *                           The string to find in haystack
4262
   *                           </p>
4263
   * @param int     $offset    [optional] <p>
4264
   *                           The position in haystack
4265
   *                           to start searching
4266
   *                           </p>
4267
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4268
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4269 62
   *
4270
   * @return int|false <p>
4271 62
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4272
   *                   or false if needle is not found.
4273 62
   *                   </p>
4274 4
   */
4275
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4276
  {
4277
    $haystack = (string)$haystack;
4278
    $needle = (string)$needle;
4279 61
4280 1
    if (!isset($haystack[0], $needle[0])) {
4281 61
      return false;
4282 61
    }
4283 61
4284 1
    if ($cleanUtf8 === true) {
4285
      $haystack = self::clean($haystack);
4286
      $needle = self::clean($needle);
4287
    }
4288 61
4289 61
    if (
4290
        $encoding === 'UTF-8'
4291
        ||
4292
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4293 61
    ) {
4294 2
      $encoding = 'UTF-8';
4295 2
    } else {
4296
      $encoding = self::normalize_encoding($encoding);
4297 61
    }
4298
4299
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4300
  }
4301
4302
  /**
4303
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4304
   *
4305
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4306
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4307
   * @param bool   $before_needle [optional] <p>
4308
   *                              If <b>TRUE</b>, grapheme_strstr() returns the part of the
4309
   *                              haystack before the first occurrence of the needle (excluding the needle).
4310
   *                              </p>
4311
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4312 1
   *
4313
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4314 1
   */
4315
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4316
  {
4317
    if ('' === $needle .= '') {
4318
      return false;
4319
    }
4320
4321
    if ($encoding !== 'UTF-8') {
4322
      $encoding = self::normalize_encoding($encoding);
4323
    }
4324
4325
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4326
  }
4327
4328
  /**
4329
   * Get the string length, not the byte-length!
4330
   *
4331 2
   * @link     http://php.net/manual/en/function.mb-strlen.php
4332
   *
4333 2
   * @param string  $str       <p>The string being checked for length.</p>
4334
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4335
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4336
   *
4337
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4338
   *             character counted as +1)</p>
4339
   */
4340
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4341
  {
4342
    $str = (string)$str;
4343
4344
    if (!isset($str[0])) {
4345
      return 0;
4346
    }
4347
4348
    if (
4349 1
        $encoding === 'UTF-8'
4350
        ||
4351 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4352
    ) {
4353
      $encoding = 'UTF-8';
4354
    } else {
4355
      $encoding = self::normalize_encoding($encoding);
4356
    }
4357
4358
    switch ($encoding) {
4359
      case 'ASCII':
4360
      case 'CP850':
4361
        return strlen($str);
4362
    }
4363
4364
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4365
      $str = self::clean($str);
4366
    }
4367 2
4368
    return \mb_strlen($str, $encoding);
4369 2
  }
4370 2
4371
  /**
4372 2
   * Case insensitive string comparisons using a "natural order" algorithm.
4373
   *
4374
   * INFO: natural order version of UTF8::strcasecmp()
4375
   *
4376
   * @param string $str1 <p>The first string.</p>
4377
   * @param string $str2 <p>The second string.</p>
4378
   *
4379
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4380
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4381
   *             <strong>0</strong> if they are equal
4382
   */
4383
  public static function strnatcasecmp($str1, $str2)
4384
  {
4385 1
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4386
  }
4387 1
4388 1
  /**
4389
   * String comparisons using a "natural order" algorithm
4390 1
   *
4391 1
   * INFO: natural order version of UTF8::strcmp()
4392
   *
4393
   * @link  http://php.net/manual/en/function.strnatcmp.php
4394 1
   *
4395 1
   * @param string $str1 <p>The first string.</p>
4396
   * @param string $str2 <p>The second string.</p>
4397
   *
4398
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4399
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4400
   *             <strong>0</strong> if they are equal
4401
   */
4402
  public static function strnatcmp($str1, $str2)
4403
  {
4404
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4405
  }
4406
4407
  /**
4408
   * Case-insensitive string comparison of the first n characters.
4409
   *
4410
   * @link  http://php.net/manual/en/function.strncasecmp.php
4411
   *
4412
   * @param string $str1 <p>The first string.</p>
4413
   * @param string $str2 <p>The second string.</p>
4414
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4415
   *
4416
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4417 15
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4418
   *             <strong>0</strong> if they are equal
4419 15
   */
4420 15
  public static function strncasecmp($str1, $str2, $len)
4421
  {
4422 15
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4423 2
  }
4424
4425
  /**
4426
   * String comparison of the first n characters.
4427 14
   *
4428
   * @link  http://php.net/manual/en/function.strncmp.php
4429
   *
4430
   * @param string $str1 <p>The first string.</p>
4431 14
   * @param string $str2 <p>The second string.</p>
4432
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4433
   *
4434
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4435 14
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4436
   *             <strong>0</strong> if they are equal
4437
   */
4438
  public static function strncmp($str1, $str2, $len)
4439 1
  {
4440 1
    $str1 = self::substr($str1, 0, $len);
4441 1
    $str2 = self::substr($str2, 0, $len);
4442
4443 14
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4440 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4441 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4444
  }
4445
4446
  /**
4447
   * Search a string for any of a set of characters.
4448
   *
4449 14
   * @link  http://php.net/manual/en/function.strpbrk.php
4450 1
   *
4451 14
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4452 14
   * @param string $char_list <p>This parameter is case sensitive.</p>
4453 14
   *
4454
   * @return string String starting from the character found, or false if it is not found.
4455
   */
4456
  public static function strpbrk($haystack, $char_list)
4457
  {
4458
    $haystack = (string)$haystack;
4459 14
    $char_list = (string)$char_list;
4460 14
4461 14
    if (!isset($haystack[0], $char_list[0])) {
4462 14
      return false;
4463
    }
4464
4465
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4466
      return substr($haystack, strpos($haystack, $m[0]));
4467
    } else {
4468
      return false;
4469
    }
4470
  }
4471
4472
  /**
4473
   * Find position of first occurrence of string in a string.
4474
   *
4475
   * @link http://php.net/manual/en/function.mb-strpos.php
4476
   *
4477
   * @param string  $haystack  <p>The string being checked.</p>
4478
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4479
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4480
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4481
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4482
   *
4483
   * @return int|false <p>
4484
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4485
   *                   If needle is not found it returns false.
4486
   *                   </p>
4487
   */
4488
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4489
  {
4490
    $haystack = (string)$haystack;
4491
    $needle = (string)$needle;
4492
4493
    if (!isset($haystack[0], $needle[0])) {
4494
      return false;
4495
    }
4496
4497
    // init
4498
    $offset = (int)$offset;
4499
4500
    // iconv and mbstring do not support integer $needle
4501
4502
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4503
      $needle = (string)self::chr($needle);
4504
    }
4505
4506
    if ($cleanUtf8 === true) {
4507 1
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4508
      // if invalid characters are found in $haystack before $needle
4509 1
      $needle = self::clean($needle);
4510
      $haystack = self::clean($haystack);
4511
    }
4512
4513 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4514
      self::checkForSupport();
4515
    }
4516
4517
    if (
4518
        $encoding === 'UTF-8'
4519
        ||
4520
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4521
    ) {
4522
      $encoding = 'UTF-8';
4523
    } else {
4524
      $encoding = self::normalize_encoding($encoding);
4525
    }
4526
4527 1
    if (self::$support['mbstring'] === true) {
4528
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4529 1
    }
4530
4531
    if (self::$support['iconv'] === true) {
4532
      // ignore invalid negative offset to keep compatibility
4533
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4534
      return \iconv_strpos($haystack, $needle, $offset > 0 ? $offset : 0, $encoding);
4535
    }
4536
4537
    if ($offset > 0) {
4538
      $haystack = self::substr($haystack, $offset);
4539
    }
4540
4541 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4542
      $left = substr($haystack, 0, $pos);
4543
4544
      // negative offset not supported in PHP strpos(), ignoring
4545
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4546
    }
4547
4548
    return false;
4549
  }
4550
4551
  /**
4552
   * Finds the last occurrence of a character in a string within another.
4553
   *
4554
   * @link http://php.net/manual/en/function.mb-strrchr.php
4555 4
   *
4556
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4557 4
   * @param string $needle   <p>The string to find in haystack</p>
4558
   * @param bool   $part     [optional] <p>
4559 4
   *                         Determines which portion of haystack
4560 2
   *                         this function returns.
4561
   *                         If set to true, it returns all of haystack
4562
   *                         from the beginning to the last occurrence of needle.
4563 3
   *                         If set to false, it returns all of haystack
4564
   *                         from the last occurrence of needle to the end,
4565
   *                         </p>
4566
   * @param string $encoding [optional] <p>
4567
   *                         Character encoding name to use.
4568
   *                         If it is omitted, internal character encoding is used.
4569
   *                         </p>
4570
   *
4571
   * @return string|false The portion of haystack or false if needle is not found.
4572
   */
4573 View Code Duplication
  public static function strrchr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4574
  {
4575
    if ($encoding !== 'UTF-8') {
4576
      $encoding = self::normalize_encoding($encoding);
4577
    }
4578
4579
    return \mb_strrchr($haystack, $needle, $part, $encoding);
4580
  }
4581
4582
  /**
4583
   * alias for "UTF8::strstr()"
4584
   *
4585
   * @see UTF8::strstr()
4586
   *
4587
   * @param string $haystack
4588 1
   * @param string $needle
4589
   * @param bool   $before_needle
4590 1
   *
4591
   * @return string|false
4592
   */
4593
  public static function strchr($haystack, $needle, $before_needle = false)
4594 1
  {
4595
    return self::strstr($haystack, $needle, $before_needle);
4596
  }
4597
4598
  /**
4599
   * alias for "UTF8::stristr()"
4600
   *
4601
   * @see UTF8::stristr()
4602
   *
4603
   * @param string $haystack
4604
   * @param string $needle
4605
   * @param bool   $before_needle
4606
   *
4607
   * @return string|false
4608
   */
4609
  public static function strichr($haystack, $needle, $before_needle = false)
4610 1
  {
4611
    return self::stristr($haystack, $needle, $before_needle);
4612 1
  }
4613
4614
  /**
4615
   * Reverses characters order in the string.
4616
   *
4617
   * @param string $str The input string
4618
   *
4619
   * @return string The string with characters in the reverse sequence
4620
   */
4621
  public static function strrev($str)
4622
  {
4623
    $str = (string)$str;
4624
4625
    if (!isset($str[0])) {
4626
      return '';
4627
    }
4628
4629
    return implode(array_reverse(self::split($str)));
4630
  }
4631 11
4632
  /**
4633 11
   * Finds the last occurrence of a character in a string within another, case insensitive.
4634
   *
4635 11
   * @link http://php.net/manual/en/function.mb-strrichr.php
4636 2
   *
4637 2
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4638
   * @param string $needle   <p>The string to find in haystack.</p>
4639 11
   * @param bool   $part     [optional] <p>
4640
   *                         Determines which portion of haystack
4641 11
   *                         this function returns.
4642 2
   *                         If set to true, it returns all of haystack
4643
   *                         from the beginning to the last occurrence of needle.
4644
   *                         If set to false, it returns all of haystack
4645
   *                         from the last occurrence of needle to the end,
4646 10
   *                         </p>
4647 10
   * @param string $encoding [optional] <p>
4648
   *                         Character encoding name to use.
4649 10
   *                         If it is omitted, internal character encoding is used.
4650
   *                         </p>
4651
   *
4652 2
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4653 2
   */
4654 2 View Code Duplication
  public static function strrichr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4655
  {
4656
    if ($encoding !== 'UTF-8') {
4657 10
      $encoding = self::normalize_encoding($encoding);
4658
    }
4659
4660
    return \mb_strrichr($haystack, $needle, $part, $encoding);
4661 10
  }
4662 10
4663
  /**
4664
   * Find position of last occurrence of a case-insensitive string.
4665
   *
4666
   * @param string  $haystack  <p>The string to look in.</p>
4667
   * @param string  $needle    <p>The string to look for.</p>
4668
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4669
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4670
   *
4671
   * @return int|false <p>
4672
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4673
   *                   not found, it returns false.
4674
   *                   </p>
4675
   */
4676
  public static function strripos($haystack, $needle, $offset = 0, $cleanUtf8 = false)
4677
  {
4678
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $cleanUtf8);
4679
  }
4680
4681
  /**
4682
   * Find position of last occurrence of a string in a string.
4683
   *
4684
   * @link http://php.net/manual/en/function.mb-strrpos.php
4685
   *
4686
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4687
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4688
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4689
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4690
   *                              the end of the string.
4691
   *                              </p>
4692
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4693
   *
4694
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4695
   *                   is not found, it returns false.</p>
4696
   */
4697
  public static function strrpos($haystack, $needle, $offset = null, $cleanUtf8 = false)
4698 8
  {
4699
    $haystack = (string)$haystack;
4700 8
4701 2
    if (((int)$needle) === $needle && ($needle >= 0)) {
4702 2
      $needle = self::chr($needle);
4703
    }
4704 8
4705
    $needle = (string)$needle;
4706
4707
    if (!isset($haystack[0], $needle[0])) {
4708
      return false;
4709
    }
4710
4711
    // init
4712
    $needle = (string)$needle;
4713
    $offset = (int)$offset;
4714
4715
    if ($cleanUtf8 === true) {
4716
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4717
4718
      $needle = self::clean($needle);
4719
      $haystack = self::clean($haystack);
4720 2
    }
4721
4722 2
4723
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4724
      self::checkForSupport();
4725
    }
4726
4727
    if (self::$support['mbstring'] === true) {
4728 2
      return \mb_strrpos($haystack, $needle, $offset, 'UTF-8');
4729 2
    }
4730 2
4731 2
    if (self::$support['iconv'] === true) {
4732
      return \grapheme_strrpos($haystack, $needle, $offset);
4733
    }
4734
4735
    // fallback
4736
4737
    if ($offset > 0) {
4738
      $haystack = self::substr($haystack, $offset);
4739
    } elseif ($offset < 0) {
4740
      $haystack = self::substr($haystack, 0, $offset);
4741
    }
4742
4743 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4744
      $left = substr($haystack, 0, $pos);
4745
4746
      // negative offset not supported in PHP strpos(), ignoring
4747
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4748
    }
4749
4750 11
    return false;
4751
  }
4752 11
4753 11
  /**
4754 11
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4755
   * mask.
4756 11
   *
4757 1
   * @param string $str    <p>The input string.</p>
4758 1
   * @param string $mask   <p>The mask of chars</p>
4759 1
   * @param int    $offset [optional]
4760
   * @param int    $length [optional]
4761 11
   *
4762
   * @return int
4763 11
   */
4764
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4765 11
  {
4766 1
    // init
4767 1
    $length = (int)$length;
4768
    $offset = (int)$offset;
4769
4770 11
    if ($offset || 2147483647 !== $length) {
4771 11
      $str = self::substr($str, $offset, $length);
4772
    }
4773 11
4774
    $str = (string)$str;
4775 11
    if (!isset($str[0])) {
4776
      return '';
4777
    }
4778
4779
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4780
  }
4781
4782
  /**
4783
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4784
   *
4785
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4786
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4787
   * @param bool   $before_needle [optional] <p>
4788 21
   *                              If <b>TRUE</b>, strstr() returns the part of the
4789
   *                              haystack before the first occurrence of the needle (excluding the needle).
4790
   *                              </p>
4791 21
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4792
   *
4793 21
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4794 6
   */
4795
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4796
  {
4797 19
    if ($encoding !== 'UTF-8') {
4798
      $encoding = self::normalize_encoding($encoding);
4799
    }
4800
4801 19
    if (
4802
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4803
        ||
4804
        self::$support['mbstring'] === true
4805
    ) {
4806
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4807
    }
4808
4809
    return \grapheme_strstr($haystack, $needle, $before_needle);
4810
  }
4811 3
4812
  /**
4813 3
   * Unicode transformation for case-less matching.
4814
   *
4815
   * @link http://unicode.org/reports/tr21/tr21-5.html
4816
   *
4817
   * @param string $str  <p>The input string.</p>
4818
   * @param bool   $full <p>
4819
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4820
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4821
   *                     </p>
4822
   *
4823
   * @return string
4824
   */
4825
  public static function strtocasefold($str, $full = true)
4826 16
  {
4827
    static $fullCaseFold = null;
4828 16
    static $commonCaseFoldKeys = null;
4829
    static $commonCaseFoldValues = null;
4830 16
4831 4
    if ($commonCaseFoldKeys === null) {
4832
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4833
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4834 15
    }
4835
4836
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4837
4838 15
    if ($full) {
4839
4840
      if ($fullCaseFold === null) {
4841
        $fullCaseFold = self::getData('caseFolding_full');
4842
      }
4843
4844
      /** @noinspection OffsetOperationsInspection */
4845
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4846
    }
4847
4848
    $str = self::clean($str);
4849
4850
    return self::strtolower($str);
4851
  }
4852
4853
  /**
4854
   * Make a string lowercase.
4855 1
   *
4856
   * @link http://php.net/manual/en/function.mb-strtolower.php
4857 1
   *
4858 1
   * @param string $str      <p>The string being lowercased.</p>
4859 1
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function</p>
4860 1
   *
4861 1
   * @return string str with all alphabetic characters converted to lowercase.
4862
   */
4863 1 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4864 1
  {
4865 1
    // init
4866 1
    $str = (string)$str;
4867 1
4868
    if (!isset($str[0])) {
4869 1
      return '';
4870 1
    }
4871
4872 1
    if ($encoding !== 'UTF-8') {
4873
      $encoding = self::normalize_encoding($encoding);
4874
    }
4875
4876
    return \mb_strtolower($str, $encoding);
4877
  }
4878
4879
  /**
4880
   * Generic case sensitive transformation for collation matching.
4881
   *
4882
   * @param string $str <p>The input string</p>
4883
   *
4884 1
   * @return string
4885
   */
4886 1
  private static function strtonatfold($str)
4887 1
  {
4888 1
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4889
  }
4890 1
4891
  /**
4892
   * Make a string uppercase.
4893
   *
4894 1
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4895 1
   *
4896
   * @param string $str      <p>The string being uppercased.</p>
4897 1
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4898
   *
4899
   * @return string str with all alphabetic characters converted to uppercase.
4900
   */
4901 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4902
  {
4903
    $str = (string)$str;
4904
4905
    if (!isset($str[0])) {
4906
      return '';
4907
    }
4908
4909
    if ($encoding !== 'UTF-8') {
4910
      $encoding = self::normalize_encoding($encoding);
4911
    }
4912
4913 47
    return \mb_strtoupper($str, $encoding);
4914
  }
4915
4916 47
  /**
4917
   * Translate characters or replace sub-strings.
4918 47
   *
4919 11
   * @link  http://php.net/manual/en/function.strtr.php
4920
   *
4921
   * @param string          $str  <p>The string being translated.</p>
4922 45
   * @param string|string[] $from <p>The string replacing from.</p>
4923
   * @param string|string[] $to   <p>The string being translated to to.</p>
4924
   *
4925
   * @return string <p>
4926 1
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4927 1
   *                corresponding character in to.
4928
   *                </p>
4929 45
   */
4930 45
  public static function strtr($str, $from, $to = INF)
4931 37
  {
4932 37
    if (INF !== $to) {
4933
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 4933 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4934 45
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 4934 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4935 2
      $countFrom = count($from);
4936
      $countTo = count($to);
4937
4938 43
      if ($countFrom > $countTo) {
4939 20
        $from = array_slice($from, 0, $countTo);
4940 20
      } elseif ($countFrom < $countTo) {
4941 41
        $to = array_slice($to, 0, $countFrom);
4942
      }
4943
4944 43
      $from = array_combine($from, $to);
4945
    }
4946
4947
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 4930 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
4948
  }
4949
4950 43
  /**
4951 1
   * Return the width of a string.
4952 43
   *
4953 43
   * @param string  $str       <p>The input string.</p>
4954 43
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4955
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4956
   *
4957
   * @return int
4958
   */
4959
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4960 43
  {
4961 43
    if ($encoding !== 'UTF-8') {
4962 43
      $encoding = self::normalize_encoding($encoding);
4963 43
    }
4964
4965
    if ($cleanUtf8 === true) {
4966
      // iconv and mbstring are not tolerant to invalid encoding
4967
      // further, their behaviour is inconsistent with that of PHP's substr
4968
4969
      $str = self::clean($str);
4970
    }
4971
4972
    return \mb_strwidth($str, $encoding);
4973
  }
4974
4975
  /**
4976
   * Get part of a string.
4977
   *
4978
   * @link http://php.net/manual/en/function.mb-substr.php
4979
   *
4980
   * @param string  $str       <p>The string being checked.</p>
4981
   * @param int     $start     <p>The first position used in str.</p>
4982
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
4983
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4984
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4985
   *
4986
   * @return string Returns a sub-string specified by the start and length parameters.
4987
   */
4988
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4989
  {
4990
    // init
4991
    $str = (string)$str;
4992
4993 1
    if (!isset($str[0])) {
4994
      return '';
4995 1
    }
4996 1
4997
    if ($cleanUtf8 === true) {
4998 1
      // iconv and mbstring are not tolerant to invalid encoding
4999
      // further, their behaviour is inconsistent with that of PHP's substr
5000
5001
      $str = self::clean($str);
5002
    }
5003
5004
    $str_length = 0;
5005
    if ($start || $length === null) {
5006
      $str_length = (int)self::strlen($str);
5007
    }
5008
5009
    if ($start && $start > $str_length) {
5010
      return false;
5011
    }
5012
5013
    if ($length === null) {
5014
      $length = $str_length;
5015
    } else {
5016
      $length = (int)$length;
5017
    }
5018 1
5019
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5020 1
      self::checkForSupport();
5021 1
    }
5022
5023 1
    if (
5024 1
        $encoding === 'UTF-8'
5025
        ||
5026
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
5027 1
    ) {
5028 1
      $encoding = 'UTF-8';
5029 1
    } else {
5030
      $encoding = self::normalize_encoding($encoding);
5031 1
    }
5032 1
5033
    if (self::$support['mbstring'] === true) {
5034
      return \mb_substr($str, $start, $length, $encoding);
5035 1
    }
5036 1
5037
    if (self::$support['iconv'] === true) {
5038 1
      return \iconv_substr($str, $start, $length, $encoding);
5039
    }
5040
5041
    // fallback
5042 1
5043
    // split to array, and remove invalid characters
5044
    $array = self::split($str);
5045
5046
    // extract relevant part, and join to make sting again
5047
    return implode(array_slice($array, $start, $length));
5048
  }
5049
5050
  /**
5051
   * Binary safe comparison of two strings from an offset, up to length characters.
5052
   *
5053
   * @param string  $main_str           <p>The main string being compared.</p>
5054
   * @param string  $str                <p>The secondary string being compared.</p>
5055
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
5056
   *                                    the end of the string.</p>
5057 6
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
5058
   *                                    the length of the str compared to the length of main_str less the offset.</p>
5059 6
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
5060 1
   *                                    insensitive.</p>
5061
   *
5062
   * @return int
5063 1
   */
5064 1
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
5065 1
  {
5066 1
    $main_str = self::substr($main_str, $offset, $length);
5067
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5066 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5068
5069
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5066 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5067 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5066 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5067 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5070 1
  }
5071 1
5072 1
  /**
5073 1
   * Count the number of substring occurrences.
5074 1
   *
5075 1
   * @link  http://php.net/manual/en/function.substr-count.php
5076 1
   *
5077 1
   * @param string $haystack  <p>The string to search in.</p>
5078
   * @param string $needle    <p>The substring to search for.</p>
5079
   * @param int    $offset    [optional] <p>The offset where to start counting.</p>
5080
   * @param int    $length    [optional] <p>
5081 1
   *                          The maximum length after the specified offset to search for the
5082 1
   *                          substring. It outputs a warning if the offset plus the length is
5083 1
   *                          greater than the haystack length.
5084 1
   *                          </p>
5085 1
   * @param string $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5086 1
   *
5087 1
   * @return int|false <p>This functions returns an integer or false if there isn't a string.</p>
5088 1
   */
5089
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8')
5090
  {
5091 1
    $haystack = (string)$haystack;
5092 1
    $needle = (string)$needle;
5093 1
5094 1
    if (!isset($haystack[0], $needle[0])) {
5095
      return false;
5096
    }
5097
5098 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5099
      $offset = (int)$offset;
5100 6
      $length = (int)$length;
5101 1
5102 1
      if ($length + $offset <= 0) {
5103 1
        return false;
5104 1
      }
5105
5106 1
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5107
    }
5108
5109 6
    if ($encoding !== 'UTF-8') {
5110 6
      $encoding = self::normalize_encoding($encoding);
5111
    }
5112 6
5113 4
    return \mb_substr_count($haystack, $needle, $encoding);
5114 4
  }
5115
5116 6
  /**
5117
   * Replace text within a portion of a string.
5118 6
   *
5119
   * source: https://gist.github.com/stemar/8287074
5120
   *
5121
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5122
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5123
   * @param int|int[]       $start
5124
   * @param int|int[]|void  $length      [optional]
5125
   *
5126
   * @return string|string[]
5127
   */
5128
  public static function substr_replace($str, $replacement, $start, $length = null)
5129 1
  {
5130
    if (is_array($str)) {
5131 1
      $num = count($str);
5132
5133 1
      // $replacement
5134 1
      if (is_array($replacement)) {
5135
        $replacement = array_slice($replacement, 0, $num);
5136
      } else {
5137 1
        $replacement = array_pad(array($replacement), $num, $replacement);
5138
      }
5139
5140
      // $start
5141 1 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5142
        $start = array_slice($start, 0, $num);
5143 1
        foreach ($start as &$valueTmp) {
5144 1
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5145
        }
5146 1
        unset($valueTmp);
5147
      } else {
5148 1
        $start = array_pad(array($start), $num, $start);
5149 1
      }
5150
5151 1
      // $length
5152
      if (!isset($length)) {
5153 1
        $length = array_fill(0, $num, 0);
5154 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5155 1
        $length = array_slice($length, 0, $num);
5156
        foreach ($length as &$valueTmpV2) {
5157 1
          if (isset($valueTmpV2)) {
5158
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5159
          } else {
5160
            $valueTmpV2 = 0;
5161
          }
5162
        }
5163
        unset($valueTmpV2);
5164
      } else {
5165
        $length = array_pad(array($length), $num, $length);
5166
      }
5167
5168
      // Recursive call
5169
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5170 6
    } else {
5171
      if (is_array($replacement)) {
5172 6
        if (count($replacement) > 0) {
5173
          $replacement = $replacement[0];
5174
        } else {
5175
          $replacement = '';
5176
        }
5177
      }
5178
    }
5179
5180
    preg_match_all('/./us', (string)$str, $smatches);
5181
    preg_match_all('/./us', (string)$replacement, $rmatches);
5182
5183
    if ($length === null) {
5184 1
      $length = \mb_strlen($str);
5185
    }
5186 1
5187
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5188
5189
    return implode($smatches[0], null);
5190
  }
5191
5192
  /**
5193
   * Returns a case swapped version of the string.
5194
   *
5195
   * @param string $str      <p>The input string.</p>
5196
   * @param string $encoding [optional] <p>Default is UTF-8</p>
5197
   *
5198 1
   * @return string <p>Each character's case swapped.</p>
5199
   */
5200 1
  public static function swapCase($str, $encoding = 'UTF-8')
5201
  {
5202
    $str = (string)$str;
5203
5204
    if (!isset($str[0])) {
5205
      return '';
5206
    }
5207
5208
    if ($encoding !== 'UTF-8') {
5209
      $encoding = self::normalize_encoding($encoding);
5210
    }
5211 13
5212
    $str = self::clean($str);
5213 13
5214
    $strSwappedCase = preg_replace_callback(
5215
        '/[\S]/u',
5216 13
        function ($match) use ($encoding) {
5217
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5218 13
5219 3
          if ($match[0] === $marchToUpper) {
5220
            return UTF8::strtolower($match[0], $encoding);
5221
          } else {
5222 11
            return $marchToUpper;
5223
          }
5224 11
        },
5225
        $str
5226
    );
5227
5228 11
    return $strSwappedCase;
5229 11
  }
5230
5231
  /**
5232 11
   * alias for "UTF8::to_ascii()"
5233 11
   *
5234
   * @see UTF8::to_ascii()
5235 1
   *
5236
   * @param string $s
5237 1
   * @param string $subst_chr
5238 1
   * @param bool   $strict
5239 1
   *
5240
   * @return string
5241 1
   */
5242
  public static function toAscii($s, $subst_chr = '?', $strict = false)
5243 1
  {
5244 1
    return self::to_ascii($s, $subst_chr, $strict);
5245
  }
5246
5247 1
  /**
5248
   * alias for "UTF8::to_latin1()"
5249
   *
5250 1
   * @see UTF8::to_latin1()
5251 1
   *
5252 1
   * @param $str
5253
   *
5254 1
   * @return string
5255 1
   */
5256
  public static function toLatin1($str)
5257 1
  {
5258 1
    return self::to_latin1($str);
5259 1
  }
5260
5261 1
  /**
5262 1
   * alias for "UTF8::to_utf8()"
5263
   *
5264 1
   * @see UTF8::to_utf8()
5265 1
   *
5266 1
   * @param string $str
5267
   *
5268 1
   * @return string
5269
   */
5270
  public static function toUTF8($str)
5271
  {
5272
    return self::to_utf8($str);
5273
  }
5274
5275
  /**
5276
   * Convert a string into ASCII.
5277
   *
5278
   * @param string $str     <p>The input string.</p>
5279
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5280
   * @param bool   $strict  [optional] <p>Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad
5281
   *                        performance</p>
5282
   *
5283 1
   * @return string
5284 1
   *
5285
   * @throws \Exception
5286 1
   */
5287
  public static function to_ascii($str, $unknown = '?', $strict = false)
5288
  {
5289
    static $UTF8_TO_ASCII;
5290
5291 1
    // init
5292
    $str = (string)$str;
5293
5294
    if (!isset($str[0])) {
5295
      return '';
5296 1
    }
5297 1
5298 1
    $str = self::clean($str, false, true, true);
5299 1
5300
    // check if we only have ASCII
5301 1
    if (self::is_ascii($str) === true) {
5302 1
      return $str;
5303 1
    }
5304
5305 1
    if ($strict === true) {
5306
      if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5307 1
        self::checkForSupport();
5308 1
      }
5309 1
5310 1
      if (self::$support['intl'] == true && Bootup::is_php('5.4')) {
5311 1
        $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5312
5313 1
        // check again, if we only have ASCII, now ...
5314
        if (self::is_ascii($str) === true) {
5315 1
          return $str;
5316
        }
5317
5318
      } else {
5319
        throw new \Exception('Intl is not supported or you use PHP < 5.4!');
5320
      }
5321
    }
5322
5323
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5324
    $chars = $ar[0];
5325
    foreach ($chars as &$c) {
5326
5327 1
      $ordC0 = ord($c[0]);
5328
5329 1
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5330
        continue;
5331
      }
5332
5333
      $ordC1 = ord($c[1]);
5334
5335
      // ASCII - next please
5336
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5337
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5338
      }
5339
5340
      if ($ordC0 >= 224) {
5341 1
        $ordC2 = ord($c[2]);
5342
5343 1
        if ($ordC0 <= 239) {
5344
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5345
        }
5346
5347
        if ($ordC0 >= 240) {
5348
          $ordC3 = ord($c[3]);
5349
5350
          if ($ordC0 <= 247) {
5351
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5352
          }
5353
5354
          if ($ordC0 >= 248) {
5355
            $ordC4 = ord($c[4]);
5356
5357 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5358
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5359
            }
5360
5361
            if ($ordC0 >= 252) {
5362
              $ordC5 = ord($c[5]);
5363
5364 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5365
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5366
              }
5367
            }
5368
          }
5369 20
        }
5370
      }
5371 20
5372 2
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5373
        $c = $unknown;
5374
        continue;
5375 2
      }
5376 2
5377
      if (!isset($ord)) {
5378 2
        $c = $unknown;
5379
        continue;
5380
      }
5381 20
5382
      $bank = $ord >> 8;
5383 20
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5384 4
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5385
        if (file_exists($bankfile)) {
5386
          /** @noinspection PhpIncludeInspection */
5387 19
          require $bankfile;
5388 19
        } else {
5389
          $UTF8_TO_ASCII[$bank] = array();
5390
        }
5391 19
      }
5392 19
5393
      $newchar = $ord & 255;
5394 19
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5395 19
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5396 19
      } else {
5397 19
        $c = $unknown;
5398
      }
5399 19
    }
5400
5401 16
    return implode('', $chars);
5402 16
  }
5403 16
5404 16
  /**
5405 5
   * alias for "UTF8::to_iso8859()"
5406 5
   *
5407 5
   * @see UTF8::to_iso8859()
5408
   *
5409
   * @param string $str
5410 19
   *
5411
   * @return string|string[]
5412 17
   */
5413 13
  public static function toIso8859($str)
5414 13
  {
5415 13
    return self::to_iso8859($str);
5416 8
  }
5417 8
5418 8
  /**
5419
   * alias for "UTF8::to_iso8859()"
5420
   *
5421 19
   * @see UTF8::to_iso8859()
5422
   *
5423 9
   * @param string|string[] $str
5424 4
   *
5425 4
   * @return string|string[]
5426 4
   */
5427 6
  public static function to_latin1($str)
5428 6
  {
5429 6
    return self::to_iso8859($str);
5430
  }
5431
5432 9
  /**
5433 6
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5434 6
   *
5435 6
   * - It decode UTF-8 codepoints and unicode escape sequences.
5436
   *
5437
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5438 19
   *
5439
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5440 4
   *
5441 4
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5442 2
   *    are followed by any of these:  ("group B")
5443 2
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5444 3
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5445 3
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5446 3
   * is also a valid unicode character, and will be left unchanged.
5447
   *
5448
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5449 4
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5450 16
   *
5451
   * @param string|string[] $str <p>Any string or array.</p>
5452 19
   *
5453
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5454
   */
5455 19
  public static function to_utf8($str)
5456 19
  {
5457
    if (is_array($str)) {
5458 3
      foreach ($str as $k => $v) {
5459 19
        /** @noinspection AlterInForeachInspection */
5460
        /** @noinspection OffsetOperationsInspection */
5461 19
        $str[$k] = self::to_utf8($v);
5462
      }
5463
5464 19
      return $str;
5465 19
    }
5466 19
5467 2
    $str = (string)$str;
5468 19
5469
    if (!isset($str[0])) {
5470 19
      return $str;
5471
    }
5472 19
5473
    $max = strlen($str);
5474
    $buf = '';
5475
5476
    /** @noinspection ForeachInvariantsInspection */
5477
    for ($i = 0; $i < $max; $i++) {
5478
      $c1 = $str[$i];
5479
5480
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5481
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5482 2
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5483
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5484 2
5485
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5486 1
5487
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5488
            $buf .= $c1 . $c2;
5489 1
            $i++;
5490 1
          } else { // not valid UTF8 - convert it
5491
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5492 1
            $cc2 = ($c1 & "\x3f") | "\x80";
5493
            $buf .= $cc1 . $cc2;
5494
          }
5495 2
5496 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5497 2
5498 1
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5499
            $buf .= $c1 . $c2 . $c3;
5500
            $i += 2;
5501 2
          } else { // not valid UTF8 - convert it
5502
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5503
            $cc2 = ($c1 & "\x3f") | "\x80";
5504
            $buf .= $cc1 . $cc2;
5505
          }
5506
5507
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5508
5509 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5510
            $buf .= $c1 . $c2 . $c3 . $c4;
5511
            $i += 3;
5512
          } else { // not valid UTF8 - convert it
5513
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5514
            $cc2 = ($c1 & "\x3f") | "\x80";
5515
            $buf .= $cc1 . $cc2;
5516
          }
5517 26
5518
        } else { // doesn't look like UTF8, but should be converted
5519 26
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5520
          $cc2 = (($c1 & "\x3f") | "\x80");
5521 26
          $buf .= $cc1 . $cc2;
5522 5
        }
5523
5524
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5525
5526 22
        $ordC1 = ord($c1);
5527 6
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5528
          $buf .= self::$win1252ToUtf8[$ordC1];
5529
        } else {
5530 16
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5531
          $cc2 = (($c1 & "\x3f") | "\x80");
5532
          $buf .= $cc1 . $cc2;
5533
        }
5534
5535
      } else { // it doesn't need conversion
5536
        $buf .= $c1;
5537
      }
5538
    }
5539
5540 14
    // decode unicode escape sequences
5541
    $buf = preg_replace_callback(
5542 14
        '/\\\\u([0-9a-f]{4})/i',
5543
        function ($match) {
5544
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5545
        },
5546
        $buf
5547
    );
5548
5549
    // decode UTF-8 codepoints
5550
    $buf = preg_replace_callback(
5551
        '/&#\d{2,4};/',
5552
        function ($match) {
5553
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5554 1
        },
5555
        $buf
5556 1
    );
5557
5558
    return $buf;
5559
  }
5560
5561
  /**
5562
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5563
   *
5564
   * @param string|string[] $str
5565
   *
5566
   * @return string|string[]
5567 8
   */
5568
  public static function to_iso8859($str)
5569 8
  {
5570 2
    if (is_array($str)) {
5571
5572
      foreach ($str as $k => $v) {
5573
        /** @noinspection AlterInForeachInspection */
5574 7
        /** @noinspection OffsetOperationsInspection */
5575 7
        $str[$k] = self::to_iso8859($v);
5576
      }
5577 7
5578 1
      return $str;
5579 1
    }
5580 7
5581
    $str = (string)$str;
5582
5583 7
    if (!isset($str[0])) {
5584
      return '';
5585 7
    }
5586
5587
    return self::utf8_decode($str);
5588
  }
5589 1
5590 1
  /**
5591 1
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5592 7
   *
5593 7
   * INFO: This is slower then "trim()"
5594 7
   *
5595 7
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5596 7
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5597
   *
5598 7
   * @param string $str   <p>The string to be trimmed</p>
5599
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5600
   *
5601
   * @return string <p>The trimmed string.</p>
5602
   */
5603
  public static function trim($str = '', $chars = INF)
5604
  {
5605
    $str = (string)$str;
5606
5607
    if (!isset($str[0])) {
5608
      return '';
5609
    }
5610
5611
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5612
    if ($chars === INF || !$chars) {
5613
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5614
    }
5615
5616
    return self::rtrim(self::ltrim($str, $chars), $chars);
5617
  }
5618 1
5619
  /**
5620 1
   * Makes string's first char uppercase.
5621
   *
5622 1
   * @param string $str <p>The input string.</p>
5623 1
   *
5624
   * @return string <p>The resulting string</p>
5625
   */
5626 1
  public static function ucfirst($str)
5627
  {
5628 1
    return self::strtoupper(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5629
  }
5630 1
5631 1
  /**
5632 1
   * alias for "UTF8::ucfirst()"
5633 1
   *
5634
   * @see UTF8::ucfirst()
5635 1
   *
5636 1
   * @param string $word
5637 1
   *
5638
   * @return string
5639 1
   */
5640
  public static function ucword($word)
5641
  {
5642
    return self::ucfirst($word);
5643
  }
5644
5645
  /**
5646
   * Uppercase for all words in the string.
5647 1
   *
5648
   * @param string   $str        <p>The input string.</p>
5649
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5650
   *
5651
   * @return string
5652
   */
5653
  public static function ucwords($str, $exceptions = array())
5654
  {
5655
    if (!$str) {
5656
      return '';
5657
    }
5658
5659
    // init
5660
    $words = explode(' ', $str);
5661
    $newwords = array();
5662
5663
    if (count($exceptions) > 0) {
5664
      $useExceptions = true;
5665
    } else {
5666
      $useExceptions = false;
5667
    }
5668
5669
    foreach ($words as $word) {
5670
      if (
5671
          ($useExceptions === false)
5672
          ||
5673
          (
5674
              $useExceptions === true
5675
              &&
5676
              !in_array($word, $exceptions, true)
5677
          )
5678
      ) {
5679
        $word = self::ucfirst($word);
5680
      }
5681
      $newwords[] = $word;
5682
    }
5683
5684
    return implode(' ', $newwords);
5685
  }
5686
5687
  /**
5688
   * Multi decode html entity & fix urlencoded-win1252-chars.
5689
   *
5690
   * e.g:
5691
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5692
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5693
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5694
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5695
   * 'Düsseldorf'                   => 'Düsseldorf'
5696
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5697
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5698
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5699
   *
5700
   * @param string $str <p>The input string.</p>
5701
   *
5702
   * @return string
5703
   */
5704
  public static function urldecode($str)
5705
  {
5706
    $str = (string)$str;
5707
5708
    if (!isset($str[0])) {
5709
      return '';
5710
    }
5711
5712
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5713
5714
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5715
5716
    $str = self::fix_simple_utf8(
5717
        rawurldecode(
5718
            self::html_entity_decode(
5719
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5720
                $flags
5721
            )
5722
        )
5723
    );
5724
5725
    return (string)$str;
5726
  }
5727
5728
  /**
5729
   * Return a array with "urlencoded"-win1252 -> UTF-8
5730
   *
5731
   * @return mixed
5732
   */
5733
  public static function urldecode_fix_win1252_chars()
5734
  {
5735
    static $array = array(
5736
        '%20' => ' ',
5737
        '%21' => '!',
5738
        '%22' => '"',
5739
        '%23' => '#',
5740
        '%24' => '$',
5741
        '%25' => '%',
5742
        '%26' => '&',
5743
        '%27' => "'",
5744
        '%28' => '(',
5745
        '%29' => ')',
5746
        '%2A' => '*',
5747
        '%2B' => '+',
5748
        '%2C' => ',',
5749
        '%2D' => '-',
5750
        '%2E' => '.',
5751
        '%2F' => '/',
5752
        '%30' => '0',
5753
        '%31' => '1',
5754
        '%32' => '2',
5755
        '%33' => '3',
5756
        '%34' => '4',
5757
        '%35' => '5',
5758
        '%36' => '6',
5759
        '%37' => '7',
5760
        '%38' => '8',
5761
        '%39' => '9',
5762
        '%3A' => ':',
5763
        '%3B' => ';',
5764
        '%3C' => '<',
5765
        '%3D' => '=',
5766
        '%3E' => '>',
5767
        '%3F' => '?',
5768
        '%40' => '@',
5769
        '%41' => 'A',
5770
        '%42' => 'B',
5771
        '%43' => 'C',
5772
        '%44' => 'D',
5773
        '%45' => 'E',
5774
        '%46' => 'F',
5775
        '%47' => 'G',
5776
        '%48' => 'H',
5777
        '%49' => 'I',
5778
        '%4A' => 'J',
5779
        '%4B' => 'K',
5780
        '%4C' => 'L',
5781
        '%4D' => 'M',
5782
        '%4E' => 'N',
5783
        '%4F' => 'O',
5784
        '%50' => 'P',
5785
        '%51' => 'Q',
5786
        '%52' => 'R',
5787
        '%53' => 'S',
5788
        '%54' => 'T',
5789
        '%55' => 'U',
5790
        '%56' => 'V',
5791
        '%57' => 'W',
5792
        '%58' => 'X',
5793
        '%59' => 'Y',
5794
        '%5A' => 'Z',
5795
        '%5B' => '[',
5796
        '%5C' => '\\',
5797
        '%5D' => ']',
5798
        '%5E' => '^',
5799
        '%5F' => '_',
5800
        '%60' => '`',
5801
        '%61' => 'a',
5802
        '%62' => 'b',
5803
        '%63' => 'c',
5804
        '%64' => 'd',
5805
        '%65' => 'e',
5806
        '%66' => 'f',
5807
        '%67' => 'g',
5808
        '%68' => 'h',
5809
        '%69' => 'i',
5810
        '%6A' => 'j',
5811
        '%6B' => 'k',
5812
        '%6C' => 'l',
5813
        '%6D' => 'm',
5814
        '%6E' => 'n',
5815
        '%6F' => 'o',
5816
        '%70' => 'p',
5817
        '%71' => 'q',
5818
        '%72' => 'r',
5819
        '%73' => 's',
5820
        '%74' => 't',
5821
        '%75' => 'u',
5822
        '%76' => 'v',
5823
        '%77' => 'w',
5824
        '%78' => 'x',
5825
        '%79' => 'y',
5826
        '%7A' => 'z',
5827
        '%7B' => '{',
5828
        '%7C' => '|',
5829
        '%7D' => '}',
5830
        '%7E' => '~',
5831
        '%7F' => '',
5832
        '%80' => '`',
5833
        '%81' => '',
5834
        '%82' => '‚',
5835
        '%83' => 'ƒ',
5836
        '%84' => '„',
5837
        '%85' => '…',
5838
        '%86' => '†',
5839
        '%87' => '‡',
5840
        '%88' => 'ˆ',
5841
        '%89' => '‰',
5842
        '%8A' => 'Š',
5843
        '%8B' => '‹',
5844
        '%8C' => 'Œ',
5845
        '%8D' => '',
5846
        '%8E' => 'Ž',
5847
        '%8F' => '',
5848
        '%90' => '',
5849
        '%91' => '‘',
5850
        '%92' => '’',
5851
        '%93' => '“',
5852
        '%94' => '”',
5853
        '%95' => '•',
5854
        '%96' => '–',
5855
        '%97' => '—',
5856
        '%98' => '˜',
5857
        '%99' => '™',
5858
        '%9A' => 'š',
5859
        '%9B' => '›',
5860
        '%9C' => 'œ',
5861
        '%9D' => '',
5862
        '%9E' => 'ž',
5863
        '%9F' => 'Ÿ',
5864
        '%A0' => '',
5865
        '%A1' => '¡',
5866
        '%A2' => '¢',
5867
        '%A3' => '£',
5868
        '%A4' => '¤',
5869
        '%A5' => '¥',
5870
        '%A6' => '¦',
5871
        '%A7' => '§',
5872
        '%A8' => '¨',
5873
        '%A9' => '©',
5874 1
        '%AA' => 'ª',
5875
        '%AB' => '«',
5876 1
        '%AC' => '¬',
5877
        '%AD' => '',
5878
        '%AE' => '®',
5879
        '%AF' => '¯',
5880
        '%B0' => '°',
5881
        '%B1' => '±',
5882
        '%B2' => '²',
5883
        '%B3' => '³',
5884
        '%B4' => '´',
5885
        '%B5' => 'µ',
5886 6
        '%B6' => '¶',
5887
        '%B7' => '·',
5888 6
        '%B8' => '¸',
5889 6
        '%B9' => '¹',
5890
        '%BA' => 'º',
5891 6
        '%BB' => '»',
5892
        '%BC' => '¼',
5893 6
        '%BD' => '½',
5894 3
        '%BE' => '¾',
5895
        '%BF' => '¿',
5896
        '%C0' => 'À',
5897
        '%C1' => 'Á',
5898 6
        '%C2' => 'Â',
5899
        '%C3' => 'Ã',
5900 6
        '%C4' => 'Ä',
5901 1
        '%C5' => 'Å',
5902 1
        '%C6' => 'Æ',
5903 1
        '%C7' => 'Ç',
5904
        '%C8' => 'È',
5905 6
        '%C9' => 'É',
5906
        '%CA' => 'Ê',
5907
        '%CB' => 'Ë',
5908
        '%CC' => 'Ì',
5909
        '%CD' => 'Í',
5910
        '%CE' => 'Î',
5911
        '%CF' => 'Ï',
5912
        '%D0' => 'Ð',
5913
        '%D1' => 'Ñ',
5914
        '%D2' => 'Ò',
5915 6
        '%D3' => 'Ó',
5916
        '%D4' => 'Ô',
5917 6
        '%D5' => 'Õ',
5918
        '%D6' => 'Ö',
5919 6
        '%D7' => '×',
5920 6
        '%D8' => 'Ø',
5921
        '%D9' => 'Ù',
5922
        '%DA' => 'Ú',
5923 5
        '%DB' => 'Û',
5924 5
        '%DC' => 'Ü',
5925
        '%DD' => 'Ý',
5926 5
        '%DE' => 'Þ',
5927 1
        '%DF' => 'ß',
5928 1
        '%E0' => 'à',
5929 1
        '%E1' => 'á',
5930
        '%E2' => 'â',
5931 5
        '%E3' => 'ã',
5932
        '%E4' => 'ä',
5933
        '%E5' => 'å',
5934
        '%E6' => 'æ',
5935
        '%E7' => 'ç',
5936
        '%E8' => 'è',
5937
        '%E9' => 'é',
5938
        '%EA' => 'ê',
5939
        '%EB' => 'ë',
5940
        '%EC' => 'ì',
5941
        '%ED' => 'í',
5942
        '%EE' => 'î',
5943
        '%EF' => 'ï',
5944
        '%F0' => 'ð',
5945
        '%F1' => 'ñ',
5946
        '%F2' => 'ò',
5947
        '%F3' => 'ó',
5948
        '%F4' => 'ô',
5949
        '%F5' => 'õ',
5950
        '%F6' => 'ö',
5951
        '%F7' => '÷',
5952
        '%F8' => 'ø',
5953
        '%F9' => 'ù',
5954
        '%FA' => 'ú',
5955
        '%FB' => 'û',
5956
        '%FC' => 'ü',
5957
        '%FD' => 'ý',
5958
        '%FE' => 'þ',
5959
        '%FF' => 'ÿ',
5960
    );
5961 1
5962
    return $array;
5963 1
  }
5964
5965
  /**
5966
   * Decodes an UTF-8 string to ISO-8859-1.
5967
   *
5968
   * @param string $str <p>The input string.</p>
5969
   *
5970
   * @return string
5971
   */
5972
  public static function utf8_decode($str)
5973
  {
5974
    static $utf8ToWin1252Keys = null;
5975 1
    static $utf8ToWin1252Values = null;
5976
5977 1
    $str = (string)$str;
5978
5979 1
    if (!isset($str[0])) {
5980 1
      return '';
5981
    }
5982
5983 1
    // init
5984
    $str = self::to_utf8($str);
5985 1
5986 1
    if ($utf8ToWin1252Keys === null) {
5987
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
5988
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
5989 1
    }
5990
5991
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
5992 1
  }
5993 1
5994 1
  /**
5995 1
   * Encodes an ISO-8859-1 string to UTF-8.
5996 1
   *
5997
   * @param string $str <p>The input string.</p>
5998
   *
5999 1
   * @return string
6000
   */
6001
  public static function utf8_encode($str)
6002
  {
6003
    $str = \utf8_encode($str);
6004
6005
    if (false === strpos($str, "\xC2")) {
6006
      return $str;
6007
    } else {
6008
6009
      static $cp1252ToUtf8Keys = null;
6010
      static $cp1252ToUtf8Values = null;
6011
6012
      if ($cp1252ToUtf8Keys === null) {
6013
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
6014
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
6015
      }
6016
6017
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
6018 9
    }
6019
  }
6020 9
6021 9
  /**
6022
   * fix -> utf8-win1252 chars
6023 9
   *
6024 2
   * @param string $str <p>The input string.</p>
6025
   *
6026
   * @return string
6027 8
   *
6028 8
   * @deprecated use "UTF8::fix_simple_utf8()"
6029 8
   */
6030
  public static function utf8_fix_win1252_chars($str)
6031 8
  {
6032
    return self::fix_simple_utf8($str);
6033
  }
6034
6035 8
  /**
6036
   * Returns an array with all utf8 whitespace characters.
6037 8
   *
6038
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
6039 8
   *
6040 1
   * @author: Derek E. [email protected]
6041 1
   *
6042 1
   * @return array <p>
6043
   *               An array with all known whitespace characters as values and the type of whitespace as keys
6044 8
   *               as defined in above URL.
6045 8
   *               </p>
6046
   */
6047 8
  public static function whitespace_table()
6048 8
  {
6049 8
    return self::$whitespaceTable;
6050 8
  }
6051 8
6052
  /**
6053 8
   * Limit the number of words in a string.
6054 8
   *
6055 8
   * @param string $str      <p>The input string.</p>
6056 8
   * @param int    $words    <p>The limit of words as integer.</p>
6057
   * @param string $strAddOn <p>Replacement for the striped string.</p>
6058 8
   *
6059 6
   * @return string
6060 6
   */
6061 6
  public static function words_limit($str, $words = 100, $strAddOn = '...')
6062 6
  {
6063
    $str = (string)$str;
6064 6
6065 3
    if (!isset($str[0])) {
6066 3
      return '';
6067
    }
6068 6
6069 6
    $words = (int)$words;
6070
6071 8
    if ($words < 1) {
6072
      return '';
6073
    }
6074
6075
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
6076
6077
    if (
6078
        !isset($matches[0])
6079 1
        ||
6080
        self::strlen($str) === self::strlen($matches[0])
6081 1
    ) {
6082
      return $str;
6083
    }
6084
6085
    return self::rtrim($matches[0]) . $strAddOn;
6086
  }
6087
6088
  /**
6089
   * Wraps a string to a given number of characters
6090
   *
6091
   * @link  http://php.net/manual/en/function.wordwrap.php
6092
   *
6093
   * @param string $str   <p>The input string.</p>
6094
   * @param int    $width [optional] <p>The column width.</p>
6095
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6096
   * @param bool   $cut   [optional] <p>
6097
   *                      If the cut is set to true, the string is
6098
   *                      always wrapped at or before the specified width. So if you have
6099
   *                      a word that is larger than the given width, it is broken apart.
6100
   *                      </p>
6101
   *
6102
   * @return string <p>The given string wrapped at the specified column.</p>
6103
   */
6104
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6105
  {
6106
    $str = (string)$str;
6107
    $break = (string)$break;
6108
6109
    if (!isset($str[0], $break[0])) {
6110
      return '';
6111
    }
6112
6113
    $w = '';
6114
    $strSplit = explode($break, $str);
6115
    $count = count($strSplit);
6116
6117
    if (1 === $count && '' === $strSplit[0]) {
6118
      return '';
6119
    }
6120
6121
    $chars = array();
6122
    /** @noinspection ForeachInvariantsInspection */
6123
    for ($i = 0; $i < $count; ++$i) {
6124
6125
      if ($i) {
6126
        $chars[] = $break;
6127
        $w .= '#';
6128
      }
6129
6130
      $c = $strSplit[$i];
6131
      unset($strSplit[$i]);
6132
6133
      foreach (self::split($c) as $c) {
6134
        $chars[] = $c;
6135
        $w .= ' ' === $c ? ' ' : '?';
6136
      }
6137
    }
6138
6139
    $strReturn = '';
6140
    $j = 0;
6141
    $b = $i = -1;
6142
    $w = wordwrap($w, $width, '#', $cut);
6143
6144
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6145
      for (++$i; $i < $b; ++$i) {
6146
        $strReturn .= $chars[$j];
6147
        unset($chars[$j++]);
6148
      }
6149
6150
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6151
        unset($chars[$j++]);
6152
      }
6153
6154
      $strReturn .= $break;
6155
    }
6156
6157
    return $strReturn . implode('', $chars);
6158
  }
6159
6160
  /**
6161
   * Returns an array of Unicode White Space characters.
6162
   *
6163
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6164
   */
6165
  public static function ws()
6166
  {
6167
    return self::$whitespace;
6168
  }
6169
6170
}
6171