Completed
Push — master ( 480a3f...9001bb )
by Lars
04:22
created

UTF8::parse_str()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 12
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3

Importance

Changes 0
Metric Value
dl 0
loc 12
ccs 5
cts 5
cp 1
rs 9.4285
c 0
b 0
f 0
cc 3
eloc 6
nc 2
nop 2
crap 3
1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834 1
  public static function add_bom_to_string($str)
835
  {
836 1
    if (self::string_has_bom($str) === false) {
837 1
      $str = self::bom() . $str;
838 1
    }
839
840 1
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 2
  public static function bom()
861
  {
862 2
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884 2
  public static function checkForSupport()
885
  {
886 2
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888 1
      self::$support['already_checked_via_portable_utf8'] = true;
889
890 1
      self::$support['mbstring'] = self::mbstring_loaded();
891 1
      self::$support['iconv'] = self::iconv_loaded();
892 1
      self::$support['intl'] = self::intl_loaded();
893 1
      self::$support['intlChar'] = self::intlChar_loaded();
894 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895 1
    }
896 2
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907 9
  public static function chr($code_point)
908
  {
909 9
    $i = (int)$code_point;
910 9
    if ($i !== $code_point) {
911 1
      return null;
912
    }
913
914 9
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
915
      self::checkForSupport();
916
    }
917
918 9
    if (self::$support['intlChar'] === true) {
919
      return \IntlChar::chr($code_point);
920
    }
921
922
    // use static cache, if there is no support for "IntlChar"
923 9
    static $cache = array();
924 9
    if (isset($cache[$code_point]) === true) {
925 8
      return $cache[$code_point];
926
    }
927
928
929 8
    if ($code_point <= 0x7f) {
930 6
      return $cache[$code_point] = chr($code_point);
931
    }
932
933 7
    if ($code_point <= 0x7ff) {
934 6
      return $cache[$code_point] = chr(0xc0 | ($code_point >> 6)) .
935 6
                                   chr(0x80 | ($code_point & 0x3f));
936
    }
937
938 7
    if ($code_point <= 0xffff) {
939 7
      return $cache[$code_point] = chr(0xe0 | ($code_point >> 12)) .
940 7
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
941 7
                                   chr(0x80 | ($code_point & 0x3f));
942
    }
943
944 1
    if ($code_point <= 0x10ffff) {
945 1
      return $cache[$code_point] = chr(0xf0 | ($code_point >> 18)) .
946 1
                                   chr(0x80 | (($code_point >> 12) & 0x3f)) .
947 1
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
948 1
                                   chr(0x80 | ($code_point & 0x3f));
949
    }
950
951
    # U+FFFD REPLACEMENT CHARACTER
952
    return $cache[$code_point] = "\xEF\xBF\xBD";
953
  }
954
955
  /**
956
   * Applies callback to all characters of a string.
957
   *
958
   * @param string|array $callback <p>The callback function.</p>
959
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
960
   *
961
   * @return array <p>The outcome of callback.</p>
962
   */
963 1
  public static function chr_map($callback, $str)
964
  {
965 1
    $chars = self::split($str);
966
967 1
    return array_map($callback, $chars);
968
  }
969
970
  /**
971
   * Generates an array of byte length of each character of a Unicode string.
972
   *
973
   * 1 byte => U+0000  - U+007F
974
   * 2 byte => U+0080  - U+07FF
975
   * 3 byte => U+0800  - U+FFFF
976
   * 4 byte => U+10000 - U+10FFFF
977
   *
978
   * @param string $str <p>The original Unicode string.</p>
979
   *
980
   * @return array <p>An array of byte lengths of each character.</p>
981
   */
982 4
  public static function chr_size_list($str)
983
  {
984 4
    if (!$str) {
985 3
      return array();
986
    }
987
988 4
    return array_map('strlen', self::split($str));
989
  }
990
991
  /**
992
   * Get a decimal code representation of a specific character.
993
   *
994
   * @param string $char <p>The input character.</p>
995
   *
996
   * @return int
997
   */
998 2
  public static function chr_to_decimal($char)
999
  {
1000 2
    $char = (string)$char;
1001 2
    $code = self::ord($char[0]);
1002 2
    $bytes = 1;
1003
1004 2
    if (!($code & 0x80)) {
1005
      // 0xxxxxxx
1006 2
      return $code;
1007
    }
1008
1009 2
    if (($code & 0xe0) === 0xc0) {
1010
      // 110xxxxx
1011 2
      $bytes = 2;
1012 2
      $code &= ~0xc0;
1013 2
    } elseif (($code & 0xf0) === 0xe0) {
1014
      // 1110xxxx
1015 1
      $bytes = 3;
1016 1
      $code &= ~0xe0;
1017 1
    } elseif (($code & 0xf8) === 0xf0) {
1018
      // 11110xxx
1019
      $bytes = 4;
1020
      $code &= ~0xf0;
1021
    }
1022
1023 2
    for ($i = 2; $i <= $bytes; $i++) {
1024
      // 10xxxxxx
1025 2
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1026 2
    }
1027
1028 2
    return $code;
1029
  }
1030
1031
  /**
1032
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1033
   *
1034
   * @param string $char <p>The input character</p>
1035
   * @param string $pfix [optional]
1036
   *
1037
   * @return string <p>The code point encoded as U+xxxx<p>
1038
   */
1039 1
  public static function chr_to_hex($char, $pfix = 'U+')
1040
  {
1041 1
    return self::int_to_hex(self::ord($char), $pfix);
1042
  }
1043
1044
  /**
1045
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1046
   *
1047
   * @param string $body     <p>The original string to be split.</p>
1048
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1049
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1050
   *
1051
   * @return string <p>The chunked string</p>
1052
   */
1053 1
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1054
  {
1055 1
    return implode($end, self::split($body, $chunklen));
1056
  }
1057
1058
  /**
1059
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1060
   *
1061
   * @param string $str                     <p>The string to be sanitized.</p>
1062
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1063
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1064
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1065
   *                                        => "..."</p>
1066
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1067
   *                                        $normalize_whitespace</p>
1068
   *
1069
   * @return string <p>Clean UTF-8 encoded string.</p>
1070
   */
1071 44
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1072
  {
1073
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1074
    // caused connection reset problem on larger strings
1075
1076
    $regx = '/
1077
      (
1078
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1079
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1080
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1081
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1082
        ){1,100}                      # ...one or more times
1083
      )
1084
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1085
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1086 44
    /x';
1087 44
    $str = preg_replace($regx, '$1', $str);
1088
1089 44
    $str = self::replace_diamond_question_mark($str, '');
1090 44
    $str = self::remove_invisible_characters($str);
1091
1092 44
    if ($normalize_whitespace === true) {
1093 17
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1094 17
    }
1095
1096 44
    if ($normalize_msword === true) {
1097 12
      $str = self::normalize_msword($str);
1098 12
    }
1099
1100 44
    if ($remove_bom === true) {
1101 5
      $str = self::removeBOM($str);
1102 5
    }
1103
1104 44
    return $str;
1105
  }
1106
1107
  /**
1108
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1109
   *
1110
   * @param string $str <p>The input string.</p>
1111
   *
1112
   * @return string
1113
   */
1114 4
  public static function cleanup($str)
1115
  {
1116 4
    $str = (string)$str;
1117
1118 4
    if (!isset($str[0])) {
1119 1
      return '';
1120
    }
1121
1122
    // fixed ISO <-> UTF-8 Errors
1123 4
    $str = self::fix_simple_utf8($str);
1124
1125
    // remove all none UTF-8 symbols
1126
    // && remove diamond question mark (�)
1127
    // && remove remove invisible characters (e.g. "\0")
1128
    // && remove BOM
1129
    // && normalize whitespace chars (but keep non-breaking-spaces)
1130 4
    $str = self::clean($str, true, true, false, true);
1131
1132 4
    return (string)$str;
1133
  }
1134
1135
  /**
1136
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1137
   *
1138
   * INFO: opposite to UTF8::string()
1139
   *
1140
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1141
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1142
   *                                    default, code points will be returned as integers.</p>
1143
   *
1144
   * @return array <p>The array of code points.</p>
1145
   */
1146 5
  public static function codepoints($arg, $u_style = false)
1147
  {
1148 5
    if (is_string($arg)) {
1149 5
      $arg = self::split($arg);
1150 5
    }
1151
1152 5
    $arg = array_map(
1153
        array(
1154 5
            '\\voku\\helper\\UTF8',
1155 5
            'ord',
1156 5
        ),
1157
        $arg
1158 5
    );
1159
1160 5
    if ($u_style) {
1161 1
      $arg = array_map(
1162
          array(
1163 1
              '\\voku\\helper\\UTF8',
1164 1
              'int_to_hex',
1165 1
          ),
1166
          $arg
1167 1
      );
1168 1
    }
1169
1170 5
    return $arg;
1171
  }
1172
1173
  /**
1174
   * Returns count of characters used in a string.
1175
   *
1176
   * @param string $str       <p>The input string.</p>
1177
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1178
   *
1179
   * @return array <p>An associative array of Character as keys and
1180
   *               their count as values.</p>
1181
   */
1182 6
  public static function count_chars($str, $cleanUtf8 = false)
1183
  {
1184 6
    return array_count_values(self::split($str, 1, $cleanUtf8));
1185
  }
1186
1187
  /**
1188
   * Get a UTF-8 character from its decimal code representation.
1189
   *
1190
   * @param int $code
1191
   *
1192
   * @return string
1193
   */
1194 1
  public static function decimal_to_chr($code)
1195
  {
1196 1
    return \mb_convert_encoding(
1197 1
        '&#x' . dechex($code) . ';',
1198 1
        'UTF-8',
1199
        'HTML-ENTITIES'
1200 1
    );
1201
  }
1202
1203
  /**
1204
   * Encode a string with a new charset-encoding.
1205
   *
1206
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1207
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1208
   *
1209
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1210
   * @param string $str      <p>The input string</p>
1211
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1212
   *                         /> otherwise we auto-detect the current string-encoding</p>
1213
   *
1214
   * @return string
1215
   */
1216 11
  public static function encode($encoding, $str, $force = true)
1217
  {
1218 11
    $str = (string)$str;
1219 11
    $encoding = (string)$encoding;
1220
1221 11
    if (!isset($str[0], $encoding[0])) {
1222 5
      return $str;
1223
    }
1224
1225 11
    if ($encoding !== 'UTF-8') {
1226 1
      $encoding = self::normalize_encoding($encoding);
1227 1
    }
1228
1229 11
    $encodingDetected = self::str_detect_encoding($str);
1230
1231
    if (
1232
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1233 11
        &&
1234
        (
1235
            $force === true
1236 11
            ||
1237
            $encodingDetected !== $encoding
1238 1
        )
1239 11
    ) {
1240
1241
      if (
1242
          $encoding === 'UTF-8'
1243 11
          &&
1244
          (
1245
              $force === true
1246 11
              || $encodingDetected === 'UTF-8'
1247 1
              || $encodingDetected === 'WINDOWS-1252'
1248 1
              || $encodingDetected === 'ISO-8859-1'
1249 1
          )
1250 11
      ) {
1251 11
        return self::to_utf8($str);
1252
      }
1253
1254
      if (
1255
          $encoding === 'ISO-8859-1'
1256 2
          &&
1257
          (
1258
              $force === true
1259 1
              || $encodingDetected === 'ISO-8859-1'
1260
              || $encodingDetected === 'UTF-8'
1261
          )
1262 2
      ) {
1263 1
        return self::to_iso8859($str);
1264
      }
1265
1266 2
      $strEncoded = \mb_convert_encoding(
1267 2
          $str,
1268 2
          $encoding,
1269
          $encodingDetected
1270 2
      );
1271
1272 2
      if ($strEncoded) {
1273 2
        return $strEncoded;
1274
      }
1275
    }
1276
1277 1
    return $str;
1278
  }
1279
1280
  /**
1281
   * Reads entire file into a string.
1282
   *
1283
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1284
   *
1285
   * @link http://php.net/manual/en/function.file-get-contents.php
1286
   *
1287
   * @param string        $filename      <p>
1288
   *                                     Name of the file to read.
1289
   *                                     </p>
1290
   * @param int|null      $flags         [optional] <p>
1291
   *                                     Prior to PHP 6, this parameter is called
1292
   *                                     use_include_path and is a bool.
1293
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1294
   *                                     to trigger include path
1295
   *                                     search.
1296
   *                                     </p>
1297
   *                                     <p>
1298
   *                                     The value of flags can be any combination of
1299
   *                                     the following flags (with some restrictions), joined with the
1300
   *                                     binary OR (|)
1301
   *                                     operator.
1302
   *                                     </p>
1303
   *                                     <p>
1304
   *                                     <table>
1305
   *                                     Available flags
1306
   *                                     <tr valign="top">
1307
   *                                     <td>Flag</td>
1308
   *                                     <td>Description</td>
1309
   *                                     </tr>
1310
   *                                     <tr valign="top">
1311
   *                                     <td>
1312
   *                                     FILE_USE_INCLUDE_PATH
1313
   *                                     </td>
1314
   *                                     <td>
1315
   *                                     Search for filename in the include directory.
1316
   *                                     See include_path for more
1317
   *                                     information.
1318
   *                                     </td>
1319
   *                                     </tr>
1320
   *                                     <tr valign="top">
1321
   *                                     <td>
1322
   *                                     FILE_TEXT
1323
   *                                     </td>
1324
   *                                     <td>
1325
   *                                     As of PHP 6, the default encoding of the read
1326
   *                                     data is UTF-8. You can specify a different encoding by creating a
1327
   *                                     custom context or by changing the default using
1328
   *                                     stream_default_encoding. This flag cannot be
1329
   *                                     used with FILE_BINARY.
1330
   *                                     </td>
1331
   *                                     </tr>
1332
   *                                     <tr valign="top">
1333
   *                                     <td>
1334
   *                                     FILE_BINARY
1335
   *                                     </td>
1336
   *                                     <td>
1337
   *                                     With this flag, the file is read in binary mode. This is the default
1338
   *                                     setting and cannot be used with FILE_TEXT.
1339
   *                                     </td>
1340
   *                                     </tr>
1341
   *                                     </table>
1342
   *                                     </p>
1343
   * @param resource|null $context       [optional] <p>
1344
   *                                     A valid context resource created with
1345
   *                                     stream_context_create. If you don't need to use a
1346
   *                                     custom context, you can skip this parameter by &null;.
1347
   *                                     </p>
1348
   * @param int|null      $offset        [optional] <p>
1349
   *                                     The offset where the reading starts.
1350
   *                                     </p>
1351
   * @param int|null      $maxlen        [optional] <p>
1352
   *                                     Maximum length of data read. The default is to read until end
1353
   *                                     of file is reached.
1354
   *                                     </p>
1355
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1356
   *
1357
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1358
   *                                     or pdf, because they used non default utf-8 chars</p>
1359
   *
1360
   * @return string <p>The function returns the read data or false on failure.</p>
1361
   */
1362 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1363
  {
1364
    // init
1365 2
    $timeout = (int)$timeout;
1366 2
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1367
1368 2
    if ($timeout && $context === null) {
1369 2
      $context = stream_context_create(
1370
          array(
1371
              'http' =>
1372
                  array(
1373 2
                      'timeout' => $timeout,
1374 2
                  ),
1375
          )
1376 2
      );
1377 2
    }
1378
1379 2
    if (is_int($maxlen)) {
1380 1
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1381 1
    } else {
1382 2
      $data = file_get_contents($filename, $flags, $context, $offset);
1383
    }
1384
1385
    // return false on error
1386 2
    if ($data === false) {
1387 1
      return false;
1388
    }
1389
1390 1
    if ($convertToUtf8 === true) {
1391 1
      $data = self::encode('UTF-8', $data, false);
1392 1
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1393 1
    }
1394
1395 1
    return $data;
1396
  }
1397
1398
  /**
1399
   * Checks if a file starts with BOM (Byte Order Mark) character.
1400
   *
1401
   * @param string $file_path <p>Path to a valid file.</p>
1402
   *
1403
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1404
   */
1405 1
  public static function file_has_bom($file_path)
1406
  {
1407 1
    return self::string_has_bom(file_get_contents($file_path));
1408
  }
1409
1410
  /**
1411
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1412
   *
1413
   * @param mixed  $var
1414
   * @param int    $normalization_form
1415
   * @param string $leading_combining
1416
   *
1417
   * @return mixed
1418
   */
1419 9
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1420
  {
1421 9
    switch (gettype($var)) {
1422 9 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1423 3
        foreach ($var as $k => $v) {
1424
          /** @noinspection AlterInForeachInspection */
1425 3
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1426 3
        }
1427 3
        break;
1428 9 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1429 2
        foreach ($var as $k => $v) {
1430 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1431 2
        }
1432 2
        break;
1433 9
      case 'string':
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
1434
1435 8
        if (false !== strpos($var, "\r")) {
1436
          // Workaround https://bugs.php.net/65732
1437 2
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1438 2
        }
1439
1440 8
        if (self::is_ascii($var) === false) {
1441
1442 8
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1443 6
            $n = '-';
1444 6
          } else {
1445 6
            $n = \Normalizer::normalize($var, $normalization_form);
1446
1447 6
            if (isset($n[0])) {
1448 3
              $var = $n;
1449 3
            } else {
1450 5
              $var = self::encode('UTF-8', $var);
1451
            }
1452
          }
1453
1454
          if (
1455 8
              $var[0] >= "\x80" && isset($n[0], $leading_combining[0])
1456 8
              &&
1457 5
              preg_match('/^\p{Mn}/u', $var)
1458 8
          ) {
1459
            // Prevent leading combining chars
1460
            // for NFC-safe concatenations.
1461 2
            $var = $leading_combining . $var;
1462 2
          }
1463 8
        }
1464 8
        break;
1465 9
    }
1466
1467 9
    return $var;
1468
  }
1469
1470
  /**
1471
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1472
   *
1473
   * @param int    $type
1474
   * @param string $var
1475
   * @param int    $filter
1476
   * @param mixed  $option
1477
   *
1478
   * @return mixed
1479
   */
1480 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1481
  {
1482
    if (4 > func_num_args()) {
1483
      $var = filter_input($type, $var, $filter);
1484
    } else {
1485
      $var = filter_input($type, $var, $filter, $option);
1486
    }
1487
1488
    return self::filter($var);
1489
  }
1490
1491
  /**
1492
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1493
   *
1494
   * @param int   $type
1495
   * @param mixed $definition
1496
   * @param bool  $add_empty
1497
   *
1498
   * @return mixed
1499
   */
1500 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1501
  {
1502
    if (2 > func_num_args()) {
1503
      $a = filter_input_array($type);
1504
    } else {
1505
      $a = filter_input_array($type, $definition, $add_empty);
1506
    }
1507
1508
    return self::filter($a);
1509
  }
1510
1511
  /**
1512
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1513
   *
1514
   * @param mixed $var
1515
   * @param int   $filter
1516
   * @param mixed $option
1517
   *
1518
   * @return mixed
1519
   */
1520 1 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1521
  {
1522 1
    if (3 > func_num_args()) {
1523 1
      $var = filter_var($var, $filter);
1524 1
    } else {
1525 1
      $var = filter_var($var, $filter, $option);
1526
    }
1527
1528 1
    return self::filter($var);
1529
  }
1530
1531
  /**
1532
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1533
   *
1534
   * @param array $data
1535
   * @param mixed $definition
1536
   * @param bool  $add_empty
1537
   *
1538
   * @return mixed
1539
   */
1540 1 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1541
  {
1542 1
    if (2 > func_num_args()) {
1543 1
      $a = filter_var_array($data);
1544 1
    } else {
1545 1
      $a = filter_var_array($data, $definition, $add_empty);
1546
    }
1547
1548 1
    return self::filter($a);
1549
  }
1550
1551
  /**
1552
   * Check if the number of unicode characters are not more than the specified integer.
1553
   *
1554
   * @param string $str      The original string to be checked.
1555
   * @param int    $box_size The size in number of chars to be checked against string.
1556
   *
1557
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1558
   */
1559 1
  public static function fits_inside($str, $box_size)
1560
  {
1561 1
    return (self::strlen($str) <= $box_size);
1562
  }
1563
1564
  /**
1565
   * Try to fix simple broken UTF-8 strings.
1566
   *
1567
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1568
   *
1569
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1570
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1571
   * See: http://en.wikipedia.org/wiki/Windows-1252
1572
   *
1573
   * @param string $str <p>The input string</p>
1574
   *
1575
   * @return string
1576
   */
1577 7
  public static function fix_simple_utf8($str)
1578
  {
1579 7
    static $brokenUtf8ToUtf8Keys = null;
1580 7
    static $brokenUtf8ToUtf8Values = null;
1581
1582 7
    $str = (string)$str;
1583
1584 7
    if (!isset($str[0])) {
1585 2
      return '';
1586
    }
1587
1588 7
    if ($brokenUtf8ToUtf8Keys === null) {
1589 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1590 1
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1591 1
    }
1592
1593 7
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1594
  }
1595
1596
  /**
1597
   * Fix a double (or multiple) encoded UTF8 string.
1598
   *
1599
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1600
   *
1601
   * @return mixed
1602
   */
1603 1
  public static function fix_utf8($str)
1604
  {
1605 1
    if (is_array($str)) {
1606
1607 1
      /** @noinspection ForeachSourceInspection */
1608
      foreach ($str as $k => $v) {
1609
        /** @noinspection AlterInForeachInspection */
1610 1
        /** @noinspection OffsetOperationsInspection */
1611 1
        $str[$k] = self::fix_utf8($v);
1612
      }
1613 1
1614
      return $str;
1615
    }
1616 1
1617 1
    $last = '';
1618 1
    while ($last !== $str) {
1619 1
      $last = $str;
1620 1
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1620 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1621
    }
1622 1
1623
    return $str;
1624
  }
1625
1626
  /**
1627
   * Get character of a specific character.
1628
   *
1629
   * @param string $char
1630
   *
1631
   * @return string <p>'RTL' or 'LTR'</p>
1632 1
   */
1633
  public static function getCharDirection($char)
1634 1
  {
1635
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1636
      self::checkForSupport();
1637
    }
1638 1
1639
    if (self::$support['intlChar'] === true) {
1640
      $tmpReturn = \IntlChar::charDirection($char);
1641
1642
      // from "IntlChar"-Class
1643
      $charDirection = array(
1644
          'RTL' => array(1, 13, 14, 15, 21),
1645
          'LTR' => array(0, 11, 12, 20),
1646
      );
1647
1648
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1649
        return 'LTR';
1650
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1651
        return 'RTL';
1652
      }
1653
    }
1654 1
1655
    $c = static::chr_to_decimal($char);
1656 1
1657 1
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1658
      return 'LTR';
1659
    }
1660 1
1661
    if (0x85e >= $c) {
1662 1
1663 1
      if (0x5be === $c ||
1664 1
          0x5c0 === $c ||
1665 1
          0x5c3 === $c ||
1666 1
          0x5c6 === $c ||
1667 1
          (0x5d0 <= $c && 0x5ea >= $c) ||
1668 1
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1669 1
          0x608 === $c ||
1670 1
          0x60b === $c ||
1671 1
          0x60d === $c ||
1672 1
          0x61b === $c ||
1673
          (0x61e <= $c && 0x64a >= $c) ||
1674
          (0x66d <= $c && 0x66f >= $c) ||
1675
          (0x671 <= $c && 0x6d5 >= $c) ||
1676
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1677
          (0x6ee <= $c && 0x6ef >= $c) ||
1678
          (0x6fa <= $c && 0x70d >= $c) ||
1679
          0x710 === $c ||
1680
          (0x712 <= $c && 0x72f >= $c) ||
1681
          (0x74d <= $c && 0x7a5 >= $c) ||
1682
          0x7b1 === $c ||
1683
          (0x7c0 <= $c && 0x7ea >= $c) ||
1684
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1685
          0x7fa === $c ||
1686
          (0x800 <= $c && 0x815 >= $c) ||
1687
          0x81a === $c ||
1688
          0x824 === $c ||
1689
          0x828 === $c ||
1690
          (0x830 <= $c && 0x83e >= $c) ||
1691
          (0x840 <= $c && 0x858 >= $c) ||
1692 1
          0x85e === $c
1693 1
      ) {
1694
        return 'RTL';
1695
      }
1696
1697
    } elseif (0x200f === $c) {
1698
1699
      return 'RTL';
1700
1701
    } elseif (0xfb1d <= $c) {
1702
1703
      if (0xfb1d === $c ||
1704
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1705
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1706
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1707
          0xfb3e === $c ||
1708
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1709
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1710
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1711
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1712
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1713
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1714
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1715
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1716
          (0xfe76 <= $c && 0xfefc >= $c) ||
1717
          (0x10800 <= $c && 0x10805 >= $c) ||
1718
          0x10808 === $c ||
1719
          (0x1080a <= $c && 0x10835 >= $c) ||
1720
          (0x10837 <= $c && 0x10838 >= $c) ||
1721
          0x1083c === $c ||
1722
          (0x1083f <= $c && 0x10855 >= $c) ||
1723
          (0x10857 <= $c && 0x1085f >= $c) ||
1724
          (0x10900 <= $c && 0x1091b >= $c) ||
1725
          (0x10920 <= $c && 0x10939 >= $c) ||
1726
          0x1093f === $c ||
1727
          0x10a00 === $c ||
1728
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1729
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1730
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1731
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1732
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1733
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1734
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1735
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1736
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1737
          (0x10b78 <= $c && 0x10b7f >= $c)
1738
      ) {
1739
        return 'RTL';
1740
      }
1741
    }
1742
1743
    return 'LTR';
1744
  }
1745
1746
  /**
1747
   * get data from "/data/*.ser"
1748
   *
1749
   * @param string $file
1750
   *
1751
   * @return bool|string|array|int <p>Will return false on error.</p>
1752 1
   */
1753
  private static function getData($file)
1754 1
  {
1755 1
    $file = __DIR__ . '/data/' . $file . '.php';
1756
    if (file_exists($file)) {
1757 1
      /** @noinspection PhpIncludeInspection */
1758
      return require $file;
1759
    } else {
1760
      return false;
1761
    }
1762
  }
1763
1764
  /**
1765
   * alias for "UTF8::string_has_bom()"
1766
   *
1767
   * @see UTF8::string_has_bom()
1768
   *
1769
   * @param string $str
1770
   *
1771
   * @return bool
1772 1
   */
1773
  public static function hasBom($str)
1774 1
  {
1775
    return self::string_has_bom($str);
1776
  }
1777
1778
  /**
1779
   * Converts hexadecimal U+xxxx code point representation to integer.
1780
   *
1781
   * INFO: opposite to UTF8::int_to_hex()
1782
   *
1783
   * @param string $str <p>The hexadecimal code point representation.</p>
1784
   *
1785
   * @return int|false <p>The code point, or false on failure.</p>
1786 1
   */
1787
  public static function hex_to_int($str)
1788 1
  {
1789 1
    if (!$str) {
1790
      return false;
1791
    }
1792 1
1793 1
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1794
      return intval($match[1], 16);
1795
    }
1796 1
1797
    return false;
1798
  }
1799
1800
  /**
1801
   * alias for "UTF8::html_entity_decode()"
1802
   *
1803
   * @see UTF8::html_entity_decode()
1804
   *
1805
   * @param string $str
1806
   * @param int    $flags
1807
   * @param string $encoding
1808
   *
1809
   * @return string
1810 1
   */
1811
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1812 1
  {
1813
    return self::html_entity_decode($str, $flags, $encoding);
1814
  }
1815
1816
  /**
1817
   * Converts a UTF-8 string to a series of HTML numbered entities.
1818
   *
1819
   * INFO: opposite to UTF8::html_decode()
1820
   *
1821
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1822
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1823
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1824
   *
1825
   * @return string <p>HTML numbered entities.</p>
1826 2
   */
1827
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1828
  {
1829 2
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1830
    if (function_exists('mb_encode_numericentity')) {
1831 2
1832 2
      $startCode = 0x00;
1833 1
      if ($keepAsciiChars === true) {
1834 1
        $startCode = 0x80;
1835
      }
1836 2
1837 1
      if ($encoding !== 'UTF-8') {
1838 1
        $encoding = self::normalize_encoding($encoding);
1839
      }
1840 2
1841 2
      return mb_encode_numericentity(
1842 2
          $str,
1843
          array($startCode, 0xffff, 0, 0xffff,),
1844 2
          $encoding
1845
      );
1846
    }
1847
1848
    return implode(
1849
        array_map(
1850
            function ($data) use ($keepAsciiChars) {
1851
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1852
            },
1853
            self::split($str)
1854
        )
1855
    );
1856
  }
1857
1858
  /**
1859
   * UTF-8 version of html_entity_decode()
1860
   *
1861
   * The reason we are not using html_entity_decode() by itself is because
1862
   * while it is not technically correct to leave out the semicolon
1863
   * at the end of an entity most browsers will still interpret the entity
1864
   * correctly. html_entity_decode() does not convert entities without
1865
   * semicolons, so we are left with our own little solution here. Bummer.
1866
   *
1867
   * Convert all HTML entities to their applicable characters
1868
   *
1869
   * INFO: opposite to UTF8::html_encode()
1870
   *
1871
   * @link http://php.net/manual/en/function.html-entity-decode.php
1872
   *
1873
   * @param string $str      <p>
1874
   *                         The input string.
1875
   *                         </p>
1876
   * @param int    $flags    [optional] <p>
1877
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1878
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1879
   *                         <table>
1880
   *                         Available <i>flags</i> constants
1881
   *                         <tr valign="top">
1882
   *                         <td>Constant Name</td>
1883
   *                         <td>Description</td>
1884
   *                         </tr>
1885
   *                         <tr valign="top">
1886
   *                         <td><b>ENT_COMPAT</b></td>
1887
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1888
   *                         </tr>
1889
   *                         <tr valign="top">
1890
   *                         <td><b>ENT_QUOTES</b></td>
1891
   *                         <td>Will convert both double and single quotes.</td>
1892
   *                         </tr>
1893
   *                         <tr valign="top">
1894
   *                         <td><b>ENT_NOQUOTES</b></td>
1895
   *                         <td>Will leave both double and single quotes unconverted.</td>
1896
   *                         </tr>
1897
   *                         <tr valign="top">
1898
   *                         <td><b>ENT_HTML401</b></td>
1899
   *                         <td>
1900
   *                         Handle code as HTML 4.01.
1901
   *                         </td>
1902
   *                         </tr>
1903
   *                         <tr valign="top">
1904
   *                         <td><b>ENT_XML1</b></td>
1905
   *                         <td>
1906
   *                         Handle code as XML 1.
1907
   *                         </td>
1908
   *                         </tr>
1909
   *                         <tr valign="top">
1910
   *                         <td><b>ENT_XHTML</b></td>
1911
   *                         <td>
1912
   *                         Handle code as XHTML.
1913
   *                         </td>
1914
   *                         </tr>
1915
   *                         <tr valign="top">
1916
   *                         <td><b>ENT_HTML5</b></td>
1917
   *                         <td>
1918
   *                         Handle code as HTML 5.
1919
   *                         </td>
1920
   *                         </tr>
1921
   *                         </table>
1922
   *                         </p>
1923
   * @param string $encoding [optional] <p>Encoding to use.</p>
1924
   *
1925
   * @return string <p>The decoded string.</p>
1926 9
   */
1927
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1928 9
  {
1929
    $str = (string)$str;
1930 9
1931 6
    if (!isset($str[0])) {
1932
      return '';
1933
    }
1934 9
1935 7
    if (!isset($str[3])) { // examples: &; || &x;
0 ignored issues
show
Unused Code Comprehensibility introduced by
46% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
1936
      return $str;
1937
    }
1938
1939 9
    if (
1940 9
        strpos($str, '&') === false
1941
        ||
1942 9
        (
1943 9
            strpos($str, '&#') === false
1944 9
            &&
1945 9
            strpos($str, ';') === false
1946 9
        )
1947 6
    ) {
1948
      return $str;
1949
    }
1950 9
1951 2
    if ($encoding !== 'UTF-8') {
1952 2
      $encoding = self::normalize_encoding($encoding);
1953
    }
1954 9
1955 4
    if ($flags === null) {
1956 4
      if (Bootup::is_php('5.4') === true) {
1957 4
        $flags = ENT_COMPAT | ENT_HTML5;
1958
      } else {
1959
        $flags = ENT_COMPAT;
1960 4
      }
1961
    }
1962
1963 9
    do {
1964
      $str_compare = $str;
1965 9
1966 9
      $str = preg_replace_callback(
1967
          "/&#\d{2,5};/",
1968 7
          function ($matches) use ($encoding) {
1969
            $returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES');
1970 7
1971 6
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1972
              return $returnTmp;
1973 4
            } else {
1974
              return $matches[0];
1975 9
            }
1976
          },
1977 9
          $str
1978
      );
1979
1980 9
      // decode numeric & UTF16 two byte entities
1981 9
      $str = html_entity_decode(
1982 9
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1983
          $flags,
1984 9
          $encoding
1985
      );
1986 9
1987
    } while ($str_compare !== $str);
1988 9
1989
    return $str;
1990
  }
1991
1992
  /**
1993
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1994
   *
1995
   * @link http://php.net/manual/en/function.htmlentities.php
1996
   *
1997
   * @param string $str           <p>
1998
   *                              The input string.
1999
   *                              </p>
2000
   * @param int    $flags         [optional] <p>
2001
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2002
   *                              invalid code unit sequences and the used document type. The default is
2003
   *                              ENT_COMPAT | ENT_HTML401.
2004
   *                              <table>
2005
   *                              Available <i>flags</i> constants
2006
   *                              <tr valign="top">
2007
   *                              <td>Constant Name</td>
2008
   *                              <td>Description</td>
2009
   *                              </tr>
2010
   *                              <tr valign="top">
2011
   *                              <td><b>ENT_COMPAT</b></td>
2012
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2013
   *                              </tr>
2014
   *                              <tr valign="top">
2015
   *                              <td><b>ENT_QUOTES</b></td>
2016
   *                              <td>Will convert both double and single quotes.</td>
2017
   *                              </tr>
2018
   *                              <tr valign="top">
2019
   *                              <td><b>ENT_NOQUOTES</b></td>
2020
   *                              <td>Will leave both double and single quotes unconverted.</td>
2021
   *                              </tr>
2022
   *                              <tr valign="top">
2023
   *                              <td><b>ENT_IGNORE</b></td>
2024
   *                              <td>
2025
   *                              Silently discard invalid code unit sequences instead of returning
2026
   *                              an empty string. Using this flag is discouraged as it
2027
   *                              may have security implications.
2028
   *                              </td>
2029
   *                              </tr>
2030
   *                              <tr valign="top">
2031
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2032
   *                              <td>
2033
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2034
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2035
   *                              </td>
2036
   *                              </tr>
2037
   *                              <tr valign="top">
2038
   *                              <td><b>ENT_DISALLOWED</b></td>
2039
   *                              <td>
2040
   *                              Replace invalid code points for the given document type with a
2041
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2042
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2043
   *                              instance, to ensure the well-formedness of XML documents with
2044
   *                              embedded external content.
2045
   *                              </td>
2046
   *                              </tr>
2047
   *                              <tr valign="top">
2048
   *                              <td><b>ENT_HTML401</b></td>
2049
   *                              <td>
2050
   *                              Handle code as HTML 4.01.
2051
   *                              </td>
2052
   *                              </tr>
2053
   *                              <tr valign="top">
2054
   *                              <td><b>ENT_XML1</b></td>
2055
   *                              <td>
2056
   *                              Handle code as XML 1.
2057
   *                              </td>
2058
   *                              </tr>
2059
   *                              <tr valign="top">
2060
   *                              <td><b>ENT_XHTML</b></td>
2061
   *                              <td>
2062
   *                              Handle code as XHTML.
2063
   *                              </td>
2064
   *                              </tr>
2065
   *                              <tr valign="top">
2066
   *                              <td><b>ENT_HTML5</b></td>
2067
   *                              <td>
2068
   *                              Handle code as HTML 5.
2069
   *                              </td>
2070
   *                              </tr>
2071
   *                              </table>
2072
   *                              </p>
2073
   * @param string $encoding      [optional] <p>
2074
   *                              Like <b>htmlspecialchars</b>,
2075
   *                              <b>htmlentities</b> takes an optional third argument
2076
   *                              <i>encoding</i> which defines encoding used in
2077
   *                              conversion.
2078
   *                              Although this argument is technically optional, you are highly
2079
   *                              encouraged to specify the correct value for your code.
2080
   *                              </p>
2081
   * @param bool   $double_encode [optional] <p>
2082
   *                              When <i>double_encode</i> is turned off PHP will not
2083
   *                              encode existing html entities. The default is to convert everything.
2084
   *                              </p>
2085
   *
2086
   *
2087
   * @return string the encoded string.
2088
   * </p>
2089
   * <p>
2090
   * If the input <i>string</i> contains an invalid code unit
2091
   * sequence within the given <i>encoding</i> an empty string
2092
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2093
   * <b>ENT_SUBSTITUTE</b> flags are set.
2094 2
   */
2095
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2096 2
  {
2097 1
    if ($encoding !== 'UTF-8') {
2098 1
      $encoding = self::normalize_encoding($encoding);
2099
    }
2100 2
2101
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2102 2
2103 1
    if ($encoding !== 'UTF-8') {
2104
      return $str;
2105
    }
2106 2
2107 2
    $byteLengths = self::chr_size_list($str);
2108 2
    $search = array();
2109 2
    $replacements = array();
2110 2
    foreach ($byteLengths as $counter => $byteLength) {
2111 1
      if ($byteLength >= 3) {
2112
        $char = self::access($str, $counter);
2113 1
2114 1
        if (!isset($replacements[$char])) {
2115 1
          $search[$char] = $char;
2116 1
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2112 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2117 1
        }
2118 2
      }
2119
    }
2120 2
2121
    return str_replace($search, $replacements, $str);
2122
  }
2123
2124
  /**
2125
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2126
   *
2127
   * INFO: Take a look at "UTF8::htmlentities()"
2128
   *
2129
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2130
   *
2131
   * @param string $str           <p>
2132
   *                              The string being converted.
2133
   *                              </p>
2134
   * @param int    $flags         [optional] <p>
2135
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2136
   *                              invalid code unit sequences and the used document type. The default is
2137
   *                              ENT_COMPAT | ENT_HTML401.
2138
   *                              <table>
2139
   *                              Available <i>flags</i> constants
2140
   *                              <tr valign="top">
2141
   *                              <td>Constant Name</td>
2142
   *                              <td>Description</td>
2143
   *                              </tr>
2144
   *                              <tr valign="top">
2145
   *                              <td><b>ENT_COMPAT</b></td>
2146
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2147
   *                              </tr>
2148
   *                              <tr valign="top">
2149
   *                              <td><b>ENT_QUOTES</b></td>
2150
   *                              <td>Will convert both double and single quotes.</td>
2151
   *                              </tr>
2152
   *                              <tr valign="top">
2153
   *                              <td><b>ENT_NOQUOTES</b></td>
2154
   *                              <td>Will leave both double and single quotes unconverted.</td>
2155
   *                              </tr>
2156
   *                              <tr valign="top">
2157
   *                              <td><b>ENT_IGNORE</b></td>
2158
   *                              <td>
2159
   *                              Silently discard invalid code unit sequences instead of returning
2160
   *                              an empty string. Using this flag is discouraged as it
2161
   *                              may have security implications.
2162
   *                              </td>
2163
   *                              </tr>
2164
   *                              <tr valign="top">
2165
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2166
   *                              <td>
2167
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2168
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2169
   *                              </td>
2170
   *                              </tr>
2171
   *                              <tr valign="top">
2172
   *                              <td><b>ENT_DISALLOWED</b></td>
2173
   *                              <td>
2174
   *                              Replace invalid code points for the given document type with a
2175
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2176
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2177
   *                              instance, to ensure the well-formedness of XML documents with
2178
   *                              embedded external content.
2179
   *                              </td>
2180
   *                              </tr>
2181
   *                              <tr valign="top">
2182
   *                              <td><b>ENT_HTML401</b></td>
2183
   *                              <td>
2184
   *                              Handle code as HTML 4.01.
2185
   *                              </td>
2186
   *                              </tr>
2187
   *                              <tr valign="top">
2188
   *                              <td><b>ENT_XML1</b></td>
2189
   *                              <td>
2190
   *                              Handle code as XML 1.
2191
   *                              </td>
2192
   *                              </tr>
2193
   *                              <tr valign="top">
2194
   *                              <td><b>ENT_XHTML</b></td>
2195
   *                              <td>
2196
   *                              Handle code as XHTML.
2197
   *                              </td>
2198
   *                              </tr>
2199
   *                              <tr valign="top">
2200
   *                              <td><b>ENT_HTML5</b></td>
2201
   *                              <td>
2202
   *                              Handle code as HTML 5.
2203
   *                              </td>
2204
   *                              </tr>
2205
   *                              </table>
2206
   *                              </p>
2207
   * @param string $encoding      [optional] <p>
2208
   *                              Defines encoding used in conversion.
2209
   *                              </p>
2210
   *                              <p>
2211
   *                              For the purposes of this function, the encodings
2212
   *                              ISO-8859-1, ISO-8859-15,
2213
   *                              UTF-8, cp866,
2214
   *                              cp1251, cp1252, and
2215
   *                              KOI8-R are effectively equivalent, provided the
2216
   *                              <i>string</i> itself is valid for the encoding, as
2217
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2218
   *                              the same positions in all of these encodings.
2219
   *                              </p>
2220
   * @param bool   $double_encode [optional] <p>
2221
   *                              When <i>double_encode</i> is turned off PHP will not
2222
   *                              encode existing html entities, the default is to convert everything.
2223
   *                              </p>
2224
   *
2225
   * @return string The converted string.
2226
   * </p>
2227
   * <p>
2228
   * If the input <i>string</i> contains an invalid code unit
2229
   * sequence within the given <i>encoding</i> an empty string
2230
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2231
   * <b>ENT_SUBSTITUTE</b> flags are set.
2232 1
   */
2233
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2234 1
  {
2235
    if ($encoding !== 'UTF-8') {
2236
      $encoding = self::normalize_encoding($encoding);
2237
    }
2238 1
2239
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2240
  }
2241
2242
  /**
2243
   * Checks whether iconv is available on the server.
2244
   *
2245
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2246 1
   */
2247
  public static function iconv_loaded()
2248 1
  {
2249
    return extension_loaded('iconv') ? true : false;
2250
  }
2251
2252
  /**
2253
   * Converts Integer to hexadecimal U+xxxx code point representation.
2254
   *
2255
   * INFO: opposite to UTF8::hex_to_int()
2256
   *
2257
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2258
   * @param string $pfix [optional]
2259
   *
2260
   * @return string <p>The code point, or empty string on failure.</p>
2261 3
   */
2262
  public static function int_to_hex($int, $pfix = 'U+')
2263 3
  {
2264 3
    if (ctype_digit((string)$int)) {
2265
      $hex = dechex((int)$int);
2266 3
2267
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2268 3
2269
      return $pfix . $hex;
2270
    }
2271
2272
    return '';
2273
  }
2274
2275
  /**
2276
   * Checks whether intl-char is available on the server.
2277
   *
2278
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2279 1
   */
2280
  public static function intlChar_loaded()
2281 1
  {
2282
    return (Bootup::is_php('7.0') === true && class_exists('IntlChar') === true);
2283
  }
2284
2285
  /**
2286
   * Checks whether intl is available on the server.
2287
   *
2288
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2289 2
   */
2290
  public static function intl_loaded()
2291 2
  {
2292
    return extension_loaded('intl') ? true : false;
2293
  }
2294
2295
  /**
2296
   * alias for "UTF8::is_ascii()"
2297
   *
2298
   * @see UTF8::is_ascii()
2299
   *
2300
   * @param string $str
2301
   *
2302
   * @return boolean
2303 2
   */
2304
  public static function isAscii($str)
2305 2
  {
2306
    return self::is_ascii($str);
2307
  }
2308
2309
  /**
2310
   * alias for "UTF8::is_base64()"
2311
   *
2312
   * @see UTF8::is_base64()
2313
   *
2314
   * @param string $str
2315
   *
2316
   * @return bool
2317 1
   */
2318
  public static function isBase64($str)
2319 1
  {
2320
    return self::is_base64($str);
2321
  }
2322
2323
  /**
2324
   * alias for "UTF8::is_binary()"
2325
   *
2326
   * @see UTF8::is_binary()
2327
   *
2328
   * @param string $str
2329
   *
2330
   * @return bool
2331
   */
2332
  public static function isBinary($str)
2333
  {
2334
    return self::is_binary($str);
2335
  }
2336
2337
  /**
2338
   * alias for "UTF8::is_bom()"
2339
   *
2340
   * @see UTF8::is_bom()
2341
   *
2342
   * @param string $utf8_chr
2343
   *
2344
   * @return boolean
2345
   */
2346
  public static function isBom($utf8_chr)
2347
  {
2348
    return self::is_bom($utf8_chr);
2349
  }
2350
2351
  /**
2352
   * alias for "UTF8::is_html()"
2353
   *
2354
   * @see UTF8::is_html()
2355
   *
2356
   * @param string $str
2357
   *
2358
   * @return boolean
2359 1
   */
2360
  public static function isHtml($str)
2361 1
  {
2362
    return self::is_html($str);
2363
  }
2364
2365
  /**
2366
   * alias for "UTF8::is_json()"
2367
   *
2368
   * @see UTF8::is_json()
2369
   *
2370
   * @param string $str
2371
   *
2372
   * @return bool
2373
   */
2374
  public static function isJson($str)
2375
  {
2376
    return self::is_json($str);
2377
  }
2378
2379
  /**
2380
   * alias for "UTF8::is_utf16()"
2381
   *
2382
   * @see UTF8::is_utf16()
2383
   *
2384
   * @param string $str
2385
   *
2386
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2387 1
   */
2388
  public static function isUtf16($str)
2389 1
  {
2390
    return self::is_utf16($str);
2391
  }
2392
2393
  /**
2394
   * alias for "UTF8::is_utf32()"
2395
   *
2396
   * @see UTF8::is_utf32()
2397
   *
2398
   * @param string $str
2399
   *
2400
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2401 1
   */
2402
  public static function isUtf32($str)
2403 1
  {
2404
    return self::is_utf32($str);
2405
  }
2406
2407
  /**
2408
   * alias for "UTF8::is_utf8()"
2409
   *
2410
   * @see UTF8::is_utf8()
2411
   *
2412
   * @param string $str
2413
   * @param bool   $strict
2414
   *
2415
   * @return bool
2416 16
   */
2417
  public static function isUtf8($str, $strict = false)
2418 16
  {
2419
    return self::is_utf8($str, $strict);
2420
  }
2421
2422
  /**
2423
   * Checks if a string is 7 bit ASCII.
2424
   *
2425
   * @param string $str <p>The string to check.</p>
2426
   *
2427
   * @return bool <p>
2428
   *              <strong>true</strong> if it is ASCII<br />
2429
   *              <strong>false</strong> otherwise
2430
   *              </p>
2431 28
   */
2432
  public static function is_ascii($str)
2433 28
  {
2434
    $str = (string)$str;
2435 28
2436 5
    if (!isset($str[0])) {
2437
      return true;
2438
    }
2439 28
2440
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2441
  }
2442
2443
  /**
2444
   * Returns true if the string is base64 encoded, false otherwise.
2445
   *
2446
   * @param string $str <p>The input string.</p>
2447
   *
2448
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2449 1
   */
2450
  public static function is_base64($str)
2451 1
  {
2452
    $str = (string)$str;
2453 1
2454 1
    if (!isset($str[0])) {
2455
      return false;
2456
    }
2457 1
2458 1
    if (base64_encode(base64_decode($str, true)) === $str) {
2459
      return true;
2460 1
    } else {
2461
      return false;
2462
    }
2463
  }
2464
2465
  /**
2466
   * Check if the input is binary... (is look like a hack).
2467
   *
2468
   * @param mixed $input
2469
   *
2470
   * @return bool
2471 16
   */
2472
  public static function is_binary($input)
2473
  {
2474 16
2475
    $testLength = strlen($input);
2476
2477 16
    if (
2478
        preg_match('~^[01]+$~', $input)
2479 16
        ||
2480 16
        substr_count($input, "\x00") > 0
2481 15
        ||
2482 16
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2483 6
    ) {
2484
      return true;
2485 15
    } else {
2486
      return false;
2487
    }
2488
  }
2489
2490
  /**
2491
   * Check if the file is binary.
2492
   *
2493
   * @param string $file
2494
   *
2495
   * @return boolean
2496
   */
2497
  public static function is_binary_file($file)
2498
  {
2499
    try {
2500
      $fp = fopen($file, 'r');
2501
      $block = fread($fp, 512);
2502
      fclose($fp);
2503
    } catch (\Exception $e) {
2504
      $block = '';
2505
    }
2506
2507
    return self::is_binary($block);
2508
  }
2509
2510
  /**
2511
   * Checks if the given string is equal to any "Byte Order Mark".
2512
   *
2513
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2514
   *
2515
   * @param string $str <p>The input string.</p>
2516
   *
2517
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2518
   */
2519
  public static function is_bom($str)
2520
  {
2521
    foreach (self::$bom as $bomString => $bomByteLength) {
2522
      if ($str === $bomString) {
2523
        return true;
2524
      }
2525
    }
2526
2527
    return false;
2528
  }
2529
2530
  /**
2531
   * Check if the string contains any html-tags <lall>.
2532
   *
2533
   * @param string $str <p>The input string.</p>
2534
   *
2535
   * @return boolean
2536 1
   */
2537
  public static function is_html($str)
2538 1
  {
2539
    $str = (string)$str;
2540 1
2541
    if (!isset($str[0])) {
2542
      return false;
2543
    }
2544
2545 1
    // init
2546
    $matches = array();
2547 1
2548
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2549 1
2550 1
    if (count($matches) == 0) {
2551
      return false;
2552 1
    } else {
2553
      return true;
2554
    }
2555
  }
2556
2557
  /**
2558
   * Try to check if "$str" is an json-string.
2559
   *
2560
   * @param string $str <p>The input string.</p>
2561
   *
2562
   * @return bool
2563 1
   */
2564
  public static function is_json($str)
2565 1
  {
2566
    $str = (string)$str;
2567 1
2568
    if (!isset($str[0])) {
2569
      return false;
2570
    }
2571
2572 1
    if (
2573 1
        is_object(self::json_decode($str))
2574 1
        &&
2575 1
        json_last_error() === JSON_ERROR_NONE
2576 1
    ) {
2577
      return true;
2578 1
    } else {
2579
      return false;
2580
    }
2581
  }
2582
2583
  /**
2584
   * Check if the string is UTF-16.
2585
   *
2586
   * @param string $str <p>The input string.</p>
2587
   *
2588
   * @return int|false <p>
2589
   *                   <strong>false</strong> if is't not UTF-16,<br />
2590
   *                   <strong>1</strong> for UTF-16LE,<br />
2591
   *                   <strong>2</strong> for UTF-16BE.
2592
   *                   </p>
2593 4
   */
2594 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2595 4
  {
2596
    $str = self::remove_bom($str);
2597 4
2598
    if (self::is_binary($str)) {
2599 4
2600 4
      $maybeUTF16LE = 0;
2601 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2602 4
      if ($test) {
2603 4
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2604 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2605 4
        if ($test3 === $test) {
2606 4
          $strChars = self::count_chars($str, true);
2607 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2608 2
            if (in_array($test3char, $strChars, true) === true) {
2609 2
              $maybeUTF16LE++;
2610 4
            }
2611 4
          }
2612 4
        }
2613
      }
2614 4
2615 4
      $maybeUTF16BE = 0;
2616 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2617 4
      if ($test) {
2618 4
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2619 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2620 4
        if ($test3 === $test) {
2621 4
          $strChars = self::count_chars($str, true);
2622 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2623 3
            if (in_array($test3char, $strChars, true) === true) {
2624 3
              $maybeUTF16BE++;
2625 4
            }
2626 4
          }
2627 4
        }
2628
      }
2629 4
2630 3
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2631 2
        if ($maybeUTF16LE > $maybeUTF16BE) {
2632
          return 1;
2633 3
        } else {
2634
          return 2;
2635
        }
2636
      }
2637 3
2638
    }
2639 3
2640
    return false;
2641
  }
2642
2643
  /**
2644
   * Check if the string is UTF-32.
2645
   *
2646
   * @param string $str
2647
   *
2648
   * @return int|false <p>
2649
   *                   <strong>false</strong> if is't not UTF-16,<br />
2650
   *                   <strong>1</strong> for UTF-32LE,<br />
2651
   *                   <strong>2</strong> for UTF-32BE.
2652
   *                   </p>
2653 3
   */
2654 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2655 3
  {
2656
    $str = self::remove_bom($str);
2657 3
2658
    if (self::is_binary($str)) {
2659 3
2660 3
      $maybeUTF32LE = 0;
2661 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2662 3
      if ($test) {
2663 3
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2664 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2665 3
        if ($test3 === $test) {
2666 3
          $strChars = self::count_chars($str, true);
2667 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2668 1
            if (in_array($test3char, $strChars, true) === true) {
2669 1
              $maybeUTF32LE++;
2670 3
            }
2671 3
          }
2672 3
        }
2673
      }
2674 3
2675 3
      $maybeUTF32BE = 0;
2676 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2677 3
      if ($test) {
2678 3
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2679 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2680 3
        if ($test3 === $test) {
2681 3
          $strChars = self::count_chars($str, true);
2682 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2683 1
            if (in_array($test3char, $strChars, true) === true) {
2684 1
              $maybeUTF32BE++;
2685 3
            }
2686 3
          }
2687 3
        }
2688
      }
2689 3
2690 1
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2691 1
        if ($maybeUTF32LE > $maybeUTF32BE) {
2692
          return 1;
2693 1
        } else {
2694
          return 2;
2695
        }
2696
      }
2697 3
2698
    }
2699 3
2700
    return false;
2701
  }
2702
2703
  /**
2704
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2705
   *
2706
   * @see    http://hsivonen.iki.fi/php-utf8/
2707
   *
2708
   * @param string $str    <p>The string to be checked.</p>
2709
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2710
   *
2711
   * @return bool
2712 43
   */
2713
  public static function is_utf8($str, $strict = false)
2714 43
  {
2715
    $str = (string)$str;
2716 43
2717 3
    if (!isset($str[0])) {
2718
      return true;
2719
    }
2720 41
2721 1
    if ($strict === true) {
2722 1
      if (self::is_utf16($str) !== false) {
2723
        return false;
2724
      }
2725
2726
      if (self::is_utf32($str) !== false) {
2727
        return false;
2728
      }
2729
    }
2730 41
2731
    if (self::pcre_utf8_support() !== true) {
2732
2733
      // If even just the first character can be matched, when the /u
2734
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2735
      // invalid, nothing at all will match, even if the string contains
2736
      // some valid sequences
2737
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2738
2739
    } else {
2740 41
2741
      $mState = 0; // cached expected number of octets after the current octet
2742 41
      // until the beginning of the next UTF8 character sequence
2743 41
      $mUcs4 = 0; // cached Unicode character
2744 41
      $mBytes = 1; // cached expected number of octets in the current sequence
2745
      $len = strlen($str);
2746
2747 41
      /** @noinspection ForeachInvariantsInspection */
2748 41
      for ($i = 0; $i < $len; $i++) {
2749 41
        $in = ord($str[$i]);
2750
        if ($mState === 0) {
2751
          // When mState is zero we expect either a US-ASCII character or a
2752 41
          // multi-octet sequence.
2753
          if (0 === (0x80 & $in)) {
2754 36
            // US-ASCII, pass straight through.
2755 41
            $mBytes = 1;
2756 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2757 34
            // First octet of 2 octet sequence.
2758 34
            $mUcs4 = $in;
2759 34
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2760 34
            $mState = 1;
2761 39
            $mBytes = 2;
2762
          } elseif (0xE0 === (0xF0 & $in)) {
2763 21
            // First octet of 3 octet sequence.
2764 21
            $mUcs4 = $in;
2765 21
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2766 21
            $mState = 2;
2767 33
            $mBytes = 3;
2768 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2769 9
            // First octet of 4 octet sequence.
2770 9
            $mUcs4 = $in;
2771 9
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2772 9
            $mState = 3;
2773 16
            $mBytes = 4;
2774
          } elseif (0xF8 === (0xFC & $in)) {
2775
            /* First octet of 5 octet sequence.
2776
            *
2777
            * This is illegal because the encoded codepoint must be either
2778
            * (a) not the shortest form or
2779
            * (b) outside the Unicode range of 0-0x10FFFF.
2780
            * Rather than trying to resynchronize, we will carry on until the end
2781
            * of the sequence and let the later error handling code catch it.
2782 3
            */
2783 3
            $mUcs4 = $in;
2784 3
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2785 3
            $mState = 4;
2786 9
            $mBytes = 5;
2787 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2788 3
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2789 3
            $mUcs4 = $in;
2790 3
            $mUcs4 = ($mUcs4 & 1) << 30;
2791 3
            $mState = 5;
2792 3
            $mBytes = 6;
2793
          } else {
2794
            /* Current octet is neither in the US-ASCII range nor a legal first
2795
             * octet of a multi-octet sequence.
2796 5
             */
2797
            return false;
2798 41
          }
2799
        } else {
2800
          // When mState is non-zero, we expect a continuation of the multi-octet
2801 36
          // sequence
2802
          if (0x80 === (0xC0 & $in)) {
2803 33
            // Legal continuation.
2804 33
            $shift = ($mState - 1) * 6;
2805 33
            $tmp = $in;
2806 33
            $tmp = ($tmp & 0x0000003F) << $shift;
2807
            $mUcs4 |= $tmp;
2808
            /**
2809
             * End of the multi-octet sequence. mUcs4 now contains the final
2810
             * Unicode code point to be output
2811 33
             */
2812
            if (0 === --$mState) {
2813
              /*
2814
              * Check for illegal sequences and code points.
2815
              */
2816
              // From Unicode 3.1, non-shortest form is illegal
2817 33
              if (
2818 33
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2819 33
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2820 33
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2821
                  (4 < $mBytes) ||
2822 33
                  // From Unicode 3.2, surrogate characters are illegal.
2823
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2824 33
                  // Code points outside the Unicode range are illegal.
2825 33
                  ($mUcs4 > 0x10FFFF)
2826 5
              ) {
2827
                return false;
2828
              }
2829 33
              // initialize UTF8 cache
2830 33
              $mState = 0;
2831 33
              $mUcs4 = 0;
2832 33
              $mBytes = 1;
2833 33
            }
2834
          } else {
2835
            /**
2836
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2837
             * Incomplete multi-octet sequence.
2838 18
             */
2839
            return false;
2840
          }
2841 41
        }
2842
      }
2843 20
2844
      return true;
2845
    }
2846
  }
2847
2848
  /**
2849
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2850
   * Decodes a JSON string
2851
   *
2852
   * @link http://php.net/manual/en/function.json-decode.php
2853
   *
2854
   * @param string $json    <p>
2855
   *                        The <i>json</i> string being decoded.
2856
   *                        </p>
2857
   *                        <p>
2858
   *                        This function only works with UTF-8 encoded strings.
2859
   *                        </p>
2860
   *                        <p>PHP implements a superset of
2861
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2862
   *                        only supports these values when they are nested inside an array or an object.
2863
   *                        </p>
2864
   * @param bool   $assoc   [optional] <p>
2865
   *                        When <b>TRUE</b>, returned objects will be converted into
2866
   *                        associative arrays.
2867
   *                        </p>
2868
   * @param int    $depth   [optional] <p>
2869
   *                        User specified recursion depth.
2870
   *                        </p>
2871
   * @param int    $options [optional] <p>
2872
   *                        Bitmask of JSON decode options. Currently only
2873
   *                        <b>JSON_BIGINT_AS_STRING</b>
2874
   *                        is supported (default is to cast large integers as floats)
2875
   *                        </p>
2876
   *
2877
   * @return mixed the value encoded in <i>json</i> in appropriate
2878
   * PHP type. Values true, false and
2879
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2880
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2881
   * <i>json</i> cannot be decoded or if the encoded
2882
   * data is deeper than the recursion limit.
2883 2
   */
2884
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2885 2
  {
2886
    $json = self::filter($json);
2887 2
2888 2
    if (Bootup::is_php('5.4') === true) {
2889 2
      $json = json_decode($json, $assoc, $depth, $options);
2890
    } else {
2891
      $json = json_decode($json, $assoc, $depth);
2892
    }
2893 2
2894
    return $json;
2895
  }
2896
2897
  /**
2898
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2899
   * Returns the JSON representation of a value.
2900
   *
2901
   * @link http://php.net/manual/en/function.json-encode.php
2902
   *
2903
   * @param mixed $value   <p>
2904
   *                       The <i>value</i> being encoded. Can be any type except
2905
   *                       a resource.
2906
   *                       </p>
2907
   *                       <p>
2908
   *                       All string data must be UTF-8 encoded.
2909
   *                       </p>
2910
   *                       <p>PHP implements a superset of
2911
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2912
   *                       only supports these values when they are nested inside an array or an object.
2913
   *                       </p>
2914
   * @param int   $options [optional] <p>
2915
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2916
   *                       <b>JSON_HEX_TAG</b>,
2917
   *                       <b>JSON_HEX_AMP</b>,
2918
   *                       <b>JSON_HEX_APOS</b>,
2919
   *                       <b>JSON_NUMERIC_CHECK</b>,
2920
   *                       <b>JSON_PRETTY_PRINT</b>,
2921
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2922
   *                       <b>JSON_FORCE_OBJECT</b>,
2923
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2924
   *                       constants is described on
2925
   *                       the JSON constants page.
2926
   *                       </p>
2927
   * @param int   $depth   [optional] <p>
2928
   *                       Set the maximum depth. Must be greater than zero.
2929
   *                       </p>
2930
   *
2931
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2932 2
   */
2933
  public static function json_encode($value, $options = 0, $depth = 512)
2934 2
  {
2935
    $value = self::filter($value);
2936 2
2937
    if (Bootup::is_php('5.5')) {
2938
      $json = json_encode($value, $options, $depth);
2939 2
    } else {
2940
      $json = json_encode($value, $options);
2941
    }
2942 2
2943
    return $json;
2944
  }
2945
2946
  /**
2947
   * Makes string's first char lowercase.
2948
   *
2949
   * @param string $str <p>The input string</p>
2950
   *
2951
   * @return string <p>The resulting string</p>
2952 6
   */
2953
  public static function lcfirst($str)
2954 6
  {
2955
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2956
  }
2957
2958
  /**
2959
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2960
   *
2961
   * @param string $str   <p>The string to be trimmed</p>
2962
   * @param string $chars <p>Optional characters to be stripped</p>
2963
   *
2964
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2965 24
   */
2966 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2967 24
  {
2968
    $str = (string)$str;
2969 24
2970 2
    if (!isset($str[0])) {
2971
      return '';
2972
    }
2973
2974 23
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2975 2
    if ($chars === INF || !$chars) {
2976
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2977
    }
2978 23
2979
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2980 23
2981
    return preg_replace("/^{$chars}+/u", '', $str);
2982
  }
2983
2984
  /**
2985
   * Returns the UTF-8 character with the maximum code point in the given data.
2986
   *
2987
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2988
   *
2989
   * @return string <p>The character with the highest code point than others.</p>
2990 1
   */
2991 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2992 1
  {
2993
    if (is_array($arg)) {
2994
      $arg = implode($arg);
2995
    }
2996 1
2997
    return self::chr(max(self::codepoints($arg)));
2998
  }
2999
3000
  /**
3001
   * Calculates and returns the maximum number of bytes taken by any
3002
   * UTF-8 encoded character in the given string.
3003
   *
3004
   * @param string $str <p>The original Unicode string.</p>
3005
   *
3006
   * @return int <p>Max byte lengths of the given chars.</p>
3007 1
   */
3008
  public static function max_chr_width($str)
3009 1
  {
3010 1
    $bytes = self::chr_size_list($str);
3011 1
    if (count($bytes) > 0) {
3012
      return (int)max($bytes);
3013 1
    } else {
3014
      return 0;
3015
    }
3016
  }
3017
3018
  /**
3019
   * Checks whether mbstring is available on the server.
3020
   *
3021
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
3022 2
   */
3023
  public static function mbstring_loaded()
3024 2
  {
3025
    $return = extension_loaded('mbstring');
3026 2
3027 2
    if ($return === true) {
3028 2
      \mb_internal_encoding('UTF-8');
3029
    }
3030 2
3031
    return $return;
3032
  }
3033
3034
  /**
3035
   * Returns the UTF-8 character with the minimum code point in the given data.
3036
   *
3037
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
3038
   *
3039
   * @return string <p>The character with the lowest code point than others.</p>
3040 1
   */
3041 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3042 1
  {
3043
    if (is_array($arg)) {
3044
      $arg = implode($arg);
3045
    }
3046 1
3047
    return self::chr(min(self::codepoints($arg)));
3048
  }
3049
3050
  /**
3051
   * alias for "UTF8::normalize_encoding()"
3052
   *
3053
   * @see UTF8::normalize_encoding()
3054
   *
3055
   * @param string $encoding
3056
   *
3057
   * @return string
3058 1
   */
3059
  public static function normalizeEncoding($encoding)
3060 1
  {
3061
    return self::normalize_encoding($encoding);
3062
  }
3063
3064
  /**
3065
   * Normalize the encoding-"name" input.
3066
   *
3067
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3068
   *
3069
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3070 16
   */
3071
  public static function normalize_encoding($encoding)
3072 16
  {
3073
    static $staticNormalizeEncodingCache = array();
3074 16
3075 2
    if (!$encoding) {
3076
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3077
    }
3078 16
3079 1
    if ('UTF-8' === $encoding) {
3080
      return $encoding;
3081
    }
3082 16
3083 4
    if (in_array($encoding, self::$iconvEncoding, true)) {
3084
      return $encoding;
3085
    }
3086 15
3087 14
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3088
      return $staticNormalizeEncodingCache[$encoding];
3089
    }
3090 4
3091 4
    $encodingOrig = $encoding;
3092 4
    $encoding = strtoupper($encoding);
3093
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3094
3095 4
    $equivalences = array(
3096 4
        'ISO88591'    => 'ISO-8859-1',
3097 4
        'ISO8859'     => 'ISO-8859-1',
3098 4
        'ISO'         => 'ISO-8859-1',
3099 4
        'LATIN1'      => 'ISO-8859-1',
3100 4
        'LATIN'       => 'ISO-8859-1',
3101 4
        'WIN1252'     => 'ISO-8859-1',
3102 4
        'WINDOWS1252' => 'ISO-8859-1',
3103 4
        'UTF16'       => 'UTF-16',
3104 4
        'UTF32'       => 'UTF-32',
3105 4
        'UTF8'        => 'UTF-8',
3106 4
        'UTF'         => 'UTF-8',
3107 4
        'UTF7'        => 'UTF-7',
3108 4
        '8BIT'        => 'CP850',
3109 4
        'BINARY'      => 'CP850',
3110
    );
3111 4
3112 4
    if (!empty($equivalences[$encodingUpperHelper])) {
3113 4
      $encoding = $equivalences[$encodingUpperHelper];
3114
    }
3115 4
3116
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3117 4
3118
    return $encoding;
3119
  }
3120
3121
  /**
3122
   * Normalize some MS Word special characters.
3123
   *
3124
   * @param string $str <p>The string to be normalized.</p>
3125
   *
3126
   * @return string
3127 13
   */
3128
  public static function normalize_msword($str)
3129 13
  {
3130 13
    static $utf8MSWordKeys = null;
3131
    static $utf8MSWordValues = null;
3132 13
3133 1
    if ($utf8MSWordKeys === null) {
3134 1
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3135 1
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3136
    }
3137 13
3138
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3139
  }
3140
3141
  /**
3142
   * Normalize the whitespace.
3143
   *
3144
   * @param string $str                     <p>The string to be normalized.</p>
3145
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3146
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3147
   *                                        bidirectional text chars.</p>
3148
   *
3149
   * @return string
3150 18
   */
3151
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3152 18
  {
3153 18
    static $whitespaces = array();
3154
    static $bidiUniCodeControls = null;
3155 18
3156
    $cacheKey = (int)$keepNonBreakingSpace;
3157 18
3158
    if (!isset($whitespaces[$cacheKey])) {
3159 2
3160
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3161 2
3162
      if ($keepNonBreakingSpace === true) {
3163 1
        /** @noinspection OffsetOperationsInspection */
3164 1
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3165
      }
3166 2
3167 2
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3168
    }
3169 18
3170 18
    if ($keepBidiUnicodeControls === false) {
3171 1
      if ($bidiUniCodeControls === null) {
3172 1
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3173
      }
3174 18
3175 18
      $str = str_replace($bidiUniCodeControls, '', $str);
3176
    }
3177 18
3178
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3179
  }
3180
3181
  /**
3182
   * Format a number with grouped thousands.
3183
   *
3184
   * @param float  $number
3185
   * @param int    $decimals
3186
   * @param string $dec_point
3187
   * @param string $thousands_sep
3188
   *
3189
   * @return string
3190
   *    *
3191
   * @deprecated Because this has nothing to do with UTF8. :/
3192
   */
3193
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3194
  {
3195
    $thousands_sep = (string)$thousands_sep;
3196
    $dec_point = (string)$dec_point;
3197
3198
    if (
3199
        isset($thousands_sep[1], $dec_point[1])
3200
        &&
3201
        Bootup::is_php('5.4') === true
3202
    ) {
3203
      return str_replace(
3204
          array(
3205
              '.',
3206
              ',',
3207
          ),
3208
          array(
3209
              $dec_point,
3210
              $thousands_sep,
3211
          ),
3212
          number_format($number, $decimals, '.', ',')
3213
      );
3214
    }
3215
3216
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3217
  }
3218
3219
  /**
3220
   * Calculates Unicode code point of the given UTF-8 encoded character.
3221
   *
3222
   * INFO: opposite to UTF8::chr()
3223
   *
3224
   * @param string $chr <p>The character of which to calculate code point.<p/>
3225
   *
3226
   * @return int <p>
3227
   *             Unicode code point of the given character,<br />
3228
   *             0 on invalid UTF-8 byte sequence.
3229
   *             </p>
3230 17
   */
3231
  public static function ord($chr)
3232 17
  {
3233 3
    if (!$chr && $chr !== '0') {
3234
      return 0;
3235
    }
3236 16
3237
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3238
      self::checkForSupport();
3239
    }
3240 16
3241
    if (self::$support['intlChar'] === true) {
3242
      $tmpReturn = \IntlChar::ord($chr);
3243
      if ($tmpReturn) {
3244
        return $tmpReturn;
3245
      }
3246
    }
3247
3248 16
    // use static cache, if there is no support for "IntlChar"
3249 16
    static $cache = array();
3250 15
    if (isset($cache[$chr]) === true) {
3251
      return $cache[$chr];
3252
    }
3253 9
3254 9
    $chr_orig = $chr;
3255 9
    $chr = unpack('C*', substr($chr, 0, 4));
3256
    $a = $chr ? $chr[1] : 0;
3257 9
3258 1
    if (0xF0 <= $a && isset($chr[4])) {
3259
      return $cache[$chr_orig] = (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3260
    }
3261 9
3262 4
    if (0xE0 <= $a && isset($chr[3])) {
3263
      return $cache[$chr_orig] = (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3264
    }
3265 9
3266 5
    if (0xC0 <= $a && isset($chr[2])) {
3267
      return $cache[$chr_orig] = (($a - 0xC0) << 6) + $chr[2] - 0x80;
3268
    }
3269 9
3270
    return $cache[$chr_orig] = $a;
3271
  }
3272
3273
  /**
3274
   * Parses the string into an array (into the the second parameter).
3275
   *
3276
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3277
   *          if the second parameter is not set!
3278
   *
3279
   * @link http://php.net/manual/en/function.parse-str.php
3280
   *
3281
   * @param string $str    <p>The input string.</p>
3282
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3283
   *
3284
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3285 1
   */
3286
  public static function parse_str($str, &$result)
3287
  {
3288 1
    // init
3289
    $str = self::clean($str);
3290 1
3291 1
    $return = \mb_parse_str($str, $result);
3292 1
    if ($return === false || empty($result)) {
3293
      return false;
3294
    }
3295 1
3296
    return true;
3297
  }
3298
3299
  /**
3300
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3301
   *
3302
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3303 41
   */
3304
  public static function pcre_utf8_support()
3305
  {
3306 41
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3307
    return (bool)@preg_match('//u', '');
3308
  }
3309
3310
  /**
3311
   * Create an array containing a range of UTF-8 characters.
3312
   *
3313
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3314
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3315
   *
3316
   * @return array
3317 1
   */
3318
  public static function range($var1, $var2)
3319 1
  {
3320 1
    if (!$var1 || !$var2) {
3321
      return array();
3322
    }
3323 1
3324 1 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3325 1
      $start = (int)$var1;
3326
    } elseif (ctype_xdigit($var1)) {
3327
      $start = (int)self::hex_to_int($var1);
3328 1
    } else {
3329
      $start = self::ord($var1);
3330
    }
3331 1
3332
    if (!$start) {
3333
      return array();
3334
    }
3335 1
3336 1 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3337 1
      $end = (int)$var2;
3338
    } elseif (ctype_xdigit($var2)) {
3339
      $end = (int)self::hex_to_int($var2);
3340 1
    } else {
3341
      $end = self::ord($var2);
3342
    }
3343 1
3344
    if (!$end) {
3345
      return array();
3346
    }
3347 1
3348
    return array_map(
3349 1
        array(
3350 1
            '\\voku\\helper\\UTF8',
3351 1
            'chr',
3352 1
        ),
3353 1
        range($start, $end)
3354
    );
3355
  }
3356
3357
  /**
3358
   * alias for "UTF8::remove_bom()"
3359
   *
3360
   * @see UTF8::remove_bom()
3361
   *
3362
   * @param string $str
3363
   *
3364
   * @return string
3365 5
   */
3366
  public static function removeBOM($str)
3367 5
  {
3368
    return self::remove_bom($str);
3369
  }
3370
3371
  /**
3372
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3373
   *
3374
   * @param string $str <p>The input string.</p>
3375
   *
3376
   * @return string <p>String without UTF-BOM</p>
3377 10
   */
3378
  public static function remove_bom($str)
3379 10
  {
3380 10
    foreach (self::$bom as $bomString => $bomByteLength) {
3381 5
      if (0 === strpos($str, $bomString)) {
3382 5
        $str = substr($str, $bomByteLength);
3383 10
      }
3384
    }
3385 10
3386
    return $str;
3387
  }
3388
3389
  /**
3390
   * Removes duplicate occurrences of a string in another string.
3391
   *
3392
   * @param string          $str  <p>The base string.</p>
3393
   * @param string|string[] $what <p>String to search for in the base string.</p>
3394
   *
3395
   * @return string <p>The result string with removed duplicates.</p>
3396 1
   */
3397
  public static function remove_duplicates($str, $what = ' ')
3398 1
  {
3399 1
    if (is_string($what)) {
3400 1
      $what = array($what);
3401
    }
3402 1
3403 1
    if (is_array($what)) {
3404 1
      /** @noinspection ForeachSourceInspection */
3405 1
      foreach ($what as $item) {
3406 1
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3407
      }
3408 1
    }
3409
3410
    return $str;
3411
  }
3412
3413
  /**
3414
   * Remove invisible characters from a string.
3415
   *
3416
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3417
   *
3418
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3419
   *
3420
   * @param string $str
3421
   * @param bool   $url_encoded
3422
   * @param string $replacement
3423
   *
3424 45
   * @return string
3425
   */
3426
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3427 45
  {
3428
    // init
3429
    $non_displayables = array();
3430
3431 45
    // every control character except newline (dec 10),
3432 45
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3433 45
    if ($url_encoded) {
3434 45
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3435
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3436 45
    }
3437
3438
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3439 45
3440 45
    do {
3441
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3442 45
    } while ($count !== 0);
3443
3444
    return $str;
3445
  }
3446
3447
  /**
3448
   * Replace the diamond question mark (�) with the replacement.
3449
   *
3450
   * @param string $str
3451
   * @param string $unknown
3452
   *
3453 45
   * @return string
3454
   */
3455 45
  public static function replace_diamond_question_mark($str, $unknown = '?')
3456
  {
3457 45
    return str_replace(
3458 45
        array(
3459 45
            "\xEF\xBF\xBD",
3460
            '�',
3461 45
        ),
3462 45
        array(
3463 45
            $unknown,
3464
            $unknown,
3465 45
        ),
3466
        $str
3467
    );
3468
  }
3469
3470
  /**
3471
   * Strip whitespace or other characters from end of a UTF-8 string.
3472
   *
3473
   * @param string $str   <p>The string to be trimmed.</p>
3474
   * @param string $chars <p>Optional characters to be stripped.</p>
3475
   *
3476 23
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3477
   */
3478 23 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3479
  {
3480 23
    $str = (string)$str;
3481 5
3482
    if (!isset($str[0])) {
3483
      return '';
3484
    }
3485 19
3486 3
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3487
    if ($chars === INF || !$chars) {
3488
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3489 18
    }
3490
3491 18
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3492
3493
    return preg_replace("/{$chars}+$/u", '', $str);
3494
  }
3495
3496
  /**
3497
   * rxClass
3498
   *
3499
   * @param string $s
3500
   * @param string $class
3501
   *
3502 52
   * @return string
3503
   */
3504 52
  private static function rxClass($s, $class = '')
3505
  {
3506 52
    static $rxClassCache = array();
3507
3508 52
    $cacheKey = $s . $class;
3509 40
3510
    if (isset($rxClassCache[$cacheKey])) {
3511
      return $rxClassCache[$cacheKey];
3512 18
    }
3513
3514
    $class = array($class);
3515 18
3516 17
    /** @noinspection SuspiciousLoopInspection */
3517
    foreach (self::str_split($s) as $s) {
3518 17
      if ('-' === $s) {
3519 17
        $class[0] = '-' . $class[0];
3520 17
      } elseif (!isset($s[2])) {
3521 2
        $class[0] .= preg_quote($s, '/');
3522 2
      } elseif (1 === self::strlen($s)) {
3523
        $class[0] .= $s;
3524
      } else {
3525 18
        $class[] = $s;
3526
      }
3527 18
    }
3528 18
3529 18
    if ($class[0]) {
3530
      $class[0] = '[' . $class[0] . ']';
3531 18
    }
3532 18
3533 18
    if (1 === count($class)) {
3534
      $return = $class[0];
3535
    } else {
3536
      $return = '(?:' . implode('|', $class) . ')';
3537 18
    }
3538
3539 18
    $rxClassCache[$cacheKey] = $return;
3540
3541
    return $return;
3542
  }
3543
3544
  /**
3545
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3546
   */
3547
  public static function showSupport()
3548
  {
3549
    foreach (self::$support as $utf8Support) {
3550
      echo $utf8Support . "\n<br>";
3551
    }
3552
  }
3553
3554
  /**
3555
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3556
   *
3557
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3558
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3559
   *
3560 1
   * @return string <p>The HTML numbered entity.</p>
3561
   */
3562 1
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3563 1
  {
3564
    if (!$char) {
3565
      return '';
3566
    }
3567
3568 1
    if (
3569 1
        $keepAsciiChars === true
3570 1
        &&
3571 1
        self::isAscii($char) === true
3572
    ) {
3573
      return $char;
3574 1
    }
3575
3576
    return '&#' . self::ord($char) . ';';
3577
  }
3578
3579
  /**
3580
   * Convert a string to an array of Unicode characters.
3581
   *
3582
   * @param string  $str       <p>The string to split into array.</p>
3583
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3584
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3585
   *
3586 36
   * @return string[] <p>An array containing chunks of the string.</p>
3587
   */
3588 36
  public static function split($str, $length = 1, $cleanUtf8 = false)
3589
  {
3590 36
    $str = (string)$str;
3591 2
3592
    if (!isset($str[0])) {
3593
      return array();
3594
    }
3595 36
3596 36
    // init
3597
    $str = (string)$str;
3598 36
    $ret = array();
3599
3600
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3601
      self::checkForSupport();
3602 36
    }
3603
3604 36
    if (self::$support['pcre_utf8'] === true) {
3605 6
3606 6
      if ($cleanUtf8 === true) {
3607
        $str = self::clean($str);
3608 36
      }
3609 36
3610 36
      preg_match_all('/./us', $str, $retArray);
3611 36
      if (isset($retArray[0])) {
3612 36
        $ret = $retArray[0];
3613
      }
3614 36
      unset($retArray);
3615
3616
    } else {
3617
3618
      // fallback
3619
3620
      $len = strlen($str);
3621
3622
      /** @noinspection ForeachInvariantsInspection */
3623
      for ($i = 0; $i < $len; $i++) {
3624
        if (($str[$i] & "\x80") === "\x00") {
3625
          $ret[] = $str[$i];
3626
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3627
          if (($str[$i + 1] & "\xC0") === "\x80") {
3628
            $ret[] = $str[$i] . $str[$i + 1];
3629
3630
            $i++;
3631
          }
3632 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3633
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3634
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3635
3636
            $i += 2;
3637
          }
3638
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3639 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3640
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3641
3642
            $i += 3;
3643
          }
3644
        }
3645
      }
3646 36
    }
3647 5
3648
    if ($length > 1) {
3649 5
      $ret = array_chunk($ret, $length);
3650 5
3651
      $ret = array_map('implode', $ret);
3652
    }
3653 36
3654
    /** @noinspection OffsetOperationsInspection */
3655
    if (isset($ret[0]) && $ret[0] === '') {
3656
      return array();
3657 36
    }
3658
3659
    return $ret;
3660
  }
3661
3662
  /**
3663
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3664
   *
3665
   * @param string $str <p>The input string.</p>
3666
   *
3667
   * @return false|string <p>
3668
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3669
   *                      otherwise it will return false.
3670 12
   *                      </p>
3671
   */
3672
  public static function str_detect_encoding($str)
3673
  {
3674
    //
3675
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3676 12
    //
3677 2
3678 1
    if (self::is_binary($str)) {
3679 2
      if (self::is_utf16($str) === 1) {
3680 1
        return 'UTF-16LE';
3681 2
      } elseif (self::is_utf16($str) === 2) {
3682
        return 'UTF-16BE';
3683 2
      } elseif (self::is_utf32($str) === 1) {
3684
        return 'UTF-32LE';
3685
      } elseif (self::is_utf32($str) === 2) {
3686 2
        return 'UTF-32BE';
3687
      }
3688
    }
3689
3690
    //
3691
    // 2.) simple check for ASCII chars
3692 12
    //
3693 3
3694
    if (self::is_ascii($str) === true) {
3695
      return 'ASCII';
3696
    }
3697
3698
    //
3699
    // 3.) simple check for UTF-8 chars
3700 12
    //
3701 9
3702
    if (self::is_utf8($str) === true) {
3703
      return 'UTF-8';
3704
    }
3705
3706
    //
3707
    // 4.) check via "\mb_detect_encoding()"
3708
    //
3709
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3710 6
3711 6
    $detectOrder = array(
3712 6
        'ISO-8859-1',
3713 6
        'ISO-8859-2',
3714 6
        'ISO-8859-3',
3715 6
        'ISO-8859-4',
3716 6
        'ISO-8859-5',
3717 6
        'ISO-8859-6',
3718 6
        'ISO-8859-7',
3719 6
        'ISO-8859-8',
3720 6
        'ISO-8859-9',
3721 6
        'ISO-8859-10',
3722 6
        'ISO-8859-13',
3723 6
        'ISO-8859-14',
3724 6
        'ISO-8859-15',
3725 6
        'ISO-8859-16',
3726 6
        'WINDOWS-1251',
3727 6
        'WINDOWS-1252',
3728 6
        'WINDOWS-1254',
3729 6
        'ISO-2022-JP',
3730 6
        'JIS',
3731
        'EUC-JP',
3732 6
    );
3733 6
3734 6
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3735
    if ($encoding) {
3736
      return $encoding;
3737
    }
3738
3739
    //
3740
    // 5.) check via "iconv()"
3741
    //
3742
3743
    $md5 = md5($str);
3744
    foreach (self::$iconvEncoding as $encodingTmp) {
3745
      # INFO: //IGNORE and //TRANSLIT still throw notice
3746
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3747
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3748
        return $encodingTmp;
3749
      }
3750
    }
3751
3752
    return false;
3753
  }
3754
3755
  /**
3756
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3757
   *
3758
   * @link  http://php.net/manual/en/function.str-ireplace.php
3759
   *
3760
   * @param mixed $search  <p>
3761
   *                       Every replacement with search array is
3762
   *                       performed on the result of previous replacement.
3763
   *                       </p>
3764
   * @param mixed $replace <p>
3765
   *                       </p>
3766
   * @param mixed $subject <p>
3767
   *                       If subject is an array, then the search and
3768
   *                       replace is performed with every entry of
3769
   *                       subject, and the return value is an array as
3770
   *                       well.
3771
   *                       </p>
3772
   * @param int   $count   [optional] <p>
3773
   *                       The number of matched and replaced needles will
3774
   *                       be returned in count which is passed by
3775
   *                       reference.
3776
   *                       </p>
3777
   *
3778 14
   * @return mixed <p>A string or an array of replacements.</p>
3779
   */
3780 14
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3781
  {
3782
    $search = (array)$search;
3783 14
3784 14
    /** @noinspection AlterInForeachInspection */
3785 1
    foreach ($search as &$s) {
3786 1
      if ('' === $s .= '') {
3787 13
        $s = '/^(?<=.)$/';
3788
      } else {
3789 14
        $s = '/' . preg_quote($s, '/') . '/ui';
3790
      }
3791 14
    }
3792 14
3793
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3794 14
    $count = $replace; // used as reference parameter
3795
3796
    return $subject;
3797
  }
3798
3799
  /**
3800
   * Limit the number of characters in a string, but also after the next word.
3801
   *
3802
   * @param string $str
3803
   * @param int    $length
3804
   * @param string $strAddOn
3805
   *
3806 1
   * @return string
3807
   */
3808 1
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3809
  {
3810 1
    $str = (string)$str;
3811
3812
    if (!isset($str[0])) {
3813
      return '';
3814 1
    }
3815
3816 1
    $length = (int)$length;
3817
3818
    if (self::strlen($str) <= $length) {
3819
      return $str;
3820 1
    }
3821 1
3822
    if (self::substr($str, $length - 1, 1) === ' ') {
3823
      return self::substr($str, 0, $length - 1) . $strAddOn;
3824 1
    }
3825 1
3826 1
    $str = self::substr($str, 0, $length);
3827 1
    $array = explode(' ', $str);
3828
    array_pop($array);
3829 1
    $new_str = implode(' ', $array);
3830
3831
    if ($new_str === '') {
3832 1
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3833
    } else {
3834
      $str = $new_str . $strAddOn;
3835 1
    }
3836
3837
    return $str;
3838
  }
3839
3840
  /**
3841
   * Pad a UTF-8 string to given length with another string.
3842
   *
3843
   * @param string $str        <p>The input string.</p>
3844
   * @param int    $pad_length <p>The length of return string.</p>
3845
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3846
   * @param int    $pad_type   [optional] <p>
3847
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3848
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3849
   *                           </p>
3850
   *
3851 2
   * @return string <strong>Returns the padded string</strong>
3852
   */
3853 2
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3854
  {
3855
    $str_length = self::strlen($str);
3856 2
3857 2
    if (
3858
        is_int($pad_length) === true
3859 2
        &&
3860
        $pad_length > 0
3861 2
        &&
3862 2
        $pad_length >= $str_length
3863
    ) {
3864 2
      $ps_length = self::strlen($pad_string);
3865
3866
      $diff = $pad_length - $str_length;
3867 2
3868 2
      switch ($pad_type) {
3869 2 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3870 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3871 2
          $pre = self::substr($pre, 0, $diff);
3872
          $post = '';
3873 2
          break;
3874 2
3875 2
        case STR_PAD_BOTH:
3876 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3877 2
          $pre = self::substr($pre, 0, (int)$diff / 2);
3878 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3879
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3880 2
          break;
3881 2
3882 2
        case STR_PAD_RIGHT:
3883 2 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3884 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3885 2
          $post = self::substr($post, 0, $diff);
3886
          $pre = '';
3887 2
      }
3888
3889
      return $pre . $str . $post;
3890 2
    }
3891
3892
    return $str;
3893
  }
3894
3895
  /**
3896
   * Repeat a string.
3897
   *
3898
   * @param string $str        <p>
3899
   *                           The string to be repeated.
3900
   *                           </p>
3901
   * @param int    $multiplier <p>
3902
   *                           Number of time the input string should be
3903
   *                           repeated.
3904
   *                           </p>
3905
   *                           <p>
3906
   *                           multiplier has to be greater than or equal to 0.
3907
   *                           If the multiplier is set to 0, the function
3908
   *                           will return an empty string.
3909
   *                           </p>
3910
   *
3911 1
   * @return string <p>The repeated string.</p>
3912
   */
3913 1
  public static function str_repeat($str, $multiplier)
3914
  {
3915 1
    $str = self::filter($str);
3916
3917
    return str_repeat($str, $multiplier);
3918
  }
3919
3920
  /**
3921
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3922
   *
3923
   * Replace all occurrences of the search string with the replacement string
3924
   *
3925
   * @link http://php.net/manual/en/function.str-replace.php
3926
   *
3927
   * @param mixed $search  <p>
3928
   *                       The value being searched for, otherwise known as the needle.
3929
   *                       An array may be used to designate multiple needles.
3930
   *                       </p>
3931
   * @param mixed $replace <p>
3932
   *                       The replacement value that replaces found search
3933
   *                       values. An array may be used to designate multiple replacements.
3934
   *                       </p>
3935
   * @param mixed $subject <p>
3936
   *                       The string or array being searched and replaced on,
3937
   *                       otherwise known as the haystack.
3938
   *                       </p>
3939
   *                       <p>
3940
   *                       If subject is an array, then the search and
3941
   *                       replace is performed with every entry of
3942
   *                       subject, and the return value is an array as
3943
   *                       well.
3944
   *                       </p>
3945
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3946
   *
3947 12
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3948
   */
3949 12
  public static function str_replace($search, $replace, $subject, &$count = null)
3950
  {
3951
    return str_replace($search, $replace, $subject, $count);
3952
  }
3953
3954
  /**
3955
   * Shuffles all the characters in the string.
3956
   *
3957
   * @param string $str <p>The input string</p>
3958
   *
3959 1
   * @return string <p>The shuffled string.</p>
3960
   */
3961 1
  public static function str_shuffle($str)
3962
  {
3963 1
    $array = self::split($str);
3964
3965 1
    shuffle($array);
3966
3967
    return implode('', $array);
3968
  }
3969
3970
  /**
3971
   * Sort all characters according to code points.
3972
   *
3973
   * @param string $str    <p>A UTF-8 string.</p>
3974
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3975
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3976
   *
3977 1
   * @return string <p>String of sorted characters.</p>
3978
   */
3979 1
  public static function str_sort($str, $unique = false, $desc = false)
3980
  {
3981 1
    $array = self::codepoints($str);
3982 1
3983 1
    if ($unique) {
3984
      $array = array_flip(array_flip($array));
3985 1
    }
3986 1
3987 1
    if ($desc) {
3988 1
      arsort($array);
3989
    } else {
3990
      asort($array);
3991 1
    }
3992
3993
    return self::string($array);
3994
  }
3995
3996
  /**
3997
   * Split a string into an array.
3998
   *
3999
   * @param string $str
4000
   * @param int    $len
4001
   *
4002 21
   * @return array
4003
   */
4004
  public static function str_split($str, $len = 1)
4005 21
  {
4006 21
    // init
4007
    $len = (int)$len;
4008 21
    $str = (string)$str;
4009 1
4010
    if (!isset($str[0])) {
4011
      return array();
4012 20
    }
4013
4014
    if ($len < 1) {
4015
      return str_split($str, $len);
4016 20
    }
4017 20
4018
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
4019 20
    $a = $a[0];
4020 20
4021
    if ($len === 1) {
4022
      return $a;
4023 1
    }
4024 1
4025
    $arrayOutput = array();
4026
    $p = -1;
4027 1
4028 1
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
4029 1
    foreach ($a as $l => $a) {
4030 1
      if ($l % $len) {
4031 1
        $arrayOutput[$p] .= $a;
4032
      } else {
4033 1
        $arrayOutput[++$p] = $a;
4034
      }
4035 1
    }
4036
4037
    return $arrayOutput;
4038
  }
4039
4040
  /**
4041
   * Get a binary representation of a specific string.
4042
   *
4043
   * @param string $str <p>The input string.</p>
4044
   *
4045 1
   * @return string
4046
   */
4047 1
  public static function str_to_binary($str)
4048
  {
4049 1
    $str = (string)$str;
4050
4051 1
    $value = unpack('H*', $str);
4052
4053
    return base_convert($value[1], 16, 2);
4054
  }
4055
4056
  /**
4057
   * alias for "UTF8::to_ascii()"
4058
   *
4059
   * @see UTF8::to_ascii()
4060
   *
4061
   * @param string $str
4062
   * @param string $unknown
4063
   * @param bool   $strict
4064
   *
4065 7
   * @return string
4066
   */
4067 7
  public static function str_transliterate($str, $unknown = '?', $strict = false)
4068
  {
4069
    return self::to_ascii($str, $unknown, $strict);
4070
  }
4071
4072
  /**
4073
   * Counts number of words in the UTF-8 string.
4074
   *
4075
   * @param string $str      <p>The input string.</p>
4076
   * @param int    $format   [optional] <p>
4077
   *                         <strong>0</strong> => return a number of words (default)<br />
4078
   *                         <strong>1</strong> => return an array of words<br />
4079
   *                         <strong>2</strong> => return an array of words with word-offset as key
4080
   *                         </p>
4081
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word.</p>
4082
   *
4083 1
   * @return array|int <p>The number of words in the string</p>
4084
   */
4085 1
  public static function str_word_count($str, $format = 0, $charlist = '')
4086 1
  {
4087
    $charlist = self::rxClass($charlist, '\pL');
4088 1
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4089
4090 1
    $len = count($strParts);
4091
4092 1
    if ($format === 1) {
4093 1
4094 1
      $numberOfWords = array();
4095 1
      for ($i = 1; $i < $len; $i += 2) {
4096
        $numberOfWords[] = $strParts[$i];
4097 1
      }
4098
4099 1
    } elseif ($format === 2) {
4100 1
4101 1
      $numberOfWords = array();
4102 1
      $offset = self::strlen($strParts[0]);
4103 1
      for ($i = 1; $i < $len; $i += 2) {
4104 1
        $numberOfWords[$offset] = $strParts[$i];
4105
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4106 1
      }
4107
4108 1
    } else {
4109
4110
      $numberOfWords = ($len - 1) / 2;
4111
4112 1
    }
4113
4114
    return $numberOfWords;
4115
  }
4116
4117
  /**
4118
   * Case-insensitive string comparison.
4119
   *
4120
   * INFO: Case-insensitive version of UTF8::strcmp()
4121
   *
4122
   * @param string $str1
4123
   * @param string $str2
4124
   *
4125
   * @return int <p>
4126
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4127
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4128
   *             <strong>0</strong> if they are equal.
4129 9
   *             </p>
4130
   */
4131 9
  public static function strcasecmp($str1, $str2)
4132
  {
4133
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4134
  }
4135
4136
  /**
4137
   * alias for "UTF8::strstr()"
4138
   *
4139
   * @see UTF8::strstr()
4140
   *
4141
   * @param string  $haystack
4142
   * @param string  $needle
4143
   * @param bool    $before_needle
4144
   * @param string  $encoding
4145
   * @param boolean $cleanUtf8
4146
   *
4147 1
   * @return string|false
4148
   */
4149 1
  public static function strchr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4150
  {
4151
    return self::strstr($haystack, $needle, $before_needle, $encoding, $cleanUtf8);
4152
  }
4153
4154
  /**
4155
   * Case-sensitive string comparison.
4156
   *
4157
   * @param string $str1
4158
   * @param string $str2
4159
   *
4160
   * @return int  <p>
4161
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4162
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4163
   *              <strong>0</strong> if they are equal.
4164 12
   *              </p>
4165
   */
4166 12
  public static function strcmp($str1, $str2)
4167 11
  {
4168 11
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4169 12
        \Normalizer::normalize($str1, \Normalizer::NFD),
4170
        \Normalizer::normalize($str2, \Normalizer::NFD)
4171
    );
4172
  }
4173
4174
  /**
4175
   * Find length of initial segment not matching mask.
4176
   *
4177
   * @param string $str
4178
   * @param string $charList
4179
   * @param int    $offset
4180
   * @param int    $length
4181
   *
4182 9
   * @return int|null
4183
   */
4184 9
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4185 1
  {
4186
    if ('' === $charList .= '') {
4187
      return null;
4188 8
    }
4189 2
4190 2
    if ($offset || 2147483647 !== $length) {
4191
      $str = (string)self::substr($str, $offset, $length);
4192 8
    }
4193 8
4194 1
    $str = (string)$str;
4195
    if (!isset($str[0])) {
4196
      return null;
4197 7
    }
4198
4199 7
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4200
      /** @noinspection OffsetOperationsInspection */
4201
      return self::strlen($length[1]);
4202 1
    }
4203
4204
    return self::strlen($str);
4205
  }
4206
4207
  /**
4208
   * alias for "UTF8::stristr()"
4209
   *
4210
   * @see UTF8::stristr()
4211
   *
4212
   * @param string  $haystack
4213
   * @param string  $needle
4214
   * @param bool    $before_needle
4215
   * @param string  $encoding
4216
   * @param boolean $cleanUtf8
4217
   *
4218 1
   * @return string|false
4219
   */
4220 1
  public static function strichr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4221
  {
4222
    return self::stristr($haystack, $needle, $before_needle, $encoding, $cleanUtf8);
4223
  }
4224
4225
  /**
4226
   * Create a UTF-8 string from code points.
4227
   *
4228
   * INFO: opposite to UTF8::codepoints()
4229
   *
4230
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4231
   *
4232 2
   * @return string <p>UTF-8 encoded string.</p>
4233
   */
4234 2
  public static function string(array $array)
4235 2
  {
4236
    return implode(
4237 2
        array_map(
4238 2
            array(
4239 2
                '\\voku\\helper\\UTF8',
4240
                'chr',
4241 2
            ),
4242 2
            $array
4243
        )
4244
    );
4245
  }
4246
4247
  /**
4248
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4249
   *
4250
   * @param string $str <p>The input string.</p>
4251
   *
4252 3
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4253
   */
4254 3
  public static function string_has_bom($str)
4255 3
  {
4256 3
    foreach (self::$bom as $bomString => $bomByteLength) {
4257
      if (0 === strpos($str, $bomString)) {
4258 3
        return true;
4259
      }
4260 3
    }
4261
4262
    return false;
4263
  }
4264
4265
  /**
4266
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4267
   *
4268
   * @link http://php.net/manual/en/function.strip-tags.php
4269
   *
4270
   * @param string $str            <p>
4271
   *                               The input string.
4272
   *                               </p>
4273
   * @param string $allowable_tags [optional] <p>
4274
   *                               You can use the optional second parameter to specify tags which should
4275
   *                               not be stripped.
4276
   *                               </p>
4277
   *                               <p>
4278
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4279
   *                               can not be changed with allowable_tags.
4280
   *                               </p>
4281
   *
4282 2
   * @return string <p>The stripped string.</p>
4283
   */
4284
  public static function strip_tags($str, $allowable_tags = null)
4285 2
  {
4286
    // clean broken utf8
4287 2
    $str = self::clean($str);
4288
4289
    return strip_tags($str, $allowable_tags);
4290
  }
4291
4292
  /**
4293
   * Finds position of first occurrence of a string within another, case insensitive.
4294
   *
4295
   * @link http://php.net/manual/en/function.mb-stripos.php
4296
   *
4297
   * @param string  $haystack  <p>
4298
   *                           The string from which to get the position of the first occurrence
4299
   *                           of needle
4300
   *                           </p>
4301
   * @param string  $needle    <p>
4302
   *                           The string to find in haystack
4303
   *                           </p>
4304
   * @param int     $offset    [optional] <p>
4305
   *                           The position in haystack
4306
   *                           to start searching
4307
   *                           </p>
4308
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4309
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4310
   *
4311
   * @return int|false <p>
4312
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4313
   *                   or false if needle is not found.
4314 8
   *                   </p>
4315
   */
4316 8
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4317 8
  {
4318
    $haystack = (string)$haystack;
4319 8
    $needle = (string)$needle;
4320 3
4321
    if (!isset($haystack[0], $needle[0])) {
4322
      return false;
4323 7
    }
4324 1
4325 1
    if ($cleanUtf8 === true) {
4326 1
      $haystack = self::clean($haystack);
4327
      $needle = self::clean($needle);
4328
    }
4329
4330 7 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4331 1
        $encoding === 'UTF-8'
4332 7
        ||
4333 7
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4334 7
    ) {
4335
      $encoding = 'UTF-8';
4336
    } else {
4337
      $encoding = self::normalize_encoding($encoding);
4338 7
    }
4339
4340
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4341
  }
4342
4343
  /**
4344
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4345
   *
4346
   * @param string  $haystack      <p>The input string. Must be valid UTF-8.</p>
4347
   * @param string  $needle        <p>The string to look for. Must be valid UTF-8.</p>
4348
   * @param bool    $before_needle [optional] <p>
4349
   *                               If <b>TRUE</b>, grapheme_strstr() returns the part of the
4350
   *                               haystack before the first occurrence of the needle (excluding the needle).
4351
   *                               </p>
4352
   * @param string  $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4353
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4354
   *
4355 8
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4356
   */
4357 8
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4358 2
  {
4359
    if ('' === $needle .= '') {
4360
      return false;
4361 6
    }
4362
4363
    if ($encoding !== 'UTF-8') {
4364
      $encoding = self::normalize_encoding($encoding);
4365 6
    }
4366
4367
    if ($cleanUtf8 === true) {
4368
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4369
      // if invalid characters are found in $haystack before $needle
4370
      $needle = self::clean($needle);
4371
      $haystack = self::clean($haystack);
4372 6
    }
4373
4374
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4375
  }
4376
4377
  /**
4378
   * Get the string length, not the byte-length!
4379
   *
4380
   * @link     http://php.net/manual/en/function.mb-strlen.php
4381
   *
4382
   * @param string  $str       <p>The string being checked for length.</p>
4383
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4384
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4385
   *
4386
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4387 62
   *             character counted as +1)</p>
4388
   */
4389 62
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4390
  {
4391 62
    $str = (string)$str;
4392 4
4393
    if (!isset($str[0])) {
4394
      return 0;
4395
    }
4396
4397 61 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4398 2
        $encoding === 'UTF-8'
4399 61
        ||
4400 60
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4401 60
    ) {
4402 2
      $encoding = 'UTF-8';
4403
    } else {
4404
      $encoding = self::normalize_encoding($encoding);
4405
    }
4406 61
4407 61
    switch ($encoding) {
4408 1
      case 'ASCII':
4409
      case 'CP850':
4410
        return strlen($str);
4411 61
    }
4412 2
4413 2
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4414
      $str = self::clean($str);
4415 61
    }
4416
4417
    return \mb_strlen($str, $encoding);
4418
  }
4419
4420
  /**
4421
   * Case insensitive string comparisons using a "natural order" algorithm.
4422
   *
4423
   * INFO: natural order version of UTF8::strcasecmp()
4424
   *
4425
   * @param string $str1 <p>The first string.</p>
4426
   * @param string $str2 <p>The second string.</p>
4427
   *
4428
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4429
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4430 1
   *             <strong>0</strong> if they are equal
4431
   */
4432 1
  public static function strnatcasecmp($str1, $str2)
4433
  {
4434
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4435
  }
4436
4437
  /**
4438
   * String comparisons using a "natural order" algorithm
4439
   *
4440
   * INFO: natural order version of UTF8::strcmp()
4441
   *
4442
   * @link  http://php.net/manual/en/function.strnatcmp.php
4443
   *
4444
   * @param string $str1 <p>The first string.</p>
4445
   * @param string $str2 <p>The second string.</p>
4446
   *
4447
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4448
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4449 2
   *             <strong>0</strong> if they are equal
4450
   */
4451 2
  public static function strnatcmp($str1, $str2)
4452
  {
4453
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4454
  }
4455
4456
  /**
4457
   * Case-insensitive string comparison of the first n characters.
4458
   *
4459
   * @link  http://php.net/manual/en/function.strncasecmp.php
4460
   *
4461
   * @param string $str1 <p>The first string.</p>
4462
   * @param string $str2 <p>The second string.</p>
4463
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4464
   *
4465
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4466
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4467 1
   *             <strong>0</strong> if they are equal
4468
   */
4469 1
  public static function strncasecmp($str1, $str2, $len)
4470
  {
4471
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4472
  }
4473
4474
  /**
4475
   * String comparison of the first n characters.
4476
   *
4477
   * @link  http://php.net/manual/en/function.strncmp.php
4478
   *
4479
   * @param string $str1 <p>The first string.</p>
4480
   * @param string $str2 <p>The second string.</p>
4481
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4482
   *
4483
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4484
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4485 2
   *             <strong>0</strong> if they are equal
4486
   */
4487 2
  public static function strncmp($str1, $str2, $len)
4488 2
  {
4489
    $str1 = self::substr($str1, 0, $len);
4490 2
    $str2 = self::substr($str2, 0, $len);
4491
4492
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4489 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4490 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4493
  }
4494
4495
  /**
4496
   * Search a string for any of a set of characters.
4497
   *
4498
   * @link  http://php.net/manual/en/function.strpbrk.php
4499
   *
4500
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4501
   * @param string $char_list <p>This parameter is case sensitive.</p>
4502
   *
4503 1
   * @return string String starting from the character found, or false if it is not found.
4504
   */
4505 1
  public static function strpbrk($haystack, $char_list)
4506 1
  {
4507
    $haystack = (string)$haystack;
4508 1
    $char_list = (string)$char_list;
4509 1
4510
    if (!isset($haystack[0], $char_list[0])) {
4511
      return false;
4512 1
    }
4513 1
4514
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4515 1
      return substr($haystack, strpos($haystack, $m[0]));
4516
    } else {
4517
      return false;
4518
    }
4519
  }
4520
4521
  /**
4522
   * Find position of first occurrence of string in a string.
4523
   *
4524
   * @link http://php.net/manual/en/function.mb-strpos.php
4525
   *
4526
   * @param string  $haystack  <p>The string being checked.</p>
4527
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4528
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4529
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4530
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4531
   *
4532
   * @return int|false <p>
4533
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4534
   *                   If needle is not found it returns false.
4535 15
   *                   </p>
4536
   */
4537 15
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4538 15
  {
4539
    $haystack = (string)$haystack;
4540 15
    $needle = (string)$needle;
4541 2
4542
    if (!isset($haystack[0], $needle[0])) {
4543
      return false;
4544
    }
4545 14
4546
    // init
4547
    $offset = (int)$offset;
4548
4549 14
    // iconv and mbstring do not support integer $needle
4550
4551
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4552
      $needle = (string)self::chr($needle);
4553 14
    }
4554
4555
    if ($cleanUtf8 === true) {
4556 2
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4557 2
      // if invalid characters are found in $haystack before $needle
4558 2
      $needle = self::clean($needle);
4559
      $haystack = self::clean($haystack);
4560 14
    }
4561
4562
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4563
      self::checkForSupport();
4564
    }
4565
4566 14 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4567 2
        $encoding === 'UTF-8'
4568 14
        ||
4569 14
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4570 14
    ) {
4571 1
      $encoding = 'UTF-8';
4572
    } else {
4573
      $encoding = self::normalize_encoding($encoding);
4574 14
    }
4575 14
4576
    if (self::$support['mbstring'] === true) {
4577
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4578
    }
4579
4580
    if (self::$support['iconv'] === true) {
4581
      // ignore invalid negative offset to keep compatibility
4582
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4583
      return \iconv_strpos($haystack, $needle, $offset > 0 ? $offset : 0, $encoding);
4584
    }
4585
4586
    if ($offset > 0) {
4587
      $haystack = self::substr($haystack, $offset);
4588
    }
4589
4590 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4591
      $left = substr($haystack, 0, $pos);
4592
4593
      // negative offset not supported in PHP strpos(), ignoring
4594
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4595
    }
4596
4597
    return false;
4598
  }
4599
4600
  /**
4601
   * Finds the last occurrence of a character in a string within another.
4602
   *
4603
   * @link http://php.net/manual/en/function.mb-strrchr.php
4604
   *
4605
   * @param string $haystack      <p>The string from which to get the last occurrence of needle.</p>
4606
   * @param string $needle        <p>The string to find in haystack</p>
4607
   * @param bool   $before_needle [optional] <p>
4608
   *                              Determines which portion of haystack
4609
   *                              this function returns.
4610
   *                              If set to true, it returns all of haystack
4611
   *                              from the beginning to the last occurrence of needle.
4612
   *                              If set to false, it returns all of haystack
4613
   *                              from the last occurrence of needle to the end,
4614
   *                              </p>
4615
   * @param string $encoding      [optional] <p>
4616
   *                              Character encoding name to use.
4617
   *                              If it is omitted, internal character encoding is used.
4618
   *                              </p>
4619
   *
4620 1
   * @return string|false The portion of haystack or false if needle is not found.
4621
   */
4622 1 View Code Duplication
  public static function strrchr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4623 1
  {
4624 1
    if ($encoding !== 'UTF-8') {
4625
      $encoding = self::normalize_encoding($encoding);
4626 1
    }
4627
4628
    if ($cleanUtf8 === true) {
4629
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4630
      // if invalid characters are found in $haystack before $needle
4631
      $needle = self::clean($needle);
4632
      $haystack = self::clean($haystack);
4633 1
    }
4634
4635
    return \mb_strrchr($haystack, $needle, $before_needle, $encoding);
4636
  }
4637
4638
  /**
4639
   * Reverses characters order in the string.
4640
   *
4641
   * @param string $str The input string
4642
   *
4643 4
   * @return string The string with characters in the reverse sequence
4644
   */
4645 4
  public static function strrev($str)
4646
  {
4647 4
    $str = (string)$str;
4648 2
4649
    if (!isset($str[0])) {
4650
      return '';
4651 3
    }
4652
4653
    return implode(array_reverse(self::split($str)));
4654
  }
4655
4656
  /**
4657
   * Finds the last occurrence of a character in a string within another, case insensitive.
4658
   *
4659
   * @link http://php.net/manual/en/function.mb-strrichr.php
4660
   *
4661
   * @param string  $haystack      <p>The string from which to get the last occurrence of needle.</p>
4662
   * @param string  $needle        <p>The string to find in haystack.</p>
4663
   * @param bool    $before_needle [optional] <p>
4664
   *                               Determines which portion of haystack
4665
   *                               this function returns.
4666
   *                               If set to true, it returns all of haystack
4667
   *                               from the beginning to the last occurrence of needle.
4668
   *                               If set to false, it returns all of haystack
4669
   *                               from the last occurrence of needle to the end,
4670
   *                               </p>
4671
   * @param string  $encoding      [optional] <p>
4672
   *                               Character encoding name to use.
4673
   *                               If it is omitted, internal character encoding is used.
4674
   *                               </p>
4675
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4676
   *
4677 1
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4678
   */
4679 1 View Code Duplication
  public static function strrichr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4680 1
  {
4681 1
    if ($encoding !== 'UTF-8') {
4682
      $encoding = self::normalize_encoding($encoding);
4683 1
    }
4684
4685
    if ($cleanUtf8 === true) {
4686
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4687
      // if invalid characters are found in $haystack before $needle
4688
      $needle = self::clean($needle);
4689
      $haystack = self::clean($haystack);
4690 1
    }
4691
4692
    return \mb_strrichr($haystack, $needle, $before_needle, $encoding);
4693
  }
4694
4695
  /**
4696
   * Find position of last occurrence of a case-insensitive string.
4697
   *
4698
   * @param string  $haystack  <p>The string to look in.</p>
4699
   * @param string  $needle    <p>The string to look for.</p>
4700
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4701
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4702
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4703
   *
4704
   * @return int|false <p>
4705
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4706
   *                   not found, it returns false.
4707 1
   *                   </p>
4708
   */
4709 1
  public static function strripos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4710
  {
4711
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $encoding, $cleanUtf8);
4712
  }
4713
4714
  /**
4715
   * Find position of last occurrence of a string in a string.
4716
   *
4717
   * @link http://php.net/manual/en/function.mb-strrpos.php
4718
   *
4719
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4720
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4721
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4722
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4723
   *                              the end of the string.
4724
   *                              </p>
4725
   * @param string     $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4726
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4727
   *
4728
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4729 11
   *                   is not found, it returns false.</p>
4730
   */
4731 11
  public static function strrpos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4732
  {
4733 11
    if (((int)$needle) === $needle && ($needle >= 0)) {
4734 2
      $needle = self::chr($needle);
4735 2
    }
4736
4737 11
    $haystack = (string)$haystack;
4738
    $needle = (string)$needle;
4739 11
4740 2
    if (!isset($haystack[0], $needle[0])) {
4741
      return false;
4742
    }
4743
4744 10
    // init
4745 10
    $needle = (string)$needle;
4746
    $offset = (int)$offset;
4747
4748
    if (
4749 10
        $cleanUtf8 === true
4750
        ||
4751 10
        $encoding === true // INFO: the "bool"-check is only a fallback for old versions
4752
    ) {
4753
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4754 3
4755 3
      $needle = self::clean($needle);
4756 3
      $haystack = self::clean($haystack);
4757
    }
4758 10
4759
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4760
      self::checkForSupport();
4761
    }
4762
4763 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4764 10
        $encoding === 'UTF-8'
4765 1
        ||
4766 10
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4767 10
    ) {
4768 10
      $encoding = 'UTF-8';
4769 1
    } else {
4770
      $encoding = self::normalize_encoding($encoding);
4771
    }
4772
4773
    if (
4774 10
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4775 10
        ||
4776 10
        self::$support['mbstring'] === true
4777 10
    ) {
4778
      return \mb_strrpos($haystack, $needle, $offset, $encoding);
4779
    }
4780
4781
    if (self::$support['iconv'] === true) {
4782
      return \grapheme_strrpos($haystack, $needle, $offset);
4783
    }
4784
4785
    // fallback
4786
4787
    if ($offset > 0) {
4788
      $haystack = self::substr($haystack, $offset);
4789
    } elseif ($offset < 0) {
4790
      $haystack = self::substr($haystack, 0, $offset);
4791
    }
4792
4793 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4794
      $left = substr($haystack, 0, $pos);
4795
4796
      // negative offset not supported in PHP strpos(), ignoring
4797
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4798
    }
4799
4800
    return false;
4801
  }
4802
4803
  /**
4804
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4805
   * mask.
4806
   *
4807
   * @param string $str    <p>The input string.</p>
4808
   * @param string $mask   <p>The mask of chars</p>
4809
   * @param int    $offset [optional]
4810
   * @param int    $length [optional]
4811
   *
4812
   * @return int
4813 10
   */
4814
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4815
  {
4816 10
    // init
4817 10
    $length = (int)$length;
4818
    $offset = (int)$offset;
4819 10
4820 2
    if ($offset || 2147483647 !== $length) {
4821 2
      $str = self::substr($str, $offset, $length);
4822
    }
4823 10
4824 10
    $str = (string)$str;
4825 2
    if (!isset($str[0], $mask[0])) {
4826
      return 0;
4827
    }
4828 8
4829
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4830
  }
4831
4832
  /**
4833
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4834
   *
4835
   * @param string  $haystack      <p>The input string. Must be valid UTF-8.</p>
4836
   * @param string  $needle        <p>The string to look for. Must be valid UTF-8.</p>
4837
   * @param bool    $before_needle [optional] <p>
4838
   *                               If <b>TRUE</b>, strstr() returns the part of the
4839
   *                               haystack before the first occurrence of the needle (excluding the needle).
4840
   *                               </p>
4841
   * @param string  $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4842
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4843
   *
4844
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4845 2
   */
4846
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4847 2
  {
4848
    if ($cleanUtf8 === true) {
4849
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4850
      // if invalid characters are found in $haystack before $needle
4851
      $needle = self::clean($needle);
4852
      $haystack = self::clean($haystack);
4853
    }
4854 2
4855 1
    if ($encoding !== 'UTF-8') {
4856 1
      $encoding = self::normalize_encoding($encoding);
4857
    }
4858
4859
    if (
4860 2
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4861 2
        ||
4862 2
        self::$support['mbstring'] === true
4863 2
    ) {
4864
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4865
    }
4866
4867
    return \grapheme_strstr($haystack, $needle, $before_needle);
4868
  }
4869
4870
  /**
4871
   * Unicode transformation for case-less matching.
4872
   *
4873
   * @link http://unicode.org/reports/tr21/tr21-5.html
4874
   *
4875
   * @param string $str  <p>The input string.</p>
4876
   * @param bool   $full <p>
4877
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4878
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4879
   *                     </p>
4880
   *
4881
   * @return string
4882 11
   */
4883
  public static function strtocasefold($str, $full = true)
4884 11
  {
4885 11
    static $fullCaseFold = null;
4886 11
    static $commonCaseFoldKeys = null;
4887
    static $commonCaseFoldValues = null;
4888 11
4889 1
    if ($commonCaseFoldKeys === null) {
4890 1
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4891 1
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4892
    }
4893 11
4894
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4895 11
4896
    if ($full) {
4897 11
4898 1
      if ($fullCaseFold === null) {
4899 1
        $fullCaseFold = self::getData('caseFolding_full');
4900
      }
4901
4902 11
      /** @noinspection OffsetOperationsInspection */
4903 11
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4904
    }
4905 11
4906
    $str = self::clean($str);
4907 11
4908
    return self::strtolower($str);
4909
  }
4910
4911
  /**
4912
   * Make a string lowercase.
4913
   *
4914
   * @link http://php.net/manual/en/function.mb-strtolower.php
4915
   *
4916
   * @param string  $str       <p>The string being lowercased.</p>
4917
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function</p>
4918
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4919
   *
4920
   * @return string str with all alphabetic characters converted to lowercase.
4921 21
   */
4922 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4923
  {
4924 21
    // init
4925
    $str = (string)$str;
4926 21
4927 6
    if (!isset($str[0])) {
4928
      return '';
4929
    }
4930 19
4931
    if ($cleanUtf8 === true) {
4932
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4933
      // if invalid characters are found in $haystack before $needle
4934
      $str = self::clean($str);
4935
    }
4936 19
4937 2
    if ($encoding !== 'UTF-8') {
4938 2
      $encoding = self::normalize_encoding($encoding);
4939
    }
4940 19
4941
    return \mb_strtolower($str, $encoding);
4942
  }
4943
4944
  /**
4945
   * Generic case sensitive transformation for collation matching.
4946
   *
4947
   * @param string $str <p>The input string</p>
4948
   *
4949
   * @return string
4950 3
   */
4951
  private static function strtonatfold($str)
4952 3
  {
4953
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4954
  }
4955
4956
  /**
4957
   * Make a string uppercase.
4958
   *
4959
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4960
   *
4961
   * @param string  $str       <p>The string being uppercased.</p>
4962
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4963
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4964
   *
4965
   * @return string str with all alphabetic characters converted to uppercase.
4966 16
   */
4967 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4968 16
  {
4969
    $str = (string)$str;
4970 16
4971 2
    if (!isset($str[0])) {
4972
      return '';
4973
    }
4974 15
4975
    if ($cleanUtf8 === true) {
4976
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4977
      // if invalid characters are found in $haystack before $needle
4978
      $str = self::clean($str);
4979
    }
4980 15
4981 2
    if ($encoding !== 'UTF-8') {
4982 2
      $encoding = self::normalize_encoding($encoding);
4983
    }
4984 15
4985
    return \mb_strtoupper($str, $encoding);
4986
  }
4987
4988
  /**
4989
   * Translate characters or replace sub-strings.
4990
   *
4991
   * @link  http://php.net/manual/en/function.strtr.php
4992
   *
4993
   * @param string          $str  <p>The string being translated.</p>
4994
   * @param string|string[] $from <p>The string replacing from.</p>
4995
   * @param string|string[] $to   <p>The string being translated to to.</p>
4996
   *
4997
   * @return string <p>
4998
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4999
   *                corresponding character in to.
5000
   *                </p>
5001 1
   */
5002
  public static function strtr($str, $from, $to = INF)
5003 1
  {
5004 1
    if (INF !== $to) {
5005 1
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 5005 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5006 1
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 5006 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5007 1
      $countFrom = count($from);
5008
      $countTo = count($to);
5009 1
5010 1
      if ($countFrom > $countTo) {
5011 1
        $from = array_slice($from, 0, $countTo);
5012 1
      } elseif ($countFrom < $countTo) {
5013 1
        $to = array_slice($to, 0, $countFrom);
5014
      }
5015 1
5016 1
      $from = array_combine($from, $to);
5017
    }
5018 1
5019
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 5002 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5020
  }
5021
5022
  /**
5023
   * Return the width of a string.
5024
   *
5025
   * @param string  $str       <p>The input string.</p>
5026
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5027
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5028
   *
5029
   * @return int
5030 1
   */
5031
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5032 1
  {
5033 1
    if ($encoding !== 'UTF-8') {
5034 1
      $encoding = self::normalize_encoding($encoding);
5035
    }
5036 1
5037
    if ($cleanUtf8 === true) {
5038
      // iconv and mbstring are not tolerant to invalid encoding
5039
      // further, their behaviour is inconsistent with that of PHP's substr
5040 1
5041 1
      $str = self::clean($str);
5042
    }
5043 1
5044
    return \mb_strwidth($str, $encoding);
5045
  }
5046
5047
  /**
5048
   * Get part of a string.
5049
   *
5050
   * @link http://php.net/manual/en/function.mb-substr.php
5051
   *
5052
   * @param string  $str       <p>The string being checked.</p>
5053
   * @param int     $start     <p>The first position used in str.</p>
5054
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
5055
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5056
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5057
   *
5058
   * @return string <p>Returns a sub-string specified by the start and length parameters.</p>
5059 47
   */
5060
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5061
  {
5062 47
    // init
5063
    $str = (string)$str;
5064 47
5065 9
    if (!isset($str[0])) {
5066
      return '';
5067
    }
5068 45
5069
    if ($cleanUtf8 === true) {
5070
      // iconv and mbstring are not tolerant to invalid encoding
5071
      // further, their behaviour is inconsistent with that of PHP's substr
5072 1
5073 1
      $str = self::clean($str);
5074
    }
5075 45
5076 45
    $str_length = 0;
5077 37
    if ($start || $length === null) {
5078 37
      $str_length = (int)self::strlen($str);
5079
    }
5080 45
5081 2
    if ($start && $start > $str_length) {
5082
      return false;
5083
    }
5084 43
5085 20
    if ($length === null) {
5086 20
      $length = $str_length;
5087 41
    } else {
5088
      $length = (int)$length;
5089
    }
5090 43
5091
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5092
      self::checkForSupport();
5093
    }
5094
5095 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5096 43
        $encoding === 'UTF-8'
5097 2
        ||
5098 43
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
5099 43
    ) {
5100 43
      $encoding = 'UTF-8';
5101 1
    } else {
5102
      $encoding = self::normalize_encoding($encoding);
5103
    }
5104 43
5105 43
    if (self::$support['mbstring'] === true) {
5106
      return \mb_substr($str, $start, $length, $encoding);
5107
    }
5108
5109
    if (self::$support['iconv'] === true) {
5110
      return \iconv_substr($str, $start, $length, $encoding);
5111
    }
5112
5113
    // fallback
5114
5115
    // split to array, and remove invalid characters
5116
    $array = self::split($str);
5117
5118
    // extract relevant part, and join to make sting again
5119
    return implode(array_slice($array, $start, $length));
5120
  }
5121
5122
  /**
5123
   * Binary safe comparison of two strings from an offset, up to length characters.
5124
   *
5125
   * @param string  $main_str           <p>The main string being compared.</p>
5126
   * @param string  $str                <p>The secondary string being compared.</p>
5127
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
5128
   *                                    the end of the string.</p>
5129
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
5130
   *                                    the length of the str compared to the length of main_str less the offset.</p>
5131
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
5132
   *                                    insensitive.</p>
5133
   *
5134
   * @return int
5135 1
   */
5136
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
5137 1
  {
5138 1
    $main_str = self::substr($main_str, $offset, $length);
5139
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5138 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5140 1
5141
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5138 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5139 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5138 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5139 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5142
  }
5143
5144
  /**
5145
   * Count the number of substring occurrences.
5146
   *
5147
   * @link  http://php.net/manual/en/function.substr-count.php
5148
   *
5149
   * @param string  $haystack  <p>The string to search in.</p>
5150
   * @param string  $needle    <p>The substring to search for.</p>
5151
   * @param int     $offset    [optional] <p>The offset where to start counting.</p>
5152
   * @param int     $length    [optional] <p>
5153
   *                           The maximum length after the specified offset to search for the
5154
   *                           substring. It outputs a warning if the offset plus the length is
5155
   *                           greater than the haystack length.
5156
   *                           </p>
5157
   * @param string  $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5158
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5159
   *
5160
   * @return int|false <p>This functions returns an integer or false if there isn't a string.</p>
5161 1
   */
5162
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5163 1
  {
5164 1
    $haystack = (string)$haystack;
5165
    $needle = (string)$needle;
5166 1
5167 1
    if (!isset($haystack[0], $needle[0])) {
5168
      return false;
5169
    }
5170 1
5171 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5172 1
      $offset = (int)$offset;
5173
      $length = (int)$length;
5174 1
5175 1
      if (
5176
          Bootup::is_php('7.1') === false
5177
          &&
5178 1
          $length + $offset <= 0
5179 1
      ) {
5180
        return false;
5181 1
      }
5182 1
5183 1
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5184
    }
5185 1
5186
    if ($encoding !== 'UTF-8') {
5187
      $encoding = self::normalize_encoding($encoding);
5188
    }
5189
5190
    if ($cleanUtf8 === true) {
5191
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
5192 1
      // if invalid characters are found in $haystack before $needle
5193
      $needle = self::clean($needle);
5194
      $haystack = self::clean($haystack);
0 ignored issues
show
Security Bug introduced by
It seems like $haystack can also be of type false; however, voku\helper\UTF8::clean() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5195
    }
5196
5197
    return \mb_substr_count($haystack, $needle, $encoding);
5198
  }
5199
5200
  /**
5201
   * Replace text within a portion of a string.
5202
   *
5203
   * source: https://gist.github.com/stemar/8287074
5204
   *
5205
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5206
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5207 6
   * @param int|int[]       $start
5208
   * @param int|int[]|void  $length      [optional]
5209 6
   *
5210 1
   * @return string|string[]
5211
   */
5212
  public static function substr_replace($str, $replacement, $start, $length = null)
5213 1
  {
5214 1
    if (is_array($str)) {
5215 1
      $num = count($str);
5216 1
5217
      // $replacement
5218
      if (is_array($replacement)) {
5219
        $replacement = array_slice($replacement, 0, $num);
5220 1
      } else {
5221 1
        $replacement = array_pad(array($replacement), $num, $replacement);
5222 1
      }
5223 1
5224 1
      // $start
5225 1 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5226 1
        $start = array_slice($start, 0, $num);
5227 1
        foreach ($start as &$valueTmp) {
5228
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5229
        }
5230
        unset($valueTmp);
5231 1
      } else {
5232 1
        $start = array_pad(array($start), $num, $start);
5233 1
      }
5234 1
5235 1
      // $length
5236 1
      if (!isset($length)) {
5237 1
        $length = array_fill(0, $num, 0);
5238 1 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5239
        $length = array_slice($length, 0, $num);
5240
        foreach ($length as &$valueTmpV2) {
5241 1
          if (isset($valueTmpV2)) {
5242 1
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5243 1
          } else {
5244 1
            $valueTmpV2 = 0;
5245
          }
5246
        }
5247
        unset($valueTmpV2);
5248 1
      } else {
5249
        $length = array_pad(array($length), $num, $length);
5250 6
      }
5251 1
5252 1
      // Recursive call
5253 1
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5254 1
    } else {
5255
      if (is_array($replacement)) {
5256 1
        if (count($replacement) > 0) {
5257
          $replacement = $replacement[0];
5258
        } else {
5259 6
          $replacement = '';
5260 6
        }
5261
      }
5262 6
    }
5263 4
5264 4
    preg_match_all('/./us', (string)$str, $smatches);
5265
    preg_match_all('/./us', (string)$replacement, $rmatches);
5266 6
5267
    if ($length === null) {
5268 6
      $length = \mb_strlen($str);
5269
    }
5270
5271
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5272
5273
    return implode($smatches[0], null);
5274
  }
5275
5276
  /**
5277
   * Returns a case swapped version of the string.
5278
   *
5279
   * @param string  $str       <p>The input string.</p>
5280 1
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5281
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5282 1
   *
5283
   * @return string <p>Each character's case swapped.</p>
5284 1
   */
5285 1
  public static function swapCase($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5286
  {
5287
    $str = (string)$str;
5288 1
5289 1
    if (!isset($str[0])) {
5290 1
      return '';
5291
    }
5292 1
5293
    if ($encoding !== 'UTF-8') {
5294
      $encoding = self::normalize_encoding($encoding);
5295 1
    }
5296 1
5297
    if ($cleanUtf8 === true) {
5298 1
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
5299 1
      // if invalid characters are found in $haystack before $needle
5300
      $str = self::clean($str);
5301 1
    }
5302
5303 1
    $strSwappedCase = preg_replace_callback(
5304 1
        '/[\S]/u',
5305
        function ($match) use ($encoding) {
5306 1
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5307
5308 1
          if ($match[0] === $marchToUpper) {
5309
            return UTF8::strtolower($match[0], $encoding);
5310 1
          } else {
5311
            return $marchToUpper;
5312 1
          }
5313
        },
5314
        $str
5315
    );
5316
5317
    return $strSwappedCase;
5318
  }
5319
5320
  /**
5321
   * alias for "UTF8::to_ascii()"
5322
   *
5323
   * @see UTF8::to_ascii()
5324
   *
5325
   * @param string $s
5326 7
   * @param string $subst_chr
5327
   * @param bool   $strict
5328 7
   *
5329
   * @return string
5330
   */
5331
  public static function toAscii($s, $subst_chr = '?', $strict = false)
5332
  {
5333
    return self::to_ascii($s, $subst_chr, $strict);
5334
  }
5335
5336
  /**
5337
   * alias for "UTF8::to_iso8859()"
5338
   *
5339
   * @see UTF8::to_iso8859()
5340 1
   *
5341
   * @param string $str
5342 1
   *
5343
   * @return string|string[]
5344
   */
5345
  public static function toIso8859($str)
5346
  {
5347
    return self::to_iso8859($str);
5348
  }
5349
5350
  /**
5351
   * alias for "UTF8::to_latin1()"
5352
   *
5353
   * @see UTF8::to_latin1()
5354 1
   *
5355
   * @param $str
5356 1
   *
5357
   * @return string
5358
   */
5359
  public static function toLatin1($str)
5360
  {
5361
    return self::to_latin1($str);
5362
  }
5363
5364
  /**
5365
   * alias for "UTF8::to_utf8()"
5366
   *
5367
   * @see UTF8::to_utf8()
5368 1
   *
5369
   * @param string $str
5370 1
   *
5371
   * @return string
5372
   */
5373
  public static function toUTF8($str)
5374
  {
5375
    return self::to_utf8($str);
5376
  }
5377
5378
  /**
5379
   * Convert a string into ASCII.
5380
   *
5381
   * @param string $str     <p>The input string.</p>
5382
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5383
   * @param bool   $strict  [optional] <p>Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad
5384
   *                        performance</p>
5385 13
   *
5386
   * @return string
5387 13
   *
5388
   * @throws \Exception
5389
   */
5390 13
  public static function to_ascii($str, $unknown = '?', $strict = false)
5391
  {
5392 13
    static $UTF8_TO_ASCII;
5393 3
5394
    // init
5395
    $str = (string)$str;
5396 11
5397
    if (!isset($str[0])) {
5398
      return '';
5399 11
    }
5400 7
5401
    $str = self::clean($str, false, true, true);
5402
5403 5
    // check if we only have ASCII
5404 1
    if (self::is_ascii($str) === true) {
5405
      return $str;
5406
    }
5407
5408 1
    if ($strict === true) {
5409 1
      if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5410
        self::checkForSupport();
5411
      }
5412 1
5413 1
      if (self::$support['intl'] === true && Bootup::is_php('5.4')) {
5414
        $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5415
5416 1
        // check again, if we only have ASCII, now ...
5417
        if (self::is_ascii($str) === true) {
5418
          return $str;
5419 1
        }
5420
5421 5
      } else {
5422 5
        throw new \Exception('Intl is not supported or you use PHP < 5.4!');
5423 5
      }
5424
    }
5425 5
5426
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5427 5
    $chars = $ar[0];
5428 5
    foreach ($chars as &$c) {
5429
5430
      $ordC0 = ord($c[0]);
5431 5
5432
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5433
        continue;
5434 5
      }
5435 5
5436 5
      $ordC1 = ord($c[1]);
5437
5438 5
      // ASCII - next please
5439 2
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5440
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5441 2
      }
5442 2
5443 2
      if ($ordC0 >= 224) {
5444
        $ordC2 = ord($c[2]);
5445 2
5446 1
        if ($ordC0 <= 239) {
5447
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5448 1
        }
5449 1
5450 1
        if ($ordC0 >= 240) {
5451
          $ordC3 = ord($c[3]);
5452 1
5453
          if ($ordC0 <= 247) {
5454
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5455
          }
5456
5457
          if ($ordC0 >= 248) {
5458
            $ordC4 = ord($c[4]);
5459
5460 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5461
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5462
            }
5463
5464
            if ($ordC0 >= 252) {
5465
              $ordC5 = ord($c[5]);
5466
5467 1 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5468 2
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5469
              }
5470 5
            }
5471
          }
5472
        }
5473
      }
5474
5475 5
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5476
        $c = $unknown;
5477
        continue;
5478
      }
5479
5480 5
      if (!isset($ord)) {
5481 5
        $c = $unknown;
5482 1
        continue;
5483 1
      }
5484
5485 1
      $bank = $ord >> 8;
5486 1
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5487 1
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5488
        if (file_exists($bankfile)) {
5489 1
          /** @noinspection PhpIncludeInspection */
5490
          require $bankfile;
5491 5
        } else {
5492 5
          $UTF8_TO_ASCII[$bank] = array();
5493 5
        }
5494 5
      }
5495 1
5496
      $newchar = $ord & 255;
5497 5
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5498
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5499 5
      } else {
5500
        $c = $unknown;
5501
      }
5502
    }
5503
5504
    return implode('', $chars);
5505
  }
5506
5507
  /**
5508
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5509 2
   *
5510
   * @param string|string[] $str
5511 2
   *
5512
   * @return string|string[]
5513 1
   */
5514
  public static function to_iso8859($str)
5515
  {
5516 1
    if (is_array($str)) {
5517 1
5518
      /** @noinspection ForeachSourceInspection */
5519 1
      foreach ($str as $k => $v) {
5520
        /** @noinspection AlterInForeachInspection */
5521
        /** @noinspection OffsetOperationsInspection */
5522 2
        $str[$k] = self::to_iso8859($v);
5523
      }
5524 2
5525 1
      return $str;
5526
    }
5527
5528 2
    $str = (string)$str;
5529
5530
    if (!isset($str[0])) {
5531
      return '';
5532
    }
5533
5534
    return self::utf8_decode($str);
5535
  }
5536
5537
  /**
5538
   * alias for "UTF8::to_iso8859()"
5539
   *
5540 1
   * @see UTF8::to_iso8859()
5541
   *
5542 1
   * @param string|string[] $str
5543
   *
5544
   * @return string|string[]
5545
   */
5546
  public static function to_latin1($str)
5547
  {
5548
    return self::to_iso8859($str);
5549
  }
5550
5551
  /**
5552
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5553
   *
5554
   * - It decode UTF-8 codepoints and unicode escape sequences.
5555
   *
5556
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5557
   *
5558
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5559
   *
5560
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5561
   *    are followed by any of these:  ("group B")
5562
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5563
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5564
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5565
   * is also a valid unicode character, and will be left unchanged.
5566
   *
5567
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5568 20
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5569
   *
5570 20
   * @param string|string[] $str <p>Any string or array.</p>
5571 2
   *
5572
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5573
   */
5574 2
  public static function to_utf8($str)
5575 2
  {
5576
    if (is_array($str)) {
5577 2
      /** @noinspection ForeachSourceInspection */
5578
      foreach ($str as $k => $v) {
5579
        /** @noinspection AlterInForeachInspection */
5580 20
        /** @noinspection OffsetOperationsInspection */
5581
        $str[$k] = self::to_utf8($v);
5582 20
      }
5583 4
5584
      return $str;
5585
    }
5586 19
5587 19
    $str = (string)$str;
5588
5589
    if (!isset($str[0])) {
5590 19
      return $str;
5591 19
    }
5592
5593 19
    $max = strlen($str);
5594 19
    $buf = '';
5595 19
5596 19
    /** @noinspection ForeachInvariantsInspection */
5597
    for ($i = 0; $i < $max; $i++) {
5598 19
      $c1 = $str[$i];
5599
5600 16
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5601 16
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5602 16
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5603 16
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5604 5
5605 5
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5606 5
5607
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5608
            $buf .= $c1 . $c2;
5609 19
            $i++;
5610
          } else { // not valid UTF8 - convert it
5611 17
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5612 13
            $cc2 = ($c1 & "\x3f") | "\x80";
5613 13
            $buf .= $cc1 . $cc2;
5614 13
          }
5615 8
5616 8 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5617 8
5618
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5619
            $buf .= $c1 . $c2 . $c3;
5620 19
            $i += 2;
5621
          } else { // not valid UTF8 - convert it
5622 9
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5623 4
            $cc2 = ($c1 & "\x3f") | "\x80";
5624 4
            $buf .= $cc1 . $cc2;
5625 4
          }
5626 6
5627 6
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5628 6
5629 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5630
            $buf .= $c1 . $c2 . $c3 . $c4;
5631 9
            $i += 3;
5632 6
          } else { // not valid UTF8 - convert it
5633 6
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5634 6
            $cc2 = ($c1 & "\x3f") | "\x80";
5635
            $buf .= $cc1 . $cc2;
5636
          }
5637 19
5638
        } else { // doesn't look like UTF8, but should be converted
5639 4
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5640 4
          $cc2 = (($c1 & "\x3f") | "\x80");
5641 2
          $buf .= $cc1 . $cc2;
5642 2
        }
5643 3
5644 3
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5645 3
5646
        $ordC1 = ord($c1);
5647
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5648 4
          $buf .= self::$win1252ToUtf8[$ordC1];
5649 16
        } else {
5650
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5651 19
          $cc2 = (($c1 & "\x3f") | "\x80");
5652
          $buf .= $cc1 . $cc2;
5653
        }
5654 19
5655 19
      } else { // it doesn't need conversion
5656
        $buf .= $c1;
5657 3
      }
5658 19
    }
5659
5660 19
    // decode unicode escape sequences
5661
    $buf = preg_replace_callback(
5662
        '/\\\\u([0-9a-f]{4})/i',
5663 19
        function ($match) {
5664 19
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5665 19
        },
5666 2
        $buf
5667 19
    );
5668
5669 19
    // decode UTF-8 codepoints
5670
    $buf = preg_replace_callback(
5671 19
        '/&#\d{2,4};/',
5672
        function ($match) {
5673
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5674
        },
5675
        $buf
5676
    );
5677
5678
    return $buf;
5679
  }
5680
5681
  /**
5682
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5683
   *
5684
   * INFO: This is slower then "trim()"
5685
   *
5686
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5687 26
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5688
   *
5689 26
   * @param string $str   <p>The string to be trimmed</p>
5690
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5691 26
   *
5692 5
   * @return string <p>The trimmed string.</p>
5693
   */
5694
  public static function trim($str = '', $chars = INF)
5695
  {
5696 22
    $str = (string)$str;
5697 6
5698
    if (!isset($str[0])) {
5699
      return '';
5700 16
    }
5701
5702
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5703
    if ($chars === INF || !$chars) {
5704
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5705
    }
5706
5707
    return self::rtrim(self::ltrim($str, $chars), $chars);
5708
  }
5709
5710
  /**
5711
   * Makes string's first char uppercase.
5712 14
   *
5713
   * @param string  $str       <p>The input string.</p>
5714 14
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
5715
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5716
   *
5717
   * @return string <p>The resulting string</p>
5718
   */
5719
  public static function ucfirst($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5720
  {
5721
    return self::strtoupper(self::substr($str, 0, 1, $encoding, $cleanUtf8), $encoding, $cleanUtf8) . self::substr($str, 1, null, $encoding, $cleanUtf8);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1, $encoding, $cleanUtf8) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5722
  }
5723
5724
  /**
5725
   * alias for "UTF8::ucfirst()"
5726
   *
5727
   * @see UTF8::ucfirst()
5728 1
   *
5729
   * @param string  $word
5730 1
   * @param string  $encoding
5731
   * @param boolean $cleanUtf8
5732
   *
5733
   * @return string
5734
   */
5735
  public static function ucword($word, $encoding = 'UTF-8', $cleanUtf8 = false)
5736
  {
5737
    return self::ucfirst($word, $encoding, $cleanUtf8);
5738
  }
5739
5740
  /**
5741
   * Uppercase for all words in the string.
5742
   *
5743
   * @param string   $str        <p>The input string.</p>
5744 8
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5745
   * @param string   $charlist   [optional] <p>Additional chars that contains to words and do not start a new word.</p>
5746 8
   * @param string   $encoding   [optional] <p>Set the charset for e.g. "\mb_" function.</p>
5747 2
   * @param boolean  $cleanUtf8  [optional] <p>Clean non UTF-8 chars from the string.</p>
5748
   *
5749
   * @return string
5750 7
   */
5751 7
  public static function ucwords($str, $exceptions = array(), $charlist = '', $encoding = 'UTF-8', $cleanUtf8 = false)
5752 7
  {
5753
    if (!$str) {
5754 7
      return '';
5755 1
    }
5756 1
5757 7
    $charlist = self::rxClass($charlist, '\pL');
5758
    $words = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
5759
    $newwords = array();
5760 7
5761
    if (count($exceptions) > 0) {
5762 7
      $useExceptions = true;
5763 7
    } else {
5764
      $useExceptions = false;
5765
    }
5766
5767 7
    foreach ($words as $word) {
5768
5769
      if (!$word) {
5770
        continue;
5771 1
      }
5772 1
5773 1
      if (
5774 7
          ($useExceptions === false)
5775 7
          ||
5776 7
          (
5777
              $useExceptions === true
5778 7
              &&
5779 7
              !in_array($word, $exceptions, true)
5780
          )
5781 7
      ) {
5782
        $word = self::ucfirst($word, $encoding, $cleanUtf8);
5783
      }
5784
5785
      $newwords[] = $word;
5786
    }
5787
5788
    return implode('', $newwords);
5789
  }
5790
5791
  /**
5792
   * Multi decode html entity & fix urlencoded-win1252-chars.
5793
   *
5794
   * e.g:
5795
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5796
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5797
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5798
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5799
   * 'Düsseldorf'                   => 'Düsseldorf'
5800
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5801 1
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5802
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5803 1
   *
5804
   * @param string $str          <p>The input string.</p>
5805 1
   * @param bool   $multi_decode <p>Decode as often as possible.</p>
5806 1
   *
5807
   * @return string
5808
   */
5809 1
  public static function urldecode($str, $multi_decode = true)
5810
  {
5811 1
    $str = (string)$str;
5812
5813 1
    if (!isset($str[0])) {
5814 1
      return '';
5815 1
    }
5816 1
5817
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5818 1
5819 1
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5820 1
5821
    do {
5822 1
      $str_compare = $str;
5823
5824
      $str = self::fix_simple_utf8(
5825
          rawurldecode(
5826
              self::html_entity_decode(
5827
                  self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5828
                  $flags
5829
              )
5830 1
          )
5831
      );
5832
5833
    } while ($multi_decode === true && $str_compare !== $str);
5834
5835
    return (string)$str;
5836
  }
5837
5838
  /**
5839
   * Return a array with "urlencoded"-win1252 -> UTF-8
5840
   *
5841
   * @deprecated use the "UTF8::urldecode()" function to decode a string
5842
   *
5843
   * @return array
5844
   */
5845
  public static function urldecode_fix_win1252_chars()
5846
  {
5847
    static $array = array(
5848
        '%20' => ' ',
5849
        '%21' => '!',
5850
        '%22' => '"',
5851
        '%23' => '#',
5852
        '%24' => '$',
5853
        '%25' => '%',
5854
        '%26' => '&',
5855
        '%27' => "'",
5856
        '%28' => '(',
5857
        '%29' => ')',
5858
        '%2A' => '*',
5859
        '%2B' => '+',
5860
        '%2C' => ',',
5861
        '%2D' => '-',
5862
        '%2E' => '.',
5863
        '%2F' => '/',
5864
        '%30' => '0',
5865
        '%31' => '1',
5866
        '%32' => '2',
5867
        '%33' => '3',
5868
        '%34' => '4',
5869
        '%35' => '5',
5870
        '%36' => '6',
5871
        '%37' => '7',
5872
        '%38' => '8',
5873
        '%39' => '9',
5874
        '%3A' => ':',
5875
        '%3B' => ';',
5876
        '%3C' => '<',
5877
        '%3D' => '=',
5878
        '%3E' => '>',
5879
        '%3F' => '?',
5880
        '%40' => '@',
5881
        '%41' => 'A',
5882
        '%42' => 'B',
5883
        '%43' => 'C',
5884
        '%44' => 'D',
5885
        '%45' => 'E',
5886
        '%46' => 'F',
5887
        '%47' => 'G',
5888
        '%48' => 'H',
5889
        '%49' => 'I',
5890
        '%4A' => 'J',
5891
        '%4B' => 'K',
5892
        '%4C' => 'L',
5893
        '%4D' => 'M',
5894
        '%4E' => 'N',
5895
        '%4F' => 'O',
5896
        '%50' => 'P',
5897
        '%51' => 'Q',
5898
        '%52' => 'R',
5899
        '%53' => 'S',
5900
        '%54' => 'T',
5901
        '%55' => 'U',
5902
        '%56' => 'V',
5903
        '%57' => 'W',
5904
        '%58' => 'X',
5905
        '%59' => 'Y',
5906
        '%5A' => 'Z',
5907
        '%5B' => '[',
5908
        '%5C' => '\\',
5909
        '%5D' => ']',
5910
        '%5E' => '^',
5911
        '%5F' => '_',
5912
        '%60' => '`',
5913
        '%61' => 'a',
5914
        '%62' => 'b',
5915
        '%63' => 'c',
5916
        '%64' => 'd',
5917
        '%65' => 'e',
5918
        '%66' => 'f',
5919
        '%67' => 'g',
5920
        '%68' => 'h',
5921
        '%69' => 'i',
5922
        '%6A' => 'j',
5923
        '%6B' => 'k',
5924
        '%6C' => 'l',
5925
        '%6D' => 'm',
5926
        '%6E' => 'n',
5927
        '%6F' => 'o',
5928
        '%70' => 'p',
5929
        '%71' => 'q',
5930
        '%72' => 'r',
5931
        '%73' => 's',
5932
        '%74' => 't',
5933
        '%75' => 'u',
5934
        '%76' => 'v',
5935
        '%77' => 'w',
5936
        '%78' => 'x',
5937
        '%79' => 'y',
5938
        '%7A' => 'z',
5939
        '%7B' => '{',
5940
        '%7C' => '|',
5941
        '%7D' => '}',
5942
        '%7E' => '~',
5943
        '%7F' => '',
5944
        '%80' => '`',
5945
        '%81' => '',
5946
        '%82' => '‚',
5947
        '%83' => 'ƒ',
5948
        '%84' => '„',
5949
        '%85' => '…',
5950
        '%86' => '†',
5951
        '%87' => '‡',
5952
        '%88' => 'ˆ',
5953
        '%89' => '‰',
5954
        '%8A' => 'Š',
5955
        '%8B' => '‹',
5956
        '%8C' => 'Œ',
5957
        '%8D' => '',
5958
        '%8E' => 'Ž',
5959
        '%8F' => '',
5960
        '%90' => '',
5961
        '%91' => '‘',
5962
        '%92' => '’',
5963
        '%93' => '“',
5964
        '%94' => '”',
5965
        '%95' => '•',
5966
        '%96' => '–',
5967
        '%97' => '—',
5968
        '%98' => '˜',
5969
        '%99' => '™',
5970
        '%9A' => 'š',
5971
        '%9B' => '›',
5972
        '%9C' => 'œ',
5973
        '%9D' => '',
5974
        '%9E' => 'ž',
5975
        '%9F' => 'Ÿ',
5976
        '%A0' => '',
5977
        '%A1' => '¡',
5978
        '%A2' => '¢',
5979
        '%A3' => '£',
5980
        '%A4' => '¤',
5981
        '%A5' => '¥',
5982
        '%A6' => '¦',
5983
        '%A7' => '§',
5984
        '%A8' => '¨',
5985
        '%A9' => '©',
5986
        '%AA' => 'ª',
5987
        '%AB' => '«',
5988
        '%AC' => '¬',
5989
        '%AD' => '',
5990
        '%AE' => '®',
5991
        '%AF' => '¯',
5992
        '%B0' => '°',
5993
        '%B1' => '±',
5994
        '%B2' => '²',
5995
        '%B3' => '³',
5996
        '%B4' => '´',
5997
        '%B5' => 'µ',
5998
        '%B6' => '¶',
5999
        '%B7' => '·',
6000
        '%B8' => '¸',
6001
        '%B9' => '¹',
6002
        '%BA' => 'º',
6003
        '%BB' => '»',
6004
        '%BC' => '¼',
6005
        '%BD' => '½',
6006
        '%BE' => '¾',
6007
        '%BF' => '¿',
6008
        '%C0' => 'À',
6009
        '%C1' => 'Á',
6010
        '%C2' => 'Â',
6011
        '%C3' => 'Ã',
6012
        '%C4' => 'Ä',
6013
        '%C5' => 'Å',
6014
        '%C6' => 'Æ',
6015
        '%C7' => 'Ç',
6016
        '%C8' => 'È',
6017
        '%C9' => 'É',
6018
        '%CA' => 'Ê',
6019
        '%CB' => 'Ë',
6020
        '%CC' => 'Ì',
6021
        '%CD' => 'Í',
6022
        '%CE' => 'Î',
6023
        '%CF' => 'Ï',
6024
        '%D0' => 'Ð',
6025
        '%D1' => 'Ñ',
6026
        '%D2' => 'Ò',
6027
        '%D3' => 'Ó',
6028
        '%D4' => 'Ô',
6029
        '%D5' => 'Õ',
6030
        '%D6' => 'Ö',
6031
        '%D7' => '×',
6032
        '%D8' => 'Ø',
6033
        '%D9' => 'Ù',
6034
        '%DA' => 'Ú',
6035
        '%DB' => 'Û',
6036
        '%DC' => 'Ü',
6037
        '%DD' => 'Ý',
6038
        '%DE' => 'Þ',
6039
        '%DF' => 'ß',
6040
        '%E0' => 'à',
6041
        '%E1' => 'á',
6042
        '%E2' => 'â',
6043
        '%E3' => 'ã',
6044
        '%E4' => 'ä',
6045
        '%E5' => 'å',
6046
        '%E6' => 'æ',
6047
        '%E7' => 'ç',
6048
        '%E8' => 'è',
6049
        '%E9' => 'é',
6050
        '%EA' => 'ê',
6051
        '%EB' => 'ë',
6052
        '%EC' => 'ì',
6053
        '%ED' => 'í',
6054
        '%EE' => 'î',
6055
        '%EF' => 'ï',
6056
        '%F0' => 'ð',
6057 1
        '%F1' => 'ñ',
6058
        '%F2' => 'ò',
6059 1
        '%F3' => 'ó',
6060
        '%F4' => 'ô',
6061
        '%F5' => 'õ',
6062
        '%F6' => 'ö',
6063
        '%F7' => '÷',
6064
        '%F8' => 'ø',
6065
        '%F9' => 'ù',
6066
        '%FA' => 'ú',
6067
        '%FB' => 'û',
6068
        '%FC' => 'ü',
6069 6
        '%FD' => 'ý',
6070
        '%FE' => 'þ',
6071 6
        '%FF' => 'ÿ',
6072 6
    );
6073
6074 6
    return $array;
6075
  }
6076 6
6077 3
  /**
6078
   * Decodes an UTF-8 string to ISO-8859-1.
6079
   *
6080
   * @param string $str <p>The input string.</p>
6081 6
   *
6082
   * @return string
6083 6
   */
6084 1
  public static function utf8_decode($str)
6085 1
  {
6086 1
    static $utf8ToWin1252Keys = null;
6087
    static $utf8ToWin1252Values = null;
6088 6
6089
    $str = (string)$str;
6090
6091
    if (!isset($str[0])) {
6092
      return '';
6093
    }
6094
6095
    // init
6096
    $str = self::to_utf8($str);
6097
6098 6
    if ($utf8ToWin1252Keys === null) {
6099
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
6100 6
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
6101
    }
6102 6
6103 6
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
6104
  }
6105
6106 5
  /**
6107 5
   * Encodes an ISO-8859-1 string to UTF-8.
6108
   *
6109 5
   * @param string $str <p>The input string.</p>
6110 1
   *
6111 1
   * @return string
6112 1
   */
6113
  public static function utf8_encode($str)
6114 5
  {
6115
    $str = \utf8_encode($str);
6116
6117
    if (false === strpos($str, "\xC2")) {
6118
      return $str;
6119
    } else {
6120
6121
      static $cp1252ToUtf8Keys = null;
6122
      static $cp1252ToUtf8Values = null;
6123
6124
      if ($cp1252ToUtf8Keys === null) {
6125
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
6126
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
6127
      }
6128
6129
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
6130
    }
6131
  }
6132
6133
  /**
6134
   * fix -> utf8-win1252 chars
6135
   *
6136
   * @param string $str <p>The input string.</p>
6137
   *
6138
   * @return string
6139
   *
6140
   * @deprecated use "UTF8::fix_simple_utf8()"
6141
   */
6142
  public static function utf8_fix_win1252_chars($str)
6143
  {
6144 1
    return self::fix_simple_utf8($str);
6145
  }
6146 1
6147
  /**
6148
   * Returns an array with all utf8 whitespace characters.
6149
   *
6150
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
6151
   *
6152
   * @author: Derek E. [email protected]
6153
   *
6154
   * @return array <p>
6155
   *               An array with all known whitespace characters as values and the type of whitespace as keys
6156
   *               as defined in above URL.
6157
   *               </p>
6158 1
   */
6159
  public static function whitespace_table()
6160 1
  {
6161
    return self::$whitespaceTable;
6162 1
  }
6163 1
6164
  /**
6165
   * Limit the number of words in a string.
6166 1
   *
6167
   * @param string $str      <p>The input string.</p>
6168 1
   * @param int    $words    <p>The limit of words as integer.</p>
6169 1
   * @param string $strAddOn <p>Replacement for the striped string.</p>
6170
   *
6171
   * @return string
6172 1
   */
6173
  public static function words_limit($str, $words = 100, $strAddOn = '...')
6174
  {
6175 1
    $str = (string)$str;
6176 1
6177 1
    if (!isset($str[0])) {
6178 1
      return '';
6179 1
    }
6180
6181
    $words = (int)$words;
6182 1
6183
    if ($words < 1) {
6184
      return '';
6185
    }
6186
6187
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
6188
6189
    if (
6190
        !isset($matches[0])
6191
        ||
6192
        self::strlen($str) === self::strlen($matches[0])
6193
    ) {
6194
      return $str;
6195
    }
6196
6197
    return self::rtrim($matches[0]) . $strAddOn;
6198
  }
6199
6200
  /**
6201 10
   * Wraps a string to a given number of characters
6202
   *
6203 10
   * @link  http://php.net/manual/en/function.wordwrap.php
6204 10
   *
6205
   * @param string $str   <p>The input string.</p>
6206 10
   * @param int    $width [optional] <p>The column width.</p>
6207 3
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6208
   * @param bool   $cut   [optional] <p>
6209
   *                      If the cut is set to true, the string is
6210 8
   *                      always wrapped at or before the specified width. So if you have
6211 8
   *                      a word that is larger than the given width, it is broken apart.
6212 8
   *                      </p>
6213
   *
6214 8
   * @return string <p>The given string wrapped at the specified column.</p>
6215
   */
6216 8
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6217
  {
6218 8
    $str = (string)$str;
6219 1
    $break = (string)$break;
6220 1
6221 1
    if (!isset($str[0], $break[0])) {
6222
      return '';
6223 8
    }
6224 8
6225
    $w = '';
6226 8
    $strSplit = explode($break, $str);
6227 8
    $count = count($strSplit);
6228 8
6229 8
    $chars = array();
6230 8
    /** @noinspection ForeachInvariantsInspection */
6231
    for ($i = 0; $i < $count; ++$i) {
6232 8
6233 8
      if ($i) {
6234 8
        $chars[] = $break;
6235 8
        $w .= '#';
6236
      }
6237 8
6238 6
      $c = $strSplit[$i];
6239 6
      unset($strSplit[$i]);
6240 6
6241 6
      foreach (self::split($c) as $c) {
6242
        $chars[] = $c;
6243 6
        $w .= ' ' === $c ? ' ' : '?';
6244 3
      }
6245 3
    }
6246
6247 6
    $strReturn = '';
6248 6
    $j = 0;
6249
    $b = $i = -1;
6250 8
    $w = wordwrap($w, $width, '#', $cut);
6251
6252
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6253
      for (++$i; $i < $b; ++$i) {
6254
        $strReturn .= $chars[$j];
6255
        unset($chars[$j++]);
6256
      }
6257
6258 1
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6259
        unset($chars[$j++]);
6260 1
      }
6261
6262
      $strReturn .= $break;
6263
    }
6264
6265
    return $strReturn . implode('', $chars);
6266
  }
6267
6268
  /**
6269
   * Returns an array of Unicode White Space characters.
6270
   *
6271
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6272
   */
6273
  public static function ws()
6274
  {
6275
    return self::$whitespace;
6276
  }
6277
6278
}
6279