Completed
Push — master ( 986a34...f09666 )
by Lars
17:54 queued 03:04
created

UTF8::html_entity_decode()   C

Complexity

Conditions 12
Paths 9

Size

Total Lines 64
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 12.0025

Importance

Changes 9
Bugs 3 Features 3
Metric Value
c 9
b 3
f 3
dl 0
loc 64
ccs 37
cts 38
cp 0.9737
rs 6.0561
cc 12
eloc 37
nc 9
nop 3
crap 12.0025

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834 1
  public static function add_bom_to_string($str)
835
  {
836 1
    if (self::string_has_bom($str) === false) {
837 1
      $str = self::bom() . $str;
838 1
    }
839
840 1
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 2
  public static function bom()
861
  {
862 2
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884 2
  public static function checkForSupport()
885
  {
886 2
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888 1
      self::$support['already_checked_via_portable_utf8'] = true;
889
890 1
      self::$support['mbstring'] = self::mbstring_loaded();
891 1
      self::$support['iconv'] = self::iconv_loaded();
892 1
      self::$support['intl'] = self::intl_loaded();
893 1
      self::$support['intlChar'] = self::intlChar_loaded();
894 1
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895 1
    }
896 2
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907 9
  public static function chr($code_point)
908
  {
909 9
    $i = (int)$code_point;
910 9
    if ($i !== $code_point) {
911 1
      return null;
912
    }
913
914 9
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
915
      self::checkForSupport();
916
    }
917
918 9
    if (self::$support['intlChar'] === true) {
919
      return \IntlChar::chr($code_point);
920
    }
921
922
    // use static cache, if there is no support for "IntlChar"
923 9
    static $cache = array();
924 9
    if (isset($cache[$code_point]) === true) {
925 8
      return $cache[$code_point];
926
    }
927
928
929 8
    if ($code_point <= 0x7f) {
930 6
      return $cache[$code_point] = chr($code_point);
931
    }
932
933 7
    if ($code_point <= 0x7ff) {
934 6
      return $cache[$code_point] = chr(0xc0 | ($code_point >> 6)) .
935 6
                                   chr(0x80 | ($code_point & 0x3f));
936
    }
937
938 7
    if ($code_point <= 0xffff) {
939 7
      return $cache[$code_point] = chr(0xe0 | ($code_point >> 12)) .
940 7
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
941 7
                                   chr(0x80 | ($code_point & 0x3f));
942
    }
943
944 1
    if ($code_point <= 0x10ffff) {
945 1
      return $cache[$code_point] = chr(0xf0 | ($code_point >> 18)) .
946 1
                                   chr(0x80 | (($code_point >> 12) & 0x3f)) .
947 1
                                   chr(0x80 | (($code_point >> 6) & 0x3f)) .
948 1
                                   chr(0x80 | ($code_point & 0x3f));
949
    }
950
951
    # U+FFFD REPLACEMENT CHARACTER
952
    return $cache[$code_point] = "\xEF\xBF\xBD";
953
  }
954
955
  /**
956
   * Applies callback to all characters of a string.
957
   *
958
   * @param string|array $callback <p>The callback function.</p>
959
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
960
   *
961
   * @return array <p>The outcome of callback.</p>
962
   */
963 1
  public static function chr_map($callback, $str)
964
  {
965 1
    $chars = self::split($str);
966
967 1
    return array_map($callback, $chars);
968
  }
969
970
  /**
971
   * Generates an array of byte length of each character of a Unicode string.
972
   *
973
   * 1 byte => U+0000  - U+007F
974
   * 2 byte => U+0080  - U+07FF
975
   * 3 byte => U+0800  - U+FFFF
976
   * 4 byte => U+10000 - U+10FFFF
977
   *
978
   * @param string $str <p>The original Unicode string.</p>
979
   *
980
   * @return array <p>An array of byte lengths of each character.</p>
981
   */
982 4
  public static function chr_size_list($str)
983
  {
984 4
    if (!$str) {
985 3
      return array();
986
    }
987
988 4
    return array_map('strlen', self::split($str));
989
  }
990
991
  /**
992
   * Get a decimal code representation of a specific character.
993
   *
994
   * @param string $char <p>The input character.</p>
995
   *
996
   * @return int
997
   */
998 2
  public static function chr_to_decimal($char)
999
  {
1000 2
    $char = (string)$char;
1001 2
    $code = self::ord($char[0]);
1002 2
    $bytes = 1;
1003
1004 2
    if (!($code & 0x80)) {
1005
      // 0xxxxxxx
1006 2
      return $code;
1007
    }
1008
1009 2
    if (($code & 0xe0) === 0xc0) {
1010
      // 110xxxxx
1011 2
      $bytes = 2;
1012 2
      $code &= ~0xc0;
1013 2
    } elseif (($code & 0xf0) === 0xe0) {
1014
      // 1110xxxx
1015 1
      $bytes = 3;
1016 1
      $code &= ~0xe0;
1017 1
    } elseif (($code & 0xf8) === 0xf0) {
1018
      // 11110xxx
1019
      $bytes = 4;
1020
      $code &= ~0xf0;
1021
    }
1022
1023 2
    for ($i = 2; $i <= $bytes; $i++) {
1024
      // 10xxxxxx
1025 2
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1026 2
    }
1027
1028 2
    return $code;
1029
  }
1030
1031
  /**
1032
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1033
   *
1034
   * @param string $char <p>The input character</p>
1035
   * @param string $pfix [optional]
1036
   *
1037
   * @return string <p>The code point encoded as U+xxxx<p>
1038
   */
1039 1
  public static function chr_to_hex($char, $pfix = 'U+')
1040
  {
1041 1
    return self::int_to_hex(self::ord($char), $pfix);
1042
  }
1043
1044
  /**
1045
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1046
   *
1047
   * @param string $body     <p>The original string to be split.</p>
1048
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1049
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1050
   *
1051
   * @return string <p>The chunked string</p>
1052
   */
1053 1
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1054
  {
1055 1
    return implode($end, self::split($body, $chunklen));
1056
  }
1057
1058
  /**
1059
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1060
   *
1061
   * @param string $str                     <p>The string to be sanitized.</p>
1062
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1063
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1064
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1065
   *                                        => "..."</p>
1066
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1067
   *                                        $normalize_whitespace</p>
1068
   *
1069
   * @return string <p>Clean UTF-8 encoded string.</p>
1070
   */
1071 44
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1072
  {
1073
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1074
    // caused connection reset problem on larger strings
1075
1076
    $regx = '/
1077
      (
1078
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1079
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1080
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1081
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1082
        ){1,100}                      # ...one or more times
1083
      )
1084
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1085
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1086 44
    /x';
1087 44
    $str = preg_replace($regx, '$1', $str);
1088
1089 44
    $str = self::replace_diamond_question_mark($str, '');
1090 44
    $str = self::remove_invisible_characters($str);
1091
1092 44
    if ($normalize_whitespace === true) {
1093 17
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1094 17
    }
1095
1096 44
    if ($normalize_msword === true) {
1097 12
      $str = self::normalize_msword($str);
1098 12
    }
1099
1100 44
    if ($remove_bom === true) {
1101 5
      $str = self::removeBOM($str);
1102 5
    }
1103
1104 44
    return $str;
1105
  }
1106
1107
  /**
1108
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1109
   *
1110
   * @param string $str <p>The input string.</p>
1111
   *
1112
   * @return string
1113
   */
1114 4
  public static function cleanup($str)
1115
  {
1116 4
    $str = (string)$str;
1117
1118 4
    if (!isset($str[0])) {
1119 1
      return '';
1120
    }
1121
1122
    // fixed ISO <-> UTF-8 Errors
1123 4
    $str = self::fix_simple_utf8($str);
1124
1125
    // remove all none UTF-8 symbols
1126
    // && remove diamond question mark (�)
1127
    // && remove remove invisible characters (e.g. "\0")
1128
    // && remove BOM
1129
    // && normalize whitespace chars (but keep non-breaking-spaces)
1130 4
    $str = self::clean($str, true, true, false, true);
1131
1132 4
    return (string)$str;
1133
  }
1134
1135
  /**
1136
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1137
   *
1138
   * INFO: opposite to UTF8::string()
1139
   *
1140
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1141
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1142
   *                                    default, code points will be returned as integers.</p>
1143
   *
1144
   * @return array <p>The array of code points.</p>
1145
   */
1146 5
  public static function codepoints($arg, $u_style = false)
1147
  {
1148 5
    if (is_string($arg)) {
1149 5
      $arg = self::split($arg);
1150 5
    }
1151
1152 5
    $arg = array_map(
1153
        array(
1154 5
            '\\voku\\helper\\UTF8',
1155 5
            'ord',
1156 5
        ),
1157
        $arg
1158 5
    );
1159
1160 5
    if ($u_style) {
1161 1
      $arg = array_map(
1162
          array(
1163 1
              '\\voku\\helper\\UTF8',
1164 1
              'int_to_hex',
1165 1
          ),
1166
          $arg
1167 1
      );
1168 1
    }
1169
1170 5
    return $arg;
1171
  }
1172
1173
  /**
1174
   * Returns count of characters used in a string.
1175
   *
1176
   * @param string $str       <p>The input string.</p>
1177
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1178
   *
1179
   * @return array <p>An associative array of Character as keys and
1180
   *               their count as values.</p>
1181
   */
1182 6
  public static function count_chars($str, $cleanUtf8 = false)
1183
  {
1184 6
    return array_count_values(self::split($str, 1, $cleanUtf8));
1185
  }
1186
1187
  /**
1188
   * Get a UTF-8 character from its decimal code representation.
1189
   *
1190
   * @param int $code
1191
   *
1192
   * @return string
1193
   */
1194 1
  public static function decimal_to_chr($code)
1195
  {
1196 1
    return \mb_convert_encoding(
1197 1
        '&#x' . dechex($code) . ';',
1198 1
        'UTF-8',
1199
        'HTML-ENTITIES'
1200 1
    );
1201
  }
1202
1203
  /**
1204
   * Encode a string with a new charset-encoding.
1205
   *
1206
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1207
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1208
   *
1209
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1210
   * @param string $str      <p>The input string</p>
1211
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1212
   *                         /> otherwise we auto-detect the current string-encoding</p>
1213
   *
1214
   * @return string
1215
   */
1216 11
  public static function encode($encoding, $str, $force = true)
1217
  {
1218 11
    $str = (string)$str;
1219 11
    $encoding = (string)$encoding;
1220
1221 11
    if (!isset($str[0], $encoding[0])) {
1222 5
      return $str;
1223
    }
1224
1225 11
    if ($encoding !== 'UTF-8') {
1226 1
      $encoding = self::normalize_encoding($encoding);
1227 1
    }
1228
1229 11
    $encodingDetected = self::str_detect_encoding($str);
1230
1231
    if (
1232
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1233 11
        &&
1234
        (
1235
            $force === true
1236 11
            ||
1237
            $encodingDetected !== $encoding
1238 1
        )
1239 11
    ) {
1240
1241
      if (
1242
          $encoding === 'UTF-8'
1243 11
          &&
1244
          (
1245
              $force === true
1246 11
              || $encodingDetected === 'UTF-8'
1247 1
              || $encodingDetected === 'WINDOWS-1252'
1248 1
              || $encodingDetected === 'ISO-8859-1'
1249 1
          )
1250 11
      ) {
1251 11
        return self::to_utf8($str);
1252
      }
1253
1254
      if (
1255
          $encoding === 'ISO-8859-1'
1256 2
          &&
1257
          (
1258
              $force === true
1259 1
              || $encodingDetected === 'ISO-8859-1'
1260
              || $encodingDetected === 'UTF-8'
1261
          )
1262 2
      ) {
1263 1
        return self::to_iso8859($str);
1264
      }
1265
1266 2
      $strEncoded = \mb_convert_encoding(
1267 2
          $str,
1268 2
          $encoding,
1269
          $encodingDetected
1270 2
      );
1271
1272 2
      if ($strEncoded) {
1273 2
        return $strEncoded;
1274
      }
1275
    }
1276
1277 1
    return $str;
1278
  }
1279
1280
  /**
1281
   * Reads entire file into a string.
1282
   *
1283
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1284
   *
1285
   * @link http://php.net/manual/en/function.file-get-contents.php
1286
   *
1287
   * @param string        $filename      <p>
1288
   *                                     Name of the file to read.
1289
   *                                     </p>
1290
   * @param int|null      $flags         [optional] <p>
1291
   *                                     Prior to PHP 6, this parameter is called
1292
   *                                     use_include_path and is a bool.
1293
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1294
   *                                     to trigger include path
1295
   *                                     search.
1296
   *                                     </p>
1297
   *                                     <p>
1298
   *                                     The value of flags can be any combination of
1299
   *                                     the following flags (with some restrictions), joined with the
1300
   *                                     binary OR (|)
1301
   *                                     operator.
1302
   *                                     </p>
1303
   *                                     <p>
1304
   *                                     <table>
1305
   *                                     Available flags
1306
   *                                     <tr valign="top">
1307
   *                                     <td>Flag</td>
1308
   *                                     <td>Description</td>
1309
   *                                     </tr>
1310
   *                                     <tr valign="top">
1311
   *                                     <td>
1312
   *                                     FILE_USE_INCLUDE_PATH
1313
   *                                     </td>
1314
   *                                     <td>
1315
   *                                     Search for filename in the include directory.
1316
   *                                     See include_path for more
1317
   *                                     information.
1318
   *                                     </td>
1319
   *                                     </tr>
1320
   *                                     <tr valign="top">
1321
   *                                     <td>
1322
   *                                     FILE_TEXT
1323
   *                                     </td>
1324
   *                                     <td>
1325
   *                                     As of PHP 6, the default encoding of the read
1326
   *                                     data is UTF-8. You can specify a different encoding by creating a
1327
   *                                     custom context or by changing the default using
1328
   *                                     stream_default_encoding. This flag cannot be
1329
   *                                     used with FILE_BINARY.
1330
   *                                     </td>
1331
   *                                     </tr>
1332
   *                                     <tr valign="top">
1333
   *                                     <td>
1334
   *                                     FILE_BINARY
1335
   *                                     </td>
1336
   *                                     <td>
1337
   *                                     With this flag, the file is read in binary mode. This is the default
1338
   *                                     setting and cannot be used with FILE_TEXT.
1339
   *                                     </td>
1340
   *                                     </tr>
1341
   *                                     </table>
1342
   *                                     </p>
1343
   * @param resource|null $context       [optional] <p>
1344
   *                                     A valid context resource created with
1345
   *                                     stream_context_create. If you don't need to use a
1346
   *                                     custom context, you can skip this parameter by &null;.
1347
   *                                     </p>
1348
   * @param int|null      $offset        [optional] <p>
1349
   *                                     The offset where the reading starts.
1350
   *                                     </p>
1351
   * @param int|null      $maxlen        [optional] <p>
1352
   *                                     Maximum length of data read. The default is to read until end
1353
   *                                     of file is reached.
1354
   *                                     </p>
1355
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1356
   *
1357
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1358
   *                                     or pdf, because they used non default utf-8 chars</p>
1359
   *
1360
   * @return string <p>The function returns the read data or false on failure.</p>
1361
   */
1362 2
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1363
  {
1364
    // init
1365 2
    $timeout = (int)$timeout;
1366 2
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1367
1368 2
    if ($timeout && $context === null) {
1369 2
      $context = stream_context_create(
1370
          array(
1371
              'http' =>
1372
                  array(
1373 2
                      'timeout' => $timeout,
1374 2
                  ),
1375
          )
1376 2
      );
1377 2
    }
1378
1379 2
    if (is_int($maxlen)) {
1380 1
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1381 1
    } else {
1382 2
      $data = file_get_contents($filename, $flags, $context, $offset);
1383
    }
1384
1385
    // return false on error
1386 2
    if ($data === false) {
1387 1
      return false;
1388
    }
1389
1390 1
    if ($convertToUtf8 === true) {
1391 1
      $data = self::encode('UTF-8', $data, false);
1392 1
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1393 1
    }
1394
1395 1
    return $data;
1396
  }
1397
1398
  /**
1399
   * Checks if a file starts with BOM (Byte Order Mark) character.
1400
   *
1401
   * @param string $file_path <p>Path to a valid file.</p>
1402
   *
1403
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1404
   */
1405 1
  public static function file_has_bom($file_path)
1406
  {
1407 1
    return self::string_has_bom(file_get_contents($file_path));
1408
  }
1409
1410
  /**
1411
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1412
   *
1413
   * @param mixed  $var
1414
   * @param int    $normalization_form
1415
   * @param string $leading_combining
1416
   *
1417
   * @return mixed
1418
   */
1419 9
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1420
  {
1421 9
    switch (gettype($var)) {
1422 9 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1423 3
        foreach ($var as $k => $v) {
1424
          /** @noinspection AlterInForeachInspection */
1425 3
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1426 3
        }
1427 3
        break;
1428 9 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1429 2
        foreach ($var as $k => $v) {
1430 2
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1431 2
        }
1432 2
        break;
1433 9
      case 'string':
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
1434
1435 8
        if (false !== strpos($var, "\r")) {
1436
          // Workaround https://bugs.php.net/65732
1437 2
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1438 2
        }
1439
1440 8
        if (self::is_ascii($var) === false) {
1441
1442 8
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1443 6
            $n = '-';
1444 6
          } else {
1445 6
            $n = \Normalizer::normalize($var, $normalization_form);
1446
1447 6
            if (isset($n[0])) {
1448 3
              $var = $n;
1449 3
            } else {
1450 5
              $var = self::encode('UTF-8', $var);
1451
            }
1452
          }
1453
1454
          if (
1455 8
              $var[0] >= "\x80" && isset($n[0], $leading_combining[0])
1456 8
              &&
1457 5
              preg_match('/^\p{Mn}/u', $var)
1458 8
          ) {
1459
            // Prevent leading combining chars
1460
            // for NFC-safe concatenations.
1461 2
            $var = $leading_combining . $var;
1462 2
          }
1463 8
        }
1464 8
        break;
1465 9
    }
1466
1467 9
    return $var;
1468
  }
1469
1470
  /**
1471
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1472
   *
1473
   * @param int    $type
1474
   * @param string $var
1475
   * @param int    $filter
1476
   * @param mixed  $option
1477
   *
1478
   * @return mixed
1479
   */
1480 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1481
  {
1482
    if (4 > func_num_args()) {
1483
      $var = filter_input($type, $var, $filter);
1484
    } else {
1485
      $var = filter_input($type, $var, $filter, $option);
1486
    }
1487
1488
    return self::filter($var);
1489
  }
1490
1491
  /**
1492
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1493
   *
1494
   * @param int   $type
1495
   * @param mixed $definition
1496
   * @param bool  $add_empty
1497
   *
1498
   * @return mixed
1499
   */
1500 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1501
  {
1502
    if (2 > func_num_args()) {
1503
      $a = filter_input_array($type);
1504
    } else {
1505
      $a = filter_input_array($type, $definition, $add_empty);
1506
    }
1507
1508
    return self::filter($a);
1509
  }
1510
1511
  /**
1512
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1513
   *
1514
   * @param mixed $var
1515
   * @param int   $filter
1516
   * @param mixed $option
1517
   *
1518
   * @return mixed
1519
   */
1520 1 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1521
  {
1522 1
    if (3 > func_num_args()) {
1523 1
      $var = filter_var($var, $filter);
1524 1
    } else {
1525 1
      $var = filter_var($var, $filter, $option);
1526
    }
1527
1528 1
    return self::filter($var);
1529
  }
1530
1531
  /**
1532
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1533
   *
1534
   * @param array $data
1535
   * @param mixed $definition
1536
   * @param bool  $add_empty
1537
   *
1538
   * @return mixed
1539
   */
1540 1 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1541
  {
1542 1
    if (2 > func_num_args()) {
1543 1
      $a = filter_var_array($data);
1544 1
    } else {
1545 1
      $a = filter_var_array($data, $definition, $add_empty);
1546
    }
1547
1548 1
    return self::filter($a);
1549
  }
1550
1551
  /**
1552
   * Check if the number of unicode characters are not more than the specified integer.
1553
   *
1554
   * @param string $str      The original string to be checked.
1555
   * @param int    $box_size The size in number of chars to be checked against string.
1556
   *
1557
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1558
   */
1559 1
  public static function fits_inside($str, $box_size)
1560
  {
1561 1
    return (self::strlen($str) <= $box_size);
1562
  }
1563
1564
  /**
1565
   * Try to fix simple broken UTF-8 strings.
1566
   *
1567
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1568
   *
1569
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1570
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1571
   * See: http://en.wikipedia.org/wiki/Windows-1252
1572
   *
1573
   * @param string $str <p>The input string</p>
1574
   *
1575
   * @return string
1576
   */
1577 7
  public static function fix_simple_utf8($str)
1578
  {
1579 7
    static $brokenUtf8ToUtf8Keys = null;
1580 7
    static $brokenUtf8ToUtf8Values = null;
1581
1582 7
    $str = (string)$str;
1583
1584 7
    if (!isset($str[0])) {
1585 2
      return '';
1586
    }
1587
1588 7
    if ($brokenUtf8ToUtf8Keys === null) {
1589 1
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1590 1
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1591 1
    }
1592
1593 7
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1594
  }
1595
1596
  /**
1597
   * Fix a double (or multiple) encoded UTF8 string.
1598
   *
1599
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1600
   *
1601
   * @return mixed
1602
   */
1603 1
  public static function fix_utf8($str)
1604
  {
1605 1
    if (is_array($str)) {
1606
1607 1
      foreach ($str as $k => $v) {
1608
        /** @noinspection AlterInForeachInspection */
1609
        /** @noinspection OffsetOperationsInspection */
1610 1
        $str[$k] = self::fix_utf8($v);
1611 1
      }
1612
1613 1
      return $str;
1614
    }
1615
1616 1
    $last = '';
1617 1
    while ($last !== $str) {
1618 1
      $last = $str;
1619 1
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1619 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1620 1
    }
1621
1622 1
    return $str;
1623
  }
1624
1625
  /**
1626
   * Get character of a specific character.
1627
   *
1628
   * @param string $char
1629
   *
1630
   * @return string <p>'RTL' or 'LTR'</p>
1631
   */
1632 1
  public static function getCharDirection($char)
1633
  {
1634 1
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1635
      self::checkForSupport();
1636
    }
1637
1638 1
    if (self::$support['intlChar'] === true) {
1639
      $tmpReturn = \IntlChar::charDirection($char);
1640
1641
      // from "IntlChar"-Class
1642
      $charDirection = array(
1643
          'RTL' => array(1, 13, 14, 15, 21),
1644
          'LTR' => array(0, 11, 12, 20),
1645
      );
1646
1647
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1648
        return 'LTR';
1649
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1650
        return 'RTL';
1651
      }
1652
    }
1653
1654 1
    $c = static::chr_to_decimal($char);
1655
1656 1
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1657 1
      return 'LTR';
1658
    }
1659
1660 1
    if (0x85e >= $c) {
1661
1662 1
      if (0x5be === $c ||
1663 1
          0x5c0 === $c ||
1664 1
          0x5c3 === $c ||
1665 1
          0x5c6 === $c ||
1666 1
          (0x5d0 <= $c && 0x5ea >= $c) ||
1667 1
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1668 1
          0x608 === $c ||
1669 1
          0x60b === $c ||
1670 1
          0x60d === $c ||
1671 1
          0x61b === $c ||
1672 1
          (0x61e <= $c && 0x64a >= $c) ||
1673
          (0x66d <= $c && 0x66f >= $c) ||
1674
          (0x671 <= $c && 0x6d5 >= $c) ||
1675
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1676
          (0x6ee <= $c && 0x6ef >= $c) ||
1677
          (0x6fa <= $c && 0x70d >= $c) ||
1678
          0x710 === $c ||
1679
          (0x712 <= $c && 0x72f >= $c) ||
1680
          (0x74d <= $c && 0x7a5 >= $c) ||
1681
          0x7b1 === $c ||
1682
          (0x7c0 <= $c && 0x7ea >= $c) ||
1683
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1684
          0x7fa === $c ||
1685
          (0x800 <= $c && 0x815 >= $c) ||
1686
          0x81a === $c ||
1687
          0x824 === $c ||
1688
          0x828 === $c ||
1689
          (0x830 <= $c && 0x83e >= $c) ||
1690
          (0x840 <= $c && 0x858 >= $c) ||
1691
          0x85e === $c
1692 1
      ) {
1693 1
        return 'RTL';
1694
      }
1695
1696
    } elseif (0x200f === $c) {
1697
1698
      return 'RTL';
1699
1700
    } elseif (0xfb1d <= $c) {
1701
1702
      if (0xfb1d === $c ||
1703
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1704
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1705
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1706
          0xfb3e === $c ||
1707
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1708
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1709
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1710
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1711
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1712
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1713
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1714
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1715
          (0xfe76 <= $c && 0xfefc >= $c) ||
1716
          (0x10800 <= $c && 0x10805 >= $c) ||
1717
          0x10808 === $c ||
1718
          (0x1080a <= $c && 0x10835 >= $c) ||
1719
          (0x10837 <= $c && 0x10838 >= $c) ||
1720
          0x1083c === $c ||
1721
          (0x1083f <= $c && 0x10855 >= $c) ||
1722
          (0x10857 <= $c && 0x1085f >= $c) ||
1723
          (0x10900 <= $c && 0x1091b >= $c) ||
1724
          (0x10920 <= $c && 0x10939 >= $c) ||
1725
          0x1093f === $c ||
1726
          0x10a00 === $c ||
1727
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1728
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1729
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1730
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1731
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1732
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1733
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1734
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1735
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1736
          (0x10b78 <= $c && 0x10b7f >= $c)
1737
      ) {
1738
        return 'RTL';
1739
      }
1740
    }
1741
1742
    return 'LTR';
1743
  }
1744
1745
  /**
1746
   * get data from "/data/*.ser"
1747
   *
1748
   * @param string $file
1749
   *
1750
   * @return bool|string|array|int <p>Will return false on error.</p>
1751
   */
1752 1
  private static function getData($file)
1753
  {
1754 1
    $file = __DIR__ . '/data/' . $file . '.php';
1755 1
    if (file_exists($file)) {
1756
      /** @noinspection PhpIncludeInspection */
1757 1
      return require $file;
1758
    } else {
1759
      return false;
1760
    }
1761
  }
1762
1763
  /**
1764
   * alias for "UTF8::string_has_bom()"
1765
   *
1766
   * @see UTF8::string_has_bom()
1767
   *
1768
   * @param string $str
1769
   *
1770
   * @return bool
1771
   */
1772 1
  public static function hasBom($str)
1773
  {
1774 1
    return self::string_has_bom($str);
1775
  }
1776
1777
  /**
1778
   * Converts hexadecimal U+xxxx code point representation to integer.
1779
   *
1780
   * INFO: opposite to UTF8::int_to_hex()
1781
   *
1782
   * @param string $str <p>The hexadecimal code point representation.</p>
1783
   *
1784
   * @return int|false <p>The code point, or false on failure.</p>
1785
   */
1786 1
  public static function hex_to_int($str)
1787
  {
1788 1
    if (!$str) {
1789 1
      return false;
1790
    }
1791
1792 1
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1793 1
      return intval($match[1], 16);
1794
    }
1795
1796 1
    return false;
1797
  }
1798
1799
  /**
1800
   * alias for "UTF8::html_entity_decode()"
1801
   *
1802
   * @see UTF8::html_entity_decode()
1803
   *
1804
   * @param string $str
1805
   * @param int    $flags
1806
   * @param string $encoding
1807
   *
1808
   * @return string
1809
   */
1810 1
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1811
  {
1812 1
    return self::html_entity_decode($str, $flags, $encoding);
1813
  }
1814
1815
  /**
1816
   * Converts a UTF-8 string to a series of HTML numbered entities.
1817
   *
1818
   * INFO: opposite to UTF8::html_decode()
1819
   *
1820
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1821
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1822
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1823
   *
1824
   * @return string <p>HTML numbered entities.</p>
1825
   */
1826 2
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1827
  {
1828
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1829 2
    if (function_exists('mb_encode_numericentity')) {
1830
1831 2
      $startCode = 0x00;
1832 2
      if ($keepAsciiChars === true) {
1833 1
        $startCode = 0x80;
1834 1
      }
1835
1836 2
      if ($encoding !== 'UTF-8') {
1837 1
        $encoding = self::normalize_encoding($encoding);
1838 1
      }
1839
1840 2
      return mb_encode_numericentity(
1841 2
          $str,
1842 2
          array($startCode, 0xffff, 0, 0xffff,),
1843
          $encoding
1844 2
      );
1845
    }
1846
1847
    return implode(
1848
        array_map(
1849
            function ($data) use ($keepAsciiChars) {
1850
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1851
            },
1852
            self::split($str)
1853
        )
1854
    );
1855
  }
1856
1857
  /**
1858
   * UTF-8 version of html_entity_decode()
1859
   *
1860
   * The reason we are not using html_entity_decode() by itself is because
1861
   * while it is not technically correct to leave out the semicolon
1862
   * at the end of an entity most browsers will still interpret the entity
1863
   * correctly. html_entity_decode() does not convert entities without
1864
   * semicolons, so we are left with our own little solution here. Bummer.
1865
   *
1866
   * Convert all HTML entities to their applicable characters
1867
   *
1868
   * INFO: opposite to UTF8::html_encode()
1869
   *
1870
   * @link http://php.net/manual/en/function.html-entity-decode.php
1871
   *
1872
   * @param string $str      <p>
1873
   *                         The input string.
1874
   *                         </p>
1875
   * @param int    $flags    [optional] <p>
1876
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1877
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1878
   *                         <table>
1879
   *                         Available <i>flags</i> constants
1880
   *                         <tr valign="top">
1881
   *                         <td>Constant Name</td>
1882
   *                         <td>Description</td>
1883
   *                         </tr>
1884
   *                         <tr valign="top">
1885
   *                         <td><b>ENT_COMPAT</b></td>
1886
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1887
   *                         </tr>
1888
   *                         <tr valign="top">
1889
   *                         <td><b>ENT_QUOTES</b></td>
1890
   *                         <td>Will convert both double and single quotes.</td>
1891
   *                         </tr>
1892
   *                         <tr valign="top">
1893
   *                         <td><b>ENT_NOQUOTES</b></td>
1894
   *                         <td>Will leave both double and single quotes unconverted.</td>
1895
   *                         </tr>
1896
   *                         <tr valign="top">
1897
   *                         <td><b>ENT_HTML401</b></td>
1898
   *                         <td>
1899
   *                         Handle code as HTML 4.01.
1900
   *                         </td>
1901
   *                         </tr>
1902
   *                         <tr valign="top">
1903
   *                         <td><b>ENT_XML1</b></td>
1904
   *                         <td>
1905
   *                         Handle code as XML 1.
1906
   *                         </td>
1907
   *                         </tr>
1908
   *                         <tr valign="top">
1909
   *                         <td><b>ENT_XHTML</b></td>
1910
   *                         <td>
1911
   *                         Handle code as XHTML.
1912
   *                         </td>
1913
   *                         </tr>
1914
   *                         <tr valign="top">
1915
   *                         <td><b>ENT_HTML5</b></td>
1916
   *                         <td>
1917
   *                         Handle code as HTML 5.
1918
   *                         </td>
1919
   *                         </tr>
1920
   *                         </table>
1921
   *                         </p>
1922
   * @param string $encoding [optional] <p>Encoding to use.</p>
1923
   *
1924
   * @return string <p>The decoded string.</p>
1925
   */
1926 9
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1927
  {
1928 9
    $str = (string)$str;
1929
1930 9
    if (!isset($str[0])) {
1931 6
      return '';
1932
    }
1933
1934 9
    if (!isset($str[3])) { // examples: &; || &x;
0 ignored issues
show
Unused Code Comprehensibility introduced by
46% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
1935 7
      return $str;
1936
    }
1937
1938
    if (
1939 9
        strpos($str, '&') === false
1940 9
        ||
1941
        (
1942 9
            strpos($str, '&#') === false
1943 9
            &&
1944 9
            strpos($str, ';') === false
1945 9
        )
1946 9
    ) {
1947 6
      return $str;
1948
    }
1949
1950 9
    if ($encoding !== 'UTF-8') {
1951 2
      $encoding = self::normalize_encoding($encoding);
1952 2
    }
1953
1954 9
    if ($flags === null) {
1955 4
      if (Bootup::is_php('5.4') === true) {
1956 4
        $flags = ENT_COMPAT | ENT_HTML5;
1957 4
      } else {
1958
        $flags = ENT_COMPAT;
1959
      }
1960 4
    }
1961
1962
    do {
1963 9
      $str_compare = $str;
1964
1965 9
      $str = preg_replace_callback(
1966 9
          "/&#\d{2,5};/",
1967
          function ($matches) use ($encoding) {
1968 7
            $returnTmp = \mb_convert_encoding($matches[0], $encoding, 'HTML-ENTITIES');
1969
1970 7
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1971 6
              return $returnTmp;
1972
            } else {
1973 4
              return $matches[0];
1974
            }
1975 9
          },
1976
          $str
1977 9
      );
1978
1979
      // decode numeric & UTF16 two byte entities
1980 9
      $str = html_entity_decode(
1981 9
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1982 9
          $flags,
1983
          $encoding
1984 9
      );
1985
1986 9
    } while ($str_compare !== $str);
1987
1988 9
    return $str;
1989
  }
1990
1991
  /**
1992
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1993
   *
1994
   * @link http://php.net/manual/en/function.htmlentities.php
1995
   *
1996
   * @param string $str           <p>
1997
   *                              The input string.
1998
   *                              </p>
1999
   * @param int    $flags         [optional] <p>
2000
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2001
   *                              invalid code unit sequences and the used document type. The default is
2002
   *                              ENT_COMPAT | ENT_HTML401.
2003
   *                              <table>
2004
   *                              Available <i>flags</i> constants
2005
   *                              <tr valign="top">
2006
   *                              <td>Constant Name</td>
2007
   *                              <td>Description</td>
2008
   *                              </tr>
2009
   *                              <tr valign="top">
2010
   *                              <td><b>ENT_COMPAT</b></td>
2011
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2012
   *                              </tr>
2013
   *                              <tr valign="top">
2014
   *                              <td><b>ENT_QUOTES</b></td>
2015
   *                              <td>Will convert both double and single quotes.</td>
2016
   *                              </tr>
2017
   *                              <tr valign="top">
2018
   *                              <td><b>ENT_NOQUOTES</b></td>
2019
   *                              <td>Will leave both double and single quotes unconverted.</td>
2020
   *                              </tr>
2021
   *                              <tr valign="top">
2022
   *                              <td><b>ENT_IGNORE</b></td>
2023
   *                              <td>
2024
   *                              Silently discard invalid code unit sequences instead of returning
2025
   *                              an empty string. Using this flag is discouraged as it
2026
   *                              may have security implications.
2027
   *                              </td>
2028
   *                              </tr>
2029
   *                              <tr valign="top">
2030
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2031
   *                              <td>
2032
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2033
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2034
   *                              </td>
2035
   *                              </tr>
2036
   *                              <tr valign="top">
2037
   *                              <td><b>ENT_DISALLOWED</b></td>
2038
   *                              <td>
2039
   *                              Replace invalid code points for the given document type with a
2040
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2041
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2042
   *                              instance, to ensure the well-formedness of XML documents with
2043
   *                              embedded external content.
2044
   *                              </td>
2045
   *                              </tr>
2046
   *                              <tr valign="top">
2047
   *                              <td><b>ENT_HTML401</b></td>
2048
   *                              <td>
2049
   *                              Handle code as HTML 4.01.
2050
   *                              </td>
2051
   *                              </tr>
2052
   *                              <tr valign="top">
2053
   *                              <td><b>ENT_XML1</b></td>
2054
   *                              <td>
2055
   *                              Handle code as XML 1.
2056
   *                              </td>
2057
   *                              </tr>
2058
   *                              <tr valign="top">
2059
   *                              <td><b>ENT_XHTML</b></td>
2060
   *                              <td>
2061
   *                              Handle code as XHTML.
2062
   *                              </td>
2063
   *                              </tr>
2064
   *                              <tr valign="top">
2065
   *                              <td><b>ENT_HTML5</b></td>
2066
   *                              <td>
2067
   *                              Handle code as HTML 5.
2068
   *                              </td>
2069
   *                              </tr>
2070
   *                              </table>
2071
   *                              </p>
2072
   * @param string $encoding      [optional] <p>
2073
   *                              Like <b>htmlspecialchars</b>,
2074
   *                              <b>htmlentities</b> takes an optional third argument
2075
   *                              <i>encoding</i> which defines encoding used in
2076
   *                              conversion.
2077
   *                              Although this argument is technically optional, you are highly
2078
   *                              encouraged to specify the correct value for your code.
2079
   *                              </p>
2080
   * @param bool   $double_encode [optional] <p>
2081
   *                              When <i>double_encode</i> is turned off PHP will not
2082
   *                              encode existing html entities. The default is to convert everything.
2083
   *                              </p>
2084
   *
2085
   *
2086
   * @return string the encoded string.
2087
   * </p>
2088
   * <p>
2089
   * If the input <i>string</i> contains an invalid code unit
2090
   * sequence within the given <i>encoding</i> an empty string
2091
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2092
   * <b>ENT_SUBSTITUTE</b> flags are set.
2093
   */
2094 2
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2095
  {
2096 2
    if ($encoding !== 'UTF-8') {
2097 1
      $encoding = self::normalize_encoding($encoding);
2098 1
    }
2099
2100 2
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2101
2102 2
    if ($encoding !== 'UTF-8') {
2103 1
      return $str;
2104
    }
2105
2106 2
    $byteLengths = self::chr_size_list($str);
2107 2
    $search = array();
2108 2
    $replacements = array();
2109 2
    foreach ($byteLengths as $counter => $byteLength) {
2110 2
      if ($byteLength >= 3) {
2111 1
        $char = self::access($str, $counter);
2112
2113 1
        if (!isset($replacements[$char])) {
2114 1
          $search[$char] = $char;
2115 1
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2111 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2116 1
        }
2117 1
      }
2118 2
    }
2119
2120 2
    return str_replace($search, $replacements, $str);
2121
  }
2122
2123
  /**
2124
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2125
   *
2126
   * INFO: Take a look at "UTF8::htmlentities()"
2127
   *
2128
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2129
   *
2130
   * @param string $str           <p>
2131
   *                              The string being converted.
2132
   *                              </p>
2133
   * @param int    $flags         [optional] <p>
2134
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2135
   *                              invalid code unit sequences and the used document type. The default is
2136
   *                              ENT_COMPAT | ENT_HTML401.
2137
   *                              <table>
2138
   *                              Available <i>flags</i> constants
2139
   *                              <tr valign="top">
2140
   *                              <td>Constant Name</td>
2141
   *                              <td>Description</td>
2142
   *                              </tr>
2143
   *                              <tr valign="top">
2144
   *                              <td><b>ENT_COMPAT</b></td>
2145
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2146
   *                              </tr>
2147
   *                              <tr valign="top">
2148
   *                              <td><b>ENT_QUOTES</b></td>
2149
   *                              <td>Will convert both double and single quotes.</td>
2150
   *                              </tr>
2151
   *                              <tr valign="top">
2152
   *                              <td><b>ENT_NOQUOTES</b></td>
2153
   *                              <td>Will leave both double and single quotes unconverted.</td>
2154
   *                              </tr>
2155
   *                              <tr valign="top">
2156
   *                              <td><b>ENT_IGNORE</b></td>
2157
   *                              <td>
2158
   *                              Silently discard invalid code unit sequences instead of returning
2159
   *                              an empty string. Using this flag is discouraged as it
2160
   *                              may have security implications.
2161
   *                              </td>
2162
   *                              </tr>
2163
   *                              <tr valign="top">
2164
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2165
   *                              <td>
2166
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2167
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2168
   *                              </td>
2169
   *                              </tr>
2170
   *                              <tr valign="top">
2171
   *                              <td><b>ENT_DISALLOWED</b></td>
2172
   *                              <td>
2173
   *                              Replace invalid code points for the given document type with a
2174
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2175
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2176
   *                              instance, to ensure the well-formedness of XML documents with
2177
   *                              embedded external content.
2178
   *                              </td>
2179
   *                              </tr>
2180
   *                              <tr valign="top">
2181
   *                              <td><b>ENT_HTML401</b></td>
2182
   *                              <td>
2183
   *                              Handle code as HTML 4.01.
2184
   *                              </td>
2185
   *                              </tr>
2186
   *                              <tr valign="top">
2187
   *                              <td><b>ENT_XML1</b></td>
2188
   *                              <td>
2189
   *                              Handle code as XML 1.
2190
   *                              </td>
2191
   *                              </tr>
2192
   *                              <tr valign="top">
2193
   *                              <td><b>ENT_XHTML</b></td>
2194
   *                              <td>
2195
   *                              Handle code as XHTML.
2196
   *                              </td>
2197
   *                              </tr>
2198
   *                              <tr valign="top">
2199
   *                              <td><b>ENT_HTML5</b></td>
2200
   *                              <td>
2201
   *                              Handle code as HTML 5.
2202
   *                              </td>
2203
   *                              </tr>
2204
   *                              </table>
2205
   *                              </p>
2206
   * @param string $encoding      [optional] <p>
2207
   *                              Defines encoding used in conversion.
2208
   *                              </p>
2209
   *                              <p>
2210
   *                              For the purposes of this function, the encodings
2211
   *                              ISO-8859-1, ISO-8859-15,
2212
   *                              UTF-8, cp866,
2213
   *                              cp1251, cp1252, and
2214
   *                              KOI8-R are effectively equivalent, provided the
2215
   *                              <i>string</i> itself is valid for the encoding, as
2216
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2217
   *                              the same positions in all of these encodings.
2218
   *                              </p>
2219
   * @param bool   $double_encode [optional] <p>
2220
   *                              When <i>double_encode</i> is turned off PHP will not
2221
   *                              encode existing html entities, the default is to convert everything.
2222
   *                              </p>
2223
   *
2224
   * @return string The converted string.
2225
   * </p>
2226
   * <p>
2227
   * If the input <i>string</i> contains an invalid code unit
2228
   * sequence within the given <i>encoding</i> an empty string
2229
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2230
   * <b>ENT_SUBSTITUTE</b> flags are set.
2231
   */
2232 1
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2233
  {
2234 1
    if ($encoding !== 'UTF-8') {
2235
      $encoding = self::normalize_encoding($encoding);
2236
    }
2237
2238 1
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2239
  }
2240
2241
  /**
2242
   * Checks whether iconv is available on the server.
2243
   *
2244
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2245
   */
2246 1
  public static function iconv_loaded()
2247
  {
2248 1
    return extension_loaded('iconv') ? true : false;
2249
  }
2250
2251
  /**
2252
   * Converts Integer to hexadecimal U+xxxx code point representation.
2253
   *
2254
   * INFO: opposite to UTF8::hex_to_int()
2255
   *
2256
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2257
   * @param string $pfix [optional]
2258
   *
2259
   * @return string <p>The code point, or empty string on failure.</p>
2260
   */
2261 3
  public static function int_to_hex($int, $pfix = 'U+')
2262
  {
2263 3
    if (ctype_digit((string)$int)) {
2264 3
      $hex = dechex((int)$int);
2265
2266 3
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2267
2268 3
      return $pfix . $hex;
2269
    }
2270
2271
    return '';
2272
  }
2273
2274
  /**
2275
   * Checks whether intl-char is available on the server.
2276
   *
2277
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2278
   */
2279 1
  public static function intlChar_loaded()
2280
  {
2281 1
    return (Bootup::is_php('7.0') === true && class_exists('IntlChar') === true);
2282
  }
2283
2284
  /**
2285
   * Checks whether intl is available on the server.
2286
   *
2287
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2288
   */
2289 2
  public static function intl_loaded()
2290
  {
2291 2
    return extension_loaded('intl') ? true : false;
2292
  }
2293
2294
  /**
2295
   * alias for "UTF8::is_ascii()"
2296
   *
2297
   * @see UTF8::is_ascii()
2298
   *
2299
   * @param string $str
2300
   *
2301
   * @return boolean
2302
   */
2303 2
  public static function isAscii($str)
2304
  {
2305 2
    return self::is_ascii($str);
2306
  }
2307
2308
  /**
2309
   * alias for "UTF8::is_base64()"
2310
   *
2311
   * @see UTF8::is_base64()
2312
   *
2313
   * @param string $str
2314
   *
2315
   * @return bool
2316
   */
2317 1
  public static function isBase64($str)
2318
  {
2319 1
    return self::is_base64($str);
2320
  }
2321
2322
  /**
2323
   * alias for "UTF8::is_binary()"
2324
   *
2325
   * @see UTF8::is_binary()
2326
   *
2327
   * @param string $str
2328
   *
2329
   * @return bool
2330
   */
2331
  public static function isBinary($str)
2332
  {
2333
    return self::is_binary($str);
2334
  }
2335
2336
  /**
2337
   * alias for "UTF8::is_bom()"
2338
   *
2339
   * @see UTF8::is_bom()
2340
   *
2341
   * @param string $utf8_chr
2342
   *
2343
   * @return boolean
2344
   */
2345
  public static function isBom($utf8_chr)
2346
  {
2347
    return self::is_bom($utf8_chr);
2348
  }
2349
2350
  /**
2351
   * alias for "UTF8::is_html()"
2352
   *
2353
   * @see UTF8::is_html()
2354
   *
2355
   * @param string $str
2356
   *
2357
   * @return boolean
2358
   */
2359 1
  public static function isHtml($str)
2360
  {
2361 1
    return self::is_html($str);
2362
  }
2363
2364
  /**
2365
   * alias for "UTF8::is_json()"
2366
   *
2367
   * @see UTF8::is_json()
2368
   *
2369
   * @param string $str
2370
   *
2371
   * @return bool
2372
   */
2373
  public static function isJson($str)
2374
  {
2375
    return self::is_json($str);
2376
  }
2377
2378
  /**
2379
   * alias for "UTF8::is_utf16()"
2380
   *
2381
   * @see UTF8::is_utf16()
2382
   *
2383
   * @param string $str
2384
   *
2385
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2386
   */
2387 1
  public static function isUtf16($str)
2388
  {
2389 1
    return self::is_utf16($str);
2390
  }
2391
2392
  /**
2393
   * alias for "UTF8::is_utf32()"
2394
   *
2395
   * @see UTF8::is_utf32()
2396
   *
2397
   * @param string $str
2398
   *
2399
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2400
   */
2401 1
  public static function isUtf32($str)
2402
  {
2403 1
    return self::is_utf32($str);
2404
  }
2405
2406
  /**
2407
   * alias for "UTF8::is_utf8()"
2408
   *
2409
   * @see UTF8::is_utf8()
2410
   *
2411
   * @param string $str
2412
   * @param bool   $strict
2413
   *
2414
   * @return bool
2415
   */
2416 16
  public static function isUtf8($str, $strict = false)
2417
  {
2418 16
    return self::is_utf8($str, $strict);
2419
  }
2420
2421
  /**
2422
   * Checks if a string is 7 bit ASCII.
2423
   *
2424
   * @param string $str <p>The string to check.</p>
2425
   *
2426
   * @return bool <p>
2427
   *              <strong>true</strong> if it is ASCII<br />
2428
   *              <strong>false</strong> otherwise
2429
   *              </p>
2430
   */
2431 28
  public static function is_ascii($str)
2432
  {
2433 28
    $str = (string)$str;
2434
2435 28
    if (!isset($str[0])) {
2436 5
      return true;
2437
    }
2438
2439 28
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2440
  }
2441
2442
  /**
2443
   * Returns true if the string is base64 encoded, false otherwise.
2444
   *
2445
   * @param string $str <p>The input string.</p>
2446
   *
2447
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2448
   */
2449 1
  public static function is_base64($str)
2450
  {
2451 1
    $str = (string)$str;
2452
2453 1
    if (!isset($str[0])) {
2454 1
      return false;
2455
    }
2456
2457 1
    if (base64_encode(base64_decode($str, true)) === $str) {
2458 1
      return true;
2459
    } else {
2460 1
      return false;
2461
    }
2462
  }
2463
2464
  /**
2465
   * Check if the input is binary... (is look like a hack).
2466
   *
2467
   * @param mixed $input
2468
   *
2469
   * @return bool
2470
   */
2471 16
  public static function is_binary($input)
2472
  {
2473
2474 16
    $testLength = strlen($input);
2475
2476
    if (
2477 16
        preg_match('~^[01]+$~', $input)
2478
        ||
2479 16
        substr_count($input, "\x00") > 0
2480 16
        ||
2481 15
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2482 16
    ) {
2483 6
      return true;
2484
    } else {
2485 15
      return false;
2486
    }
2487
  }
2488
2489
  /**
2490
   * Check if the file is binary.
2491
   *
2492
   * @param string $file
2493
   *
2494
   * @return boolean
2495
   */
2496
  public static function is_binary_file($file)
2497
  {
2498
    try {
2499
      $fp = fopen($file, 'r');
2500
      $block = fread($fp, 512);
2501
      fclose($fp);
2502
    } catch (\Exception $e) {
2503
      $block = '';
2504
    }
2505
2506
    return self::is_binary($block);
2507
  }
2508
2509
  /**
2510
   * Checks if the given string is equal to any "Byte Order Mark".
2511
   *
2512
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2513
   *
2514
   * @param string $str <p>The input string.</p>
2515
   *
2516
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2517
   */
2518
  public static function is_bom($str)
2519
  {
2520
    foreach (self::$bom as $bomString => $bomByteLength) {
2521
      if ($str === $bomString) {
2522
        return true;
2523
      }
2524
    }
2525
2526
    return false;
2527
  }
2528
2529
  /**
2530
   * Check if the string contains any html-tags <lall>.
2531
   *
2532
   * @param string $str <p>The input string.</p>
2533
   *
2534
   * @return boolean
2535
   */
2536 1
  public static function is_html($str)
2537
  {
2538 1
    $str = (string)$str;
2539
2540 1
    if (!isset($str[0])) {
2541
      return false;
2542
    }
2543
2544
    // init
2545 1
    $matches = array();
2546
2547 1
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2548
2549 1
    if (count($matches) == 0) {
2550 1
      return false;
2551
    } else {
2552 1
      return true;
2553
    }
2554
  }
2555
2556
  /**
2557
   * Try to check if "$str" is an json-string.
2558
   *
2559
   * @param string $str <p>The input string.</p>
2560
   *
2561
   * @return bool
2562
   */
2563 1
  public static function is_json($str)
2564
  {
2565 1
    $str = (string)$str;
2566
2567 1
    if (!isset($str[0])) {
2568
      return false;
2569
    }
2570
2571
    if (
2572 1
        is_object(self::json_decode($str))
2573 1
        &&
2574 1
        json_last_error() === JSON_ERROR_NONE
2575 1
    ) {
2576 1
      return true;
2577
    } else {
2578 1
      return false;
2579
    }
2580
  }
2581
2582
  /**
2583
   * Check if the string is UTF-16.
2584
   *
2585
   * @param string $str <p>The input string.</p>
2586
   *
2587
   * @return int|false <p>
2588
   *                   <strong>false</strong> if is't not UTF-16,<br />
2589
   *                   <strong>1</strong> for UTF-16LE,<br />
2590
   *                   <strong>2</strong> for UTF-16BE.
2591
   *                   </p>
2592
   */
2593 4 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2594
  {
2595 4
    $str = self::remove_bom($str);
2596
2597 4
    if (self::is_binary($str)) {
2598
2599 4
      $maybeUTF16LE = 0;
2600 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2601 4
      if ($test) {
2602 4
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2603 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2604 4
        if ($test3 === $test) {
2605 4
          $strChars = self::count_chars($str, true);
2606 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2607 4
            if (in_array($test3char, $strChars, true) === true) {
2608 2
              $maybeUTF16LE++;
2609 2
            }
2610 4
          }
2611 4
        }
2612 4
      }
2613
2614 4
      $maybeUTF16BE = 0;
2615 4
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2616 4
      if ($test) {
2617 4
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2618 4
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2619 4
        if ($test3 === $test) {
2620 4
          $strChars = self::count_chars($str, true);
2621 4
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2622 4
            if (in_array($test3char, $strChars, true) === true) {
2623 3
              $maybeUTF16BE++;
2624 3
            }
2625 4
          }
2626 4
        }
2627 4
      }
2628
2629 4
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2630 3
        if ($maybeUTF16LE > $maybeUTF16BE) {
2631 2
          return 1;
2632
        } else {
2633 3
          return 2;
2634
        }
2635
      }
2636
2637 3
    }
2638
2639 3
    return false;
2640
  }
2641
2642
  /**
2643
   * Check if the string is UTF-32.
2644
   *
2645
   * @param string $str
2646
   *
2647
   * @return int|false <p>
2648
   *                   <strong>false</strong> if is't not UTF-16,<br />
2649
   *                   <strong>1</strong> for UTF-32LE,<br />
2650
   *                   <strong>2</strong> for UTF-32BE.
2651
   *                   </p>
2652
   */
2653 3 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2654
  {
2655 3
    $str = self::remove_bom($str);
2656
2657 3
    if (self::is_binary($str)) {
2658
2659 3
      $maybeUTF32LE = 0;
2660 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2661 3
      if ($test) {
2662 3
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2663 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2664 3
        if ($test3 === $test) {
2665 3
          $strChars = self::count_chars($str, true);
2666 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2667 3
            if (in_array($test3char, $strChars, true) === true) {
2668 1
              $maybeUTF32LE++;
2669 1
            }
2670 3
          }
2671 3
        }
2672 3
      }
2673
2674 3
      $maybeUTF32BE = 0;
2675 3
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2676 3
      if ($test) {
2677 3
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2678 3
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2679 3
        if ($test3 === $test) {
2680 3
          $strChars = self::count_chars($str, true);
2681 3
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2682 3
            if (in_array($test3char, $strChars, true) === true) {
2683 1
              $maybeUTF32BE++;
2684 1
            }
2685 3
          }
2686 3
        }
2687 3
      }
2688
2689 3
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2690 1
        if ($maybeUTF32LE > $maybeUTF32BE) {
2691 1
          return 1;
2692
        } else {
2693 1
          return 2;
2694
        }
2695
      }
2696
2697 3
    }
2698
2699 3
    return false;
2700
  }
2701
2702
  /**
2703
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2704
   *
2705
   * @see    http://hsivonen.iki.fi/php-utf8/
2706
   *
2707
   * @param string $str    <p>The string to be checked.</p>
2708
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2709
   *
2710
   * @return bool
2711
   */
2712 43
  public static function is_utf8($str, $strict = false)
2713
  {
2714 43
    $str = (string)$str;
2715
2716 43
    if (!isset($str[0])) {
2717 3
      return true;
2718
    }
2719
2720 41
    if ($strict === true) {
2721 1
      if (self::is_utf16($str) !== false) {
2722 1
        return false;
2723
      }
2724
2725
      if (self::is_utf32($str) !== false) {
2726
        return false;
2727
      }
2728
    }
2729
2730 41
    if (self::pcre_utf8_support() !== true) {
2731
2732
      // If even just the first character can be matched, when the /u
2733
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2734
      // invalid, nothing at all will match, even if the string contains
2735
      // some valid sequences
2736
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2737
2738
    } else {
2739
2740 41
      $mState = 0; // cached expected number of octets after the current octet
2741
      // until the beginning of the next UTF8 character sequence
2742 41
      $mUcs4 = 0; // cached Unicode character
2743 41
      $mBytes = 1; // cached expected number of octets in the current sequence
2744 41
      $len = strlen($str);
2745
2746
      /** @noinspection ForeachInvariantsInspection */
2747 41
      for ($i = 0; $i < $len; $i++) {
2748 41
        $in = ord($str[$i]);
2749 41
        if ($mState === 0) {
2750
          // When mState is zero we expect either a US-ASCII character or a
2751
          // multi-octet sequence.
2752 41
          if (0 === (0x80 & $in)) {
2753
            // US-ASCII, pass straight through.
2754 36
            $mBytes = 1;
2755 41 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2756
            // First octet of 2 octet sequence.
2757 34
            $mUcs4 = $in;
2758 34
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2759 34
            $mState = 1;
2760 34
            $mBytes = 2;
2761 39
          } elseif (0xE0 === (0xF0 & $in)) {
2762
            // First octet of 3 octet sequence.
2763 21
            $mUcs4 = $in;
2764 21
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2765 21
            $mState = 2;
2766 21
            $mBytes = 3;
2767 33 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2768
            // First octet of 4 octet sequence.
2769 9
            $mUcs4 = $in;
2770 9
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2771 9
            $mState = 3;
2772 9
            $mBytes = 4;
2773 16
          } elseif (0xF8 === (0xFC & $in)) {
2774
            /* First octet of 5 octet sequence.
2775
            *
2776
            * This is illegal because the encoded codepoint must be either
2777
            * (a) not the shortest form or
2778
            * (b) outside the Unicode range of 0-0x10FFFF.
2779
            * Rather than trying to resynchronize, we will carry on until the end
2780
            * of the sequence and let the later error handling code catch it.
2781
            */
2782 3
            $mUcs4 = $in;
2783 3
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2784 3
            $mState = 4;
2785 3
            $mBytes = 5;
2786 9 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2787
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2788 3
            $mUcs4 = $in;
2789 3
            $mUcs4 = ($mUcs4 & 1) << 30;
2790 3
            $mState = 5;
2791 3
            $mBytes = 6;
2792 3
          } else {
2793
            /* Current octet is neither in the US-ASCII range nor a legal first
2794
             * octet of a multi-octet sequence.
2795
             */
2796 5
            return false;
2797
          }
2798 41
        } else {
2799
          // When mState is non-zero, we expect a continuation of the multi-octet
2800
          // sequence
2801 36
          if (0x80 === (0xC0 & $in)) {
2802
            // Legal continuation.
2803 33
            $shift = ($mState - 1) * 6;
2804 33
            $tmp = $in;
2805 33
            $tmp = ($tmp & 0x0000003F) << $shift;
2806 33
            $mUcs4 |= $tmp;
2807
            /**
2808
             * End of the multi-octet sequence. mUcs4 now contains the final
2809
             * Unicode code point to be output
2810
             */
2811 33
            if (0 === --$mState) {
2812
              /*
2813
              * Check for illegal sequences and code points.
2814
              */
2815
              // From Unicode 3.1, non-shortest form is illegal
2816
              if (
2817 33
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2818 33
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2819 33
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2820 33
                  (4 < $mBytes) ||
2821
                  // From Unicode 3.2, surrogate characters are illegal.
2822 33
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2823
                  // Code points outside the Unicode range are illegal.
2824 33
                  ($mUcs4 > 0x10FFFF)
2825 33
              ) {
2826 5
                return false;
2827
              }
2828
              // initialize UTF8 cache
2829 33
              $mState = 0;
2830 33
              $mUcs4 = 0;
2831 33
              $mBytes = 1;
2832 33
            }
2833 33
          } else {
2834
            /**
2835
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2836
             * Incomplete multi-octet sequence.
2837
             */
2838 18
            return false;
2839
          }
2840
        }
2841 41
      }
2842
2843 20
      return true;
2844
    }
2845
  }
2846
2847
  /**
2848
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2849
   * Decodes a JSON string
2850
   *
2851
   * @link http://php.net/manual/en/function.json-decode.php
2852
   *
2853
   * @param string $json    <p>
2854
   *                        The <i>json</i> string being decoded.
2855
   *                        </p>
2856
   *                        <p>
2857
   *                        This function only works with UTF-8 encoded strings.
2858
   *                        </p>
2859
   *                        <p>PHP implements a superset of
2860
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2861
   *                        only supports these values when they are nested inside an array or an object.
2862
   *                        </p>
2863
   * @param bool   $assoc   [optional] <p>
2864
   *                        When <b>TRUE</b>, returned objects will be converted into
2865
   *                        associative arrays.
2866
   *                        </p>
2867
   * @param int    $depth   [optional] <p>
2868
   *                        User specified recursion depth.
2869
   *                        </p>
2870
   * @param int    $options [optional] <p>
2871
   *                        Bitmask of JSON decode options. Currently only
2872
   *                        <b>JSON_BIGINT_AS_STRING</b>
2873
   *                        is supported (default is to cast large integers as floats)
2874
   *                        </p>
2875
   *
2876
   * @return mixed the value encoded in <i>json</i> in appropriate
2877
   * PHP type. Values true, false and
2878
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2879
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2880
   * <i>json</i> cannot be decoded or if the encoded
2881
   * data is deeper than the recursion limit.
2882
   */
2883 2
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2884
  {
2885 2
    $json = self::filter($json);
2886
2887 2
    if (Bootup::is_php('5.4') === true) {
2888 2
      $json = json_decode($json, $assoc, $depth, $options);
2889 2
    } else {
2890
      $json = json_decode($json, $assoc, $depth);
2891
    }
2892
2893 2
    return $json;
2894
  }
2895
2896
  /**
2897
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2898
   * Returns the JSON representation of a value.
2899
   *
2900
   * @link http://php.net/manual/en/function.json-encode.php
2901
   *
2902
   * @param mixed $value   <p>
2903
   *                       The <i>value</i> being encoded. Can be any type except
2904
   *                       a resource.
2905
   *                       </p>
2906
   *                       <p>
2907
   *                       All string data must be UTF-8 encoded.
2908
   *                       </p>
2909
   *                       <p>PHP implements a superset of
2910
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2911
   *                       only supports these values when they are nested inside an array or an object.
2912
   *                       </p>
2913
   * @param int   $options [optional] <p>
2914
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2915
   *                       <b>JSON_HEX_TAG</b>,
2916
   *                       <b>JSON_HEX_AMP</b>,
2917
   *                       <b>JSON_HEX_APOS</b>,
2918
   *                       <b>JSON_NUMERIC_CHECK</b>,
2919
   *                       <b>JSON_PRETTY_PRINT</b>,
2920
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2921
   *                       <b>JSON_FORCE_OBJECT</b>,
2922
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2923
   *                       constants is described on
2924
   *                       the JSON constants page.
2925
   *                       </p>
2926
   * @param int   $depth   [optional] <p>
2927
   *                       Set the maximum depth. Must be greater than zero.
2928
   *                       </p>
2929
   *
2930
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2931
   */
2932 2
  public static function json_encode($value, $options = 0, $depth = 512)
2933
  {
2934 2
    $value = self::filter($value);
2935
2936 2
    if (Bootup::is_php('5.5')) {
2937
      $json = json_encode($value, $options, $depth);
2938
    } else {
2939 2
      $json = json_encode($value, $options);
2940
    }
2941
2942 2
    return $json;
2943
  }
2944
2945
  /**
2946
   * Makes string's first char lowercase.
2947
   *
2948
   * @param string $str <p>The input string</p>
2949
   *
2950
   * @return string <p>The resulting string</p>
2951
   */
2952 6
  public static function lcfirst($str)
2953
  {
2954 6
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2955
  }
2956
2957
  /**
2958
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2959
   *
2960
   * @param string $str   <p>The string to be trimmed</p>
2961
   * @param string $chars <p>Optional characters to be stripped</p>
2962
   *
2963
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2964
   */
2965 24 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2966
  {
2967 24
    $str = (string)$str;
2968
2969 24
    if (!isset($str[0])) {
2970 2
      return '';
2971
    }
2972
2973
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2974 23
    if ($chars === INF || !$chars) {
2975 2
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2976
    }
2977
2978 23
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2979
2980 23
    return preg_replace("/^{$chars}+/u", '', $str);
2981
  }
2982
2983
  /**
2984
   * Returns the UTF-8 character with the maximum code point in the given data.
2985
   *
2986
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2987
   *
2988
   * @return string <p>The character with the highest code point than others.</p>
2989
   */
2990 1 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2991
  {
2992 1
    if (is_array($arg)) {
2993
      $arg = implode($arg);
2994
    }
2995
2996 1
    return self::chr(max(self::codepoints($arg)));
2997
  }
2998
2999
  /**
3000
   * Calculates and returns the maximum number of bytes taken by any
3001
   * UTF-8 encoded character in the given string.
3002
   *
3003
   * @param string $str <p>The original Unicode string.</p>
3004
   *
3005
   * @return int <p>Max byte lengths of the given chars.</p>
3006
   */
3007 1
  public static function max_chr_width($str)
3008
  {
3009 1
    $bytes = self::chr_size_list($str);
3010 1
    if (count($bytes) > 0) {
3011 1
      return (int)max($bytes);
3012
    } else {
3013 1
      return 0;
3014
    }
3015
  }
3016
3017
  /**
3018
   * Checks whether mbstring is available on the server.
3019
   *
3020
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
3021
   */
3022 2
  public static function mbstring_loaded()
3023
  {
3024 2
    $return = extension_loaded('mbstring');
3025
3026 2
    if ($return === true) {
3027 2
      \mb_internal_encoding('UTF-8');
3028 2
    }
3029
3030 2
    return $return;
3031
  }
3032
3033
  /**
3034
   * Returns the UTF-8 character with the minimum code point in the given data.
3035
   *
3036
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
3037
   *
3038
   * @return string <p>The character with the lowest code point than others.</p>
3039
   */
3040 1 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3041
  {
3042 1
    if (is_array($arg)) {
3043
      $arg = implode($arg);
3044
    }
3045
3046 1
    return self::chr(min(self::codepoints($arg)));
3047
  }
3048
3049
  /**
3050
   * alias for "UTF8::normalize_encoding()"
3051
   *
3052
   * @see UTF8::normalize_encoding()
3053
   *
3054
   * @param string $encoding
3055
   *
3056
   * @return string
3057
   */
3058 1
  public static function normalizeEncoding($encoding)
3059
  {
3060 1
    return self::normalize_encoding($encoding);
3061
  }
3062
3063
  /**
3064
   * Normalize the encoding-"name" input.
3065
   *
3066
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3067
   *
3068
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3069
   */
3070 16
  public static function normalize_encoding($encoding)
3071
  {
3072 16
    static $staticNormalizeEncodingCache = array();
3073
3074 16
    if (!$encoding) {
3075 2
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3076
    }
3077
3078 16
    if ('UTF-8' === $encoding) {
3079 1
      return $encoding;
3080
    }
3081
3082 16
    if (in_array($encoding, self::$iconvEncoding, true)) {
3083 4
      return $encoding;
3084
    }
3085
3086 15
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3087 14
      return $staticNormalizeEncodingCache[$encoding];
3088
    }
3089
3090 4
    $encodingOrig = $encoding;
3091 4
    $encoding = strtoupper($encoding);
3092 4
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3093
3094
    $equivalences = array(
3095 4
        'ISO88591'    => 'ISO-8859-1',
3096 4
        'ISO8859'     => 'ISO-8859-1',
3097 4
        'ISO'         => 'ISO-8859-1',
3098 4
        'LATIN1'      => 'ISO-8859-1',
3099 4
        'LATIN'       => 'ISO-8859-1',
3100 4
        'WIN1252'     => 'ISO-8859-1',
3101 4
        'WINDOWS1252' => 'ISO-8859-1',
3102 4
        'UTF16'       => 'UTF-16',
3103 4
        'UTF32'       => 'UTF-32',
3104 4
        'UTF8'        => 'UTF-8',
3105 4
        'UTF'         => 'UTF-8',
3106 4
        'UTF7'        => 'UTF-7',
3107 4
        '8BIT'        => 'CP850',
3108 4
        'BINARY'      => 'CP850',
3109 4
    );
3110
3111 4
    if (!empty($equivalences[$encodingUpperHelper])) {
3112 4
      $encoding = $equivalences[$encodingUpperHelper];
3113 4
    }
3114
3115 4
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3116
3117 4
    return $encoding;
3118
  }
3119
3120
  /**
3121
   * Normalize some MS Word special characters.
3122
   *
3123
   * @param string $str <p>The string to be normalized.</p>
3124
   *
3125
   * @return string
3126
   */
3127 13
  public static function normalize_msword($str)
3128
  {
3129 13
    static $utf8MSWordKeys = null;
3130 13
    static $utf8MSWordValues = null;
3131
3132 13
    if ($utf8MSWordKeys === null) {
3133 1
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3134 1
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3135 1
    }
3136
3137 13
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3138
  }
3139
3140
  /**
3141
   * Normalize the whitespace.
3142
   *
3143
   * @param string $str                     <p>The string to be normalized.</p>
3144
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3145
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3146
   *                                        bidirectional text chars.</p>
3147
   *
3148
   * @return string
3149
   */
3150 18
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3151
  {
3152 18
    static $whitespaces = array();
3153 18
    static $bidiUniCodeControls = null;
3154
3155 18
    $cacheKey = (int)$keepNonBreakingSpace;
3156
3157 18
    if (!isset($whitespaces[$cacheKey])) {
3158
3159 2
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3160
3161 2
      if ($keepNonBreakingSpace === true) {
3162
        /** @noinspection OffsetOperationsInspection */
3163 1
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3164 1
      }
3165
3166 2
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3167 2
    }
3168
3169 18
    if ($keepBidiUnicodeControls === false) {
3170 18
      if ($bidiUniCodeControls === null) {
3171 1
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3172 1
      }
3173
3174 18
      $str = str_replace($bidiUniCodeControls, '', $str);
3175 18
    }
3176
3177 18
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3178
  }
3179
3180
  /**
3181
   * Format a number with grouped thousands.
3182
   *
3183
   * @param float  $number
3184
   * @param int    $decimals
3185
   * @param string $dec_point
3186
   * @param string $thousands_sep
3187
   *
3188
   * @return string
3189
   *    *
3190
   * @deprecated Because this has nothing to do with UTF8. :/
3191
   */
3192
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3193
  {
3194
    $thousands_sep = (string)$thousands_sep;
3195
    $dec_point = (string)$dec_point;
3196
3197
    if (
3198
        isset($thousands_sep[1], $dec_point[1])
3199
        &&
3200
        Bootup::is_php('5.4') === true
3201
    ) {
3202
      return str_replace(
3203
          array(
3204
              '.',
3205
              ',',
3206
          ),
3207
          array(
3208
              $dec_point,
3209
              $thousands_sep,
3210
          ),
3211
          number_format($number, $decimals, '.', ',')
3212
      );
3213
    }
3214
3215
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3216
  }
3217
3218
  /**
3219
   * Calculates Unicode code point of the given UTF-8 encoded character.
3220
   *
3221
   * INFO: opposite to UTF8::chr()
3222
   *
3223
   * @param string $chr <p>The character of which to calculate code point.<p/>
3224
   *
3225
   * @return int <p>
3226
   *             Unicode code point of the given character,<br />
3227
   *             0 on invalid UTF-8 byte sequence.
3228
   *             </p>
3229
   */
3230 17
  public static function ord($chr)
3231
  {
3232 17
    if (!$chr && $chr !== '0') {
3233 3
      return 0;
3234
    }
3235
3236 16
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3237
      self::checkForSupport();
3238
    }
3239
3240 16
    if (self::$support['intlChar'] === true) {
3241
      $tmpReturn = \IntlChar::ord($chr);
3242
      if ($tmpReturn) {
3243
        return $tmpReturn;
3244
      }
3245
    }
3246
3247
    // use static cache, if there is no support for "IntlChar"
3248 16
    static $cache = array();
3249 16
    if (isset($cache[$chr]) === true) {
3250 15
      return $cache[$chr];
3251
    }
3252
3253 9
    $chr_orig = $chr;
3254 9
    $chr = unpack('C*', substr($chr, 0, 4));
3255 9
    $a = $chr ? $chr[1] : 0;
3256
3257 9
    if (0xF0 <= $a && isset($chr[4])) {
3258 1
      return $cache[$chr_orig] = (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3259
    }
3260
3261 9
    if (0xE0 <= $a && isset($chr[3])) {
3262 4
      return $cache[$chr_orig] = (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3263
    }
3264
3265 9
    if (0xC0 <= $a && isset($chr[2])) {
3266 5
      return $cache[$chr_orig] = (($a - 0xC0) << 6) + $chr[2] - 0x80;
3267
    }
3268
3269 9
    return $cache[$chr_orig] = $a;
3270
  }
3271
3272
  /**
3273
   * Parses the string into an array (into the the second parameter).
3274
   *
3275
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3276
   *          if the second parameter is not set!
3277
   *
3278
   * @link http://php.net/manual/en/function.parse-str.php
3279
   *
3280
   * @param string $str    <p>The input string.</p>
3281
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3282
   *
3283
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3284
   */
3285 1
  public static function parse_str($str, &$result)
3286
  {
3287
    // init
3288 1
    $str = self::clean($str);
3289
3290 1
    $return = \mb_parse_str($str, $result);
3291 1
    if ($return === false || empty($result)) {
3292 1
      return false;
3293
    }
3294
3295 1
    return true;
3296
  }
3297
3298
  /**
3299
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3300
   *
3301
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3302
   */
3303 41
  public static function pcre_utf8_support()
3304
  {
3305
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3306 41
    return (bool)@preg_match('//u', '');
3307
  }
3308
3309
  /**
3310
   * Create an array containing a range of UTF-8 characters.
3311
   *
3312
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3313
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3314
   *
3315
   * @return array
3316
   */
3317 1
  public static function range($var1, $var2)
3318
  {
3319 1
    if (!$var1 || !$var2) {
3320 1
      return array();
3321
    }
3322
3323 1 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3324 1
      $start = (int)$var1;
3325 1
    } elseif (ctype_xdigit($var1)) {
3326
      $start = (int)self::hex_to_int($var1);
3327
    } else {
3328 1
      $start = self::ord($var1);
3329
    }
3330
3331 1
    if (!$start) {
3332
      return array();
3333
    }
3334
3335 1 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3336 1
      $end = (int)$var2;
3337 1
    } elseif (ctype_xdigit($var2)) {
3338
      $end = (int)self::hex_to_int($var2);
3339
    } else {
3340 1
      $end = self::ord($var2);
3341
    }
3342
3343 1
    if (!$end) {
3344
      return array();
3345
    }
3346
3347 1
    return array_map(
3348
        array(
3349 1
            '\\voku\\helper\\UTF8',
3350 1
            'chr',
3351 1
        ),
3352 1
        range($start, $end)
3353 1
    );
3354
  }
3355
3356
  /**
3357
   * alias for "UTF8::remove_bom()"
3358
   *
3359
   * @see UTF8::remove_bom()
3360
   *
3361
   * @param string $str
3362
   *
3363
   * @return string
3364
   */
3365 5
  public static function removeBOM($str)
3366
  {
3367 5
    return self::remove_bom($str);
3368
  }
3369
3370
  /**
3371
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3372
   *
3373
   * @param string $str <p>The input string.</p>
3374
   *
3375
   * @return string <p>String without UTF-BOM</p>
3376
   */
3377 10
  public static function remove_bom($str)
3378
  {
3379 10
    foreach (self::$bom as $bomString => $bomByteLength) {
3380 10
      if (0 === strpos($str, $bomString)) {
3381 5
        $str = substr($str, $bomByteLength);
3382 5
      }
3383 10
    }
3384
3385 10
    return $str;
3386
  }
3387
3388
  /**
3389
   * Removes duplicate occurrences of a string in another string.
3390
   *
3391
   * @param string          $str  <p>The base string.</p>
3392
   * @param string|string[] $what <p>String to search for in the base string.</p>
3393
   *
3394
   * @return string <p>The result string with removed duplicates.</p>
3395
   */
3396 1
  public static function remove_duplicates($str, $what = ' ')
3397
  {
3398 1
    if (is_string($what)) {
3399 1
      $what = array($what);
3400 1
    }
3401
3402 1
    if (is_array($what)) {
3403 1
      foreach ($what as $item) {
3404 1
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3405 1
      }
3406 1
    }
3407
3408 1
    return $str;
3409
  }
3410
3411
  /**
3412
   * Remove invisible characters from a string.
3413
   *
3414
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3415
   *
3416
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3417
   *
3418
   * @param string $str
3419
   * @param bool   $url_encoded
3420
   * @param string $replacement
3421
   *
3422
   * @return string
3423
   */
3424 45
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3425
  {
3426
    // init
3427 45
    $non_displayables = array();
3428
3429
    // every control character except newline (dec 10),
3430
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3431 45
    if ($url_encoded) {
3432 45
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3433 45
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3434 45
    }
3435
3436 45
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3437
3438
    do {
3439 45
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3440 45
    } while ($count !== 0);
3441
3442 45
    return $str;
3443
  }
3444
3445
  /**
3446
   * Replace the diamond question mark (�) with the replacement.
3447
   *
3448
   * @param string $str
3449
   * @param string $unknown
3450
   *
3451
   * @return string
3452
   */
3453 45
  public static function replace_diamond_question_mark($str, $unknown = '?')
3454
  {
3455 45
    return str_replace(
3456
        array(
3457 45
            "\xEF\xBF\xBD",
3458 45
            '�',
3459 45
        ),
3460
        array(
3461 45
            $unknown,
3462 45
            $unknown,
3463 45
        ),
3464
        $str
3465 45
    );
3466
  }
3467
3468
  /**
3469
   * Strip whitespace or other characters from end of a UTF-8 string.
3470
   *
3471
   * @param string $str   <p>The string to be trimmed.</p>
3472
   * @param string $chars <p>Optional characters to be stripped.</p>
3473
   *
3474
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3475
   */
3476 23 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3477
  {
3478 23
    $str = (string)$str;
3479
3480 23
    if (!isset($str[0])) {
3481 5
      return '';
3482
    }
3483
3484
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3485 19
    if ($chars === INF || !$chars) {
3486 3
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3487
    }
3488
3489 18
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3490
3491 18
    return preg_replace("/{$chars}+$/u", '', $str);
3492
  }
3493
3494
  /**
3495
   * rxClass
3496
   *
3497
   * @param string $s
3498
   * @param string $class
3499
   *
3500
   * @return string
3501
   */
3502 52
  private static function rxClass($s, $class = '')
3503
  {
3504 52
    static $rxClassCache = array();
3505
3506 52
    $cacheKey = $s . $class;
3507
3508 52
    if (isset($rxClassCache[$cacheKey])) {
3509 40
      return $rxClassCache[$cacheKey];
3510
    }
3511
3512 18
    $class = array($class);
3513
3514
    /** @noinspection SuspiciousLoopInspection */
3515 18
    foreach (self::str_split($s) as $s) {
3516 17
      if ('-' === $s) {
3517
        $class[0] = '-' . $class[0];
3518 17
      } elseif (!isset($s[2])) {
3519 17
        $class[0] .= preg_quote($s, '/');
3520 17
      } elseif (1 === self::strlen($s)) {
3521 2
        $class[0] .= $s;
3522 2
      } else {
3523
        $class[] = $s;
3524
      }
3525 18
    }
3526
3527 18
    if ($class[0]) {
3528 18
      $class[0] = '[' . $class[0] . ']';
3529 18
    }
3530
3531 18
    if (1 === count($class)) {
3532 18
      $return = $class[0];
3533 18
    } else {
3534
      $return = '(?:' . implode('|', $class) . ')';
3535
    }
3536
3537 18
    $rxClassCache[$cacheKey] = $return;
3538
3539 18
    return $return;
3540
  }
3541
3542
  /**
3543
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3544
   */
3545
  public static function showSupport()
3546
  {
3547
    foreach (self::$support as $utf8Support) {
3548
      echo $utf8Support . "\n<br>";
3549
    }
3550
  }
3551
3552
  /**
3553
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3554
   *
3555
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3556
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3557
   *
3558
   * @return string <p>The HTML numbered entity.</p>
3559
   */
3560 1
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3561
  {
3562 1
    if (!$char) {
3563 1
      return '';
3564
    }
3565
3566
    if (
3567
        $keepAsciiChars === true
3568 1
        &&
3569 1
        self::isAscii($char) === true
3570 1
    ) {
3571 1
      return $char;
3572
    }
3573
3574 1
    return '&#' . self::ord($char) . ';';
3575
  }
3576
3577
  /**
3578
   * Convert a string to an array of Unicode characters.
3579
   *
3580
   * @param string  $str       <p>The string to split into array.</p>
3581
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3582
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3583
   *
3584
   * @return string[] <p>An array containing chunks of the string.</p>
3585
   */
3586 36
  public static function split($str, $length = 1, $cleanUtf8 = false)
3587
  {
3588 36
    $str = (string)$str;
3589
3590 36
    if (!isset($str[0])) {
3591 2
      return array();
3592
    }
3593
3594
    // init
3595 36
    $str = (string)$str;
3596 36
    $ret = array();
3597
3598 36
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3599
      self::checkForSupport();
3600
    }
3601
3602 36
    if (self::$support['pcre_utf8'] === true) {
3603
3604 36
      if ($cleanUtf8 === true) {
3605 6
        $str = self::clean($str);
3606 6
      }
3607
3608 36
      preg_match_all('/./us', $str, $retArray);
3609 36
      if (isset($retArray[0])) {
3610 36
        $ret = $retArray[0];
3611 36
      }
3612 36
      unset($retArray);
3613
3614 36
    } else {
3615
3616
      // fallback
3617
3618
      $len = strlen($str);
3619
3620
      /** @noinspection ForeachInvariantsInspection */
3621
      for ($i = 0; $i < $len; $i++) {
3622
        if (($str[$i] & "\x80") === "\x00") {
3623
          $ret[] = $str[$i];
3624
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3625
          if (($str[$i + 1] & "\xC0") === "\x80") {
3626
            $ret[] = $str[$i] . $str[$i + 1];
3627
3628
            $i++;
3629
          }
3630 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3631
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3632
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3633
3634
            $i += 2;
3635
          }
3636
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3637 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3638
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3639
3640
            $i += 3;
3641
          }
3642
        }
3643
      }
3644
    }
3645
3646 36
    if ($length > 1) {
3647 5
      $ret = array_chunk($ret, $length);
3648
3649 5
      $ret = array_map('implode', $ret);
3650 5
    }
3651
3652
    /** @noinspection OffsetOperationsInspection */
3653 36
    if (isset($ret[0]) && $ret[0] === '') {
3654
      return array();
3655
    }
3656
3657 36
    return $ret;
3658
  }
3659
3660
  /**
3661
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3662
   *
3663
   * @param string $str <p>The input string.</p>
3664
   *
3665
   * @return false|string <p>
3666
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3667
   *                      otherwise it will return false.
3668
   *                      </p>
3669
   */
3670 12
  public static function str_detect_encoding($str)
3671
  {
3672
    //
3673
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3674
    //
3675
3676 12
    if (self::is_binary($str)) {
3677 2
      if (self::is_utf16($str) === 1) {
3678 1
        return 'UTF-16LE';
3679 2
      } elseif (self::is_utf16($str) === 2) {
3680 1
        return 'UTF-16BE';
3681 2
      } elseif (self::is_utf32($str) === 1) {
3682
        return 'UTF-32LE';
3683 2
      } elseif (self::is_utf32($str) === 2) {
3684
        return 'UTF-32BE';
3685
      }
3686 2
    }
3687
3688
    //
3689
    // 2.) simple check for ASCII chars
3690
    //
3691
3692 12
    if (self::is_ascii($str) === true) {
3693 3
      return 'ASCII';
3694
    }
3695
3696
    //
3697
    // 3.) simple check for UTF-8 chars
3698
    //
3699
3700 12
    if (self::is_utf8($str) === true) {
3701 9
      return 'UTF-8';
3702
    }
3703
3704
    //
3705
    // 4.) check via "\mb_detect_encoding()"
3706
    //
3707
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3708
3709
    $detectOrder = array(
3710 6
        'ISO-8859-1',
3711 6
        'ISO-8859-2',
3712 6
        'ISO-8859-3',
3713 6
        'ISO-8859-4',
3714 6
        'ISO-8859-5',
3715 6
        'ISO-8859-6',
3716 6
        'ISO-8859-7',
3717 6
        'ISO-8859-8',
3718 6
        'ISO-8859-9',
3719 6
        'ISO-8859-10',
3720 6
        'ISO-8859-13',
3721 6
        'ISO-8859-14',
3722 6
        'ISO-8859-15',
3723 6
        'ISO-8859-16',
3724 6
        'WINDOWS-1251',
3725 6
        'WINDOWS-1252',
3726 6
        'WINDOWS-1254',
3727 6
        'ISO-2022-JP',
3728 6
        'JIS',
3729 6
        'EUC-JP',
3730 6
    );
3731
3732 6
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3733 6
    if ($encoding) {
3734 6
      return $encoding;
3735
    }
3736
3737
    //
3738
    // 5.) check via "iconv()"
3739
    //
3740
3741
    $md5 = md5($str);
3742
    foreach (self::$iconvEncoding as $encodingTmp) {
3743
      # INFO: //IGNORE and //TRANSLIT still throw notice
3744
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3745
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3746
        return $encodingTmp;
3747
      }
3748
    }
3749
3750
    return false;
3751
  }
3752
3753
  /**
3754
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3755
   *
3756
   * @link  http://php.net/manual/en/function.str-ireplace.php
3757
   *
3758
   * @param mixed $search  <p>
3759
   *                       Every replacement with search array is
3760
   *                       performed on the result of previous replacement.
3761
   *                       </p>
3762
   * @param mixed $replace <p>
3763
   *                       </p>
3764
   * @param mixed $subject <p>
3765
   *                       If subject is an array, then the search and
3766
   *                       replace is performed with every entry of
3767
   *                       subject, and the return value is an array as
3768
   *                       well.
3769
   *                       </p>
3770
   * @param int   $count   [optional] <p>
3771
   *                       The number of matched and replaced needles will
3772
   *                       be returned in count which is passed by
3773
   *                       reference.
3774
   *                       </p>
3775
   *
3776
   * @return mixed <p>A string or an array of replacements.</p>
3777
   */
3778 14
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3779
  {
3780 14
    $search = (array)$search;
3781
3782
    /** @noinspection AlterInForeachInspection */
3783 14
    foreach ($search as &$s) {
3784 14
      if ('' === $s .= '') {
3785 1
        $s = '/^(?<=.)$/';
3786 1
      } else {
3787 13
        $s = '/' . preg_quote($s, '/') . '/ui';
3788
      }
3789 14
    }
3790
3791 14
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3792 14
    $count = $replace; // used as reference parameter
3793
3794 14
    return $subject;
3795
  }
3796
3797
  /**
3798
   * Limit the number of characters in a string, but also after the next word.
3799
   *
3800
   * @param string $str
3801
   * @param int    $length
3802
   * @param string $strAddOn
3803
   *
3804
   * @return string
3805
   */
3806 1
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3807
  {
3808 1
    $str = (string)$str;
3809
3810 1
    if (!isset($str[0])) {
3811
      return '';
3812
    }
3813
3814 1
    $length = (int)$length;
3815
3816 1
    if (self::strlen($str) <= $length) {
3817
      return $str;
3818
    }
3819
3820 1
    if (self::substr($str, $length - 1, 1) === ' ') {
3821 1
      return self::substr($str, 0, $length - 1) . $strAddOn;
3822
    }
3823
3824 1
    $str = self::substr($str, 0, $length);
3825 1
    $array = explode(' ', $str);
3826 1
    array_pop($array);
3827 1
    $new_str = implode(' ', $array);
3828
3829 1
    if ($new_str === '') {
3830
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3831
    } else {
3832 1
      $str = $new_str . $strAddOn;
3833
    }
3834
3835 1
    return $str;
3836
  }
3837
3838
  /**
3839
   * Pad a UTF-8 string to given length with another string.
3840
   *
3841
   * @param string $str        <p>The input string.</p>
3842
   * @param int    $pad_length <p>The length of return string.</p>
3843
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3844
   * @param int    $pad_type   [optional] <p>
3845
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3846
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3847
   *                           </p>
3848
   *
3849
   * @return string <strong>Returns the padded string</strong>
3850
   */
3851 2
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3852
  {
3853 2
    $str_length = self::strlen($str);
3854
3855
    if (
3856 2
        is_int($pad_length) === true
3857 2
        &&
3858
        $pad_length > 0
3859 2
        &&
3860
        $pad_length >= $str_length
3861 2
    ) {
3862 2
      $ps_length = self::strlen($pad_string);
3863
3864 2
      $diff = $pad_length - $str_length;
3865
3866
      switch ($pad_type) {
3867 2 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3868 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3869 2
          $pre = self::substr($pre, 0, $diff);
3870 2
          $post = '';
3871 2
          break;
3872
3873 2
        case STR_PAD_BOTH:
3874 2
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3875 2
          $pre = self::substr($pre, 0, (int)$diff / 2);
3876 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3877 2
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3878 2
          break;
3879
3880 2
        case STR_PAD_RIGHT:
3881 2 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3882 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3883 2
          $post = self::substr($post, 0, $diff);
3884 2
          $pre = '';
3885 2
      }
3886
3887 2
      return $pre . $str . $post;
3888
    }
3889
3890 2
    return $str;
3891
  }
3892
3893
  /**
3894
   * Repeat a string.
3895
   *
3896
   * @param string $str        <p>
3897
   *                           The string to be repeated.
3898
   *                           </p>
3899
   * @param int    $multiplier <p>
3900
   *                           Number of time the input string should be
3901
   *                           repeated.
3902
   *                           </p>
3903
   *                           <p>
3904
   *                           multiplier has to be greater than or equal to 0.
3905
   *                           If the multiplier is set to 0, the function
3906
   *                           will return an empty string.
3907
   *                           </p>
3908
   *
3909
   * @return string <p>The repeated string.</p>
3910
   */
3911 1
  public static function str_repeat($str, $multiplier)
3912
  {
3913 1
    $str = self::filter($str);
3914
3915 1
    return str_repeat($str, $multiplier);
3916
  }
3917
3918
  /**
3919
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3920
   *
3921
   * Replace all occurrences of the search string with the replacement string
3922
   *
3923
   * @link http://php.net/manual/en/function.str-replace.php
3924
   *
3925
   * @param mixed $search  <p>
3926
   *                       The value being searched for, otherwise known as the needle.
3927
   *                       An array may be used to designate multiple needles.
3928
   *                       </p>
3929
   * @param mixed $replace <p>
3930
   *                       The replacement value that replaces found search
3931
   *                       values. An array may be used to designate multiple replacements.
3932
   *                       </p>
3933
   * @param mixed $subject <p>
3934
   *                       The string or array being searched and replaced on,
3935
   *                       otherwise known as the haystack.
3936
   *                       </p>
3937
   *                       <p>
3938
   *                       If subject is an array, then the search and
3939
   *                       replace is performed with every entry of
3940
   *                       subject, and the return value is an array as
3941
   *                       well.
3942
   *                       </p>
3943
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3944
   *
3945
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3946
   */
3947 12
  public static function str_replace($search, $replace, $subject, &$count = null)
3948
  {
3949 12
    return str_replace($search, $replace, $subject, $count);
3950
  }
3951
3952
  /**
3953
   * Shuffles all the characters in the string.
3954
   *
3955
   * @param string $str <p>The input string</p>
3956
   *
3957
   * @return string <p>The shuffled string.</p>
3958
   */
3959 1
  public static function str_shuffle($str)
3960
  {
3961 1
    $array = self::split($str);
3962
3963 1
    shuffle($array);
3964
3965 1
    return implode('', $array);
3966
  }
3967
3968
  /**
3969
   * Sort all characters according to code points.
3970
   *
3971
   * @param string $str    <p>A UTF-8 string.</p>
3972
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3973
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3974
   *
3975
   * @return string <p>String of sorted characters.</p>
3976
   */
3977 1
  public static function str_sort($str, $unique = false, $desc = false)
3978
  {
3979 1
    $array = self::codepoints($str);
3980
3981 1
    if ($unique) {
3982 1
      $array = array_flip(array_flip($array));
3983 1
    }
3984
3985 1
    if ($desc) {
3986 1
      arsort($array);
3987 1
    } else {
3988 1
      asort($array);
3989
    }
3990
3991 1
    return self::string($array);
3992
  }
3993
3994
  /**
3995
   * Split a string into an array.
3996
   *
3997
   * @param string $str
3998
   * @param int    $len
3999
   *
4000
   * @return array
4001
   */
4002 21
  public static function str_split($str, $len = 1)
4003
  {
4004
    // init
4005 21
    $len = (int)$len;
4006 21
    $str = (string)$str;
4007
4008 21
    if (!isset($str[0])) {
4009 1
      return array();
4010
    }
4011
4012 20
    if ($len < 1) {
4013
      return str_split($str, $len);
4014
    }
4015
4016 20
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
4017 20
    $a = $a[0];
4018
4019 20
    if ($len === 1) {
4020 20
      return $a;
4021
    }
4022
4023 1
    $arrayOutput = array();
4024 1
    $p = -1;
4025
4026
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
4027 1
    foreach ($a as $l => $a) {
4028 1
      if ($l % $len) {
4029 1
        $arrayOutput[$p] .= $a;
4030 1
      } else {
4031 1
        $arrayOutput[++$p] = $a;
4032
      }
4033 1
    }
4034
4035 1
    return $arrayOutput;
4036
  }
4037
4038
  /**
4039
   * Get a binary representation of a specific string.
4040
   *
4041
   * @param string $str <p>The input string.</p>
4042
   *
4043
   * @return string
4044
   */
4045 1
  public static function str_to_binary($str)
4046
  {
4047 1
    $str = (string)$str;
4048
4049 1
    $value = unpack('H*', $str);
4050
4051 1
    return base_convert($value[1], 16, 2);
4052
  }
4053
4054
  /**
4055
   * alias for "UTF8::to_ascii()"
4056
   *
4057
   * @see UTF8::to_ascii()
4058
   *
4059
   * @param string $str
4060
   * @param string $unknown
4061
   * @param bool   $strict
4062
   *
4063
   * @return string
4064
   */
4065 7
  public static function str_transliterate($str, $unknown = '?', $strict = false)
4066
  {
4067 7
    return self::to_ascii($str, $unknown, $strict);
4068
  }
4069
4070
  /**
4071
   * Counts number of words in the UTF-8 string.
4072
   *
4073
   * @param string $str      <p>The input string.</p>
4074
   * @param int    $format   [optional] <p>
4075
   *                         <strong>0</strong> => return a number of words (default)<br />
4076
   *                         <strong>1</strong> => return an array of words<br />
4077
   *                         <strong>2</strong> => return an array of words with word-offset as key
4078
   *                         </p>
4079
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word.</p>
4080
   *
4081
   * @return array|int <p>The number of words in the string</p>
4082
   */
4083 1
  public static function str_word_count($str, $format = 0, $charlist = '')
4084
  {
4085 1
    $charlist = self::rxClass($charlist, '\pL');
4086 1
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4087
4088 1
    $len = count($strParts);
4089
4090 1
    if ($format === 1) {
4091
4092 1
      $numberOfWords = array();
4093 1
      for ($i = 1; $i < $len; $i += 2) {
4094 1
        $numberOfWords[] = $strParts[$i];
4095 1
      }
4096
4097 1
    } elseif ($format === 2) {
4098
4099 1
      $numberOfWords = array();
4100 1
      $offset = self::strlen($strParts[0]);
4101 1
      for ($i = 1; $i < $len; $i += 2) {
4102 1
        $numberOfWords[$offset] = $strParts[$i];
4103 1
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4104 1
      }
4105
4106 1
    } else {
4107
4108 1
      $numberOfWords = ($len - 1) / 2;
4109
4110
    }
4111
4112 1
    return $numberOfWords;
4113
  }
4114
4115
  /**
4116
   * Case-insensitive string comparison.
4117
   *
4118
   * INFO: Case-insensitive version of UTF8::strcmp()
4119
   *
4120
   * @param string $str1
4121
   * @param string $str2
4122
   *
4123
   * @return int <p>
4124
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4125
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4126
   *             <strong>0</strong> if they are equal.
4127
   *             </p>
4128
   */
4129 9
  public static function strcasecmp($str1, $str2)
4130
  {
4131 9
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4132
  }
4133
4134
  /**
4135
   * alias for "UTF8::strstr()"
4136
   *
4137
   * @see UTF8::strstr()
4138
   *
4139
   * @param string  $haystack
4140
   * @param string  $needle
4141
   * @param bool    $before_needle
4142
   * @param string  $encoding
4143
   * @param boolean $cleanUtf8
4144
   *
4145
   * @return string|false
4146
   */
4147 1
  public static function strchr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4148
  {
4149 1
    return self::strstr($haystack, $needle, $before_needle, $encoding, $cleanUtf8);
4150
  }
4151
4152
  /**
4153
   * Case-sensitive string comparison.
4154
   *
4155
   * @param string $str1
4156
   * @param string $str2
4157
   *
4158
   * @return int  <p>
4159
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4160
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4161
   *              <strong>0</strong> if they are equal.
4162
   *              </p>
4163
   */
4164 12
  public static function strcmp($str1, $str2)
4165
  {
4166 12
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4167 11
        \Normalizer::normalize($str1, \Normalizer::NFD),
4168 11
        \Normalizer::normalize($str2, \Normalizer::NFD)
4169 12
    );
4170
  }
4171
4172
  /**
4173
   * Find length of initial segment not matching mask.
4174
   *
4175
   * @param string $str
4176
   * @param string $charList
4177
   * @param int    $offset
4178
   * @param int    $length
4179
   *
4180
   * @return int|null
4181
   */
4182 9
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4183
  {
4184 9
    if ('' === $charList .= '') {
4185 1
      return null;
4186
    }
4187
4188 8
    if ($offset || 2147483647 !== $length) {
4189 2
      $str = (string)self::substr($str, $offset, $length);
4190 2
    }
4191
4192 8
    $str = (string)$str;
4193 8
    if (!isset($str[0])) {
4194 1
      return null;
4195
    }
4196
4197 7
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4198
      /** @noinspection OffsetOperationsInspection */
4199 7
      return self::strlen($length[1]);
4200
    }
4201
4202 1
    return self::strlen($str);
4203
  }
4204
4205
  /**
4206
   * alias for "UTF8::stristr()"
4207
   *
4208
   * @see UTF8::stristr()
4209
   *
4210
   * @param string  $haystack
4211
   * @param string  $needle
4212
   * @param bool    $before_needle
4213
   * @param string  $encoding
4214
   * @param boolean $cleanUtf8
4215
   *
4216
   * @return string|false
4217
   */
4218 1
  public static function strichr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4219
  {
4220 1
    return self::stristr($haystack, $needle, $before_needle, $encoding, $cleanUtf8);
4221
  }
4222
4223
  /**
4224
   * Create a UTF-8 string from code points.
4225
   *
4226
   * INFO: opposite to UTF8::codepoints()
4227
   *
4228
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4229
   *
4230
   * @return string <p>UTF-8 encoded string.</p>
4231
   */
4232 2
  public static function string(array $array)
4233
  {
4234 2
    return implode(
4235 2
        array_map(
4236
            array(
4237 2
                '\\voku\\helper\\UTF8',
4238 2
                'chr',
4239 2
            ),
4240
            $array
4241 2
        )
4242 2
    );
4243
  }
4244
4245
  /**
4246
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4247
   *
4248
   * @param string $str <p>The input string.</p>
4249
   *
4250
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4251
   */
4252 3
  public static function string_has_bom($str)
4253
  {
4254 3
    foreach (self::$bom as $bomString => $bomByteLength) {
4255 3
      if (0 === strpos($str, $bomString)) {
4256 3
        return true;
4257
      }
4258 3
    }
4259
4260 3
    return false;
4261
  }
4262
4263
  /**
4264
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4265
   *
4266
   * @link http://php.net/manual/en/function.strip-tags.php
4267
   *
4268
   * @param string $str            <p>
4269
   *                               The input string.
4270
   *                               </p>
4271
   * @param string $allowable_tags [optional] <p>
4272
   *                               You can use the optional second parameter to specify tags which should
4273
   *                               not be stripped.
4274
   *                               </p>
4275
   *                               <p>
4276
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4277
   *                               can not be changed with allowable_tags.
4278
   *                               </p>
4279
   *
4280
   * @return string <p>The stripped string.</p>
4281
   */
4282 2
  public static function strip_tags($str, $allowable_tags = null)
4283
  {
4284
    // clean broken utf8
4285 2
    $str = self::clean($str);
4286
4287 2
    return strip_tags($str, $allowable_tags);
4288
  }
4289
4290
  /**
4291
   * Finds position of first occurrence of a string within another, case insensitive.
4292
   *
4293
   * @link http://php.net/manual/en/function.mb-stripos.php
4294
   *
4295
   * @param string  $haystack  <p>
4296
   *                           The string from which to get the position of the first occurrence
4297
   *                           of needle
4298
   *                           </p>
4299
   * @param string  $needle    <p>
4300
   *                           The string to find in haystack
4301
   *                           </p>
4302
   * @param int     $offset    [optional] <p>
4303
   *                           The position in haystack
4304
   *                           to start searching
4305
   *                           </p>
4306
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4307
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4308
   *
4309
   * @return int|false <p>
4310
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4311
   *                   or false if needle is not found.
4312
   *                   </p>
4313
   */
4314 8
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4315
  {
4316 8
    $haystack = (string)$haystack;
4317 8
    $needle = (string)$needle;
4318
4319 8
    if (!isset($haystack[0], $needle[0])) {
4320 3
      return false;
4321
    }
4322
4323 7
    if ($cleanUtf8 === true) {
4324 1
      $haystack = self::clean($haystack);
4325 1
      $needle = self::clean($needle);
4326 1
    }
4327
4328 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4329
        $encoding === 'UTF-8'
4330 7
        ||
4331 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4332 7
    ) {
4333 7
      $encoding = 'UTF-8';
4334 7
    } else {
4335
      $encoding = self::normalize_encoding($encoding);
4336
    }
4337
4338 7
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4339
  }
4340
4341
  /**
4342
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4343
   *
4344
   * @param string  $haystack      <p>The input string. Must be valid UTF-8.</p>
4345
   * @param string  $needle        <p>The string to look for. Must be valid UTF-8.</p>
4346
   * @param bool    $before_needle [optional] <p>
4347
   *                               If <b>TRUE</b>, grapheme_strstr() returns the part of the
4348
   *                               haystack before the first occurrence of the needle (excluding the needle).
4349
   *                               </p>
4350
   * @param string  $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4351
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4352
   *
4353
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4354
   */
4355 8
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4356
  {
4357 8
    if ('' === $needle .= '') {
4358 2
      return false;
4359
    }
4360
4361 6
    if ($encoding !== 'UTF-8') {
4362
      $encoding = self::normalize_encoding($encoding);
4363
    }
4364
4365 6
    if ($cleanUtf8 === true) {
4366
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4367
      // if invalid characters are found in $haystack before $needle
4368
      $needle = self::clean($needle);
4369
      $haystack = self::clean($haystack);
4370
    }
4371
4372 6
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4373
  }
4374
4375
  /**
4376
   * Get the string length, not the byte-length!
4377
   *
4378
   * @link     http://php.net/manual/en/function.mb-strlen.php
4379
   *
4380
   * @param string  $str       <p>The string being checked for length.</p>
4381
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4382
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4383
   *
4384
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4385
   *             character counted as +1)</p>
4386
   */
4387 62
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4388
  {
4389 62
    $str = (string)$str;
4390
4391 62
    if (!isset($str[0])) {
4392 4
      return 0;
4393
    }
4394
4395 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4396
        $encoding === 'UTF-8'
4397 61
        ||
4398 2
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4399 61
    ) {
4400 60
      $encoding = 'UTF-8';
4401 60
    } else {
4402 2
      $encoding = self::normalize_encoding($encoding);
4403
    }
4404
4405
    switch ($encoding) {
4406 61
      case 'ASCII':
4407 61
      case 'CP850':
4408 1
        return strlen($str);
4409
    }
4410
4411 61
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4412 2
      $str = self::clean($str);
4413 2
    }
4414
4415 61
    return \mb_strlen($str, $encoding);
4416
  }
4417
4418
  /**
4419
   * Case insensitive string comparisons using a "natural order" algorithm.
4420
   *
4421
   * INFO: natural order version of UTF8::strcasecmp()
4422
   *
4423
   * @param string $str1 <p>The first string.</p>
4424
   * @param string $str2 <p>The second string.</p>
4425
   *
4426
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4427
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4428
   *             <strong>0</strong> if they are equal
4429
   */
4430 1
  public static function strnatcasecmp($str1, $str2)
4431
  {
4432 1
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4433
  }
4434
4435
  /**
4436
   * String comparisons using a "natural order" algorithm
4437
   *
4438
   * INFO: natural order version of UTF8::strcmp()
4439
   *
4440
   * @link  http://php.net/manual/en/function.strnatcmp.php
4441
   *
4442
   * @param string $str1 <p>The first string.</p>
4443
   * @param string $str2 <p>The second string.</p>
4444
   *
4445
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4446
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4447
   *             <strong>0</strong> if they are equal
4448
   */
4449 2
  public static function strnatcmp($str1, $str2)
4450
  {
4451 2
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4452
  }
4453
4454
  /**
4455
   * Case-insensitive string comparison of the first n characters.
4456
   *
4457
   * @link  http://php.net/manual/en/function.strncasecmp.php
4458
   *
4459
   * @param string $str1 <p>The first string.</p>
4460
   * @param string $str2 <p>The second string.</p>
4461
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4462
   *
4463
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4464
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4465
   *             <strong>0</strong> if they are equal
4466
   */
4467 1
  public static function strncasecmp($str1, $str2, $len)
4468
  {
4469 1
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4470
  }
4471
4472
  /**
4473
   * String comparison of the first n characters.
4474
   *
4475
   * @link  http://php.net/manual/en/function.strncmp.php
4476
   *
4477
   * @param string $str1 <p>The first string.</p>
4478
   * @param string $str2 <p>The second string.</p>
4479
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4480
   *
4481
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4482
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4483
   *             <strong>0</strong> if they are equal
4484
   */
4485 2
  public static function strncmp($str1, $str2, $len)
4486
  {
4487 2
    $str1 = self::substr($str1, 0, $len);
4488 2
    $str2 = self::substr($str2, 0, $len);
4489
4490 2
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4487 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4488 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4491
  }
4492
4493
  /**
4494
   * Search a string for any of a set of characters.
4495
   *
4496
   * @link  http://php.net/manual/en/function.strpbrk.php
4497
   *
4498
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4499
   * @param string $char_list <p>This parameter is case sensitive.</p>
4500
   *
4501
   * @return string String starting from the character found, or false if it is not found.
4502
   */
4503 1
  public static function strpbrk($haystack, $char_list)
4504
  {
4505 1
    $haystack = (string)$haystack;
4506 1
    $char_list = (string)$char_list;
4507
4508 1
    if (!isset($haystack[0], $char_list[0])) {
4509 1
      return false;
4510
    }
4511
4512 1
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4513 1
      return substr($haystack, strpos($haystack, $m[0]));
4514
    } else {
4515 1
      return false;
4516
    }
4517
  }
4518
4519
  /**
4520
   * Find position of first occurrence of string in a string.
4521
   *
4522
   * @link http://php.net/manual/en/function.mb-strpos.php
4523
   *
4524
   * @param string  $haystack  <p>The string being checked.</p>
4525
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4526
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4527
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4528
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4529
   *
4530
   * @return int|false <p>
4531
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4532
   *                   If needle is not found it returns false.
4533
   *                   </p>
4534
   */
4535 15
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4536
  {
4537 15
    $haystack = (string)$haystack;
4538 15
    $needle = (string)$needle;
4539
4540 15
    if (!isset($haystack[0], $needle[0])) {
4541 2
      return false;
4542
    }
4543
4544
    // init
4545 14
    $offset = (int)$offset;
4546
4547
    // iconv and mbstring do not support integer $needle
4548
4549 14
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4550
      $needle = (string)self::chr($needle);
4551
    }
4552
4553 14
    if ($cleanUtf8 === true) {
4554
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4555
      // if invalid characters are found in $haystack before $needle
4556 2
      $needle = self::clean($needle);
4557 2
      $haystack = self::clean($haystack);
4558 2
    }
4559
4560 14
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4561
      self::checkForSupport();
4562
    }
4563
4564 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4565
        $encoding === 'UTF-8'
4566 14
        ||
4567 2
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4568 14
    ) {
4569 14
      $encoding = 'UTF-8';
4570 14
    } else {
4571 1
      $encoding = self::normalize_encoding($encoding);
4572
    }
4573
4574 14
    if (self::$support['mbstring'] === true) {
4575 14
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4576
    }
4577
4578
    if (self::$support['iconv'] === true) {
4579
      // ignore invalid negative offset to keep compatibility
4580
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4581
      return \iconv_strpos($haystack, $needle, $offset > 0 ? $offset : 0, $encoding);
4582
    }
4583
4584
    if ($offset > 0) {
4585
      $haystack = self::substr($haystack, $offset);
4586
    }
4587
4588 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4589
      $left = substr($haystack, 0, $pos);
4590
4591
      // negative offset not supported in PHP strpos(), ignoring
4592
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4593
    }
4594
4595
    return false;
4596
  }
4597
4598
  /**
4599
   * Finds the last occurrence of a character in a string within another.
4600
   *
4601
   * @link http://php.net/manual/en/function.mb-strrchr.php
4602
   *
4603
   * @param string $haystack      <p>The string from which to get the last occurrence of needle.</p>
4604
   * @param string $needle        <p>The string to find in haystack</p>
4605
   * @param bool   $before_needle [optional] <p>
4606
   *                              Determines which portion of haystack
4607
   *                              this function returns.
4608
   *                              If set to true, it returns all of haystack
4609
   *                              from the beginning to the last occurrence of needle.
4610
   *                              If set to false, it returns all of haystack
4611
   *                              from the last occurrence of needle to the end,
4612
   *                              </p>
4613
   * @param string $encoding      [optional] <p>
4614
   *                              Character encoding name to use.
4615
   *                              If it is omitted, internal character encoding is used.
4616
   *                              </p>
4617
   *
4618
   * @return string|false The portion of haystack or false if needle is not found.
4619
   */
4620 1 View Code Duplication
  public static function strrchr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4621
  {
4622 1
    if ($encoding !== 'UTF-8') {
4623 1
      $encoding = self::normalize_encoding($encoding);
4624 1
    }
4625
4626 1
    if ($cleanUtf8 === true) {
4627
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4628
      // if invalid characters are found in $haystack before $needle
4629
      $needle = self::clean($needle);
4630
      $haystack = self::clean($haystack);
4631
    }
4632
4633 1
    return \mb_strrchr($haystack, $needle, $before_needle, $encoding);
4634
  }
4635
4636
  /**
4637
   * Reverses characters order in the string.
4638
   *
4639
   * @param string $str The input string
4640
   *
4641
   * @return string The string with characters in the reverse sequence
4642
   */
4643 4
  public static function strrev($str)
4644
  {
4645 4
    $str = (string)$str;
4646
4647 4
    if (!isset($str[0])) {
4648 2
      return '';
4649
    }
4650
4651 3
    return implode(array_reverse(self::split($str)));
4652
  }
4653
4654
  /**
4655
   * Finds the last occurrence of a character in a string within another, case insensitive.
4656
   *
4657
   * @link http://php.net/manual/en/function.mb-strrichr.php
4658
   *
4659
   * @param string  $haystack      <p>The string from which to get the last occurrence of needle.</p>
4660
   * @param string  $needle        <p>The string to find in haystack.</p>
4661
   * @param bool    $before_needle [optional] <p>
4662
   *                               Determines which portion of haystack
4663
   *                               this function returns.
4664
   *                               If set to true, it returns all of haystack
4665
   *                               from the beginning to the last occurrence of needle.
4666
   *                               If set to false, it returns all of haystack
4667
   *                               from the last occurrence of needle to the end,
4668
   *                               </p>
4669
   * @param string  $encoding      [optional] <p>
4670
   *                               Character encoding name to use.
4671
   *                               If it is omitted, internal character encoding is used.
4672
   *                               </p>
4673
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4674
   *
4675
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4676
   */
4677 1 View Code Duplication
  public static function strrichr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4678
  {
4679 1
    if ($encoding !== 'UTF-8') {
4680 1
      $encoding = self::normalize_encoding($encoding);
4681 1
    }
4682
4683 1
    if ($cleanUtf8 === true) {
4684
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4685
      // if invalid characters are found in $haystack before $needle
4686
      $needle = self::clean($needle);
4687
      $haystack = self::clean($haystack);
4688
    }
4689
4690 1
    return \mb_strrichr($haystack, $needle, $before_needle, $encoding);
4691
  }
4692
4693
  /**
4694
   * Find position of last occurrence of a case-insensitive string.
4695
   *
4696
   * @param string  $haystack  <p>The string to look in.</p>
4697
   * @param string  $needle    <p>The string to look for.</p>
4698
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4699
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4700
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4701
   *
4702
   * @return int|false <p>
4703
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4704
   *                   not found, it returns false.
4705
   *                   </p>
4706
   */
4707 1
  public static function strripos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4708
  {
4709 1
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $encoding, $cleanUtf8);
4710
  }
4711
4712
  /**
4713
   * Find position of last occurrence of a string in a string.
4714
   *
4715
   * @link http://php.net/manual/en/function.mb-strrpos.php
4716
   *
4717
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4718
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4719
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4720
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4721
   *                              the end of the string.
4722
   *                              </p>
4723
   * @param string     $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4724
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4725
   *
4726
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4727
   *                   is not found, it returns false.</p>
4728
   */
4729 11
  public static function strrpos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4730
  {
4731 11
    $haystack = (string)$haystack;
4732
4733 11
    if (((int)$needle) === $needle && ($needle >= 0)) {
4734 2
      $needle = self::chr($needle);
4735 2
    }
4736
4737 11
    $needle = (string)$needle;
4738
4739 11
    if (!isset($haystack[0], $needle[0])) {
4740 2
      return false;
4741
    }
4742
4743
    // init
4744 10
    $needle = (string)$needle;
4745 10
    $offset = (int)$offset;
4746
4747
    if (
4748
        $cleanUtf8 === true
4749 10
        ||
4750
        $encoding === true // INFO: the "bool"-check is only a fallback for old versions
4751 10
    ) {
4752
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4753
4754 3
      $needle = self::clean($needle);
4755 3
      $haystack = self::clean($haystack);
4756 3
    }
4757
4758 10
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4759
      self::checkForSupport();
4760
    }
4761
4762 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4763
        $encoding === 'UTF-8'
4764 10
        ||
4765 1
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4766 10
    ) {
4767 10
      $encoding = 'UTF-8';
4768 10
    } else {
4769 1
      $encoding = self::normalize_encoding($encoding);
4770
    }
4771
4772
    if (
4773
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4774 10
        ||
4775 10
        self::$support['mbstring'] === true
4776 10
    ) {
4777 10
      return \mb_strrpos($haystack, $needle, $offset, $encoding);
4778
    }
4779
4780
    if (self::$support['iconv'] === true) {
4781
      return \grapheme_strrpos($haystack, $needle, $offset);
4782
    }
4783
4784
    // fallback
4785
4786
    if ($offset > 0) {
4787
      $haystack = self::substr($haystack, $offset);
4788
    } elseif ($offset < 0) {
4789
      $haystack = self::substr($haystack, 0, $offset);
4790
    }
4791
4792 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4793
      $left = substr($haystack, 0, $pos);
4794
4795
      // negative offset not supported in PHP strpos(), ignoring
4796
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4797
    }
4798
4799
    return false;
4800
  }
4801
4802
  /**
4803
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4804
   * mask.
4805
   *
4806
   * @param string $str    <p>The input string.</p>
4807
   * @param string $mask   <p>The mask of chars</p>
4808
   * @param int    $offset [optional]
4809
   * @param int    $length [optional]
4810
   *
4811
   * @return int
4812
   */
4813 10
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4814
  {
4815
    // init
4816 10
    $length = (int)$length;
4817 10
    $offset = (int)$offset;
4818
4819 10
    if ($offset || 2147483647 !== $length) {
4820 2
      $str = self::substr($str, $offset, $length);
4821 2
    }
4822
4823 10
    $str = (string)$str;
4824 10
    if (!isset($str[0], $mask[0])) {
4825 2
      return 0;
4826
    }
4827
4828 8
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4829
  }
4830
4831
  /**
4832
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4833
   *
4834
   * @param string  $haystack      <p>The input string. Must be valid UTF-8.</p>
4835
   * @param string  $needle        <p>The string to look for. Must be valid UTF-8.</p>
4836
   * @param bool    $before_needle [optional] <p>
4837
   *                               If <b>TRUE</b>, strstr() returns the part of the
4838
   *                               haystack before the first occurrence of the needle (excluding the needle).
4839
   *                               </p>
4840
   * @param string  $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4841
   * @param boolean $cleanUtf8     [optional] <p>Clean non UTF-8 chars from the string.</p>
4842
   *
4843
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4844
   */
4845 2
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8', $cleanUtf8 = false)
4846
  {
4847 2
    if ($cleanUtf8 === true) {
4848
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4849
      // if invalid characters are found in $haystack before $needle
4850
      $needle = self::clean($needle);
4851
      $haystack = self::clean($haystack);
4852
    }
4853
4854 2
    if ($encoding !== 'UTF-8') {
4855 1
      $encoding = self::normalize_encoding($encoding);
4856 1
    }
4857
4858
    if (
4859
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4860 2
        ||
4861 2
        self::$support['mbstring'] === true
4862 2
    ) {
4863 2
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4864
    }
4865
4866
    return \grapheme_strstr($haystack, $needle, $before_needle);
4867
  }
4868
4869
  /**
4870
   * Unicode transformation for case-less matching.
4871
   *
4872
   * @link http://unicode.org/reports/tr21/tr21-5.html
4873
   *
4874
   * @param string $str  <p>The input string.</p>
4875
   * @param bool   $full <p>
4876
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4877
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4878
   *                     </p>
4879
   *
4880
   * @return string
4881
   */
4882 11
  public static function strtocasefold($str, $full = true)
4883
  {
4884 11
    static $fullCaseFold = null;
4885 11
    static $commonCaseFoldKeys = null;
4886 11
    static $commonCaseFoldValues = null;
4887
4888 11
    if ($commonCaseFoldKeys === null) {
4889 1
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4890 1
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4891 1
    }
4892
4893 11
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4894
4895 11
    if ($full) {
4896
4897 11
      if ($fullCaseFold === null) {
4898 1
        $fullCaseFold = self::getData('caseFolding_full');
4899 1
      }
4900
4901
      /** @noinspection OffsetOperationsInspection */
4902 11
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4903 11
    }
4904
4905 11
    $str = self::clean($str);
4906
4907 11
    return self::strtolower($str);
4908
  }
4909
4910
  /**
4911
   * Make a string lowercase.
4912
   *
4913
   * @link http://php.net/manual/en/function.mb-strtolower.php
4914
   *
4915
   * @param string  $str       <p>The string being lowercased.</p>
4916
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function</p>
4917
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4918
   *
4919
   * @return string str with all alphabetic characters converted to lowercase.
4920
   */
4921 21 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4922
  {
4923
    // init
4924 21
    $str = (string)$str;
4925
4926 21
    if (!isset($str[0])) {
4927 6
      return '';
4928
    }
4929
4930 19
    if ($cleanUtf8 === true) {
4931
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4932
      // if invalid characters are found in $haystack before $needle
4933
      $str = self::clean($str);
4934
    }
4935
4936 19
    if ($encoding !== 'UTF-8') {
4937 2
      $encoding = self::normalize_encoding($encoding);
4938 2
    }
4939
4940 19
    return \mb_strtolower($str, $encoding);
4941
  }
4942
4943
  /**
4944
   * Generic case sensitive transformation for collation matching.
4945
   *
4946
   * @param string $str <p>The input string</p>
4947
   *
4948
   * @return string
4949
   */
4950 3
  private static function strtonatfold($str)
4951
  {
4952 3
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4953
  }
4954
4955
  /**
4956
   * Make a string uppercase.
4957
   *
4958
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4959
   *
4960
   * @param string  $str       <p>The string being uppercased.</p>
4961
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4962
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4963
   *
4964
   * @return string str with all alphabetic characters converted to uppercase.
4965
   */
4966 16 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8', $cleanUtf8 = false)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4967
  {
4968 16
    $str = (string)$str;
4969
4970 16
    if (!isset($str[0])) {
4971 2
      return '';
4972
    }
4973
4974 15
    if ($cleanUtf8 === true) {
4975
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
4976
      // if invalid characters are found in $haystack before $needle
4977
      $str = self::clean($str);
4978
    }
4979
4980 15
    if ($encoding !== 'UTF-8') {
4981 2
      $encoding = self::normalize_encoding($encoding);
4982 2
    }
4983
4984 15
    return \mb_strtoupper($str, $encoding);
4985
  }
4986
4987
  /**
4988
   * Translate characters or replace sub-strings.
4989
   *
4990
   * @link  http://php.net/manual/en/function.strtr.php
4991
   *
4992
   * @param string          $str  <p>The string being translated.</p>
4993
   * @param string|string[] $from <p>The string replacing from.</p>
4994
   * @param string|string[] $to   <p>The string being translated to to.</p>
4995
   *
4996
   * @return string <p>
4997
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4998
   *                corresponding character in to.
4999
   *                </p>
5000
   */
5001 1
  public static function strtr($str, $from, $to = INF)
5002
  {
5003 1
    if (INF !== $to) {
5004 1
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 5004 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5005 1
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 5005 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
5006 1
      $countFrom = count($from);
5007 1
      $countTo = count($to);
5008
5009 1
      if ($countFrom > $countTo) {
5010 1
        $from = array_slice($from, 0, $countTo);
5011 1
      } elseif ($countFrom < $countTo) {
5012 1
        $to = array_slice($to, 0, $countFrom);
5013 1
      }
5014
5015 1
      $from = array_combine($from, $to);
5016 1
    }
5017
5018 1
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 5001 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5019
  }
5020
5021
  /**
5022
   * Return the width of a string.
5023
   *
5024
   * @param string  $str       <p>The input string.</p>
5025
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5026
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5027
   *
5028
   * @return int
5029
   */
5030 1
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5031
  {
5032 1
    if ($encoding !== 'UTF-8') {
5033 1
      $encoding = self::normalize_encoding($encoding);
5034 1
    }
5035
5036 1
    if ($cleanUtf8 === true) {
5037
      // iconv and mbstring are not tolerant to invalid encoding
5038
      // further, their behaviour is inconsistent with that of PHP's substr
5039
5040 1
      $str = self::clean($str);
5041 1
    }
5042
5043 1
    return \mb_strwidth($str, $encoding);
5044
  }
5045
5046
  /**
5047
   * Get part of a string.
5048
   *
5049
   * @link http://php.net/manual/en/function.mb-substr.php
5050
   *
5051
   * @param string  $str       <p>The string being checked.</p>
5052
   * @param int     $start     <p>The first position used in str.</p>
5053
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
5054
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5055
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5056
   *
5057
   * @return string Returns a sub-string specified by the start and length parameters.
5058
   */
5059 47
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5060
  {
5061
    // init
5062 47
    $str = (string)$str;
5063
5064 47
    if (!isset($str[0])) {
5065 9
      return '';
5066
    }
5067
5068 45
    if ($cleanUtf8 === true) {
5069
      // iconv and mbstring are not tolerant to invalid encoding
5070
      // further, their behaviour is inconsistent with that of PHP's substr
5071
5072 1
      $str = self::clean($str);
5073 1
    }
5074
5075 45
    $str_length = 0;
5076 45
    if ($start || $length === null) {
5077 37
      $str_length = (int)self::strlen($str);
5078 37
    }
5079
5080 45
    if ($start && $start > $str_length) {
5081 2
      return false;
5082
    }
5083
5084 43
    if ($length === null) {
5085 20
      $length = $str_length;
5086 20
    } else {
5087 41
      $length = (int)$length;
5088
    }
5089
5090 43
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5091
      self::checkForSupport();
5092
    }
5093
5094 View Code Duplication
    if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5095
        $encoding === 'UTF-8'
5096 43
        ||
5097 2
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
5098 43
    ) {
5099 43
      $encoding = 'UTF-8';
5100 43
    } else {
5101 1
      $encoding = self::normalize_encoding($encoding);
5102
    }
5103
5104 43
    if (self::$support['mbstring'] === true) {
5105 43
      return \mb_substr($str, $start, $length, $encoding);
5106
    }
5107
5108
    if (self::$support['iconv'] === true) {
5109
      return \iconv_substr($str, $start, $length, $encoding);
5110
    }
5111
5112
    // fallback
5113
5114
    // split to array, and remove invalid characters
5115
    $array = self::split($str);
5116
5117
    // extract relevant part, and join to make sting again
5118
    return implode(array_slice($array, $start, $length));
5119
  }
5120
5121
  /**
5122
   * Binary safe comparison of two strings from an offset, up to length characters.
5123
   *
5124
   * @param string  $main_str           <p>The main string being compared.</p>
5125
   * @param string  $str                <p>The secondary string being compared.</p>
5126
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
5127
   *                                    the end of the string.</p>
5128
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
5129
   *                                    the length of the str compared to the length of main_str less the offset.</p>
5130
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
5131
   *                                    insensitive.</p>
5132
   *
5133
   * @return int
5134
   */
5135 1
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
5136
  {
5137 1
    $main_str = self::substr($main_str, $offset, $length);
5138 1
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5137 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5139
5140 1
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5137 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5138 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 5137 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 5138 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
5141
  }
5142
5143
  /**
5144
   * Count the number of substring occurrences.
5145
   *
5146
   * @link  http://php.net/manual/en/function.substr-count.php
5147
   *
5148
   * @param string  $haystack  <p>The string to search in.</p>
5149
   * @param string  $needle    <p>The substring to search for.</p>
5150
   * @param int     $offset    [optional] <p>The offset where to start counting.</p>
5151
   * @param int     $length    [optional] <p>
5152
   *                           The maximum length after the specified offset to search for the
5153
   *                           substring. It outputs a warning if the offset plus the length is
5154
   *                           greater than the haystack length.
5155
   *                           </p>
5156
   * @param string  $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5157
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5158
   *
5159
   * @return int|false <p>This functions returns an integer or false if there isn't a string.</p>
5160
   */
5161 1
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
5162
  {
5163 1
    $haystack = (string)$haystack;
5164 1
    $needle = (string)$needle;
5165
5166 1
    if (!isset($haystack[0], $needle[0])) {
5167 1
      return false;
5168
    }
5169
5170 1
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5171 1
      $offset = (int)$offset;
5172 1
      $length = (int)$length;
5173
5174 1
      if ($length + $offset <= 0) {
5175 1
        return false;
5176
      }
5177
5178 1
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5179 1
    }
5180
5181 1
    if ($encoding !== 'UTF-8') {
5182 1
      $encoding = self::normalize_encoding($encoding);
5183 1
    }
5184
5185 1
    if ($cleanUtf8 === true) {
5186
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
5187
      // if invalid characters are found in $haystack before $needle
5188
      $needle = self::clean($needle);
5189
      $haystack = self::clean($haystack);
0 ignored issues
show
Security Bug introduced by
It seems like $haystack can also be of type false; however, voku\helper\UTF8::clean() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5190
    }
5191
5192 1
    return \mb_substr_count($haystack, $needle, $encoding);
5193
  }
5194
5195
  /**
5196
   * Replace text within a portion of a string.
5197
   *
5198
   * source: https://gist.github.com/stemar/8287074
5199
   *
5200
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5201
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5202
   * @param int|int[]       $start
5203
   * @param int|int[]|void  $length      [optional]
5204
   *
5205
   * @return string|string[]
5206
   */
5207 6
  public static function substr_replace($str, $replacement, $start, $length = null)
5208
  {
5209 6
    if (is_array($str)) {
5210 1
      $num = count($str);
5211
5212
      // $replacement
5213 1
      if (is_array($replacement)) {
5214 1
        $replacement = array_slice($replacement, 0, $num);
5215 1
      } else {
5216 1
        $replacement = array_pad(array($replacement), $num, $replacement);
5217
      }
5218
5219
      // $start
5220 1 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5221 1
        $start = array_slice($start, 0, $num);
5222 1
        foreach ($start as &$valueTmp) {
5223 1
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5224 1
        }
5225 1
        unset($valueTmp);
5226 1
      } else {
5227 1
        $start = array_pad(array($start), $num, $start);
5228
      }
5229
5230
      // $length
5231 1
      if (!isset($length)) {
5232 1
        $length = array_fill(0, $num, 0);
5233 1 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5234 1
        $length = array_slice($length, 0, $num);
5235 1
        foreach ($length as &$valueTmpV2) {
5236 1
          if (isset($valueTmpV2)) {
5237 1
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5238 1
          } else {
5239
            $valueTmpV2 = 0;
5240
          }
5241 1
        }
5242 1
        unset($valueTmpV2);
5243 1
      } else {
5244 1
        $length = array_pad(array($length), $num, $length);
5245
      }
5246
5247
      // Recursive call
5248 1
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5249
    } else {
5250 6
      if (is_array($replacement)) {
5251 1
        if (count($replacement) > 0) {
5252 1
          $replacement = $replacement[0];
5253 1
        } else {
5254 1
          $replacement = '';
5255
        }
5256 1
      }
5257
    }
5258
5259 6
    preg_match_all('/./us', (string)$str, $smatches);
5260 6
    preg_match_all('/./us', (string)$replacement, $rmatches);
5261
5262 6
    if ($length === null) {
5263 4
      $length = \mb_strlen($str);
5264 4
    }
5265
5266 6
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5267
5268 6
    return implode($smatches[0], null);
5269
  }
5270
5271
  /**
5272
   * Returns a case swapped version of the string.
5273
   *
5274
   * @param string  $str       <p>The input string.</p>
5275
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
5276
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5277
   *
5278
   * @return string <p>Each character's case swapped.</p>
5279
   */
5280 1
  public static function swapCase($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5281
  {
5282 1
    $str = (string)$str;
5283
5284 1
    if (!isset($str[0])) {
5285 1
      return '';
5286
    }
5287
5288 1
    if ($encoding !== 'UTF-8') {
5289 1
      $encoding = self::normalize_encoding($encoding);
5290 1
    }
5291
5292 1
    if ($cleanUtf8 === true) {
5293
      // "\mb_strpos" and "\iconv_strpos" returns wrong position,
5294
      // if invalid characters are found in $haystack before $needle
5295 1
      $str = self::clean($str);
5296 1
    }
5297
5298 1
    $strSwappedCase = preg_replace_callback(
5299 1
        '/[\S]/u',
5300
        function ($match) use ($encoding) {
5301 1
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5302
5303 1
          if ($match[0] === $marchToUpper) {
5304 1
            return UTF8::strtolower($match[0], $encoding);
5305
          } else {
5306 1
            return $marchToUpper;
5307
          }
5308 1
        },
5309
        $str
5310 1
    );
5311
5312 1
    return $strSwappedCase;
5313
  }
5314
5315
  /**
5316
   * alias for "UTF8::to_ascii()"
5317
   *
5318
   * @see UTF8::to_ascii()
5319
   *
5320
   * @param string $s
5321
   * @param string $subst_chr
5322
   * @param bool   $strict
5323
   *
5324
   * @return string
5325
   */
5326 7
  public static function toAscii($s, $subst_chr = '?', $strict = false)
5327
  {
5328 7
    return self::to_ascii($s, $subst_chr, $strict);
5329
  }
5330
5331
  /**
5332
   * alias for "UTF8::to_iso8859()"
5333
   *
5334
   * @see UTF8::to_iso8859()
5335
   *
5336
   * @param string $str
5337
   *
5338
   * @return string|string[]
5339
   */
5340 1
  public static function toIso8859($str)
5341
  {
5342 1
    return self::to_iso8859($str);
5343
  }
5344
5345
  /**
5346
   * alias for "UTF8::to_latin1()"
5347
   *
5348
   * @see UTF8::to_latin1()
5349
   *
5350
   * @param $str
5351
   *
5352
   * @return string
5353
   */
5354 1
  public static function toLatin1($str)
5355
  {
5356 1
    return self::to_latin1($str);
5357
  }
5358
5359
  /**
5360
   * alias for "UTF8::to_utf8()"
5361
   *
5362
   * @see UTF8::to_utf8()
5363
   *
5364
   * @param string $str
5365
   *
5366
   * @return string
5367
   */
5368 1
  public static function toUTF8($str)
5369
  {
5370 1
    return self::to_utf8($str);
5371
  }
5372
5373
  /**
5374
   * Convert a string into ASCII.
5375
   *
5376
   * @param string $str     <p>The input string.</p>
5377
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5378
   * @param bool   $strict  [optional] <p>Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad
5379
   *                        performance</p>
5380
   *
5381
   * @return string
5382
   *
5383
   * @throws \Exception
5384
   */
5385 13
  public static function to_ascii($str, $unknown = '?', $strict = false)
5386
  {
5387 13
    static $UTF8_TO_ASCII;
5388
5389
    // init
5390 13
    $str = (string)$str;
5391
5392 13
    if (!isset($str[0])) {
5393 3
      return '';
5394
    }
5395
5396 11
    $str = self::clean($str, false, true, true);
5397
5398
    // check if we only have ASCII
5399 11
    if (self::is_ascii($str) === true) {
5400 7
      return $str;
5401
    }
5402
5403 5
    if ($strict === true) {
5404 1
      if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5405
        self::checkForSupport();
5406
      }
5407
5408 1
      if (self::$support['intl'] == true && Bootup::is_php('5.4')) {
5409 1
        $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5410
5411
        // check again, if we only have ASCII, now ...
5412 1
        if (self::is_ascii($str) === true) {
5413 1
          return $str;
5414
        }
5415
5416 1
      } else {
5417
        throw new \Exception('Intl is not supported or you use PHP < 5.4!');
5418
      }
5419 1
    }
5420
5421 5
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5422 5
    $chars = $ar[0];
5423 5
    foreach ($chars as &$c) {
5424
5425 5
      $ordC0 = ord($c[0]);
5426
5427 5
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5428 5
        continue;
5429
      }
5430
5431 5
      $ordC1 = ord($c[1]);
5432
5433
      // ASCII - next please
5434 5
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5435 5
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5436 5
      }
5437
5438 5
      if ($ordC0 >= 224) {
5439 2
        $ordC2 = ord($c[2]);
5440
5441 2
        if ($ordC0 <= 239) {
5442 2
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5443 2
        }
5444
5445 2
        if ($ordC0 >= 240) {
5446 1
          $ordC3 = ord($c[3]);
5447
5448 1
          if ($ordC0 <= 247) {
5449 1
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5450 1
          }
5451
5452 1
          if ($ordC0 >= 248) {
5453
            $ordC4 = ord($c[4]);
5454
5455 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5456
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5457
            }
5458
5459
            if ($ordC0 >= 252) {
5460
              $ordC5 = ord($c[5]);
5461
5462 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5463
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5464
              }
5465
            }
5466
          }
5467 1
        }
5468 2
      }
5469
5470 5
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5471
        $c = $unknown;
5472
        continue;
5473
      }
5474
5475 5
      if (!isset($ord)) {
5476
        $c = $unknown;
5477
        continue;
5478
      }
5479
5480 5
      $bank = $ord >> 8;
5481 5
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5482 1
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5483 1
        if (file_exists($bankfile)) {
5484
          /** @noinspection PhpIncludeInspection */
5485 1
          require $bankfile;
5486 1
        } else {
5487 1
          $UTF8_TO_ASCII[$bank] = array();
5488
        }
5489 1
      }
5490
5491 5
      $newchar = $ord & 255;
5492 5
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5493 5
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5494 5
      } else {
5495 1
        $c = $unknown;
5496
      }
5497 5
    }
5498
5499 5
    return implode('', $chars);
5500
  }
5501
5502
  /**
5503
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5504
   *
5505
   * @param string|string[] $str
5506
   *
5507
   * @return string|string[]
5508
   */
5509 2
  public static function to_iso8859($str)
5510
  {
5511 2
    if (is_array($str)) {
5512
5513 1
      foreach ($str as $k => $v) {
5514
        /** @noinspection AlterInForeachInspection */
5515
        /** @noinspection OffsetOperationsInspection */
5516 1
        $str[$k] = self::to_iso8859($v);
5517 1
      }
5518
5519 1
      return $str;
5520
    }
5521
5522 2
    $str = (string)$str;
5523
5524 2
    if (!isset($str[0])) {
5525 1
      return '';
5526
    }
5527
5528 2
    return self::utf8_decode($str);
5529
  }
5530
5531
  /**
5532
   * alias for "UTF8::to_iso8859()"
5533
   *
5534
   * @see UTF8::to_iso8859()
5535
   *
5536
   * @param string|string[] $str
5537
   *
5538
   * @return string|string[]
5539
   */
5540 1
  public static function to_latin1($str)
5541
  {
5542 1
    return self::to_iso8859($str);
5543
  }
5544
5545
  /**
5546
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5547
   *
5548
   * - It decode UTF-8 codepoints and unicode escape sequences.
5549
   *
5550
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5551
   *
5552
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5553
   *
5554
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5555
   *    are followed by any of these:  ("group B")
5556
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5557
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5558
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5559
   * is also a valid unicode character, and will be left unchanged.
5560
   *
5561
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5562
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5563
   *
5564
   * @param string|string[] $str <p>Any string or array.</p>
5565
   *
5566
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5567
   */
5568 20
  public static function to_utf8($str)
5569
  {
5570 20
    if (is_array($str)) {
5571 2
      foreach ($str as $k => $v) {
5572
        /** @noinspection AlterInForeachInspection */
5573
        /** @noinspection OffsetOperationsInspection */
5574 2
        $str[$k] = self::to_utf8($v);
5575 2
      }
5576
5577 2
      return $str;
5578
    }
5579
5580 20
    $str = (string)$str;
5581
5582 20
    if (!isset($str[0])) {
5583 4
      return $str;
5584
    }
5585
5586 19
    $max = strlen($str);
5587 19
    $buf = '';
5588
5589
    /** @noinspection ForeachInvariantsInspection */
5590 19
    for ($i = 0; $i < $max; $i++) {
5591 19
      $c1 = $str[$i];
5592
5593 19
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5594 19
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5595 19
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5596 19
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5597
5598 19
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5599
5600 16
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5601 16
            $buf .= $c1 . $c2;
5602 16
            $i++;
5603 16
          } else { // not valid UTF8 - convert it
5604 5
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5605 5
            $cc2 = ($c1 & "\x3f") | "\x80";
5606 5
            $buf .= $cc1 . $cc2;
5607
          }
5608
5609 19 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5610
5611 17
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5612 13
            $buf .= $c1 . $c2 . $c3;
5613 13
            $i += 2;
5614 13
          } else { // not valid UTF8 - convert it
5615 8
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5616 8
            $cc2 = ($c1 & "\x3f") | "\x80";
5617 8
            $buf .= $cc1 . $cc2;
5618
          }
5619
5620 19
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5621
5622 9 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5623 4
            $buf .= $c1 . $c2 . $c3 . $c4;
5624 4
            $i += 3;
5625 4
          } else { // not valid UTF8 - convert it
5626 6
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5627 6
            $cc2 = ($c1 & "\x3f") | "\x80";
5628 6
            $buf .= $cc1 . $cc2;
5629
          }
5630
5631 9
        } else { // doesn't look like UTF8, but should be converted
5632 6
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5633 6
          $cc2 = (($c1 & "\x3f") | "\x80");
5634 6
          $buf .= $cc1 . $cc2;
5635
        }
5636
5637 19
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5638
5639 4
        $ordC1 = ord($c1);
5640 4
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5641 2
          $buf .= self::$win1252ToUtf8[$ordC1];
5642 2
        } else {
5643 3
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5644 3
          $cc2 = (($c1 & "\x3f") | "\x80");
5645 3
          $buf .= $cc1 . $cc2;
5646
        }
5647
5648 4
      } else { // it doesn't need conversion
5649 16
        $buf .= $c1;
5650
      }
5651 19
    }
5652
5653
    // decode unicode escape sequences
5654 19
    $buf = preg_replace_callback(
5655 19
        '/\\\\u([0-9a-f]{4})/i',
5656
        function ($match) {
5657 3
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5658 19
        },
5659
        $buf
5660 19
    );
5661
5662
    // decode UTF-8 codepoints
5663 19
    $buf = preg_replace_callback(
5664 19
        '/&#\d{2,4};/',
5665 19
        function ($match) {
5666 2
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5667 19
        },
5668
        $buf
5669 19
    );
5670
5671 19
    return $buf;
5672
  }
5673
5674
  /**
5675
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5676
   *
5677
   * INFO: This is slower then "trim()"
5678
   *
5679
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5680
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5681
   *
5682
   * @param string $str   <p>The string to be trimmed</p>
5683
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5684
   *
5685
   * @return string <p>The trimmed string.</p>
5686
   */
5687 26
  public static function trim($str = '', $chars = INF)
5688
  {
5689 26
    $str = (string)$str;
5690
5691 26
    if (!isset($str[0])) {
5692 5
      return '';
5693
    }
5694
5695
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5696 22
    if ($chars === INF || !$chars) {
5697 6
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5698
    }
5699
5700 16
    return self::rtrim(self::ltrim($str, $chars), $chars);
5701
  }
5702
5703
  /**
5704
   * Makes string's first char uppercase.
5705
   *
5706
   * @param string  $str       <p>The input string.</p>
5707
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
5708
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
5709
   *
5710
   * @return string <p>The resulting string</p>
5711
   */
5712 14
  public static function ucfirst($str, $encoding = 'UTF-8', $cleanUtf8 = false)
5713
  {
5714 14
    return self::strtoupper(self::substr($str, 0, 1, $encoding, $cleanUtf8), $encoding, $cleanUtf8) . self::substr($str, 1, null, $encoding, $cleanUtf8);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1, $encoding, $cleanUtf8) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5715
  }
5716
5717
  /**
5718
   * alias for "UTF8::ucfirst()"
5719
   *
5720
   * @see UTF8::ucfirst()
5721
   *
5722
   * @param string  $word
5723
   * @param string  $encoding
5724
   * @param boolean $cleanUtf8
5725
   *
5726
   * @return string
5727
   */
5728 1
  public static function ucword($word, $encoding = 'UTF-8', $cleanUtf8 = false)
5729
  {
5730 1
    return self::ucfirst($word, $encoding, $cleanUtf8);
5731
  }
5732
5733
  /**
5734
   * Uppercase for all words in the string.
5735
   *
5736
   * @param string   $str        <p>The input string.</p>
5737
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5738
   * @param string   $charlist   [optional] <p>Additional chars that contains to words and do not start a new word.</p>
5739
   * @param string   $encoding   [optional] <p>Set the charset for e.g. "\mb_" function.</p>
5740
   * @param boolean  $cleanUtf8  [optional] <p>Clean non UTF-8 chars from the string.</p>
5741
   *
5742
   * @return string
5743
   */
5744 8
  public static function ucwords($str, $exceptions = array(), $charlist = '', $encoding = 'UTF-8', $cleanUtf8 = false)
5745
  {
5746 8
    if (!$str) {
5747 2
      return '';
5748
    }
5749
5750 7
    $charlist = self::rxClass($charlist, '\pL');
5751 7
    $words = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
5752 7
    $newwords = array();
5753
5754 7
    if (count($exceptions) > 0) {
5755 1
      $useExceptions = true;
5756 1
    } else {
5757 7
      $useExceptions = false;
5758
    }
5759
5760 7
    foreach ($words as $word) {
5761
5762 7
      if (!$word) {
5763 7
        continue;
5764
      }
5765
5766
      if (
5767 7
          ($useExceptions === false)
5768
          ||
5769
          (
5770
              $useExceptions === true
5771 1
              &&
5772 1
              !in_array($word, $exceptions, true)
5773 1
          )
5774 7
      ) {
5775 7
        $word = self::ucfirst($word, $encoding, $cleanUtf8);
5776 7
      }
5777
5778 7
      $newwords[] = $word;
5779 7
    }
5780
5781 7
    return implode('', $newwords);
5782
  }
5783
5784
  /**
5785
   * Multi decode html entity & fix urlencoded-win1252-chars.
5786
   *
5787
   * e.g:
5788
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5789
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5790
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5791
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5792
   * 'Düsseldorf'                   => 'Düsseldorf'
5793
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5794
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5795
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5796
   *
5797
   * @param string $str <p>The input string.</p>
5798
   *
5799
   * @return string
5800
   */
5801 1
  public static function urldecode($str)
5802
  {
5803 1
    $str = (string)$str;
5804
5805 1
    if (!isset($str[0])) {
5806 1
      return '';
5807
    }
5808
5809 1
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5810
5811 1
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5812
5813 1
    $str = self::fix_simple_utf8(
5814 1
        rawurldecode(
5815 1
            self::html_entity_decode(
5816 1
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5817
                $flags
5818 1
            )
5819 1
        )
5820 1
    );
5821
5822 1
    return (string)$str;
5823
  }
5824
5825
  /**
5826
   * Return a array with "urlencoded"-win1252 -> UTF-8
5827
   *
5828
   * @return mixed
5829
   */
5830 1
  public static function urldecode_fix_win1252_chars()
5831
  {
5832
    static $array = array(
5833
        '%20' => ' ',
5834
        '%21' => '!',
5835
        '%22' => '"',
5836
        '%23' => '#',
5837
        '%24' => '$',
5838
        '%25' => '%',
5839
        '%26' => '&',
5840
        '%27' => "'",
5841
        '%28' => '(',
5842
        '%29' => ')',
5843
        '%2A' => '*',
5844
        '%2B' => '+',
5845
        '%2C' => ',',
5846
        '%2D' => '-',
5847
        '%2E' => '.',
5848
        '%2F' => '/',
5849
        '%30' => '0',
5850
        '%31' => '1',
5851
        '%32' => '2',
5852
        '%33' => '3',
5853
        '%34' => '4',
5854
        '%35' => '5',
5855
        '%36' => '6',
5856
        '%37' => '7',
5857
        '%38' => '8',
5858
        '%39' => '9',
5859
        '%3A' => ':',
5860
        '%3B' => ';',
5861
        '%3C' => '<',
5862
        '%3D' => '=',
5863
        '%3E' => '>',
5864
        '%3F' => '?',
5865
        '%40' => '@',
5866
        '%41' => 'A',
5867
        '%42' => 'B',
5868
        '%43' => 'C',
5869
        '%44' => 'D',
5870
        '%45' => 'E',
5871
        '%46' => 'F',
5872
        '%47' => 'G',
5873
        '%48' => 'H',
5874
        '%49' => 'I',
5875
        '%4A' => 'J',
5876
        '%4B' => 'K',
5877
        '%4C' => 'L',
5878
        '%4D' => 'M',
5879
        '%4E' => 'N',
5880
        '%4F' => 'O',
5881
        '%50' => 'P',
5882
        '%51' => 'Q',
5883
        '%52' => 'R',
5884
        '%53' => 'S',
5885
        '%54' => 'T',
5886
        '%55' => 'U',
5887
        '%56' => 'V',
5888
        '%57' => 'W',
5889
        '%58' => 'X',
5890
        '%59' => 'Y',
5891
        '%5A' => 'Z',
5892
        '%5B' => '[',
5893
        '%5C' => '\\',
5894
        '%5D' => ']',
5895
        '%5E' => '^',
5896
        '%5F' => '_',
5897
        '%60' => '`',
5898
        '%61' => 'a',
5899
        '%62' => 'b',
5900
        '%63' => 'c',
5901
        '%64' => 'd',
5902
        '%65' => 'e',
5903
        '%66' => 'f',
5904
        '%67' => 'g',
5905
        '%68' => 'h',
5906
        '%69' => 'i',
5907
        '%6A' => 'j',
5908
        '%6B' => 'k',
5909
        '%6C' => 'l',
5910
        '%6D' => 'm',
5911
        '%6E' => 'n',
5912
        '%6F' => 'o',
5913
        '%70' => 'p',
5914
        '%71' => 'q',
5915
        '%72' => 'r',
5916
        '%73' => 's',
5917
        '%74' => 't',
5918
        '%75' => 'u',
5919
        '%76' => 'v',
5920
        '%77' => 'w',
5921
        '%78' => 'x',
5922
        '%79' => 'y',
5923
        '%7A' => 'z',
5924
        '%7B' => '{',
5925
        '%7C' => '|',
5926
        '%7D' => '}',
5927
        '%7E' => '~',
5928
        '%7F' => '',
5929
        '%80' => '`',
5930
        '%81' => '',
5931
        '%82' => '‚',
5932
        '%83' => 'ƒ',
5933
        '%84' => '„',
5934
        '%85' => '…',
5935
        '%86' => '†',
5936
        '%87' => '‡',
5937
        '%88' => 'ˆ',
5938
        '%89' => '‰',
5939
        '%8A' => 'Š',
5940
        '%8B' => '‹',
5941
        '%8C' => 'Œ',
5942
        '%8D' => '',
5943
        '%8E' => 'Ž',
5944
        '%8F' => '',
5945
        '%90' => '',
5946
        '%91' => '‘',
5947
        '%92' => '’',
5948
        '%93' => '“',
5949
        '%94' => '”',
5950
        '%95' => '•',
5951
        '%96' => '–',
5952
        '%97' => '—',
5953
        '%98' => '˜',
5954
        '%99' => '™',
5955
        '%9A' => 'š',
5956
        '%9B' => '›',
5957
        '%9C' => 'œ',
5958
        '%9D' => '',
5959
        '%9E' => 'ž',
5960
        '%9F' => 'Ÿ',
5961
        '%A0' => '',
5962
        '%A1' => '¡',
5963
        '%A2' => '¢',
5964
        '%A3' => '£',
5965
        '%A4' => '¤',
5966
        '%A5' => '¥',
5967
        '%A6' => '¦',
5968
        '%A7' => '§',
5969
        '%A8' => '¨',
5970
        '%A9' => '©',
5971
        '%AA' => 'ª',
5972
        '%AB' => '«',
5973
        '%AC' => '¬',
5974
        '%AD' => '',
5975
        '%AE' => '®',
5976
        '%AF' => '¯',
5977
        '%B0' => '°',
5978
        '%B1' => '±',
5979
        '%B2' => '²',
5980
        '%B3' => '³',
5981
        '%B4' => '´',
5982
        '%B5' => 'µ',
5983
        '%B6' => '¶',
5984
        '%B7' => '·',
5985
        '%B8' => '¸',
5986
        '%B9' => '¹',
5987
        '%BA' => 'º',
5988
        '%BB' => '»',
5989
        '%BC' => '¼',
5990
        '%BD' => '½',
5991
        '%BE' => '¾',
5992
        '%BF' => '¿',
5993
        '%C0' => 'À',
5994
        '%C1' => 'Á',
5995
        '%C2' => 'Â',
5996
        '%C3' => 'Ã',
5997
        '%C4' => 'Ä',
5998
        '%C5' => 'Å',
5999
        '%C6' => 'Æ',
6000
        '%C7' => 'Ç',
6001
        '%C8' => 'È',
6002
        '%C9' => 'É',
6003
        '%CA' => 'Ê',
6004
        '%CB' => 'Ë',
6005
        '%CC' => 'Ì',
6006
        '%CD' => 'Í',
6007
        '%CE' => 'Î',
6008
        '%CF' => 'Ï',
6009
        '%D0' => 'Ð',
6010
        '%D1' => 'Ñ',
6011
        '%D2' => 'Ò',
6012
        '%D3' => 'Ó',
6013
        '%D4' => 'Ô',
6014
        '%D5' => 'Õ',
6015
        '%D6' => 'Ö',
6016
        '%D7' => '×',
6017
        '%D8' => 'Ø',
6018
        '%D9' => 'Ù',
6019
        '%DA' => 'Ú',
6020
        '%DB' => 'Û',
6021
        '%DC' => 'Ü',
6022
        '%DD' => 'Ý',
6023
        '%DE' => 'Þ',
6024
        '%DF' => 'ß',
6025
        '%E0' => 'à',
6026
        '%E1' => 'á',
6027
        '%E2' => 'â',
6028
        '%E3' => 'ã',
6029
        '%E4' => 'ä',
6030
        '%E5' => 'å',
6031
        '%E6' => 'æ',
6032
        '%E7' => 'ç',
6033
        '%E8' => 'è',
6034
        '%E9' => 'é',
6035
        '%EA' => 'ê',
6036
        '%EB' => 'ë',
6037
        '%EC' => 'ì',
6038
        '%ED' => 'í',
6039
        '%EE' => 'î',
6040
        '%EF' => 'ï',
6041
        '%F0' => 'ð',
6042
        '%F1' => 'ñ',
6043
        '%F2' => 'ò',
6044
        '%F3' => 'ó',
6045
        '%F4' => 'ô',
6046
        '%F5' => 'õ',
6047
        '%F6' => 'ö',
6048
        '%F7' => '÷',
6049
        '%F8' => 'ø',
6050
        '%F9' => 'ù',
6051
        '%FA' => 'ú',
6052
        '%FB' => 'û',
6053
        '%FC' => 'ü',
6054
        '%FD' => 'ý',
6055
        '%FE' => 'þ',
6056
        '%FF' => 'ÿ',
6057 1
    );
6058
6059 1
    return $array;
6060
  }
6061
6062
  /**
6063
   * Decodes an UTF-8 string to ISO-8859-1.
6064
   *
6065
   * @param string $str <p>The input string.</p>
6066
   *
6067
   * @return string
6068
   */
6069 6
  public static function utf8_decode($str)
6070
  {
6071 6
    static $utf8ToWin1252Keys = null;
6072 6
    static $utf8ToWin1252Values = null;
6073
6074 6
    $str = (string)$str;
6075
6076 6
    if (!isset($str[0])) {
6077 3
      return '';
6078
    }
6079
6080
    // init
6081 6
    $str = self::to_utf8($str);
6082
6083 6
    if ($utf8ToWin1252Keys === null) {
6084 1
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
6085 1
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
6086 1
    }
6087
6088 6
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
6089
  }
6090
6091
  /**
6092
   * Encodes an ISO-8859-1 string to UTF-8.
6093
   *
6094
   * @param string $str <p>The input string.</p>
6095
   *
6096
   * @return string
6097
   */
6098 6
  public static function utf8_encode($str)
6099
  {
6100 6
    $str = \utf8_encode($str);
6101
6102 6
    if (false === strpos($str, "\xC2")) {
6103 6
      return $str;
6104
    } else {
6105
6106 5
      static $cp1252ToUtf8Keys = null;
6107 5
      static $cp1252ToUtf8Values = null;
6108
6109 5
      if ($cp1252ToUtf8Keys === null) {
6110 1
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
6111 1
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
6112 1
      }
6113
6114 5
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
6115
    }
6116
  }
6117
6118
  /**
6119
   * fix -> utf8-win1252 chars
6120
   *
6121
   * @param string $str <p>The input string.</p>
6122
   *
6123
   * @return string
6124
   *
6125
   * @deprecated use "UTF8::fix_simple_utf8()"
6126
   */
6127
  public static function utf8_fix_win1252_chars($str)
6128
  {
6129
    return self::fix_simple_utf8($str);
6130
  }
6131
6132
  /**
6133
   * Returns an array with all utf8 whitespace characters.
6134
   *
6135
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
6136
   *
6137
   * @author: Derek E. [email protected]
6138
   *
6139
   * @return array <p>
6140
   *               An array with all known whitespace characters as values and the type of whitespace as keys
6141
   *               as defined in above URL.
6142
   *               </p>
6143
   */
6144 1
  public static function whitespace_table()
6145
  {
6146 1
    return self::$whitespaceTable;
6147
  }
6148
6149
  /**
6150
   * Limit the number of words in a string.
6151
   *
6152
   * @param string $str      <p>The input string.</p>
6153
   * @param int    $words    <p>The limit of words as integer.</p>
6154
   * @param string $strAddOn <p>Replacement for the striped string.</p>
6155
   *
6156
   * @return string
6157
   */
6158 1
  public static function words_limit($str, $words = 100, $strAddOn = '...')
6159
  {
6160 1
    $str = (string)$str;
6161
6162 1
    if (!isset($str[0])) {
6163 1
      return '';
6164
    }
6165
6166 1
    $words = (int)$words;
6167
6168 1
    if ($words < 1) {
6169 1
      return '';
6170
    }
6171
6172 1
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
6173
6174
    if (
6175 1
        !isset($matches[0])
6176 1
        ||
6177 1
        self::strlen($str) === self::strlen($matches[0])
6178 1
    ) {
6179 1
      return $str;
6180
    }
6181
6182 1
    return self::rtrim($matches[0]) . $strAddOn;
6183
  }
6184
6185
  /**
6186
   * Wraps a string to a given number of characters
6187
   *
6188
   * @link  http://php.net/manual/en/function.wordwrap.php
6189
   *
6190
   * @param string $str   <p>The input string.</p>
6191
   * @param int    $width [optional] <p>The column width.</p>
6192
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6193
   * @param bool   $cut   [optional] <p>
6194
   *                      If the cut is set to true, the string is
6195
   *                      always wrapped at or before the specified width. So if you have
6196
   *                      a word that is larger than the given width, it is broken apart.
6197
   *                      </p>
6198
   *
6199
   * @return string <p>The given string wrapped at the specified column.</p>
6200
   */
6201 10
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6202
  {
6203 10
    $str = (string)$str;
6204 10
    $break = (string)$break;
6205
6206 10
    if (!isset($str[0], $break[0])) {
6207 3
      return '';
6208
    }
6209
6210 8
    $w = '';
6211 8
    $strSplit = explode($break, $str);
6212 8
    $count = count($strSplit);
6213
6214 8
    $chars = array();
6215
    /** @noinspection ForeachInvariantsInspection */
6216 8
    for ($i = 0; $i < $count; ++$i) {
6217
6218 8
      if ($i) {
6219 1
        $chars[] = $break;
6220 1
        $w .= '#';
6221 1
      }
6222
6223 8
      $c = $strSplit[$i];
6224 8
      unset($strSplit[$i]);
6225
6226 8
      foreach (self::split($c) as $c) {
6227 8
        $chars[] = $c;
6228 8
        $w .= ' ' === $c ? ' ' : '?';
6229 8
      }
6230 8
    }
6231
6232 8
    $strReturn = '';
6233 8
    $j = 0;
6234 8
    $b = $i = -1;
6235 8
    $w = wordwrap($w, $width, '#', $cut);
6236
6237 8
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6238 6
      for (++$i; $i < $b; ++$i) {
6239 6
        $strReturn .= $chars[$j];
6240 6
        unset($chars[$j++]);
6241 6
      }
6242
6243 6
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6244 3
        unset($chars[$j++]);
6245 3
      }
6246
6247 6
      $strReturn .= $break;
6248 6
    }
6249
6250 8
    return $strReturn . implode('', $chars);
6251
  }
6252
6253
  /**
6254
   * Returns an array of Unicode White Space characters.
6255
   *
6256
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6257
   */
6258 1
  public static function ws()
6259
  {
6260 1
    return self::$whitespace;
6261
  }
6262
6263
}
6264