Completed
Push — master ( c390d7...9e4019 )
by Lars
04:05
created

UTF8::split()   C

Complexity

Conditions 23
Paths 41

Size

Total Lines 73
Code Lines 38

Duplication

Lines 12
Ratio 16.44 %

Code Coverage

Tests 52
CRAP Score 23

Importance

Changes 6
Bugs 1 Features 4
Metric Value
c 6
b 1
f 4
dl 12
loc 73
ccs 52
cts 52
cp 1
rs 5.3464
cc 23
eloc 38
nc 41
nop 3
crap 23

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace voku\helper;
4
5
use Symfony\Polyfill\Intl\Grapheme\Grapheme;
6
use Symfony\Polyfill\Xml\Xml;
7
8
/**
9
 * UTF8-Helper-Class
10
 *
11
 * @package voku\helper
12
 */
13
final class UTF8
14
{
15
  /**
16
   * @var array
17
   */
18
  private static $win1252ToUtf8 = array(
19
      128 => "\xe2\x82\xac", // EURO SIGN
20
      130 => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
21
      131 => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
22
      132 => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
23
      133 => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
24
      134 => "\xe2\x80\xa0", // DAGGER
25
      135 => "\xe2\x80\xa1", // DOUBLE DAGGER
26
      136 => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
27
      137 => "\xe2\x80\xb0", // PER MILLE SIGN
28
      138 => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
29
      139 => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
30
      140 => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
31
      142 => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
32
      145 => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
33
      146 => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
34
      147 => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
35
      148 => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
36
      149 => "\xe2\x80\xa2", // BULLET
37
      150 => "\xe2\x80\x93", // EN DASH
38
      151 => "\xe2\x80\x94", // EM DASH
39
      152 => "\xcb\x9c", // SMALL TILDE
40
      153 => "\xe2\x84\xa2", // TRADE MARK SIGN
41
      154 => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
42
      155 => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
43
      156 => "\xc5\x93", // LATIN SMALL LIGATURE OE
44
      158 => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
45
      159 => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
46
  );
47
48
  /**
49
   * @var array
50
   */
51
  private static $cp1252ToUtf8 = array(
52
      '€' => '€',
53
      '‚' => '‚',
54
      'ƒ' => 'ƒ',
55
      '„' => '„',
56
      '…' => '…',
57
      '†' => '†',
58
      '‡' => '‡',
59
      'ˆ' => 'ˆ',
60
      '‰' => '‰',
61
      'Š' => 'Š',
62
      '‹' => '‹',
63
      'Œ' => 'Œ',
64
      'Ž' => 'Ž',
65
      '‘' => '‘',
66
      '’' => '’',
67
      '“' => '“',
68
      '”' => '”',
69
      '•' => '•',
70
      '–' => '–',
71
      '—' => '—',
72
      '˜' => '˜',
73
      '™' => '™',
74
      'š' => 'š',
75
      '›' => '›',
76
      'œ' => 'œ',
77
      'ž' => 'ž',
78
      'Ÿ' => 'Ÿ',
79
  );
80
81
  /**
82
   * Bom => Byte-Length
83
   *
84
   * INFO: https://en.wikipedia.org/wiki/Byte_order_mark
85
   *
86
   * @var array
87
   */
88
  private static $bom = array(
89
      "\xef\xbb\xbf"     => 3, // UTF-8 BOM
90
      ''              => 6, // UTF-8 BOM as "WINDOWS-1252" (one char has [maybe] more then one byte ...)
91
      "\x00\x00\xfe\xff" => 4, // UTF-32 (BE) BOM
92
      "\xff\xfe\x00\x00" => 4, // UTF-32 (LE) BOM
93
      "\xfe\xff"         => 2, // UTF-16 (BE) BOM
94
      'þÿ'               => 4, // UTF-16 (BE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
95
      "\xff\xfe"         => 2, // UTF-16 (LE) BOM
96
      'ÿþ'               => 4, // UTF-16 (LE) BOM as "WINDOWS-1252"
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
97
  );
98
99
  /**
100
   * Numeric code point => UTF-8 Character
101
   *
102
   * url: http://www.w3schools.com/charsets/ref_utf_punctuation.asp
103
   *
104
   * @var array
105
   */
106
  private static $whitespace = array(
107
    // NUL Byte
108
    0     => "\x0",
109
    // Tab
110
    9     => "\x9",
111
    // New Line
112
    10    => "\xa",
113
    // Vertical Tab
114
    11    => "\xb",
115
    // Carriage Return
116
    13    => "\xd",
117
    // Ordinary Space
118
    32    => "\x20",
119
    // NO-BREAK SPACE
120
    160   => "\xc2\xa0",
121
    // OGHAM SPACE MARK
122
    5760  => "\xe1\x9a\x80",
123
    // MONGOLIAN VOWEL SEPARATOR
124
    6158  => "\xe1\xa0\x8e",
125
    // EN QUAD
126
    8192  => "\xe2\x80\x80",
127
    // EM QUAD
128
    8193  => "\xe2\x80\x81",
129
    // EN SPACE
130
    8194  => "\xe2\x80\x82",
131
    // EM SPACE
132
    8195  => "\xe2\x80\x83",
133
    // THREE-PER-EM SPACE
134
    8196  => "\xe2\x80\x84",
135
    // FOUR-PER-EM SPACE
136
    8197  => "\xe2\x80\x85",
137
    // SIX-PER-EM SPACE
138
    8198  => "\xe2\x80\x86",
139
    // FIGURE SPACE
140
    8199  => "\xe2\x80\x87",
141
    // PUNCTUATION SPACE
142
    8200  => "\xe2\x80\x88",
143
    // THIN SPACE
144
    8201  => "\xe2\x80\x89",
145
    //HAIR SPACE
146
    8202  => "\xe2\x80\x8a",
147
    // LINE SEPARATOR
148
    8232  => "\xe2\x80\xa8",
149
    // PARAGRAPH SEPARATOR
150
    8233  => "\xe2\x80\xa9",
151
    // NARROW NO-BREAK SPACE
152
    8239  => "\xe2\x80\xaf",
153
    // MEDIUM MATHEMATICAL SPACE
154
    8287  => "\xe2\x81\x9f",
155
    // IDEOGRAPHIC SPACE
156
    12288 => "\xe3\x80\x80",
157
  );
158
159
  /**
160
   * @var array
161
   */
162
  private static $whitespaceTable = array(
163
      'SPACE'                     => "\x20",
164
      'NO-BREAK SPACE'            => "\xc2\xa0",
165
      'OGHAM SPACE MARK'          => "\xe1\x9a\x80",
166
      'EN QUAD'                   => "\xe2\x80\x80",
167
      'EM QUAD'                   => "\xe2\x80\x81",
168
      'EN SPACE'                  => "\xe2\x80\x82",
169
      'EM SPACE'                  => "\xe2\x80\x83",
170
      'THREE-PER-EM SPACE'        => "\xe2\x80\x84",
171
      'FOUR-PER-EM SPACE'         => "\xe2\x80\x85",
172
      'SIX-PER-EM SPACE'          => "\xe2\x80\x86",
173
      'FIGURE SPACE'              => "\xe2\x80\x87",
174
      'PUNCTUATION SPACE'         => "\xe2\x80\x88",
175
      'THIN SPACE'                => "\xe2\x80\x89",
176
      'HAIR SPACE'                => "\xe2\x80\x8a",
177
      'LINE SEPARATOR'            => "\xe2\x80\xa8",
178
      'PARAGRAPH SEPARATOR'       => "\xe2\x80\xa9",
179
      'ZERO WIDTH SPACE'          => "\xe2\x80\x8b",
180
      'NARROW NO-BREAK SPACE'     => "\xe2\x80\xaf",
181
      'MEDIUM MATHEMATICAL SPACE' => "\xe2\x81\x9f",
182
      'IDEOGRAPHIC SPACE'         => "\xe3\x80\x80",
183
  );
184
185
  /**
186
   * bidirectional text chars
187
   *
188
   * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls
189
   *
190
   * @var array
191
   */
192
  private static $bidiUniCodeControlsTable = array(
193
    // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr")
194
    8234 => "\xE2\x80\xAA",
195
    // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl")
196
    8235 => "\xE2\x80\xAB",
197
    // POP DIRECTIONAL FORMATTING // (use -> </bdo>)
198
    8236 => "\xE2\x80\xAC",
199
    // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">)
200
    8237 => "\xE2\x80\xAD",
201
    // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">)
202
    8238 => "\xE2\x80\xAE",
203
    // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr")
204
    8294 => "\xE2\x81\xA6",
205
    // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl")
206
    8295 => "\xE2\x81\xA7",
207
    // FIRST STRONG ISOLATE // (use -> dir = "auto")
208
    8296 => "\xE2\x81\xA8",
209
    // POP DIRECTIONAL ISOLATE
210
    8297 => "\xE2\x81\xA9",
211
  );
212
213
  /**
214
   * @var array
215
   */
216
  private static $commonCaseFold = array(
217
      'ſ'            => 's',
218
      "\xCD\x85"     => 'ι',
219
      'ς'            => 'σ',
220
      "\xCF\x90"     => 'β',
221
      "\xCF\x91"     => 'θ',
222
      "\xCF\x95"     => 'φ',
223
      "\xCF\x96"     => 'π',
224
      "\xCF\xB0"     => 'κ',
225
      "\xCF\xB1"     => 'ρ',
226
      "\xCF\xB5"     => 'ε',
227
      "\xE1\xBA\x9B" => "\xE1\xB9\xA1",
228
      "\xE1\xBE\xBE" => 'ι',
229
  );
230
231
  /**
232
   * @var array
233
   */
234
  private static $brokenUtf8ToUtf8 = array(
235
      "\xc2\x80" => "\xe2\x82\xac", // EURO SIGN
236
      "\xc2\x82" => "\xe2\x80\x9a", // SINGLE LOW-9 QUOTATION MARK
237
      "\xc2\x83" => "\xc6\x92", // LATIN SMALL LETTER F WITH HOOK
238
      "\xc2\x84" => "\xe2\x80\x9e", // DOUBLE LOW-9 QUOTATION MARK
239
      "\xc2\x85" => "\xe2\x80\xa6", // HORIZONTAL ELLIPSIS
240
      "\xc2\x86" => "\xe2\x80\xa0", // DAGGER
241
      "\xc2\x87" => "\xe2\x80\xa1", // DOUBLE DAGGER
242
      "\xc2\x88" => "\xcb\x86", // MODIFIER LETTER CIRCUMFLEX ACCENT
243
      "\xc2\x89" => "\xe2\x80\xb0", // PER MILLE SIGN
244
      "\xc2\x8a" => "\xc5\xa0", // LATIN CAPITAL LETTER S WITH CARON
245
      "\xc2\x8b" => "\xe2\x80\xb9", // SINGLE LEFT-POINTING ANGLE QUOTE
246
      "\xc2\x8c" => "\xc5\x92", // LATIN CAPITAL LIGATURE OE
247
      "\xc2\x8e" => "\xc5\xbd", // LATIN CAPITAL LETTER Z WITH CARON
248
      "\xc2\x91" => "\xe2\x80\x98", // LEFT SINGLE QUOTATION MARK
249
      "\xc2\x92" => "\xe2\x80\x99", // RIGHT SINGLE QUOTATION MARK
250
      "\xc2\x93" => "\xe2\x80\x9c", // LEFT DOUBLE QUOTATION MARK
251
      "\xc2\x94" => "\xe2\x80\x9d", // RIGHT DOUBLE QUOTATION MARK
252
      "\xc2\x95" => "\xe2\x80\xa2", // BULLET
253
      "\xc2\x96" => "\xe2\x80\x93", // EN DASH
254
      "\xc2\x97" => "\xe2\x80\x94", // EM DASH
255
      "\xc2\x98" => "\xcb\x9c", // SMALL TILDE
256
      "\xc2\x99" => "\xe2\x84\xa2", // TRADE MARK SIGN
257
      "\xc2\x9a" => "\xc5\xa1", // LATIN SMALL LETTER S WITH CARON
258
      "\xc2\x9b" => "\xe2\x80\xba", // SINGLE RIGHT-POINTING ANGLE QUOTE
259
      "\xc2\x9c" => "\xc5\x93", // LATIN SMALL LIGATURE OE
260
      "\xc2\x9e" => "\xc5\xbe", // LATIN SMALL LETTER Z WITH CARON
261
      "\xc2\x9f" => "\xc5\xb8", // LATIN CAPITAL LETTER Y WITH DIAERESIS
262
      'ü'       => 'ü',
263
      'ä'       => 'ä',
264
      'ö'       => 'ö',
265
      'Ö'       => 'Ö',
266
      'ß'       => 'ß',
267
      'Ã '       => 'à',
268
      'á'       => 'á',
269
      'â'       => 'â',
270
      'ã'       => 'ã',
271
      'ù'       => 'ù',
272
      'ú'       => 'ú',
273
      'û'       => 'û',
274
      'Ù'       => 'Ù',
275
      'Ú'       => 'Ú',
276
      'Û'       => 'Û',
277
      'Ü'       => 'Ü',
278
      'ò'       => 'ò',
279
      'ó'       => 'ó',
280
      'ô'       => 'ô',
281
      'è'       => 'è',
282
      'é'       => 'é',
283
      'ê'       => 'ê',
284
      'ë'       => 'ë',
285
      'À'       => 'À',
286
      'Á'       => 'Á',
287
      'Â'       => 'Â',
288
      'Ã'       => 'Ã',
289
      'Ä'       => 'Ä',
290
      'Ã…'       => 'Å',
291
      'Ç'       => 'Ç',
292
      'È'       => 'È',
293
      'É'       => 'É',
294
      'Ê'       => 'Ê',
295
      'Ë'       => 'Ë',
296
      'ÃŒ'       => 'Ì',
297
      'Í'       => 'Í',
298
      'ÃŽ'       => 'Î',
299
      'Ï'       => 'Ï',
300
      'Ñ'       => 'Ñ',
301
      'Ã’'       => 'Ò',
302
      'Ó'       => 'Ó',
303
      'Ô'       => 'Ô',
304
      'Õ'       => 'Õ',
305
      'Ø'       => 'Ø',
306
      'Ã¥'       => 'å',
307
      'æ'       => 'æ',
308
      'ç'       => 'ç',
309
      'ì'       => 'ì',
310
      'í'       => 'í',
311
      'î'       => 'î',
312
      'ï'       => 'ï',
313
      'ð'       => 'ð',
314
      'ñ'       => 'ñ',
315
      'õ'       => 'õ',
316
      'ø'       => 'ø',
317
      'ý'       => 'ý',
318
      'ÿ'       => 'ÿ',
319
      '€'      => '€',
320
  );
321
322
  /**
323
   * @var array
324
   */
325
  private static $utf8ToWin1252 = array(
326
      "\xe2\x82\xac" => "\x80", // EURO SIGN
327
      "\xe2\x80\x9a" => "\x82", // SINGLE LOW-9 QUOTATION MARK
328
      "\xc6\x92"     => "\x83", // LATIN SMALL LETTER F WITH HOOK
329
      "\xe2\x80\x9e" => "\x84", // DOUBLE LOW-9 QUOTATION MARK
330
      "\xe2\x80\xa6" => "\x85", // HORIZONTAL ELLIPSIS
331
      "\xe2\x80\xa0" => "\x86", // DAGGER
332
      "\xe2\x80\xa1" => "\x87", // DOUBLE DAGGER
333
      "\xcb\x86"     => "\x88", // MODIFIER LETTER CIRCUMFLEX ACCENT
334
      "\xe2\x80\xb0" => "\x89", // PER MILLE SIGN
335
      "\xc5\xa0"     => "\x8a", // LATIN CAPITAL LETTER S WITH CARON
336
      "\xe2\x80\xb9" => "\x8b", // SINGLE LEFT-POINTING ANGLE QUOTE
337
      "\xc5\x92"     => "\x8c", // LATIN CAPITAL LIGATURE OE
338
      "\xc5\xbd"     => "\x8e", // LATIN CAPITAL LETTER Z WITH CARON
339
      "\xe2\x80\x98" => "\x91", // LEFT SINGLE QUOTATION MARK
340
      "\xe2\x80\x99" => "\x92", // RIGHT SINGLE QUOTATION MARK
341
      "\xe2\x80\x9c" => "\x93", // LEFT DOUBLE QUOTATION MARK
342
      "\xe2\x80\x9d" => "\x94", // RIGHT DOUBLE QUOTATION MARK
343
      "\xe2\x80\xa2" => "\x95", // BULLET
344
      "\xe2\x80\x93" => "\x96", // EN DASH
345
      "\xe2\x80\x94" => "\x97", // EM DASH
346
      "\xcb\x9c"     => "\x98", // SMALL TILDE
347
      "\xe2\x84\xa2" => "\x99", // TRADE MARK SIGN
348
      "\xc5\xa1"     => "\x9a", // LATIN SMALL LETTER S WITH CARON
349
      "\xe2\x80\xba" => "\x9b", // SINGLE RIGHT-POINTING ANGLE QUOTE
350
      "\xc5\x93"     => "\x9c", // LATIN SMALL LIGATURE OE
351
      "\xc5\xbe"     => "\x9e", // LATIN SMALL LETTER Z WITH CARON
352
      "\xc5\xb8"     => "\x9f", // LATIN CAPITAL LETTER Y WITH DIAERESIS
353
  );
354
355
  /**
356
   * @var array
357
   */
358
  private static $utf8MSWord = array(
359
      "\xc2\xab"     => '"', // « (U+00AB) in UTF-8
360
      "\xc2\xbb"     => '"', // » (U+00BB) in UTF-8
361
      "\xe2\x80\x98" => "'", // ‘ (U+2018) in UTF-8
362
      "\xe2\x80\x99" => "'", // ’ (U+2019) in UTF-8
363
      "\xe2\x80\x9a" => "'", // ‚ (U+201A) in UTF-8
364
      "\xe2\x80\x9b" => "'", // ‛ (U+201B) in UTF-8
365
      "\xe2\x80\x9c" => '"', // “ (U+201C) in UTF-8
366
      "\xe2\x80\x9d" => '"', // ” (U+201D) in UTF-8
367
      "\xe2\x80\x9e" => '"', // „ (U+201E) in UTF-8
368
      "\xe2\x80\x9f" => '"', // ‟ (U+201F) in UTF-8
369
      "\xe2\x80\xb9" => "'", // ‹ (U+2039) in UTF-8
370
      "\xe2\x80\xba" => "'", // › (U+203A) in UTF-8
371
      "\xe2\x80\x93" => '-', // – (U+2013) in UTF-8
372
      "\xe2\x80\x94" => '-', // — (U+2014) in UTF-8
373
      "\xe2\x80\xa6" => '...' // … (U+2026) in UTF-8
374
  );
375
376
  private static $iconvEncoding = array(
377
      'ANSI_X3.4-1968',
378
      'ANSI_X3.4-1986',
379
      'ASCII',
380
      'CP367',
381
      'IBM367',
382
      'ISO-IR-6',
383
      'ISO646-US',
384
      'ISO_646.IRV:1991',
385
      'US',
386
      'US-ASCII',
387
      'CSASCII',
388
      'UTF-8',
389
      'ISO-10646-UCS-2',
390
      'UCS-2',
391
      'CSUNICODE',
392
      'UCS-2BE',
393
      'UNICODE-1-1',
394
      'UNICODEBIG',
395
      'CSUNICODE11',
396
      'UCS-2LE',
397
      'UNICODELITTLE',
398
      'ISO-10646-UCS-4',
399
      'UCS-4',
400
      'CSUCS4',
401
      'UCS-4BE',
402
      'UCS-4LE',
403
      'UTF-16',
404
      'UTF-16BE',
405
      'UTF-16LE',
406
      'UTF-32',
407
      'UTF-32BE',
408
      'UTF-32LE',
409
      'UNICODE-1-1-UTF-7',
410
      'UTF-7',
411
      'CSUNICODE11UTF7',
412
      'UCS-2-INTERNAL',
413
      'UCS-2-SWAPPED',
414
      'UCS-4-INTERNAL',
415
      'UCS-4-SWAPPED',
416
      'C99',
417
      'JAVA',
418
      'CP819',
419
      'IBM819',
420
      'ISO-8859-1',
421
      'ISO-IR-100',
422
      'ISO8859-1',
423
      'ISO_8859-1',
424
      'ISO_8859-1:1987',
425
      'L1',
426
      'LATIN1',
427
      'CSISOLATIN1',
428
      'ISO-8859-2',
429
      'ISO-IR-101',
430
      'ISO8859-2',
431
      'ISO_8859-2',
432
      'ISO_8859-2:1987',
433
      'L2',
434
      'LATIN2',
435
      'CSISOLATIN2',
436
      'ISO-8859-3',
437
      'ISO-IR-109',
438
      'ISO8859-3',
439
      'ISO_8859-3',
440
      'ISO_8859-3:1988',
441
      'L3',
442
      'LATIN3',
443
      'CSISOLATIN3',
444
      'ISO-8859-4',
445
      'ISO-IR-110',
446
      'ISO8859-4',
447
      'ISO_8859-4',
448
      'ISO_8859-4:1988',
449
      'L4',
450
      'LATIN4',
451
      'CSISOLATIN4',
452
      'CYRILLIC',
453
      'ISO-8859-5',
454
      'ISO-IR-144',
455
      'ISO8859-5',
456
      'ISO_8859-5',
457
      'ISO_8859-5:1988',
458
      'CSISOLATINCYRILLIC',
459
      'ARABIC',
460
      'ASMO-708',
461
      'ECMA-114',
462
      'ISO-8859-6',
463
      'ISO-IR-127',
464
      'ISO8859-6',
465
      'ISO_8859-6',
466
      'ISO_8859-6:1987',
467
      'CSISOLATINARABIC',
468
      'ECMA-118',
469
      'ELOT_928',
470
      'GREEK',
471
      'GREEK8',
472
      'ISO-8859-7',
473
      'ISO-IR-126',
474
      'ISO8859-7',
475
      'ISO_8859-7',
476
      'ISO_8859-7:1987',
477
      'ISO_8859-7:2003',
478
      'CSISOLATINGREEK',
479
      'HEBREW',
480
      'ISO-8859-8',
481
      'ISO-IR-138',
482
      'ISO8859-8',
483
      'ISO_8859-8',
484
      'ISO_8859-8:1988',
485
      'CSISOLATINHEBREW',
486
      'ISO-8859-9',
487
      'ISO-IR-148',
488
      'ISO8859-9',
489
      'ISO_8859-9',
490
      'ISO_8859-9:1989',
491
      'L5',
492
      'LATIN5',
493
      'CSISOLATIN5',
494
      'ISO-8859-10',
495
      'ISO-IR-157',
496
      'ISO8859-10',
497
      'ISO_8859-10',
498
      'ISO_8859-10:1992',
499
      'L6',
500
      'LATIN6',
501
      'CSISOLATIN6',
502
      'ISO-8859-11',
503
      'ISO8859-11',
504
      'ISO_8859-11',
505
      'ISO-8859-13',
506
      'ISO-IR-179',
507
      'ISO8859-13',
508
      'ISO_8859-13',
509
      'L7',
510
      'LATIN7',
511
      'ISO-8859-14',
512
      'ISO-CELTIC',
513
      'ISO-IR-199',
514
      'ISO8859-14',
515
      'ISO_8859-14',
516
      'ISO_8859-14:1998',
517
      'L8',
518
      'LATIN8',
519
      'ISO-8859-15',
520
      'ISO-IR-203',
521
      'ISO8859-15',
522
      'ISO_8859-15',
523
      'ISO_8859-15:1998',
524
      'LATIN-9',
525
      'ISO-8859-16',
526
      'ISO-IR-226',
527
      'ISO8859-16',
528
      'ISO_8859-16',
529
      'ISO_8859-16:2001',
530
      'L10',
531
      'LATIN10',
532
      'KOI8-R',
533
      'CSKOI8R',
534
      'KOI8-U',
535
      'KOI8-RU',
536
      'CP1250',
537
      'MS-EE',
538
      'WINDOWS-1250',
539
      'CP1251',
540
      'MS-CYRL',
541
      'WINDOWS-1251',
542
      'CP1252',
543
      'MS-ANSI',
544
      'WINDOWS-1252',
545
      'CP1253',
546
      'MS-GREEK',
547
      'WINDOWS-1253',
548
      'CP1254',
549
      'MS-TURK',
550
      'WINDOWS-1254',
551
      'CP1255',
552
      'MS-HEBR',
553
      'WINDOWS-1255',
554
      'CP1256',
555
      'MS-ARAB',
556
      'WINDOWS-1256',
557
      'CP1257',
558
      'WINBALTRIM',
559
      'WINDOWS-1257',
560
      'CP1258',
561
      'WINDOWS-1258',
562
      '850',
563
      'CP850',
564
      'IBM850',
565
      'CSPC850MULTILINGUAL',
566
      '862',
567
      'CP862',
568
      'IBM862',
569
      'CSPC862LATINHEBREW',
570
      '866',
571
      'CP866',
572
      'IBM866',
573
      'CSIBM866',
574
      'MAC',
575
      'MACINTOSH',
576
      'MACROMAN',
577
      'CSMACINTOSH',
578
      'MACCENTRALEUROPE',
579
      'MACICELAND',
580
      'MACCROATIAN',
581
      'MACROMANIA',
582
      'MACCYRILLIC',
583
      'MACUKRAINE',
584
      'MACGREEK',
585
      'MACTURKISH',
586
      'MACHEBREW',
587
      'MACARABIC',
588
      'MACTHAI',
589
      'HP-ROMAN8',
590
      'R8',
591
      'ROMAN8',
592
      'CSHPROMAN8',
593
      'NEXTSTEP',
594
      'ARMSCII-8',
595
      'GEORGIAN-ACADEMY',
596
      'GEORGIAN-PS',
597
      'KOI8-T',
598
      'CP154',
599
      'CYRILLIC-ASIAN',
600
      'PT154',
601
      'PTCP154',
602
      'CSPTCP154',
603
      'KZ-1048',
604
      'RK1048',
605
      'STRK1048-2002',
606
      'CSKZ1048',
607
      'MULELAO-1',
608
      'CP1133',
609
      'IBM-CP1133',
610
      'ISO-IR-166',
611
      'TIS-620',
612
      'TIS620',
613
      'TIS620-0',
614
      'TIS620.2529-1',
615
      'TIS620.2533-0',
616
      'TIS620.2533-1',
617
      'CP874',
618
      'WINDOWS-874',
619
      'VISCII',
620
      'VISCII1.1-1',
621
      'CSVISCII',
622
      'TCVN',
623
      'TCVN-5712',
624
      'TCVN5712-1',
625
      'TCVN5712-1:1993',
626
      'ISO-IR-14',
627
      'ISO646-JP',
628
      'JIS_C6220-1969-RO',
629
      'JP',
630
      'CSISO14JISC6220RO',
631
      'JISX0201-1976',
632
      'JIS_X0201',
633
      'X0201',
634
      'CSHALFWIDTHKATAKANA',
635
      'ISO-IR-87',
636
      'JIS0208',
637
      'JIS_C6226-1983',
638
      'JIS_X0208',
639
      'JIS_X0208-1983',
640
      'JIS_X0208-1990',
641
      'X0208',
642
      'CSISO87JISX0208',
643
      'ISO-IR-159',
644
      'JIS_X0212',
645
      'JIS_X0212-1990',
646
      'JIS_X0212.1990-0',
647
      'X0212',
648
      'CSISO159JISX02121990',
649
      'CN',
650
      'GB_1988-80',
651
      'ISO-IR-57',
652
      'ISO646-CN',
653
      'CSISO57GB1988',
654
      'CHINESE',
655
      'GB_2312-80',
656
      'ISO-IR-58',
657
      'CSISO58GB231280',
658
      'CN-GB-ISOIR165',
659
      'ISO-IR-165',
660
      'ISO-IR-149',
661
      'KOREAN',
662
      'KSC_5601',
663
      'KS_C_5601-1987',
664
      'KS_C_5601-1989',
665
      'CSKSC56011987',
666
      'EUC-JP',
667
      'EUCJP',
668
      'EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE',
669
      'CSEUCPKDFMTJAPANESE',
670
      'MS_KANJI',
671
      'SHIFT-JIS',
672
      'SHIFT_JIS',
673
      'SJIS',
674
      'CSSHIFTJIS',
675
      'CP932',
676
      'ISO-2022-JP',
677
      'CSISO2022JP',
678
      'ISO-2022-JP-1',
679
      'ISO-2022-JP-2',
680
      'CSISO2022JP2',
681
      'CN-GB',
682
      'EUC-CN',
683
      'EUCCN',
684
      'GB2312',
685
      'CSGB2312',
686
      'GBK',
687
      'CP936',
688
      'MS936',
689
      'WINDOWS-936',
690
      'GB18030',
691
      'ISO-2022-CN',
692
      'CSISO2022CN',
693
      'ISO-2022-CN-EXT',
694
      'HZ',
695
      'HZ-GB-2312',
696
      'EUC-TW',
697
      'EUCTW',
698
      'CSEUCTW',
699
      'BIG-5',
700
      'BIG-FIVE',
701
      'BIG5',
702
      'BIGFIVE',
703
      'CN-BIG5',
704
      'CSBIG5',
705
      'CP950',
706
      'BIG5-HKSCS:1999',
707
      'BIG5-HKSCS:2001',
708
      'BIG5-HKSCS',
709
      'BIG5-HKSCS:2004',
710
      'BIG5HKSCS',
711
      'EUC-KR',
712
      'EUCKR',
713
      'CSEUCKR',
714
      'CP949',
715
      'UHC',
716
      'CP1361',
717
      'JOHAB',
718
      'ISO-2022-KR',
719
      'CSISO2022KR',
720
      'CP856',
721
      'CP922',
722
      'CP943',
723
      'CP1046',
724
      'CP1124',
725
      'CP1129',
726
      'CP1161',
727
      'IBM-1161',
728
      'IBM1161',
729
      'CSIBM1161',
730
      'CP1162',
731
      'IBM-1162',
732
      'IBM1162',
733
      'CSIBM1162',
734
      'CP1163',
735
      'IBM-1163',
736
      'IBM1163',
737
      'CSIBM1163',
738
      'DEC-KANJI',
739
      'DEC-HANYU',
740
      '437',
741
      'CP437',
742
      'IBM437',
743
      'CSPC8CODEPAGE437',
744
      'CP737',
745
      'CP775',
746
      'IBM775',
747
      'CSPC775BALTIC',
748
      '852',
749
      'CP852',
750
      'IBM852',
751
      'CSPCP852',
752
      'CP853',
753
      '855',
754
      'CP855',
755
      'IBM855',
756
      'CSIBM855',
757
      '857',
758
      'CP857',
759
      'IBM857',
760
      'CSIBM857',
761
      'CP858',
762
      '860',
763
      'CP860',
764
      'IBM860',
765
      'CSIBM860',
766
      '861',
767
      'CP-IS',
768
      'CP861',
769
      'IBM861',
770
      'CSIBM861',
771
      '863',
772
      'CP863',
773
      'IBM863',
774
      'CSIBM863',
775
      'CP864',
776
      'IBM864',
777
      'CSIBM864',
778
      '865',
779
      'CP865',
780
      'IBM865',
781
      'CSIBM865',
782
      '869',
783
      'CP-GR',
784
      'CP869',
785
      'IBM869',
786
      'CSIBM869',
787
      'CP1125',
788
      'EUC-JISX0213',
789
      'SHIFT_JISX0213',
790
      'ISO-2022-JP-3',
791
      'BIG5-2003',
792
      'ISO-IR-230',
793
      'TDS565',
794
      'ATARI',
795
      'ATARIST',
796
      'RISCOS-LATIN1',
797
  );
798
799
  /**
800
   * @var array
801
   */
802
  private static $support = array();
803
804
  /**
805
   * __construct()
806
   */
807 1
  public function __construct()
808
  {
809 1
    self::checkForSupport();
810 1
  }
811
812
  /**
813
   * Return the character at the specified position: $str[1] like functionality.
814
   *
815
   * @param string $str <p>A UTF-8 string.</p>
816
   * @param int    $pos <p>The position of character to return.</p>
817
   *
818
   * @return string <p>Single Multi-Byte character.</p>
819
   */
820 2
  public static function access($str, $pos)
821
  {
822 2
    return self::substr($str, $pos, 1);
823
  }
824
825
  /**
826
   * Prepends UTF-8 BOM character to the string and returns the whole string.
827
   *
828
   * INFO: If BOM already existed there, the Input string is returned.
829
   *
830
   * @param string $str <p>The input string.</p>
831
   *
832
   * @return string <p>The output string that contains BOM.</p>
833
   */
834
  public static function add_bom_to_string($str)
835
  {
836
    if (self::string_has_bom($str) === false) {
837
      $str = self::bom() . $str;
838
    }
839
840
    return $str;
841
  }
842
843
  /**
844
   * Convert binary into an string.
845
   *
846
   * @param mixed $bin 1|0
847
   *
848
   * @return string
849
   */
850 1
  public static function binary_to_str($bin)
851
  {
852 1
    return pack('H*', base_convert($bin, 2, 16));
853
  }
854
855
  /**
856
   * Returns the UTF-8 Byte Order Mark Character.
857
   *
858
   * @return string UTF-8 Byte Order Mark
859
   */
860 1
  public static function bom()
861
  {
862 1
    return "\xEF\xBB\xBF";
863
  }
864
865
  /**
866
   * @alias of UTF8::chr_map()
867
   * @see   UTF8::chr_map()
868
   *
869
   * @param string|array $callback
870
   * @param string       $str
871
   *
872
   * @return array
873
   */
874 1
  public static function callback($callback, $str)
875
  {
876 1
    return self::chr_map($callback, $str);
877
  }
878
879
  /**
880
   * This method will auto-detect your server environment for UTF-8 support.
881
   *
882
   * INFO: You don't need to run it manually, it will be triggered if it's needed.
883
   */
884
  public static function checkForSupport()
885
  {
886
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
887
888
      self::$support['already_checked_via_portable_utf8'] = true;
889
890
      self::$support['mbstring'] = self::mbstring_loaded();
891
      self::$support['iconv'] = self::iconv_loaded();
892
      self::$support['intl'] = self::intl_loaded();
893
      self::$support['intlChar'] = self::intlChar_loaded();
894
      self::$support['pcre_utf8'] = self::pcre_utf8_support();
895
    }
896
  }
897
898
  /**
899
   * Generates a UTF-8 encoded character from the given code point.
900
   *
901
   * INFO: opposite to UTF8::ord()
902
   *
903
   * @param int $code_point <p>The code point for which to generate a character.</p>
904
   *
905
   * @return string|null <p>Multi-Byte character, returns null on failure to encode.</p>
906
   */
907
  public static function chr($code_point)
908
  {
909
    // init
910
    $i = (int)$code_point;
911
912
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
913
      self::checkForSupport();
914
    }
915
916
    if (self::$support['intlChar'] === true) {
917
      return \IntlChar::chr($code_point);
918
    }
919
920
    if ($i !== $code_point) {
921
      $i = self::hex_to_int($code_point);
922
    }
923
924
    if (!$i) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $i of type integer|false is loosely compared to false; this is ambiguous if the integer can be zero. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
925
      return null;
926
    }
927
928
    return self::html_entity_decode("&#{$i};", ENT_QUOTES);
929
  }
930
931
  /**
932
   * Applies callback to all characters of a string.
933
   *
934
   * @param string|array $callback <p>The callback function.</p>
935
   * @param string       $str      <p>UTF-8 string to run callback on.</p>
936
   *
937
   * @return array <p>The outcome of callback.</p>
938
   */
939
  public static function chr_map($callback, $str)
940
  {
941
    $chars = self::split($str);
942
943
    return array_map($callback, $chars);
944
  }
945
946
  /**
947
   * Generates an array of byte length of each character of a Unicode string.
948
   *
949
   * 1 byte => U+0000  - U+007F
950
   * 2 byte => U+0080  - U+07FF
951
   * 3 byte => U+0800  - U+FFFF
952
   * 4 byte => U+10000 - U+10FFFF
953
   *
954
   * @param string $str <p>The original Unicode string.</p>
955
   *
956
   * @return array <p>An array of byte lengths of each character.</p>
957
   */
958
  public static function chr_size_list($str)
959
  {
960
    if (!$str) {
961
      return array();
962
    }
963
964
    return array_map('strlen', self::split($str));
965
  }
966
967
  /**
968
   * Get a decimal code representation of a specific character.
969
   *
970
   * @param string $char <p>The input character.</p>
971
   *
972
   * @return int
973
   */
974
  public static function chr_to_decimal($char)
975
  {
976
    $char = (string)$char;
977
    $code = self::ord($char[0]);
978
    $bytes = 1;
979
980
    if (!($code & 0x80)) {
981
      // 0xxxxxxx
982
      return $code;
983
    }
984
985
    if (($code & 0xe0) === 0xc0) {
986
      // 110xxxxx
987
      $bytes = 2;
988
      $code &= ~0xc0;
989
    } elseif (($code & 0xf0) === 0xe0) {
990
      // 1110xxxx
991
      $bytes = 3;
992
      $code &= ~0xe0;
993
    } elseif (($code & 0xf8) === 0xf0) {
994
      // 11110xxx
995
      $bytes = 4;
996
      $code &= ~0xf0;
997
    }
998
999
    for ($i = 2; $i <= $bytes; $i++) {
1000
      // 10xxxxxx
1001
      $code = ($code << 6) + (self::ord($char[$i - 1]) & ~0x80);
1002
    }
1003
1004
    return $code;
1005
  }
1006
1007
  /**
1008
   * Get hexadecimal code point (U+xxxx) of a UTF-8 encoded character.
1009
   *
1010
   * @param string $char <p>The input character</p>
1011
   * @param string $pfix [optional]
1012
   *
1013
   * @return string <p>The code point encoded as U+xxxx<p>
1014
   */
1015
  public static function chr_to_hex($char, $pfix = 'U+')
1016
  {
1017
    return self::int_to_hex(self::ord($char), $pfix);
1018
  }
1019
1020
  /**
1021
   * Splits a string into smaller chunks and multiple lines, using the specified line ending character.
1022
   *
1023
   * @param string $body     <p>The original string to be split.</p>
1024
   * @param int    $chunklen [optional] <p>The maximum character length of a chunk.</p>
1025
   * @param string $end      [optional] <p>The character(s) to be inserted at the end of each chunk.</p>
1026
   *
1027
   * @return string <p>The chunked string</p>
1028
   */
1029
  public static function chunk_split($body, $chunklen = 76, $end = "\r\n")
1030
  {
1031
    return implode($end, self::split($body, $chunklen));
1032
  }
1033
1034
  /**
1035
   * Accepts a string and removes all non-UTF-8 characters from it + extras if needed.
1036
   *
1037
   * @param string $str                     <p>The string to be sanitized.</p>
1038
   * @param bool   $remove_bom              [optional] <p>Set to true, if you need to remove UTF-BOM.</p>
1039
   * @param bool   $normalize_whitespace    [optional] <p>Set to true, if you need to normalize the whitespace.</p>
1040
   * @param bool   $normalize_msword        [optional] <p>Set to true, if you need to normalize MS Word chars e.g.: "…"
1041
   *                                        => "..."</p>
1042
   * @param bool   $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in combination with
1043
   *                                        $normalize_whitespace</p>
1044
   *
1045
   * @return string <p>Clean UTF-8 encoded string.</p>
1046
   */
1047
  public static function clean($str, $remove_bom = false, $normalize_whitespace = false, $normalize_msword = false, $keep_non_breaking_space = false)
1048
  {
1049
    // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string
1050
    // caused connection reset problem on larger strings
1051
1052
    $regx = '/
1053
      (
1054
        (?: [\x00-\x7F]               # single-byte sequences   0xxxxxxx
1055
        |   [\xC0-\xDF][\x80-\xBF]    # double-byte sequences   110xxxxx 10xxxxxx
1056
        |   [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences   1110xxxx 10xxxxxx * 2
1057
        |   [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
1058
        ){1,100}                      # ...one or more times
1059
      )
1060
    | ( [\x80-\xBF] )                 # invalid byte in range 10000000 - 10111111
1061
    | ( [\xC0-\xFF] )                 # invalid byte in range 11000000 - 11111111
1062
    /x';
1063
    $str = preg_replace($regx, '$1', $str);
1064
1065
    $str = self::replace_diamond_question_mark($str, '');
1066
    $str = self::remove_invisible_characters($str);
1067
1068
    if ($normalize_whitespace === true) {
1069
      $str = self::normalize_whitespace($str, $keep_non_breaking_space);
1070
    }
1071
1072
    if ($normalize_msword === true) {
1073
      $str = self::normalize_msword($str);
1074
    }
1075
1076
    if ($remove_bom === true) {
1077
      $str = self::removeBOM($str);
1078
    }
1079
1080
    return $str;
1081
  }
1082
1083
  /**
1084
   * Clean-up a and show only printable UTF-8 chars at the end  + fix UTF-8 encoding.
1085
   *
1086
   * @param string $str <p>The input string.</p>
1087
   *
1088
   * @return string
1089
   */
1090
  public static function cleanup($str)
1091
  {
1092
    $str = (string)$str;
1093
1094
    if (!isset($str[0])) {
1095
      return '';
1096
    }
1097
1098
    // fixed ISO <-> UTF-8 Errors
1099
    $str = self::fix_simple_utf8($str);
1100
1101
    // remove all none UTF-8 symbols
1102
    // && remove diamond question mark (�)
1103
    // && remove remove invisible characters (e.g. "\0")
1104
    // && remove BOM
1105
    // && normalize whitespace chars (but keep non-breaking-spaces)
1106
    $str = self::clean($str, true, true, false, true);
1107
1108
    return (string)$str;
1109
  }
1110
1111
  /**
1112
   * Accepts a string or a array of strings and returns an array of Unicode code points.
1113
   *
1114
   * INFO: opposite to UTF8::string()
1115
   *
1116
   * @param string|string[] $arg        <p>A UTF-8 encoded string or an array of such strings.</p>
1117
   * @param bool            $u_style    <p>If True, will return code points in U+xxxx format,
1118
   *                                    default, code points will be returned as integers.</p>
1119
   *
1120
   * @return array <p>The array of code points.</p>
1121
   */
1122
  public static function codepoints($arg, $u_style = false)
1123
  {
1124
    if (is_string($arg)) {
1125
      $arg = self::split($arg);
1126
    }
1127
1128
    $arg = array_map(
1129
        array(
1130
            '\\voku\\helper\\UTF8',
1131
            'ord',
1132
        ),
1133
        $arg
1134
    );
1135
1136
    if ($u_style) {
1137
      $arg = array_map(
1138
          array(
1139
              '\\voku\\helper\\UTF8',
1140
              'int_to_hex',
1141
          ),
1142
          $arg
1143
      );
1144
    }
1145
1146
    return $arg;
1147
  }
1148
1149
  /**
1150
   * Returns count of characters used in a string.
1151
   *
1152
   * @param string $str       <p>The input string.</p>
1153
   * @param bool   $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
1154
   *
1155
   * @return array <p>An associative array of Character as keys and
1156
   *               their count as values.</p>
1157
   */
1158
  public static function count_chars($str, $cleanUtf8 = false)
1159
  {
1160
    return array_count_values(self::split($str, 1, $cleanUtf8));
1161
  }
1162
1163
  /**
1164
   * Get a UTF-8 character from its decimal code representation.
1165
   *
1166
   * @param int $code
1167
   *
1168
   * @return string
1169
   */
1170
  public static function decimal_to_chr($code)
1171
  {
1172
    return \mb_convert_encoding(
1173
        '&#x' . dechex($code) . ';',
1174
        'UTF-8',
1175
        'HTML-ENTITIES'
1176
    );
1177
  }
1178
1179
  /**
1180
   * Encode a string with a new charset-encoding.
1181
   *
1182
   * INFO:  The different to "UTF8::utf8_encode()" is that this function, try to fix also broken / double encoding,
1183
   *        so you can call this function also on a UTF-8 String and you don't mess the string.
1184
   *
1185
   * @param string $encoding <p>e.g. 'UTF-8', 'ISO-8859-1', etc.</p>
1186
   * @param string $str      <p>The input string</p>
1187
   * @param bool   $force    [optional] <p>Force the new encoding (we try to fix broken / double encoding for UTF-8)<br
1188
   *                         /> otherwise we auto-detect the current string-encoding</p>
1189
   *
1190
   * @return string
1191
   */
1192
  public static function encode($encoding, $str, $force = true)
1193
  {
1194
    $str = (string)$str;
1195
    $encoding = (string)$encoding;
1196
1197
    if (!isset($str[0], $encoding[0])) {
1198
      return $str;
1199
    }
1200
1201
    if ($encoding !== 'UTF-8') {
1202
      $encoding = self::normalize_encoding($encoding);
1203
    }
1204
1205
    $encodingDetected = self::str_detect_encoding($str);
1206
1207
    if (
1208
        $encodingDetected
0 ignored issues
show
Bug Best Practice introduced by
The expression $encodingDetected of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
1209
        &&
1210
        (
1211
            $force === true
1212
            ||
1213
            $encodingDetected !== $encoding
1214
        )
1215
    ) {
1216
1217
      if (
1218
          $encoding === 'UTF-8'
1219
          &&
1220
          (
1221
              $force === true
1222
              || $encodingDetected === 'UTF-8'
1223
              || $encodingDetected === 'WINDOWS-1252'
1224
              || $encodingDetected === 'ISO-8859-1'
1225
          )
1226
      ) {
1227
        return self::to_utf8($str);
1228
      }
1229
1230
      if (
1231
          $encoding === 'ISO-8859-1'
1232
          &&
1233
          (
1234
              $force === true
1235
              || $encodingDetected === 'ISO-8859-1'
1236
              || $encodingDetected === 'UTF-8'
1237
          )
1238
      ) {
1239
        return self::to_iso8859($str);
1240
      }
1241
1242
      $strEncoded = \mb_convert_encoding(
1243
          $str,
1244
          $encoding,
1245
          $encodingDetected
1246
      );
1247
1248
      if ($strEncoded) {
1249
        return $strEncoded;
1250
      }
1251
    }
1252
1253
    return $str;
1254
  }
1255
1256
  /**
1257
   * Reads entire file into a string.
1258
   *
1259
   * WARNING: do not use UTF-8 Option ($convertToUtf8) for binary-files (e.g.: images) !!!
1260
   *
1261
   * @link http://php.net/manual/en/function.file-get-contents.php
1262
   *
1263
   * @param string        $filename      <p>
1264
   *                                     Name of the file to read.
1265
   *                                     </p>
1266
   * @param int|null      $flags         [optional] <p>
1267
   *                                     Prior to PHP 6, this parameter is called
1268
   *                                     use_include_path and is a bool.
1269
   *                                     As of PHP 5 the FILE_USE_INCLUDE_PATH can be used
1270
   *                                     to trigger include path
1271
   *                                     search.
1272
   *                                     </p>
1273
   *                                     <p>
1274
   *                                     The value of flags can be any combination of
1275
   *                                     the following flags (with some restrictions), joined with the
1276
   *                                     binary OR (|)
1277
   *                                     operator.
1278
   *                                     </p>
1279
   *                                     <p>
1280
   *                                     <table>
1281
   *                                     Available flags
1282
   *                                     <tr valign="top">
1283
   *                                     <td>Flag</td>
1284
   *                                     <td>Description</td>
1285
   *                                     </tr>
1286
   *                                     <tr valign="top">
1287
   *                                     <td>
1288
   *                                     FILE_USE_INCLUDE_PATH
1289
   *                                     </td>
1290
   *                                     <td>
1291
   *                                     Search for filename in the include directory.
1292
   *                                     See include_path for more
1293
   *                                     information.
1294
   *                                     </td>
1295
   *                                     </tr>
1296
   *                                     <tr valign="top">
1297
   *                                     <td>
1298
   *                                     FILE_TEXT
1299
   *                                     </td>
1300
   *                                     <td>
1301
   *                                     As of PHP 6, the default encoding of the read
1302
   *                                     data is UTF-8. You can specify a different encoding by creating a
1303
   *                                     custom context or by changing the default using
1304
   *                                     stream_default_encoding. This flag cannot be
1305
   *                                     used with FILE_BINARY.
1306
   *                                     </td>
1307
   *                                     </tr>
1308
   *                                     <tr valign="top">
1309
   *                                     <td>
1310
   *                                     FILE_BINARY
1311
   *                                     </td>
1312
   *                                     <td>
1313
   *                                     With this flag, the file is read in binary mode. This is the default
1314
   *                                     setting and cannot be used with FILE_TEXT.
1315
   *                                     </td>
1316
   *                                     </tr>
1317
   *                                     </table>
1318
   *                                     </p>
1319
   * @param resource|null $context       [optional] <p>
1320
   *                                     A valid context resource created with
1321
   *                                     stream_context_create. If you don't need to use a
1322
   *                                     custom context, you can skip this parameter by &null;.
1323
   *                                     </p>
1324
   * @param int|null      $offset        [optional] <p>
1325
   *                                     The offset where the reading starts.
1326
   *                                     </p>
1327
   * @param int|null      $maxlen        [optional] <p>
1328
   *                                     Maximum length of data read. The default is to read until end
1329
   *                                     of file is reached.
1330
   *                                     </p>
1331
   * @param int           $timeout       <p>The time in seconds for the timeout.</p>
1332
   *
1333
   * @param boolean       $convertToUtf8 <strong>WARNING!!!</strong> <p>Maybe you can't use this option for e.g. images
1334
   *                                     or pdf, because they used non default utf-8 chars</p>
1335
   *
1336
   * @return string <p>The function returns the read data or false on failure.</p>
1337
   */
1338
  public static function file_get_contents($filename, $flags = null, $context = null, $offset = null, $maxlen = null, $timeout = 10, $convertToUtf8 = true)
1339
  {
1340
    // init
1341
    $timeout = (int)$timeout;
1342
    $filename = filter_var($filename, FILTER_SANITIZE_STRING);
1343
1344
    if ($timeout && $context === null) {
1345
      $context = stream_context_create(
1346
          array(
1347
              'http' =>
1348
                  array(
1349
                      'timeout' => $timeout,
1350
                  ),
1351
          )
1352
      );
1353
    }
1354
1355
    if (is_int($maxlen)) {
1356
      $data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
1357
    } else {
1358
      $data = file_get_contents($filename, $flags, $context, $offset);
1359
    }
1360
1361
    // return false on error
1362
    if ($data === false) {
1363
      return false;
1364
    }
1365
1366
    if ($convertToUtf8 === true) {
1367
      $data = self::encode('UTF-8', $data, false);
1368
      $data = self::cleanup($data);
0 ignored issues
show
Bug introduced by
It seems like $data can also be of type array; however, voku\helper\UTF8::cleanup() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1369
    }
1370
1371
    return $data;
1372
  }
1373
1374
  /**
1375
   * Checks if a file starts with BOM (Byte Order Mark) character.
1376
   *
1377
   * @param string $file_path <p>Path to a valid file.</p>
1378
   *
1379
   * @return bool <p><strong>true</strong> if the file has BOM at the start, <strong>false</strong> otherwise.</>
1380
   */
1381
  public static function file_has_bom($file_path)
1382
  {
1383
    return self::string_has_bom(file_get_contents($file_path));
1384
  }
1385
1386
  /**
1387
   * Normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1388
   *
1389
   * @param mixed  $var
1390
   * @param int    $normalization_form
1391
   * @param string $leading_combining
1392
   *
1393
   * @return mixed
1394
   */
1395
  public static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
1396
  {
1397
    switch (gettype($var)) {
1398 View Code Duplication
      case 'array':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1399
        foreach ($var as $k => $v) {
1400
          /** @noinspection AlterInForeachInspection */
1401
          $var[$k] = self::filter($v, $normalization_form, $leading_combining);
1402
        }
1403
        break;
1404 View Code Duplication
      case 'object':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1405
        foreach ($var as $k => $v) {
1406
          $var->{$k} = self::filter($v, $normalization_form, $leading_combining);
1407
        }
1408
        break;
1409
      case 'string':
1410
        if (false !== strpos($var, "\r")) {
1411
          // Workaround https://bugs.php.net/65732
1412
          $var = str_replace(array("\r\n", "\r"), "\n", $var);
1413
        }
1414
        if (preg_match('/[\x80-\xFF]/', $var)) {
1415
          if (\Normalizer::isNormalized($var, $normalization_form)) {
1416
            $n = '-';
1417
          } else {
1418
            $n = \Normalizer::normalize($var, $normalization_form);
1419
1420
            if (isset($n[0])) {
1421
              $var = $n;
1422
            } else {
1423
              $var = self::encode('UTF-8', $var);
1424
            }
1425
1426
          }
1427
          if ($var[0] >= "\x80" && isset($n[0], $leading_combining[0]) && preg_match('/^\p{Mn}/u', $var)) {
1428
            // Prevent leading combining chars
1429
            // for NFC-safe concatenations.
1430
            $var = $leading_combining . $var;
1431
          }
1432
        }
1433
        break;
1434
    }
1435
1436
    return $var;
1437
  }
1438
1439
  /**
1440
   * "filter_input()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1441
   *
1442
   * @param int    $type
1443
   * @param string $var
1444
   * @param int    $filter
1445
   * @param mixed  $option
1446
   *
1447
   * @return mixed
1448
   */
1449 View Code Duplication
  public static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1450
  {
1451
    if (4 > func_num_args()) {
1452
      $var = filter_input($type, $var, $filter);
1453
    } else {
1454
      $var = filter_input($type, $var, $filter, $option);
1455
    }
1456
1457
    return self::filter($var);
1458
  }
1459
1460
  /**
1461
   * "filter_input_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1462
   *
1463
   * @param int   $type
1464
   * @param mixed $definition
1465
   * @param bool  $add_empty
1466
   *
1467
   * @return mixed
1468
   */
1469 View Code Duplication
  public static function filter_input_array($type, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1470
  {
1471
    if (2 > func_num_args()) {
1472
      $a = filter_input_array($type);
1473
    } else {
1474
      $a = filter_input_array($type, $definition, $add_empty);
1475
    }
1476
1477
    return self::filter($a);
1478
  }
1479
1480
  /**
1481
   * "filter_var()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1482
   *
1483
   * @param mixed $var
1484
   * @param int   $filter
1485
   * @param mixed $option
1486
   *
1487
   * @return mixed
1488
   */
1489 View Code Duplication
  public static function filter_var($var, $filter = FILTER_DEFAULT, $option = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1490
  {
1491
    if (3 > func_num_args()) {
1492
      $var = filter_var($var, $filter);
1493
    } else {
1494
      $var = filter_var($var, $filter, $option);
1495
    }
1496
1497
    return self::filter($var);
1498
  }
1499
1500
  /**
1501
   * "filter_var_array()"-wrapper with normalizes to UTF-8 NFC, converting from WINDOWS-1252 when needed.
1502
   *
1503
   * @param array $data
1504
   * @param mixed $definition
1505
   * @param bool  $add_empty
1506
   *
1507
   * @return mixed
1508
   */
1509 View Code Duplication
  public static function filter_var_array($data, $definition = null, $add_empty = true)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1510
  {
1511
    if (2 > func_num_args()) {
1512
      $a = filter_var_array($data);
1513
    } else {
1514
      $a = filter_var_array($data, $definition, $add_empty);
1515
    }
1516
1517
    return self::filter($a);
1518
  }
1519
1520
  /**
1521
   * Check if the number of unicode characters are not more than the specified integer.
1522
   *
1523
   * @param string $str      The original string to be checked.
1524
   * @param int    $box_size The size in number of chars to be checked against string.
1525
   *
1526
   * @return bool true if string is less than or equal to $box_size, false otherwise.
1527
   */
1528
  public static function fits_inside($str, $box_size)
1529
  {
1530
    return (self::strlen($str) <= $box_size);
1531
  }
1532
1533
  /**
1534
   * Try to fix simple broken UTF-8 strings.
1535
   *
1536
   * INFO: Take a look at "UTF8::fix_utf8()" if you need a more advanced fix for broken UTF-8 strings.
1537
   *
1538
   * If you received an UTF-8 string that was converted from Windows-1252 as it was ISO-8859-1
1539
   * (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
1540
   * See: http://en.wikipedia.org/wiki/Windows-1252
1541
   *
1542
   * @param string $str <p>The input string</p>
1543
   *
1544
   * @return string
1545
   */
1546
  public static function fix_simple_utf8($str)
1547
  {
1548
    static $brokenUtf8ToUtf8Keys = null;
1549
    static $brokenUtf8ToUtf8Values = null;
1550
1551
    $str = (string)$str;
1552
1553
    if (!isset($str[0])) {
1554
      return '';
1555
    }
1556
1557
    if ($brokenUtf8ToUtf8Keys === null) {
1558
      $brokenUtf8ToUtf8Keys = array_keys(self::$brokenUtf8ToUtf8);
1559
      $brokenUtf8ToUtf8Values = array_values(self::$brokenUtf8ToUtf8);
1560
    }
1561
1562
    return str_replace($brokenUtf8ToUtf8Keys, $brokenUtf8ToUtf8Values, $str);
1563
  }
1564
1565
  /**
1566
   * Fix a double (or multiple) encoded UTF8 string.
1567
   *
1568
   * @param string|string[] $str <p>You can use a string or an array of strings.</p>
1569
   *
1570
   * @return mixed
1571
   */
1572
  public static function fix_utf8($str)
1573
  {
1574
    if (is_array($str)) {
1575
1576
      foreach ($str as $k => $v) {
1577
        /** @noinspection AlterInForeachInspection */
1578
        /** @noinspection OffsetOperationsInspection */
1579
        $str[$k] = self::fix_utf8($v);
1580
      }
1581
1582
      return $str;
1583
    }
1584
1585
    $last = '';
1586
    while ($last !== $str) {
1587
      $last = $str;
1588
      $str = self::to_utf8(self::utf8_decode($str));
0 ignored issues
show
Bug introduced by
It seems like $str defined by self::to_utf8(self::utf8_decode($str)) on line 1588 can also be of type array; however, voku\helper\UTF8::utf8_decode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1589
    }
1590
1591
    return $str;
1592
  }
1593
1594
  /**
1595
   * Get character of a specific character.
1596
   *
1597
   * @param string $char
1598
   *
1599
   * @return string <p>'RTL' or 'LTR'</p>
1600
   */
1601
  public static function getCharDirection($char)
1602
  {
1603
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
1604
      self::checkForSupport();
1605
    }
1606
1607
    if (self::$support['intlChar'] === true) {
1608
      $tmpReturn = \IntlChar::charDirection($char);
1609
1610
      // from "IntlChar"-Class
1611
      $charDirection = array(
1612
          'RTL' => array(1, 13, 14, 15, 21),
1613
          'LTR' => array(0, 11, 12, 20),
1614
      );
1615
1616
      if (in_array($tmpReturn, $charDirection['LTR'], true)) {
1617
        return 'LTR';
1618
      } elseif (in_array($tmpReturn, $charDirection['RTL'], true)) {
1619
        return 'RTL';
1620
      }
1621
    }
1622
1623
    $c = static::chr_to_decimal($char);
1624
1625
    if (!(0x5be <= $c && 0x10b7f >= $c)) {
1626
      return 'LTR';
1627
    }
1628
1629
    if (0x85e >= $c) {
1630
1631
      if (0x5be === $c ||
1632
          0x5c0 === $c ||
1633
          0x5c3 === $c ||
1634
          0x5c6 === $c ||
1635
          (0x5d0 <= $c && 0x5ea >= $c) ||
1636
          (0x5f0 <= $c && 0x5f4 >= $c) ||
1637
          0x608 === $c ||
1638
          0x60b === $c ||
1639
          0x60d === $c ||
1640
          0x61b === $c ||
1641
          (0x61e <= $c && 0x64a >= $c) ||
1642
          (0x66d <= $c && 0x66f >= $c) ||
1643
          (0x671 <= $c && 0x6d5 >= $c) ||
1644
          (0x6e5 <= $c && 0x6e6 >= $c) ||
1645
          (0x6ee <= $c && 0x6ef >= $c) ||
1646
          (0x6fa <= $c && 0x70d >= $c) ||
1647
          0x710 === $c ||
1648
          (0x712 <= $c && 0x72f >= $c) ||
1649
          (0x74d <= $c && 0x7a5 >= $c) ||
1650
          0x7b1 === $c ||
1651
          (0x7c0 <= $c && 0x7ea >= $c) ||
1652
          (0x7f4 <= $c && 0x7f5 >= $c) ||
1653
          0x7fa === $c ||
1654
          (0x800 <= $c && 0x815 >= $c) ||
1655
          0x81a === $c ||
1656
          0x824 === $c ||
1657
          0x828 === $c ||
1658
          (0x830 <= $c && 0x83e >= $c) ||
1659
          (0x840 <= $c && 0x858 >= $c) ||
1660
          0x85e === $c
1661
      ) {
1662
        return 'RTL';
1663
      }
1664
1665
    } elseif (0x200f === $c) {
1666
1667
      return 'RTL';
1668
1669
    } elseif (0xfb1d <= $c) {
1670
1671
      if (0xfb1d === $c ||
1672
          (0xfb1f <= $c && 0xfb28 >= $c) ||
1673
          (0xfb2a <= $c && 0xfb36 >= $c) ||
1674
          (0xfb38 <= $c && 0xfb3c >= $c) ||
1675
          0xfb3e === $c ||
1676
          (0xfb40 <= $c && 0xfb41 >= $c) ||
1677
          (0xfb43 <= $c && 0xfb44 >= $c) ||
1678
          (0xfb46 <= $c && 0xfbc1 >= $c) ||
1679
          (0xfbd3 <= $c && 0xfd3d >= $c) ||
1680
          (0xfd50 <= $c && 0xfd8f >= $c) ||
1681
          (0xfd92 <= $c && 0xfdc7 >= $c) ||
1682
          (0xfdf0 <= $c && 0xfdfc >= $c) ||
1683
          (0xfe70 <= $c && 0xfe74 >= $c) ||
1684
          (0xfe76 <= $c && 0xfefc >= $c) ||
1685
          (0x10800 <= $c && 0x10805 >= $c) ||
1686
          0x10808 === $c ||
1687
          (0x1080a <= $c && 0x10835 >= $c) ||
1688
          (0x10837 <= $c && 0x10838 >= $c) ||
1689
          0x1083c === $c ||
1690
          (0x1083f <= $c && 0x10855 >= $c) ||
1691
          (0x10857 <= $c && 0x1085f >= $c) ||
1692
          (0x10900 <= $c && 0x1091b >= $c) ||
1693
          (0x10920 <= $c && 0x10939 >= $c) ||
1694
          0x1093f === $c ||
1695
          0x10a00 === $c ||
1696
          (0x10a10 <= $c && 0x10a13 >= $c) ||
1697
          (0x10a15 <= $c && 0x10a17 >= $c) ||
1698
          (0x10a19 <= $c && 0x10a33 >= $c) ||
1699
          (0x10a40 <= $c && 0x10a47 >= $c) ||
1700
          (0x10a50 <= $c && 0x10a58 >= $c) ||
1701
          (0x10a60 <= $c && 0x10a7f >= $c) ||
1702
          (0x10b00 <= $c && 0x10b35 >= $c) ||
1703
          (0x10b40 <= $c && 0x10b55 >= $c) ||
1704
          (0x10b58 <= $c && 0x10b72 >= $c) ||
1705
          (0x10b78 <= $c && 0x10b7f >= $c)
1706
      ) {
1707
        return 'RTL';
1708
      }
1709
    }
1710
1711
    return 'LTR';
1712
  }
1713
1714
  /**
1715
   * get data from "/data/*.ser"
1716
   *
1717
   * @param string $file
1718
   *
1719
   * @return bool|string|array|int <p>Will return false on error.</p>
1720
   */
1721
  private static function getData($file)
1722
  {
1723
    $file = __DIR__ . '/data/' . $file . '.php';
1724
    if (file_exists($file)) {
1725
      /** @noinspection PhpIncludeInspection */
1726
      return require $file;
1727
    } else {
1728
      return false;
1729
    }
1730
  }
1731
1732
  /**
1733
   * Converts hexadecimal U+xxxx code point representation to integer.
1734
   *
1735
   * INFO: opposite to UTF8::int_to_hex()
1736
   *
1737
   * @param string $str <p>The hexadecimal code point representation.</p>
1738
   *
1739
   * @return int|false <p>The code point, or false on failure.</p>
1740
   */
1741
  public static function hex_to_int($str)
1742
  {
1743
    if (preg_match('/^(?:\\\u|U\+|)([a-z0-9]{4,6})$/i', $str, $match)) {
1744
      return intval($match[1], 16);
1745
    }
1746
1747
    return false;
1748
  }
1749
1750
  /**
1751
   * alias for "UTF8::html_entity_decode()"
1752
   *
1753
   * @see UTF8::html_entity_decode()
1754
   *
1755
   * @param string $str
1756
   * @param int    $flags
1757
   * @param string $encoding
1758
   *
1759
   * @return string
1760
   */
1761
  public static function html_decode($str, $flags = null, $encoding = 'UTF-8')
1762
  {
1763
    return self::html_entity_decode($str, $flags, $encoding);
1764
  }
1765
1766
  /**
1767
   * Converts a UTF-8 string to a series of HTML numbered entities.
1768
   *
1769
   * INFO: opposite to UTF8::html_decode()
1770
   *
1771
   * @param string $str            <p>The Unicode string to be encoded as numbered entities.</p>
1772
   * @param bool   $keepAsciiChars [optional] <p>Keep ASCII chars.</p>
1773
   * @param string $encoding       [optional] <p>Default is UTF-8</p>
1774
   *
1775
   * @return string <p>HTML numbered entities.</p>
1776
   */
1777
  public static function html_encode($str, $keepAsciiChars = false, $encoding = 'UTF-8')
1778
  {
1779
    # INFO: http://stackoverflow.com/questions/35854535/better-explanation-of-convmap-in-mb-encode-numericentity
1780
    if (function_exists('mb_encode_numericentity')) {
1781
1782
      $startCode = 0x00;
1783
      if ($keepAsciiChars === true) {
1784
        $startCode = 0x80;
1785
      }
1786
1787
      if ($encoding !== 'UTF-8') {
1788
        $encoding = self::normalize_encoding($encoding);
1789
      }
1790
1791
      return mb_encode_numericentity(
1792
          $str,
1793
          array($startCode, 0xffff, 0, 0xffff,),
1794
          $encoding
1795
      );
1796
    }
1797
1798
    return implode(
1799
        array_map(
1800
            function ($data) use ($keepAsciiChars) {
1801
              return UTF8::single_chr_html_encode($data, $keepAsciiChars);
1802
            },
1803
            self::split($str)
1804
        )
1805
    );
1806
  }
1807
1808
  /**
1809
   * UTF-8 version of html_entity_decode()
1810
   *
1811
   * The reason we are not using html_entity_decode() by itself is because
1812
   * while it is not technically correct to leave out the semicolon
1813
   * at the end of an entity most browsers will still interpret the entity
1814
   * correctly. html_entity_decode() does not convert entities without
1815
   * semicolons, so we are left with our own little solution here. Bummer.
1816
   *
1817
   * Convert all HTML entities to their applicable characters
1818
   *
1819
   * INFO: opposite to UTF8::html_encode()
1820
   *
1821
   * @link http://php.net/manual/en/function.html-entity-decode.php
1822
   *
1823
   * @param string $str      <p>
1824
   *                         The input string.
1825
   *                         </p>
1826
   * @param int    $flags    [optional] <p>
1827
   *                         A bitmask of one or more of the following flags, which specify how to handle quotes and
1828
   *                         which document type to use. The default is ENT_COMPAT | ENT_HTML401.
1829
   *                         <table>
1830
   *                         Available <i>flags</i> constants
1831
   *                         <tr valign="top">
1832
   *                         <td>Constant Name</td>
1833
   *                         <td>Description</td>
1834
   *                         </tr>
1835
   *                         <tr valign="top">
1836
   *                         <td><b>ENT_COMPAT</b></td>
1837
   *                         <td>Will convert double-quotes and leave single-quotes alone.</td>
1838
   *                         </tr>
1839
   *                         <tr valign="top">
1840
   *                         <td><b>ENT_QUOTES</b></td>
1841
   *                         <td>Will convert both double and single quotes.</td>
1842
   *                         </tr>
1843
   *                         <tr valign="top">
1844
   *                         <td><b>ENT_NOQUOTES</b></td>
1845
   *                         <td>Will leave both double and single quotes unconverted.</td>
1846
   *                         </tr>
1847
   *                         <tr valign="top">
1848
   *                         <td><b>ENT_HTML401</b></td>
1849
   *                         <td>
1850
   *                         Handle code as HTML 4.01.
1851
   *                         </td>
1852
   *                         </tr>
1853
   *                         <tr valign="top">
1854
   *                         <td><b>ENT_XML1</b></td>
1855
   *                         <td>
1856
   *                         Handle code as XML 1.
1857
   *                         </td>
1858
   *                         </tr>
1859
   *                         <tr valign="top">
1860
   *                         <td><b>ENT_XHTML</b></td>
1861
   *                         <td>
1862
   *                         Handle code as XHTML.
1863
   *                         </td>
1864
   *                         </tr>
1865
   *                         <tr valign="top">
1866
   *                         <td><b>ENT_HTML5</b></td>
1867
   *                         <td>
1868
   *                         Handle code as HTML 5.
1869
   *                         </td>
1870
   *                         </tr>
1871
   *                         </table>
1872
   *                         </p>
1873
   * @param string $encoding [optional] <p>Encoding to use.</p>
1874
   *
1875
   * @return string <p>The decoded string.</p>
1876
   */
1877
  public static function html_entity_decode($str, $flags = null, $encoding = 'UTF-8')
1878
  {
1879
    $str = (string)$str;
1880
1881
    if (!isset($str[0])) {
1882
      return '';
1883
    }
1884
1885
    if (strpos($str, '&') === false) {
1886
      return $str;
1887
    }
1888
1889
    if ($encoding !== 'UTF-8') {
1890
      $encoding = self::normalize_encoding($encoding);
1891 194
    }
1892
1893 194
    if ($flags === null) {
1894
      if (Bootup::is_php('5.4') === true) {
1895 1
        $flags = ENT_COMPAT | ENT_HTML5;
1896 1
      } else {
1897 1
        $flags = ENT_COMPAT;
1898 1
      }
1899 1
    }
1900 1
1901 194
    do {
1902
      $str_compare = $str;
1903
1904
      $str = preg_replace_callback(
1905
          "/&#\d{2,5};/",
1906
          function ($matches) {
1907
            $returnTmp = \mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
1908
1909
            if ($returnTmp !== '"' && $returnTmp !== "'") {
1910
              return $returnTmp;
1911
            } else {
1912 9
              return $matches[0];
1913
            }
1914 9
          },
1915
          $str
1916 9
      );
1917
1918 9
      // decode numeric & UTF16 two byte entities
1919
      $str = html_entity_decode(
1920
          preg_replace('/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS', '$1;', $str),
1921
          $flags,
1922 9
          $encoding
1923 1
      );
1924 1
1925
    } while ($str_compare !== $str);
1926 9
1927 2
    return $str;
1928
  }
1929
1930 9
  /**
1931
   * Convert all applicable characters to HTML entities: UTF-8 version of htmlentities()
1932
   *
1933
   * @link http://php.net/manual/en/function.htmlentities.php
1934
   *
1935
   * @param string $str           <p>
1936
   *                              The input string.
1937
   *                              </p>
1938
   * @param int    $flags         [optional] <p>
1939
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
1940
   *                              invalid code unit sequences and the used document type. The default is
1941 1
   *                              ENT_COMPAT | ENT_HTML401.
1942
   *                              <table>
1943 1
   *                              Available <i>flags</i> constants
1944
   *                              <tr valign="top">
1945 1
   *                              <td>Constant Name</td>
1946
   *                              <td>Description</td>
1947
   *                              </tr>
1948
   *                              <tr valign="top">
1949
   *                              <td><b>ENT_COMPAT</b></td>
1950
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
1951
   *                              </tr>
1952
   *                              <tr valign="top">
1953
   *                              <td><b>ENT_QUOTES</b></td>
1954
   *                              <td>Will convert both double and single quotes.</td>
1955
   *                              </tr>
1956
   *                              <tr valign="top">
1957
   *                              <td><b>ENT_NOQUOTES</b></td>
1958
   *                              <td>Will leave both double and single quotes unconverted.</td>
1959
   *                              </tr>
1960 4
   *                              <tr valign="top">
1961
   *                              <td><b>ENT_IGNORE</b></td>
1962 4
   *                              <td>
1963 3
   *                              Silently discard invalid code unit sequences instead of returning
1964
   *                              an empty string. Using this flag is discouraged as it
1965
   *                              may have security implications.
1966 4
   *                              </td>
1967
   *                              </tr>
1968
   *                              <tr valign="top">
1969
   *                              <td><b>ENT_SUBSTITUTE</b></td>
1970
   *                              <td>
1971
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
1972
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
1973
   *                              </td>
1974
   *                              </tr>
1975
   *                              <tr valign="top">
1976 2
   *                              <td><b>ENT_DISALLOWED</b></td>
1977
   *                              <td>
1978 2
   *                              Replace invalid code points for the given document type with a
1979 2
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
1980 2
   *                              (otherwise) instead of leaving them as is. This may be useful, for
1981
   *                              instance, to ensure the well-formedness of XML documents with
1982 2
   *                              embedded external content.
1983
   *                              </td>
1984 2
   *                              </tr>
1985
   *                              <tr valign="top">
1986
   *                              <td><b>ENT_HTML401</b></td>
1987 2
   *                              <td>
1988
   *                              Handle code as HTML 4.01.
1989 2
   *                              </td>
1990 2
   *                              </tr>
1991 2
   *                              <tr valign="top">
1992
   *                              <td><b>ENT_XML1</b></td>
1993 1
   *                              <td>
1994 1
   *                              Handle code as XML 1.
1995 1
   *                              </td>
1996
   *                              </tr>
1997
   *                              <tr valign="top">
1998
   *                              <td><b>ENT_XHTML</b></td>
1999
   *                              <td>
2000
   *                              Handle code as XHTML.
2001 2
   *                              </td>
2002
   *                              </tr>
2003 2
   *                              <tr valign="top">
2004 2
   *                              <td><b>ENT_HTML5</b></td>
2005
   *                              <td>
2006 2
   *                              Handle code as HTML 5.
2007
   *                              </td>
2008
   *                              </tr>
2009
   *                              </table>
2010
   *                              </p>
2011
   * @param string $encoding      [optional] <p>
2012
   *                              Like <b>htmlspecialchars</b>,
2013
   *                              <b>htmlentities</b> takes an optional third argument
2014
   *                              <i>encoding</i> which defines encoding used in
2015
   *                              conversion.
2016
   *                              Although this argument is technically optional, you are highly
2017
   *                              encouraged to specify the correct value for your code.
2018
   *                              </p>
2019
   * @param bool   $double_encode [optional] <p>
2020
   *                              When <i>double_encode</i> is turned off PHP will not
2021
   *                              encode existing html entities. The default is to convert everything.
2022
   *                              </p>
2023
   *
2024
   *
2025
   * @return string the encoded string.
2026
   * </p>
2027
   * <p>
2028
   * If the input <i>string</i> contains an invalid code unit
2029
   * sequence within the given <i>encoding</i> an empty string
2030
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2031 1
   * <b>ENT_SUBSTITUTE</b> flags are set.
2032
   */
2033 1
  public static function htmlentities($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2034
  {
2035
    if ($encoding !== 'UTF-8') {
2036
      $encoding = self::normalize_encoding($encoding);
2037
    }
2038
2039
    $str = htmlentities($str, $flags, $encoding, $double_encode);
2040
2041
    if ($encoding !== 'UTF-8') {
2042
      return $str;
2043
    }
2044
2045
    $byteLengths = self::chr_size_list($str);
2046
    $search = array();
2047 41
    $replacements = array();
2048
    foreach ($byteLengths as $counter => $byteLength) {
2049
      if ($byteLength >= 3) {
2050
        $char = self::access($str, $counter);
2051
2052
        if (!isset($replacements[$char])) {
2053
          $search[$char] = $char;
2054
          $replacements[$char] = self::html_encode($char);
0 ignored issues
show
Security Bug introduced by
It seems like $char defined by self::access($str, $counter) on line 2050 can also be of type false; however, voku\helper\UTF8::html_encode() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
2055
        }
2056
      }
2057
    }
2058
2059
    return str_replace($search, $replacements, $str);
2060
  }
2061
2062 41
  /**
2063 41
   * Convert only special characters to HTML entities: UTF-8 version of htmlspecialchars()
2064
   *
2065 41
   * INFO: Take a look at "UTF8::htmlentities()"
2066 41
   *
2067
   * @link http://php.net/manual/en/function.htmlspecialchars.php
2068 41
   *
2069 6
   * @param string $str           <p>
2070 6
   *                              The string being converted.
2071
   *                              </p>
2072 41
   * @param int    $flags         [optional] <p>
2073 1
   *                              A bitmask of one or more of the following flags, which specify how to handle quotes,
2074 1
   *                              invalid code unit sequences and the used document type. The default is
2075
   *                              ENT_COMPAT | ENT_HTML401.
2076 41
   *                              <table>
2077 5
   *                              Available <i>flags</i> constants
2078 5
   *                              <tr valign="top">
2079
   *                              <td>Constant Name</td>
2080 41
   *                              <td>Description</td>
2081
   *                              </tr>
2082
   *                              <tr valign="top">
2083
   *                              <td><b>ENT_COMPAT</b></td>
2084
   *                              <td>Will convert double-quotes and leave single-quotes alone.</td>
2085
   *                              </tr>
2086
   *                              <tr valign="top">
2087
   *                              <td><b>ENT_QUOTES</b></td>
2088
   *                              <td>Will convert both double and single quotes.</td>
2089
   *                              </tr>
2090 4
   *                              <tr valign="top">
2091
   *                              <td><b>ENT_NOQUOTES</b></td>
2092 4
   *                              <td>Will leave both double and single quotes unconverted.</td>
2093
   *                              </tr>
2094 4
   *                              <tr valign="top">
2095 1
   *                              <td><b>ENT_IGNORE</b></td>
2096
   *                              <td>
2097
   *                              Silently discard invalid code unit sequences instead of returning
2098
   *                              an empty string. Using this flag is discouraged as it
2099 4
   *                              may have security implications.
2100
   *                              </td>
2101
   *                              </tr>
2102
   *                              <tr valign="top">
2103
   *                              <td><b>ENT_SUBSTITUTE</b></td>
2104
   *                              <td>
2105
   *                              Replace invalid code unit sequences with a Unicode Replacement Character
2106 4
   *                              U+FFFD (UTF-8) or &#38;#38;#FFFD; (otherwise) instead of returning an empty string.
2107
   *                              </td>
2108 4
   *                              </tr>
2109
   *                              <tr valign="top">
2110
   *                              <td><b>ENT_DISALLOWED</b></td>
2111
   *                              <td>
2112
   *                              Replace invalid code points for the given document type with a
2113
   *                              Unicode Replacement Character U+FFFD (UTF-8) or &#38;#38;#FFFD;
2114
   *                              (otherwise) instead of leaving them as is. This may be useful, for
2115
   *                              instance, to ensure the well-formedness of XML documents with
2116
   *                              embedded external content.
2117
   *                              </td>
2118
   *                              </tr>
2119
   *                              <tr valign="top">
2120
   *                              <td><b>ENT_HTML401</b></td>
2121
   *                              <td>
2122 5
   *                              Handle code as HTML 4.01.
2123
   *                              </td>
2124 5
   *                              </tr>
2125 5
   *                              <tr valign="top">
2126 5
   *                              <td><b>ENT_XML1</b></td>
2127
   *                              <td>
2128 5
   *                              Handle code as XML 1.
2129
   *                              </td>
2130 5
   *                              </tr>
2131 5
   *                              <tr valign="top">
2132 5
   *                              <td><b>ENT_XHTML</b></td>
2133
   *                              <td>
2134 5
   *                              Handle code as XHTML.
2135
   *                              </td>
2136 5
   *                              </tr>
2137 1
   *                              <tr valign="top">
2138
   *                              <td><b>ENT_HTML5</b></td>
2139 1
   *                              <td>
2140 1
   *                              Handle code as HTML 5.
2141 1
   *                              </td>
2142
   *                              </tr>
2143 1
   *                              </table>
2144 1
   *                              </p>
2145
   * @param string $encoding      [optional] <p>
2146 5
   *                              Defines encoding used in conversion.
2147
   *                              </p>
2148
   *                              <p>
2149
   *                              For the purposes of this function, the encodings
2150
   *                              ISO-8859-1, ISO-8859-15,
2151
   *                              UTF-8, cp866,
2152
   *                              cp1251, cp1252, and
2153
   *                              KOI8-R are effectively equivalent, provided the
2154
   *                              <i>string</i> itself is valid for the encoding, as
2155
   *                              the characters affected by <b>htmlspecialchars</b> occupy
2156
   *                              the same positions in all of these encodings.
2157
   *                              </p>
2158 6
   * @param bool   $double_encode [optional] <p>
2159
   *                              When <i>double_encode</i> is turned off PHP will not
2160 6
   *                              encode existing html entities, the default is to convert everything.
2161
   *                              </p>
2162
   *
2163
   * @return string The converted string.
2164
   * </p>
2165
   * <p>
2166
   * If the input <i>string</i> contains an invalid code unit
2167
   * sequence within the given <i>encoding</i> an empty string
2168
   * will be returned, unless either the <b>ENT_IGNORE</b> or
2169
   * <b>ENT_SUBSTITUTE</b> flags are set.
2170 1
   */
2171
  public static function htmlspecialchars($str, $flags = ENT_COMPAT, $encoding = 'UTF-8', $double_encode = true)
2172 1
  {
2173
    if ($encoding !== 'UTF-8') {
2174 1
      $encoding = self::normalize_encoding($encoding);
2175 1
    }
2176 1
2177
    return htmlspecialchars($str, $flags, $encoding, $double_encode);
2178 1
  }
2179
2180
  /**
2181
   * Checks whether iconv is available on the server.
2182
   *
2183
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2184
   */
2185
  public static function iconv_loaded()
2186
  {
2187
    return extension_loaded('iconv') ? true : false;
2188
  }
2189
2190
  /**
2191
   * Converts Integer to hexadecimal U+xxxx code point representation.
2192
   *
2193
   * INFO: opposite to UTF8::hex_to_int()
2194 11
   *
2195
   * @param int    $int  <p>The integer to be converted to hexadecimal code point.</p>
2196 11
   * @param string $pfix [optional]
2197 11
   *
2198
   * @return string <p>The code point, or empty string on failure.</p>
2199 11
   */
2200 5
  public static function int_to_hex($int, $pfix = 'U+')
2201
  {
2202
    if (ctype_digit((string)$int)) {
2203 11
      $hex = dechex((int)$int);
2204 11
2205
      $hex = (strlen($hex) < 4 ? substr('0000' . $hex, -4) : $hex);
2206
2207
      return $pfix . $hex;
2208 11
    }
2209
2210
    return '';
2211 11
  }
2212
2213 1
  /**
2214 11
   * Checks whether intl-char is available on the server.
2215 11
   *
2216
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2217
   */
2218
  public static function intlChar_loaded()
2219 11
  {
2220
    return Bootup::is_php('7.0') === true and class_exists('IntlChar');
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as and instead of && is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
2221
  }
2222 11
2223 1
  /**
2224 1
   * Checks whether intl is available on the server.
2225 1
   *
2226 11
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2227 11
   */
2228
  public static function intl_loaded()
2229
  {
2230
    return extension_loaded('intl') ? true : false;
2231
  }
2232 2
2233
  /**
2234
   * alias for "UTF8::is_ascii()"
2235 1
   *
2236
   * @see UTF8::is_ascii()
2237
   *
2238 2
   * @param string $str
2239 1
   *
2240
   * @return boolean
2241
   */
2242 2
  public static function isAscii($str)
2243 2
  {
2244 2
    return self::is_ascii($str);
2245
  }
2246 2
2247
  /**
2248 2
   * alias for "UTF8::is_base64()"
2249 2
   *
2250
   * @see UTF8::is_base64()
2251
   *
2252
   * @param string $str
2253 1
   *
2254
   * @return bool
2255
   */
2256
  public static function isBase64($str)
2257
  {
2258
    return self::is_base64($str);
2259
  }
2260
2261
  /**
2262
   * alias for "UTF8::is_binary()"
2263
   *
2264
   * @see UTF8::is_binary()
2265
   *
2266
   * @param string $str
2267
   *
2268
   * @return bool
2269
   */
2270
  public static function isBinary($str)
2271
  {
2272
    return self::is_binary($str);
2273
  }
2274
2275
  /**
2276
   * alias for "UTF8::is_bom()"
2277
   *
2278
   * @see UTF8::is_bom()
2279
   *
2280
   * @param string $utf8_chr
2281
   *
2282
   * @return boolean
2283
   */
2284
  public static function isBom($utf8_chr)
2285
  {
2286
    return self::is_bom($utf8_chr);
2287
  }
2288
2289
  /**
2290
   * alias for "UTF8::is_html()"
2291
   *
2292
   * @see UTF8::is_html()
2293
   *
2294
   * @param string $str
2295
   *
2296
   * @return boolean
2297
   */
2298
  public static function isHtml($str)
2299
  {
2300
    return self::is_html($str);
2301
  }
2302
2303
  /**
2304
   * alias for "UTF8::is_json()"
2305
   *
2306
   * @see UTF8::is_json()
2307
   *
2308
   * @param string $str
2309
   *
2310
   * @return bool
2311
   */
2312
  public static function isJson($str)
2313
  {
2314
    return self::is_json($str);
2315
  }
2316
2317
  /**
2318
   * alias for "UTF8::is_utf16()"
2319
   *
2320
   * @see UTF8::is_utf16()
2321
   *
2322
   * @param string $str
2323
   *
2324
   * @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE.
2325
   */
2326
  public static function isUtf16($str)
2327
  {
2328
    return self::is_utf16($str);
2329
  }
2330
2331
  /**
2332
   * alias for "UTF8::is_utf32()"
2333
   *
2334
   * @see UTF8::is_utf32()
2335
   *
2336
   * @param string $str
2337
   *
2338 2
   * @return int|false false if is't not UTF16, 1 for UTF-32LE, 2 for UTF-32BE.
2339
   */
2340
  public static function isUtf32($str)
2341 2
  {
2342 2
    return self::is_utf32($str);
2343
  }
2344 2
2345 2
  /**
2346
   * alias for "UTF8::is_utf8()"
2347
   *
2348
   * @see UTF8::is_utf8()
2349 2
   *
2350 2
   * @param string $str
2351
   * @param bool   $strict
2352 2
   *
2353 2
   * @return bool
2354
   */
2355 2
  public static function isUtf8($str, $strict = false)
2356 1
  {
2357 1
    return self::is_utf8($str, $strict);
2358 2
  }
2359
2360
  /**
2361
   * Checks if a string is 7 bit ASCII.
2362 2
   *
2363 1
   * @param string $str <p>The string to check.</p>
2364
   *
2365
   * @return bool <p>
2366 1
   *              <strong>true</strong> if it is ASCII<br />
2367 1
   *              <strong>false</strong> otherwise
2368
   *              </p>
2369 1
   */
2370 1
  public static function is_ascii($str)
2371 1
  {
2372
    return (bool)!preg_match('/[\x80-\xFF]/', $str);
2373
  }
2374 1
2375
  /**
2376
   * Returns true if the string is base64 encoded, false otherwise.
2377
   *
2378
   * @param string $str <p>The input string.</p>
2379
   *
2380
   * @return bool <p>Whether or not $str is base64 encoded.</p>
2381
   */
2382
  public static function is_base64($str)
2383
  {
2384 1
    $str = (string)$str;
2385
2386 1
    if (!isset($str[0])) {
2387
      return false;
2388
    }
2389
2390
    if (base64_encode(base64_decode($str, true)) === $str) {
2391
      return true;
2392
    } else {
2393
      return false;
2394
    }
2395
  }
2396
2397
  /**
2398 9
   * Check if the input is binary... (is look like a hack).
2399
   *
2400 9
   * @param mixed $input
2401 9
   *
2402 3
   * @return bool
2403
   */
2404 3
  public static function is_binary($input)
2405 3
  {
2406 3
2407 9
    $testLength = strlen($input);
2408 2
2409 2
    if (
2410 2
        preg_match('~^[01]+$~', $input)
2411 2
        ||
2412 9
        substr_count($input, "\x00") > 0
2413 8
        ||
2414
        ($testLength ? substr_count($input, '^ -~') / $testLength > 0.3 : 1 === 0)
2415 2
    ) {
2416 2
      return true;
2417 8
    } else {
2418 8
      return false;
2419 6
    }
2420 6
  }
2421 6
2422
  /**
2423 6
   * Check if the file is binary.
2424 3
   *
2425 3
   * @param string $file
2426 5
   *
2427
   * @return boolean
2428
   */
2429
  public static function is_binary_file($file)
2430 8
  {
2431
    try {
2432
      $fp = fopen($file, 'r');
2433 2
      $block = fread($fp, 512);
2434 2
      fclose($fp);
2435 8
    } catch (\Exception $e) {
2436 8
      $block = '';
2437 9
    }
2438
2439 9
    return self::is_binary($block);
2440
  }
2441
2442
  /**
2443
   * Checks if the given string is equal to any "Byte Order Mark".
2444
   *
2445
   * WARNING: Use "UTF8::string_has_bom()" if you will check BOM in a string.
2446
   *
2447
   * @param string $str <p>The input string.</p>
2448
   *
2449
   * @return bool <p><strong>true</strong> if the $utf8_chr is Byte Order Mark, <strong>false</strong> otherwise.</p>
2450
   */
2451
  public static function is_bom($str)
2452
  {
2453
    foreach (self::$bom as $bomString => $bomByteLength) {
2454
      if ($str === $bomString) {
2455
        return true;
2456
      }
2457
    }
2458
2459
    return false;
2460
  }
2461
2462
  /**
2463
   * Check if the string contains any html-tags <lall>.
2464
   *
2465
   * @param string $str <p>The input string.</p>
2466
   *
2467
   * @return boolean
2468
   */
2469
  public static function is_html($str)
2470
  {
2471
    $str = (string)$str;
2472
2473
    if (!isset($str[0])) {
2474
      return false;
2475
    }
2476
2477
    // init
2478
    $matches = array();
2479
2480
    preg_match("/<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>/", $str, $matches);
2481
2482
    if (count($matches) == 0) {
2483
      return false;
2484
    } else {
2485
      return true;
2486
    }
2487
  }
2488
2489
  /**
2490
   * Try to check if "$str" is an json-string.
2491
   *
2492 1
   * @param string $str <p>The input string.</p>
2493
   *
2494 1
   * @return bool
2495 1
   */
2496 1
  public static function is_json($str)
2497 1
  {
2498
    $str = (string)$str;
2499
2500 1
    if (!isset($str[0])) {
2501
      return false;
2502
    }
2503
2504
    if (
2505
        is_object(self::json_decode($str))
2506
        &&
2507
        json_last_error() === JSON_ERROR_NONE
2508
    ) {
2509
      return true;
2510
    } else {
2511
      return false;
2512 1
    }
2513
  }
2514 1
2515 1
  /**
2516 1
   * Check if the string is UTF-16.
2517 1
   *
2518
   * @param string $str <p>The input string.</p>
2519
   *
2520 1
   * @return int|false <p>
2521
   *                   <strong>false</strong> if is't not UTF-16,<br />
2522
   *                   <strong>1</strong> for UTF-16LE,<br />
2523
   *                   <strong>2</strong> for UTF-16BE.
2524
   *                   </p>
2525
   */
2526 View Code Duplication
  public static function is_utf16($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2527
  {
2528
    $str = self::remove_bom($str);
2529
2530
    if (self::is_binary($str)) {
2531 1
2532
      $maybeUTF16LE = 0;
2533 1
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16LE');
2534
      if ($test) {
2535
        $test2 = \mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
2536
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
2537
        if ($test3 === $test) {
2538
          $strChars = self::count_chars($str, true);
2539
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2540
            if (in_array($test3char, $strChars, true) === true) {
2541
              $maybeUTF16LE++;
2542
            }
2543
          }
2544
        }
2545 7
      }
2546
2547 7
      $maybeUTF16BE = 0;
2548 7
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-16BE');
2549
      if ($test) {
2550 7
        $test2 = \mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
2551
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
2552 7
        if ($test3 === $test) {
2553 2
          $strChars = self::count_chars($str, true);
2554
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2555
            if (in_array($test3char, $strChars, true) === true) {
2556 7
              $maybeUTF16BE++;
2557 1
            }
2558 1
          }
2559 1
        }
2560
      }
2561 7
2562
      if ($maybeUTF16BE !== $maybeUTF16LE) {
2563
        if ($maybeUTF16LE > $maybeUTF16BE) {
2564
          return 1;
2565
        } else {
2566
          return 2;
2567
        }
2568
      }
2569
2570
    }
2571 1
2572
    return false;
2573 1
  }
2574
2575 1
  /**
2576
   * Check if the string is UTF-32.
2577
   *
2578 1
   * @param string $str
2579 1
   *
2580
   * @return int|false <p>
2581 1
   *                   <strong>false</strong> if is't not UTF-16,<br />
2582
   *                   <strong>1</strong> for UTF-32LE,<br />
2583
   *                   <strong>2</strong> for UTF-32BE.
2584 1
   *                   </p>
2585 1
   */
2586 1 View Code Duplication
  public static function is_utf32($str)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2587 1
  {
2588 1
    $str = self::remove_bom($str);
2589
2590 1
    if (self::is_binary($str)) {
2591
2592
      $maybeUTF32LE = 0;
2593
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32LE');
2594
      if ($test) {
2595
        $test2 = \mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
2596
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
2597
        if ($test3 === $test) {
2598
          $strChars = self::count_chars($str, true);
2599
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2600 1
            if (in_array($test3char, $strChars, true) === true) {
2601
              $maybeUTF32LE++;
2602
            }
2603 1
          }
2604
        }
2605 1
      }
2606
2607
      $maybeUTF32BE = 0;
2608
      $test = \mb_convert_encoding($str, 'UTF-8', 'UTF-32BE');
2609
      if ($test) {
2610
        $test2 = \mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
2611
        $test3 = \mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
2612
        if ($test3 === $test) {
2613
          $strChars = self::count_chars($str, true);
2614
          foreach (self::count_chars($test3, true) as $test3char => $test3charEmpty) {
2615
            if (in_array($test3char, $strChars, true) === true) {
2616
              $maybeUTF32BE++;
2617
            }
2618
          }
2619
        }
2620
      }
2621 1
2622
      if ($maybeUTF32BE !== $maybeUTF32LE) {
2623 1
        if ($maybeUTF32LE > $maybeUTF32BE) {
2624 1
          return 1;
2625
        } else {
2626
          return 2;
2627 1
        }
2628
      }
2629 1
2630 1
    }
2631 1
2632 1
    return false;
2633 1
  }
2634 1
2635 1
  /**
2636 1
   * Checks whether the passed string contains only byte sequences that appear valid UTF-8 characters.
2637 1
   *
2638 1
   * @see    http://hsivonen.iki.fi/php-utf8/
2639 1
   *
2640
   * @param string $str    <p>The string to be checked.</p>
2641
   * @param bool   $strict <p>Check also if the string is not UTF-16 or UTF-32.</p>
2642
   *
2643
   * @return bool
2644
   */
2645
  public static function is_utf8($str, $strict = false)
2646
  {
2647
    $str = (string)$str;
2648
2649
    if (!isset($str[0])) {
2650
      return true;
2651
    }
2652
2653
    if ($strict === true) {
2654
      if (self::is_utf16($str) !== false) {
2655
        return false;
2656
      }
2657
2658
      if (self::is_utf32($str) !== false) {
2659 1
        return false;
2660 1
      }
2661
    }
2662
2663
    if (self::pcre_utf8_support() !== true) {
2664
2665
      // If even just the first character can be matched, when the /u
2666
      // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
2667
      // invalid, nothing at all will match, even if the string contains
2668
      // some valid sequences
2669
      return (preg_match('/^.{1}/us', $str, $ar) === 1);
2670
2671
    } else {
2672
2673
      $mState = 0; // cached expected number of octets after the current octet
2674
      // until the beginning of the next UTF8 character sequence
2675
      $mUcs4 = 0; // cached Unicode character
2676
      $mBytes = 1; // cached expected number of octets in the current sequence
2677
      $len = strlen($str);
2678
2679
      /** @noinspection ForeachInvariantsInspection */
2680
      for ($i = 0; $i < $len; $i++) {
2681
        $in = ord($str[$i]);
2682
        if ($mState === 0) {
2683
          // When mState is zero we expect either a US-ASCII character or a
2684
          // multi-octet sequence.
2685
          if (0 === (0x80 & $in)) {
2686
            // US-ASCII, pass straight through.
2687
            $mBytes = 1;
2688 View Code Duplication
          } elseif (0xC0 === (0xE0 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2689
            // First octet of 2 octet sequence.
2690
            $mUcs4 = $in;
2691
            $mUcs4 = ($mUcs4 & 0x1F) << 6;
2692
            $mState = 1;
2693
            $mBytes = 2;
2694
          } elseif (0xE0 === (0xF0 & $in)) {
2695
            // First octet of 3 octet sequence.
2696
            $mUcs4 = $in;
2697
            $mUcs4 = ($mUcs4 & 0x0F) << 12;
2698
            $mState = 2;
2699
            $mBytes = 3;
2700 View Code Duplication
          } elseif (0xF0 === (0xF8 & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2701
            // First octet of 4 octet sequence.
2702
            $mUcs4 = $in;
2703
            $mUcs4 = ($mUcs4 & 0x07) << 18;
2704
            $mState = 3;
2705
            $mBytes = 4;
2706
          } elseif (0xF8 === (0xFC & $in)) {
2707
            /* First octet of 5 octet sequence.
2708
            *
2709
            * This is illegal because the encoded codepoint must be either
2710
            * (a) not the shortest form or
2711
            * (b) outside the Unicode range of 0-0x10FFFF.
2712
            * Rather than trying to resynchronize, we will carry on until the end
2713
            * of the sequence and let the later error handling code catch it.
2714
            */
2715
            $mUcs4 = $in;
2716
            $mUcs4 = ($mUcs4 & 0x03) << 24;
2717
            $mState = 4;
2718
            $mBytes = 5;
2719 1 View Code Duplication
          } elseif (0xFC === (0xFE & $in)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2720
            // First octet of 6 octet sequence, see comments for 5 octet sequence.
2721 1
            $mUcs4 = $in;
2722 1
            $mUcs4 = ($mUcs4 & 1) << 30;
2723
            $mState = 5;
2724 1
            $mBytes = 6;
2725
          } else {
2726
            /* Current octet is neither in the US-ASCII range nor a legal first
2727
             * octet of a multi-octet sequence.
2728
             */
2729
            return false;
2730
          }
2731
        } else {
2732
          // When mState is non-zero, we expect a continuation of the multi-octet
2733
          // sequence
2734
          if (0x80 === (0xC0 & $in)) {
2735
            // Legal continuation.
2736
            $shift = ($mState - 1) * 6;
2737
            $tmp = $in;
2738
            $tmp = ($tmp & 0x0000003F) << $shift;
2739 2
            $mUcs4 |= $tmp;
2740
            /**
2741 2
             * End of the multi-octet sequence. mUcs4 now contains the final
2742 1
             * Unicode code point to be output
2743
             */
2744
            if (0 === --$mState) {
2745 1
              /*
2746
              * Check for illegal sequences and code points.
2747
              */
2748
              // From Unicode 3.1, non-shortest form is illegal
2749
              if (
2750
                  (2 === $mBytes && $mUcs4 < 0x0080) ||
2751
                  (3 === $mBytes && $mUcs4 < 0x0800) ||
2752
                  (4 === $mBytes && $mUcs4 < 0x10000) ||
2753
                  (4 < $mBytes) ||
2754
                  // From Unicode 3.2, surrogate characters are illegal.
2755
                  (($mUcs4 & 0xFFFFF800) === 0xD800) ||
2756
                  // Code points outside the Unicode range are illegal.
2757
                  ($mUcs4 > 0x10FFFF)
2758
              ) {
2759 1
                return false;
2760
              }
2761 1
              // initialize UTF8 cache
2762
              $mState = 0;
2763
              $mUcs4 = 0;
2764
              $mBytes = 1;
2765
            }
2766
          } else {
2767
            /**
2768
             *((0xC0 & (*in) != 0x80) && (mState != 0))
2769
             * Incomplete multi-octet sequence.
2770
             */
2771
            return false;
2772
          }
2773
        }
2774
      }
2775 2
2776
      return true;
2777
    }
2778 2
  }
2779
2780 2
  /**
2781 2
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2782 1
   * Decodes a JSON string
2783 1
   *
2784
   * @link http://php.net/manual/en/function.json-decode.php
2785 2
   *
2786
   * @param string $json    <p>
2787 2
   *                        The <i>json</i> string being decoded.
2788 2
   *                        </p>
2789 2
   *                        <p>
2790
   *                        This function only works with UTF-8 encoded strings.
2791 2
   *                        </p>
2792
   *                        <p>PHP implements a superset of
2793
   *                        JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2794
   *                        only supports these values when they are nested inside an array or an object.
2795
   *                        </p>
2796
   * @param bool   $assoc   [optional] <p>
2797
   *                        When <b>TRUE</b>, returned objects will be converted into
2798
   *                        associative arrays.
2799
   *                        </p>
2800
   * @param int    $depth   [optional] <p>
2801
   *                        User specified recursion depth.
2802
   *                        </p>
2803
   * @param int    $options [optional] <p>
2804
   *                        Bitmask of JSON decode options. Currently only
2805
   *                        <b>JSON_BIGINT_AS_STRING</b>
2806
   *                        is supported (default is to cast large integers as floats)
2807
   *                        </p>
2808
   *
2809
   * @return mixed the value encoded in <i>json</i> in appropriate
2810
   * PHP type. Values true, false and
2811
   * null (case-insensitive) are returned as <b>TRUE</b>, <b>FALSE</b>
2812
   * and <b>NULL</b> respectively. <b>NULL</b> is returned if the
2813
   * <i>json</i> cannot be decoded or if the encoded
2814
   * data is deeper than the recursion limit.
2815
   */
2816
  public static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
2817
  {
2818
    $json = self::filter($json);
2819
2820
    if (Bootup::is_php('5.4') === true) {
2821
      $json = json_decode($json, $assoc, $depth, $options);
2822
    } else {
2823
      $json = json_decode($json, $assoc, $depth);
2824
    }
2825
2826
    return $json;
2827
  }
2828
2829
  /**
2830
   * (PHP 5 &gt;= 5.2.0, PECL json &gt;= 1.2.0)<br/>
2831
   * Returns the JSON representation of a value.
2832
   *
2833
   * @link http://php.net/manual/en/function.json-encode.php
2834
   *
2835
   * @param mixed $value   <p>
2836
   *                       The <i>value</i> being encoded. Can be any type except
2837
   *                       a resource.
2838
   *                       </p>
2839
   *                       <p>
2840
   *                       All string data must be UTF-8 encoded.
2841
   *                       </p>
2842
   *                       <p>PHP implements a superset of
2843
   *                       JSON - it will also encode and decode scalar types and <b>NULL</b>. The JSON standard
2844
   *                       only supports these values when they are nested inside an array or an object.
2845
   *                       </p>
2846
   * @param int   $options [optional] <p>
2847
   *                       Bitmask consisting of <b>JSON_HEX_QUOT</b>,
2848
   *                       <b>JSON_HEX_TAG</b>,
2849
   *                       <b>JSON_HEX_AMP</b>,
2850
   *                       <b>JSON_HEX_APOS</b>,
2851
   *                       <b>JSON_NUMERIC_CHECK</b>,
2852
   *                       <b>JSON_PRETTY_PRINT</b>,
2853
   *                       <b>JSON_UNESCAPED_SLASHES</b>,
2854
   *                       <b>JSON_FORCE_OBJECT</b>,
2855
   *                       <b>JSON_UNESCAPED_UNICODE</b>. The behaviour of these
2856
   *                       constants is described on
2857
   *                       the JSON constants page.
2858
   *                       </p>
2859
   * @param int   $depth   [optional] <p>
2860
   *                       Set the maximum depth. Must be greater than zero.
2861
   *                       </p>
2862
   *
2863
   * @return string a JSON encoded string on success or <b>FALSE</b> on failure.
2864
   */
2865
  public static function json_encode($value, $options = 0, $depth = 512)
2866
  {
2867
    $value = self::filter($value);
2868
2869
    if (Bootup::is_php('5.5')) {
2870
      $json = json_encode($value, $options, $depth);
2871
    } else {
2872
      $json = json_encode($value, $options);
2873
    }
2874
2875 17
    return $json;
2876
  }
2877 17
2878
  /**
2879 17
   * Makes string's first char lowercase.
2880 4
   *
2881
   * @param string $str <p>The input string</p>
2882
   *
2883 17
   * @return string <p>The resulting string</p>
2884 5
   */
2885
  public static function lcfirst($str)
2886
  {
2887 17
    return self::strtolower(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtolower() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
2888
  }
2889 17
2890
  /**
2891 17
   * Strip whitespace or other characters from beginning of a UTF-8 string.
2892 4
   *
2893
   * @param string $str   <p>The string to be trimmed</p>
2894
   * @param string $chars <p>Optional characters to be stripped</p>
2895 4
   *
2896
   * @return string <p>The string with unwanted characters stripped from the left.</p>
2897 4
   */
2898 View Code Duplication
  public static function ltrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2899
  {
2900 17
    $str = (string)$str;
2901
2902
    if (!isset($str[0])) {
2903 14
      return '';
2904
    }
2905 14
2906 14
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
2907
    if ($chars === INF || !$chars) {
2908 6
      return preg_replace('/^[\pZ\pC]+/u', '', $str);
2909
    }
2910 17
2911
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
2912
2913 17
    return preg_replace("/^{$chars}+/u", '', $str);
2914 17
  }
2915 17
2916
  /**
2917 17
   * Returns the UTF-8 character with the maximum code point in the given data.
2918
   *
2919 17
   * @param mixed $arg <p>A UTF-8 encoded string or an array of such strings.</p>
2920
   *
2921 17
   * @return string <p>The character with the highest code point than others.</p>
2922
   */
2923 View Code Duplication
  public static function max($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2924
  {
2925
    if (is_array($arg)) {
2926
      $arg = implode($arg);
2927
    }
2928
2929
    return self::chr(max(self::codepoints($arg)));
2930
  }
2931
2932
  /**
2933
   * Calculates and returns the maximum number of bytes taken by any
2934
   * UTF-8 encoded character in the given string.
2935
   *
2936
   * @param string $str <p>The original Unicode string.</p>
2937
   *
2938
   * @return int <p>Max byte lengths of the given chars.</p>
2939
   */
2940
  public static function max_chr_width($str)
2941
  {
2942
    $bytes = self::chr_size_list($str);
2943
    if (count($bytes) > 0) {
2944
      return (int)max($bytes);
2945
    } else {
2946
      return 0;
2947
    }
2948
  }
2949
2950
  /**
2951
   * Checks whether mbstring is available on the server.
2952
   *
2953
   * @return bool <p><strong>true</strong> if available, <strong>false</strong> otherwise.</p>
2954
   */
2955
  public static function mbstring_loaded()
2956
  {
2957
    $return = extension_loaded('mbstring');
2958
2959
    if ($return === true) {
2960
      \mb_internal_encoding('UTF-8');
2961
    }
2962
2963
    return $return;
2964
  }
2965
2966
  /**
2967
   * Returns the UTF-8 character with the minimum code point in the given data.
2968
   *
2969
   * @param mixed $arg <strong>A UTF-8 encoded string or an array of such strings.</strong>
2970
   *
2971
   * @return string <p>The character with the lowest code point than others.</p>
2972
   */
2973 View Code Duplication
  public static function min($arg)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
2974
  {
2975
    if (is_array($arg)) {
2976
      $arg = implode($arg);
2977
    }
2978
2979
    return self::chr(min(self::codepoints($arg)));
2980
  }
2981
2982
  /**
2983
   * alias for "UTF8::normalize_encoding()"
2984
   *
2985
   * @see UTF8::normalize_encoding()
2986
   *
2987
   * @param string $encoding
2988
   *
2989
   * @return string
2990
   */
2991
  public static function normalizeEncoding($encoding)
2992
  {
2993
    return self::normalize_encoding($encoding);
2994
  }
2995
2996
  /**
2997
   * Normalize the encoding-"name" input.
2998
   *
2999
   * @param string $encoding <p>e.g.: ISO, UTF8, WINDOWS-1251 etc.</p>
3000
   *
3001
   * @return string <p>e.g.: ISO-8859-1, UTF-8, WINDOWS-1251 etc.</p>
3002
   */
3003
  public static function normalize_encoding($encoding)
3004
  {
3005
    static $staticNormalizeEncodingCache = array();
3006
3007
    if (!$encoding) {
3008
      return false;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return false; (false) is incompatible with the return type documented by voku\helper\UTF8::normalize_encoding of type string.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
3009
    }
3010
3011
    if ('UTF-8' === $encoding) {
3012
      return $encoding;
3013
    }
3014
3015
    if (in_array($encoding, self::$iconvEncoding, true)) {
3016
      return $encoding;
3017
    }
3018
3019
    if (isset($staticNormalizeEncodingCache[$encoding])) {
3020
      return $staticNormalizeEncodingCache[$encoding];
3021
    }
3022
3023
    $encodingOrig = $encoding;
3024
    $encoding = strtoupper($encoding);
3025
    $encodingUpperHelper = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
3026
3027 2
    $equivalences = array(
3028
        'ISO88591'    => 'ISO-8859-1',
3029 2
        'ISO8859'     => 'ISO-8859-1',
3030
        'ISO'         => 'ISO-8859-1',
3031 2
        'LATIN1'      => 'ISO-8859-1',
3032
        'LATIN'       => 'ISO-8859-1',
3033 2
        'WIN1252'     => 'ISO-8859-1',
3034
        'WINDOWS1252' => 'ISO-8859-1',
3035
        'UTF16'       => 'UTF-16',
3036
        'UTF32'       => 'UTF-32',
3037 2
        'UTF8'        => 'UTF-8',
3038 2
        'UTF'         => 'UTF-8',
3039 2
        'UTF7'        => 'UTF-7',
3040 2
        '8BIT'        => 'CP850',
3041 2
        'BINARY'      => 'CP850',
3042 1
    );
3043
3044 1
    if (!empty($equivalences[$encodingUpperHelper])) {
3045 1
      $encoding = $equivalences[$encodingUpperHelper];
3046 1
    }
3047 1
3048 1
    $staticNormalizeEncodingCache[$encodingOrig] = $encoding;
3049 2
3050
    return $encoding;
3051 2
  }
3052
3053
  /**
3054
   * Normalize some MS Word special characters.
3055
   *
3056
   * @param string $str <p>The string to be normalized.</p>
3057
   *
3058
   * @return string
3059
   */
3060
  public static function normalize_msword($str)
3061
  {
3062
    static $utf8MSWordKeys = null;
3063
    static $utf8MSWordValues = null;
3064
3065
    if ($utf8MSWordKeys === null) {
3066
      $utf8MSWordKeys = array_keys(self::$utf8MSWord);
3067
      $utf8MSWordValues = array_values(self::$utf8MSWord);
3068
    }
3069
3070
    return str_replace($utf8MSWordKeys, $utf8MSWordValues, $str);
3071
  }
3072
3073
  /**
3074
   * Normalize the whitespace.
3075
   *
3076
   * @param string $str                     <p>The string to be normalized.</p>
3077
   * @param bool   $keepNonBreakingSpace    [optional] <p>Set to true, to keep non-breaking-spaces.</p>
3078
   * @param bool   $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web)
3079
   *                                        bidirectional text chars.</p>
3080
   *
3081
   * @return string
3082
   */
3083
  public static function normalize_whitespace($str, $keepNonBreakingSpace = false, $keepBidiUnicodeControls = false)
3084
  {
3085
    static $whitespaces = array();
3086
    static $bidiUniCodeControls = null;
3087
3088
    $cacheKey = (int)$keepNonBreakingSpace;
3089
3090
    if (!isset($whitespaces[$cacheKey])) {
3091
3092
      $whitespaces[$cacheKey] = self::$whitespaceTable;
3093
3094
      if ($keepNonBreakingSpace === true) {
3095
        /** @noinspection OffsetOperationsInspection */
3096
        unset($whitespaces[$cacheKey]['NO-BREAK SPACE']);
3097
      }
3098
3099
      $whitespaces[$cacheKey] = array_values($whitespaces[$cacheKey]);
3100
    }
3101
3102
    if ($keepBidiUnicodeControls === false) {
3103
      if ($bidiUniCodeControls === null) {
3104
        $bidiUniCodeControls = array_values(self::$bidiUniCodeControlsTable);
3105
      }
3106
3107
      $str = str_replace($bidiUniCodeControls, '', $str);
3108
    }
3109
3110
    return str_replace($whitespaces[$cacheKey], ' ', $str);
3111
  }
3112
3113
  /**
3114
   * Format a number with grouped thousands.
3115
   *
3116
   * @param float  $number
3117
   * @param int    $decimals
3118
   * @param string $dec_point
3119
   * @param string $thousands_sep
3120
   *
3121
   * @return string
3122
   *    *
3123
   * @deprecated Because this has nothing to do with UTF8. :/
3124
   */
3125
  public static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
3126
  {
3127
    $thousands_sep = (string)$thousands_sep;
3128
    $dec_point = (string)$dec_point;
3129
3130
    if (
3131
        isset($thousands_sep[1], $dec_point[1])
3132
        &&
3133
        Bootup::is_php('5.4') === true
3134
    ) {
3135
      return str_replace(
3136
          array(
3137
              '.',
3138
              ',',
3139
          ),
3140
          array(
3141
              $dec_point,
3142
              $thousands_sep,
3143
          ),
3144
          number_format($number, $decimals, '.', ',')
3145
      );
3146
    }
3147
3148
    return number_format($number, $decimals, $dec_point, $thousands_sep);
3149
  }
3150
3151
  /**
3152
   * Calculates Unicode code point of the given UTF-8 encoded character.
3153
   *
3154
   * INFO: opposite to UTF8::chr()
3155
   *
3156
   * @param string $chr <p>The character of which to calculate code point.<p/>
3157
   *
3158
   * @return int <p>
3159
   *             Unicode code point of the given character,<br />
3160
   *             0 on invalid UTF-8 byte sequence.
3161
   *             </p>
3162
   */
3163 1
  public static function ord($chr)
3164
  {
3165 1
    if (!$chr && $chr !== '0') {
3166
      return 0;
3167 1
    }
3168
3169
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3170
      self::checkForSupport();
3171
    }
3172
3173
    if (self::$support['intlChar'] === true) {
3174
      $tmpReturn = \IntlChar::ord($chr);
3175 1
      if ($tmpReturn) {
3176
        return $tmpReturn;
3177 1
      }
3178
    }
3179
3180
    $chr = unpack('C*', substr($chr, 0, 4));
3181
    $a = $chr ? $chr[1] : 0;
3182
3183
    if (0xF0 <= $a && isset($chr[4])) {
3184
      return (($a - 0xF0) << 18) + (($chr[2] - 0x80) << 12) + (($chr[3] - 0x80) << 6) + $chr[4] - 0x80;
3185
    }
3186
3187
    if (0xE0 <= $a && isset($chr[3])) {
3188
      return (($a - 0xE0) << 12) + (($chr[2] - 0x80) << 6) + $chr[3] - 0x80;
3189
    }
3190 2
3191
    if (0xC0 <= $a && isset($chr[2])) {
3192 2
      return (($a - 0xC0) << 6) + $chr[2] - 0x80;
3193 2
    }
3194
3195 2
    return $a;
3196
  }
3197 2
3198
  /**
3199
   * Parses the string into an array (into the the second parameter).
3200
   *
3201
   * WARNING: Instead of "parse_str()" this method do not (re-)placing variables in the current scope,
3202
   *          if the second parameter is not set!
3203
   *
3204
   * @link http://php.net/manual/en/function.parse-str.php
3205
   *
3206
   * @param string $str    <p>The input string.</p>
3207
   * @param array  $result <p>The result will be returned into this reference parameter.</p>
3208 1
   *
3209
   * @return bool <p>Will return <strong>false</strong> if php can't parse the string and we haven't any $result.</p>
3210 1
   */
3211
  public static function parse_str($str, &$result)
3212
  {
3213
    // init
3214
    $str = self::clean($str);
3215
3216
    $return = \mb_parse_str($str, $result);
3217
    if ($return === false || empty($result)) {
3218 3
      return false;
3219
    }
3220 3
3221
    return true;
3222
  }
3223
3224
  /**
3225
   * Checks if \u modifier is available that enables Unicode support in PCRE.
3226
   *
3227
   * @return bool <p><strong>true</strong> if support is available, <strong>false</strong> otherwise.</p>
3228
   */
3229
  public static function pcre_utf8_support()
3230
  {
3231
    /** @noinspection PhpUsageOfSilenceOperatorInspection */
3232 2
    return (bool)@preg_match('//u', '');
3233
  }
3234 2
3235
  /**
3236
   * Create an array containing a range of UTF-8 characters.
3237
   *
3238
   * @param mixed $var1 <p>Numeric or hexadecimal code points, or a UTF-8 character to start from.</p>
3239
   * @param mixed $var2 <p>Numeric or hexadecimal code points, or a UTF-8 character to end at.</p>
3240
   *
3241
   * @return array
3242
   */
3243
  public static function range($var1, $var2)
3244
  {
3245
    if (!$var1 || !$var2) {
3246 1
      return array();
3247
    }
3248 1
3249 View Code Duplication
    if (ctype_digit((string)$var1)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3250
      $start = (int)$var1;
3251
    } elseif (ctype_xdigit($var1)) {
3252
      $start = (int)self::hex_to_int($var1);
3253
    } else {
3254
      $start = self::ord($var1);
3255
    }
3256
3257
    if (!$start) {
3258
      return array();
3259
    }
3260
3261 View Code Duplication
    if (ctype_digit((string)$var2)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3262
      $end = (int)$var2;
3263
    } elseif (ctype_xdigit($var2)) {
3264
      $end = (int)self::hex_to_int($var2);
3265
    } else {
3266
      $end = self::ord($var2);
3267
    }
3268
3269
    if (!$end) {
3270
      return array();
3271
    }
3272
3273
    return array_map(
3274
        array(
3275
            '\\voku\\helper\\UTF8',
3276
            'chr',
3277
        ),
3278
        range($start, $end)
3279
    );
3280
  }
3281
3282
  /**
3283
   * Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.
3284
   *
3285
   * @param string $str <p>The input string.</p>
3286
   *
3287
   * @return string <p>String without UTF-BOM</p>
3288 1
   */
3289
  public static function remove_bom($str)
3290 1
  {
3291
    foreach (self::$bom as $bomString => $bomByteLength) {
3292
      if (0 === strpos($str, $bomString)) {
3293
        $str = substr($str, $bomByteLength);
3294
      }
3295
    }
3296
3297
    return $str;
3298
  }
3299
3300
  /**
3301
   * alias for "UTF8::remove_bom()"
3302
   *
3303
   * @see UTF8::remove_bom()
3304
   *
3305
   * @param string $str
3306
   *
3307
   * @return string
3308
   */
3309
  public static function removeBOM($str)
3310
  {
3311
    return self::remove_bom($str);
3312
  }
3313
3314
  /**
3315
   * Removes duplicate occurrences of a string in another string.
3316 1
   *
3317
   * @param string          $str  <p>The base string.</p>
3318 1
   * @param string|string[] $what <p>String to search for in the base string.</p>
3319
   *
3320
   * @return string <p>The result string with removed duplicates.</p>
3321
   */
3322
  public static function remove_duplicates($str, $what = ' ')
3323
  {
3324
    if (is_string($what)) {
3325
      $what = array($what);
3326
    }
3327
3328
    if (is_array($what)) {
3329
      foreach ($what as $item) {
3330 1
        $str = preg_replace('/(' . preg_quote($item, '/') . ')+/', $item, $str);
3331
      }
3332 1
    }
3333
3334
    return $str;
3335
  }
3336
3337
  /**
3338
   * Remove invisible characters from a string.
3339
   *
3340
   * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
3341
   *
3342
   * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
3343
   *
3344
   * @param string $str
3345 16
   * @param bool   $url_encoded
3346
   * @param string $replacement
3347 16
   *
3348
   * @return string
3349
   */
3350
  public static function remove_invisible_characters($str, $url_encoded = true, $replacement = '')
3351
  {
3352
    // init
3353
    $non_displayables = array();
3354
3355
    // every control character except newline (dec 10),
3356
    // carriage return (dec 13) and horizontal tab (dec 09)
0 ignored issues
show
Unused Code Comprehensibility introduced by
37% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3357
    if ($url_encoded) {
3358 14
      $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3359
      $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
3360 14
    }
3361
3362
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
3363
3364
    do {
3365
      $str = preg_replace($non_displayables, $replacement, $str, -1, $count);
3366
    } while ($count !== 0);
3367
3368
    return $str;
3369
  }
3370 1
3371
  /**
3372 1
   * Replace the diamond question mark (�) with the replacement.
3373
   *
3374 1
   * @param string $str
3375 1
   * @param string $unknown
3376
   *
3377
   * @return string
3378 1
   */
3379 1
  public static function replace_diamond_question_mark($str, $unknown = '?')
3380
  {
3381 1
    return str_replace(
3382
        array(
3383
            "\xEF\xBF\xBD",
3384
            '�',
3385
        ),
3386
        array(
3387
            $unknown,
3388
            $unknown,
3389
        ),
3390
        $str
3391
    );
3392 16
  }
3393
3394
  /**
3395 16
   * Strip whitespace or other characters from end of a UTF-8 string.
3396
   *
3397
   * @param string $str   <p>The string to be trimmed.</p>
3398 16
   * @param string $chars <p>Optional characters to be stripped.</p>
3399
   *
3400 16
   * @return string <p>The string with unwanted characters stripped from the right.</p>
3401 16
   */
3402 15 View Code Duplication
  public static function rtrim($str = '', $chars = INF)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3403 16
  {
3404 6
    $str = (string)$str;
3405
3406 15
    if (!isset($str[0])) {
3407
      return '';
3408
    }
3409
3410
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
3411
    if ($chars === INF || !$chars) {
3412
      return preg_replace('/[\pZ\pC]+$/u', '', $str);
3413
    }
3414
3415
    $chars = INF === $chars ? '\s' : self::rxClass($chars);
3416
3417
    return preg_replace("/{$chars}+$/u", '', $str);
3418
  }
3419
3420
  /**
3421
   * rxClass
3422
   *
3423
   * @param string $s
3424
   * @param string $class
3425
   *
3426
   * @return string
3427
   */
3428
  private static function rxClass($s, $class = '')
3429
  {
3430
    static $rxClassCache = array();
3431
3432
    $cacheKey = $s . $class;
3433
3434
    if (isset($rxClassCache[$cacheKey])) {
3435
      return $rxClassCache[$cacheKey];
3436
    }
3437
3438
    $class = array($class);
3439
3440
    /** @noinspection SuspiciousLoopInspection */
3441
    foreach (self::str_split($s) as $s) {
3442
      if ('-' === $s) {
3443
        $class[0] = '-' . $class[0];
3444
      } elseif (!isset($s[2])) {
3445
        $class[0] .= preg_quote($s, '/');
3446
      } elseif (1 === self::strlen($s)) {
3447
        $class[0] .= $s;
3448
      } else {
3449
        $class[] = $s;
3450
      }
3451
    }
3452
3453
    if ($class[0]) {
3454
      $class[0] = '[' . $class[0] . ']';
3455
    }
3456
3457 1
    if (1 === count($class)) {
3458
      $return = $class[0];
3459 1
    } else {
3460
      $return = '(?:' . implode('|', $class) . ')';
3461 1
    }
3462
3463
    $rxClassCache[$cacheKey] = $return;
3464
3465
    return $return;
3466 1
  }
3467
3468 1
  /**
3469
   * WARNING: Echo native UTF8-Support libs, e.g. for debugging.
3470 1
   */
3471 1
  public static function showSupport()
3472
  {
3473 1
    foreach (self::$support as $utf8Support) {
3474
      echo $utf8Support . "\n<br>";
3475
    }
3476
  }
3477
3478
  /**
3479
   * Converts a UTF-8 character to HTML Numbered Entity like "&#123;".
3480
   *
3481
   * @param string $char           <p>The Unicode character to be encoded as numbered entity.</p>
3482
   * @param bool   $keepAsciiChars <p>Set to <strong>true</strong> to keep ASCII chars.</>
3483
   *
3484 1
   * @return string <p>The HTML numbered entity.</p>
3485
   */
3486 1
  public static function single_chr_html_encode($char, $keepAsciiChars = false)
3487
  {
3488 1
    if (!$char) {
3489
      return '';
3490
    }
3491
3492
    if (
3493 1
        $keepAsciiChars === true
3494 1
        &&
3495 1
        self::isAscii($char) === true
3496 1
    ) {
3497 1
      return $char;
3498
    }
3499 1
3500
    return '&#' . self::ord($char) . ';';
3501
  }
3502
3503
  /**
3504
   * Convert a string to an array of Unicode characters.
3505
   *
3506
   * @param string  $str       <p>The string to split into array.</p>
3507
   * @param int     $length    [optional] <p>Max character length of each array element.</p>
3508
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
3509
   *
3510 4
   * @return string[] <p>An array containing chunks of the string.</p>
3511
   */
3512 4
  public static function split($str, $length = 1, $cleanUtf8 = false)
3513
  {
3514 4
    $str = (string)$str;
3515 4
3516
    if (!isset($str[0])) {
3517 4
      return array();
3518 4
    }
3519 4
3520 4
    // init
3521 4
    $str = (string)$str;
3522 4
    $ret = array();
3523 4
3524 4
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
3525 4
      self::checkForSupport();
3526 2
    }
3527 2
3528 4
    if (self::$support['pcre_utf8'] === true) {
3529 4
3530 4
      if ($cleanUtf8 === true) {
3531
        $str = self::clean($str);
3532 4
      }
3533 4
3534 4
      preg_match_all('/./us', $str, $retArray);
3535 4
      if (isset($retArray[0])) {
3536 4
        $ret = $retArray[0];
3537 4
      }
3538 4
      unset($retArray);
3539 4
3540 4
    } else {
3541 3
3542 3
      // fallback
3543 4
3544 4
      $len = strlen($str);
3545 4
3546
      /** @noinspection ForeachInvariantsInspection */
3547 4
      for ($i = 0; $i < $len; $i++) {
3548 3
        if (($str[$i] & "\x80") === "\x00") {
3549 2
          $ret[] = $str[$i];
3550
        } elseif ((($str[$i] & "\xE0") === "\xC0") && isset($str[$i + 1])) {
3551 3
          if (($str[$i + 1] & "\xC0") === "\x80") {
3552
            $ret[] = $str[$i] . $str[$i + 1];
3553
3554
            $i++;
3555 3
          }
3556 View Code Duplication
        } elseif ((($str[$i] & "\xF0") === "\xE0") && isset($str[$i + 2])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3557 3
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80")) {
3558
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2];
3559
3560
            $i += 2;
3561
          }
3562
        } elseif ((($str[$i] & "\xF8") === "\xF0") && isset($str[$i + 3])) {
3563 View Code Duplication
          if ((($str[$i + 1] & "\xC0") === "\x80") && (($str[$i + 2] & "\xC0") === "\x80") && (($str[$i + 3] & "\xC0") === "\x80")) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3564
            $ret[] = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
3565
3566
            $i += 3;
3567 3
          }
3568
        }
3569 3
      }
3570
    }
3571 3
3572 3
    if ($length > 1) {
3573
      $ret = array_chunk($ret, $length);
3574 3
3575 3
      $ret = array_map('implode', $ret);
3576 3
    }
3577 2
3578 2
    /** @noinspection OffsetOperationsInspection */
3579 2
    if (isset($ret[0]) && $ret[0] === '') {
3580 2
      return array();
3581 2
    }
3582 2
3583 1
    return $ret;
3584 1
  }
3585 2
3586 2
  /**
3587 2
   * Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.
3588
   *
3589 3
   * @param string $str <p>The input string.</p>
3590 3
   *
3591 3
   * @return false|string <p>
3592 2
   *                      The detected string-encoding e.g. UTF-8 or UTF-16BE,<br />
3593 2
   *                      otherwise it will return false.
3594 2
   *                      </p>
3595 2
   */
3596 2
  public static function str_detect_encoding($str)
3597 2
  {
3598 1
    //
3599 1
    // 1.) check binary strings (010001001...) like UTF-16 / UTF-32
3600 2
    //
3601 2
3602 2
    if (self::is_binary($str)) {
3603
      if (self::is_utf16($str) === 1) {
3604 3
        return 'UTF-16LE';
3605 1
      } elseif (self::is_utf16($str) === 2) {
3606 1
        return 'UTF-16BE';
3607
      } elseif (self::is_utf32($str) === 1) {
3608 1
        return 'UTF-32LE';
3609
      } elseif (self::is_utf32($str) === 2) {
3610
        return 'UTF-32BE';
3611
      }
3612 3
    }
3613
3614 3
    //
3615
    // 2.) simple check for ASCII chars
3616
    //
3617
3618
    if (self::is_ascii($str) === true) {
3619
      return 'ASCII';
3620
    }
3621
3622
    //
3623
    // 3.) simple check for UTF-8 chars
3624
    //
3625
3626
    if (self::is_utf8($str) === true) {
3627 43
      return 'UTF-8';
3628
    }
3629 43
3630
    //
3631 43
    // 4.) check via "\mb_detect_encoding()"
3632 3
    //
3633
    // INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"
3634
3635 41
    $detectOrder = array(
3636 1
        'ISO-8859-1',
3637 1
        'ISO-8859-2',
3638
        'ISO-8859-3',
3639
        'ISO-8859-4',
3640
        'ISO-8859-5',
3641
        'ISO-8859-6',
3642
        'ISO-8859-7',
3643
        'ISO-8859-8',
3644
        'ISO-8859-9',
3645 41
        'ISO-8859-10',
3646
        'ISO-8859-13',
3647
        'ISO-8859-14',
3648
        'ISO-8859-15',
3649
        'ISO-8859-16',
3650
        'WINDOWS-1251',
3651
        'WINDOWS-1252',
3652
        'WINDOWS-1254',
3653
        'ISO-2022-JP',
3654
        'JIS',
3655 41
        'EUC-JP',
3656
    );
3657 41
3658 41
    $encoding = \mb_detect_encoding($str, $detectOrder, true);
3659 41
    if ($encoding) {
3660
      return $encoding;
3661
    }
3662 41
3663 41
    //
3664 41
    // 5.) check via "iconv()"
3665
    //
3666
3667 41
    $md5 = md5($str);
3668
    foreach (self::$iconvEncoding as $encodingTmp) {
3669 36
      # INFO: //IGNORE and //TRANSLIT still throw notice
3670 41
      /** @noinspection PhpUsageOfSilenceOperatorInspection */
3671
      if (md5(@iconv($encodingTmp, $encodingTmp, $str)) === $md5) {
3672 34
        return $encodingTmp;
3673 34
      }
3674 34
    }
3675 34
3676 39
    return false;
3677
  }
3678 21
3679 21
  /**
3680 21
   * Case-insensitive and UTF-8 safe version of <function>str_replace</function>.
3681 21
   *
3682 33
   * @link  http://php.net/manual/en/function.str-ireplace.php
3683
   *
3684 9
   * @param mixed $search  <p>
3685 9
   *                       Every replacement with search array is
3686 9
   *                       performed on the result of previous replacement.
3687 9
   *                       </p>
3688 16
   * @param mixed $replace <p>
3689
   *                       </p>
3690
   * @param mixed $subject <p>
3691
   *                       If subject is an array, then the search and
3692
   *                       replace is performed with every entry of
3693
   *                       subject, and the return value is an array as
3694
   *                       well.
3695
   *                       </p>
3696
   * @param int   $count   [optional] <p>
3697 3
   *                       The number of matched and replaced needles will
3698 3
   *                       be returned in count which is passed by
3699 3
   *                       reference.
3700 3
   *                       </p>
3701 9
   *
3702
   * @return mixed <p>A string or an array of replacements.</p>
3703 3
   */
3704 3
  public static function str_ireplace($search, $replace, $subject, &$count = null)
3705 3
  {
3706 3
    $search = (array)$search;
3707 3
3708
    /** @noinspection AlterInForeachInspection */
3709
    foreach ($search as &$s) {
3710
      if ('' === $s .= '') {
3711 5
        $s = '/^(?<=.)$/';
3712
      } else {
3713 41
        $s = '/' . preg_quote($s, '/') . '/ui';
3714
      }
3715
    }
3716 36
3717
    $subject = preg_replace($search, $replace, $subject, -1, $replace);
3718 33
    $count = $replace; // used as reference parameter
3719 33
3720 33
    return $subject;
3721 33
  }
3722
3723
  /**
3724
   * Limit the number of characters in a string, but also after the next word.
3725
   *
3726 33
   * @param string $str
3727
   * @param int    $length
3728
   * @param string $strAddOn
3729
   *
3730
   * @return string
3731
   */
3732 33
  public static function str_limit_after_word($str, $length = 100, $strAddOn = '...')
3733 33
  {
3734 33
    $str = (string)$str;
3735 33
3736
    if (!isset($str[0])) {
3737 33
      return '';
3738
    }
3739 33
3740 33
    $length = (int)$length;
3741 5
3742
    if (self::strlen($str) <= $length) {
3743
      return $str;
3744 33
    }
3745 33
3746 33
    if (self::substr($str, $length - 1, 1) === ' ') {
3747 33
      return self::substr($str, 0, $length - 1) . $strAddOn;
3748 33
    }
3749
3750
    $str = self::substr($str, 0, $length);
3751
    $array = explode(' ', $str);
3752
    array_pop($array);
3753 18
    $new_str = implode(' ', $array);
3754
3755
    if ($new_str === '') {
3756 41
      $str = self::substr($str, 0, $length - 1) . $strAddOn;
0 ignored issues
show
Security Bug introduced by
It seems like $str can also be of type false; however, voku\helper\UTF8::substr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
3757
    } else {
3758 20
      $str = $new_str . $strAddOn;
3759
    }
3760
3761
    return $str;
3762
  }
3763
3764
  /**
3765
   * Pad a UTF-8 string to given length with another string.
3766
   *
3767
   * @param string $str        <p>The input string.</p>
3768
   * @param int    $pad_length <p>The length of return string.</p>
3769
   * @param string $pad_string [optional] <p>String to use for padding the input string.</p>
3770
   * @param int    $pad_type   [optional] <p>
3771
   *                           Can be <strong>STR_PAD_RIGHT</strong> (default),
3772
   *                           <strong>STR_PAD_LEFT</strong> or <strong>STR_PAD_BOTH</strong>
3773
   *                           </p>
3774
   *
3775
   * @return string <strong>Returns the padded string</strong>
3776
   */
3777
  public static function str_pad($str, $pad_length, $pad_string = ' ', $pad_type = STR_PAD_RIGHT)
3778
  {
3779
    $str_length = self::strlen($str);
3780
3781
    if (is_int($pad_length) && ($pad_length > 0) && ($pad_length >= $str_length)) {
3782
      $ps_length = self::strlen($pad_string);
3783
3784
      $diff = $pad_length - $str_length;
3785
3786
      switch ($pad_type) {
3787 View Code Duplication
        case STR_PAD_LEFT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3788
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3789
          $pre = self::substr($pre, 0, $diff);
3790
          $post = '';
3791
          break;
3792
3793
        case STR_PAD_BOTH:
3794
          $pre = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3795
          $pre = self::substr($pre, 0, (int)$diff / 2);
3796
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length / 2));
3797
          $post = self::substr($post, 0, (int)ceil($diff / 2));
3798 2
          break;
3799
3800 2
        case STR_PAD_RIGHT:
3801 View Code Duplication
        default:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
3802 2
          $post = str_repeat($pad_string, (int)ceil($diff / $ps_length));
3803
          $post = self::substr($post, 0, $diff);
3804
          $pre = '';
3805 2
      }
3806
3807
      return $pre . $str . $post;
3808 2
    }
3809
3810
    return $str;
3811
  }
3812
3813
  /**
3814
   * Repeat a string.
3815
   *
3816
   * @param string $str        <p>
3817
   *                           The string to be repeated.
3818
   *                           </p>
3819
   * @param int    $multiplier <p>
3820
   *                           Number of time the input string should be
3821
   *                           repeated.
3822
   *                           </p>
3823
   *                           <p>
3824
   *                           multiplier has to be greater than or equal to 0.
3825
   *                           If the multiplier is set to 0, the function
3826
   *                           will return an empty string.
3827
   *                           </p>
3828
   *
3829
   * @return string <p>The repeated string.</p>
3830
   */
3831
  public static function str_repeat($str, $multiplier)
3832
  {
3833
    $str = self::filter($str);
3834
3835
    return str_repeat($str, $multiplier);
3836
  }
3837
3838
  /**
3839
   * INFO: This is only a wrapper for "str_replace()"  -> the original functions is already UTF-8 safe.
3840
   *
3841
   * Replace all occurrences of the search string with the replacement string
3842
   *
3843
   * @link http://php.net/manual/en/function.str-replace.php
3844
   *
3845
   * @param mixed $search  <p>
3846
   *                       The value being searched for, otherwise known as the needle.
3847 2
   *                       An array may be used to designate multiple needles.
3848
   *                       </p>
3849 2
   * @param mixed $replace <p>
3850
   *                       The replacement value that replaces found search
3851 2
   *                       values. An array may be used to designate multiple replacements.
3852
   *                       </p>
3853
   * @param mixed $subject <p>
3854 2
   *                       The string or array being searched and replaced on,
3855
   *                       otherwise known as the haystack.
3856
   *                       </p>
3857 2
   *                       <p>
3858
   *                       If subject is an array, then the search and
3859
   *                       replace is performed with every entry of
3860
   *                       subject, and the return value is an array as
3861
   *                       well.
3862
   *                       </p>
3863
   * @param int   $count   [optional] If passed, this will hold the number of matched and replaced needles.
3864
   *
3865
   * @return mixed <p>This function returns a string or an array with the replaced values.</p>
3866
   */
3867 6
  public static function str_replace($search, $replace, $subject, &$count = null)
3868
  {
3869 6
    return str_replace($search, $replace, $subject, $count);
3870
  }
3871
3872
  /**
3873
   * Shuffles all the characters in the string.
3874
   *
3875
   * @param string $str <p>The input string</p>
3876
   *
3877
   * @return string <p>The shuffled string.</p>
3878
   */
3879
  public static function str_shuffle($str)
3880 24
  {
3881
    $array = self::split($str);
3882 24
3883
    shuffle($array);
3884 24
3885 2
    return implode('', $array);
3886
  }
3887
3888
  /**
3889 23
   * Sort all characters according to code points.
3890 2
   *
3891
   * @param string $str    <p>A UTF-8 string.</p>
3892
   * @param bool   $unique <p>Sort unique. If <strong>true</strong>, repeated characters are ignored.</p>
3893 23
   * @param bool   $desc   <p>If <strong>true</strong>, will sort characters in reverse code point order.</p>
3894
   *
3895 23
   * @return string <p>String of sorted characters.</p>
3896
   */
3897
  public static function str_sort($str, $unique = false, $desc = false)
3898
  {
3899
    $array = self::codepoints($str);
3900
3901
    if ($unique) {
3902
      $array = array_flip(array_flip($array));
3903
    }
3904
3905 1
    if ($desc) {
3906
      arsort($array);
3907 1
    } else {
3908
      asort($array);
3909
    }
3910
3911 1
    return self::string($array);
3912
  }
3913
3914
  /**
3915
   * Split a string into an array.
3916
   *
3917
   * @param string $str
3918
   * @param int    $len
3919
   *
3920
   * @return array
3921
   */
3922 1
  public static function str_split($str, $len = 1)
3923
  {
3924 1
    // init
3925 1
    $len = (int)$len;
3926 1
3927
    if ($len < 1) {
3928 1
      return str_split($str, $len);
3929
    }
3930
3931
    preg_match_all('/' . Grapheme::GRAPHEME_CLUSTER_RX . '/u', $str, $a);
3932
    $a = $a[0];
3933
3934
    if ($len === 1) {
3935
      return $a;
3936
    }
3937 2
3938
    $arrayOutput = array();
3939 2
    $p = -1;
3940
3941 2
    /** @noinspection PhpForeachArrayIsUsedAsValueInspection */
3942 2
    foreach ($a as $l => $a) {
3943 2
      if ($l % $len) {
3944
        $arrayOutput[$p] .= $a;
3945 2
      } else {
3946
        $arrayOutput[++$p] = $a;
3947
      }
3948
    }
3949
3950
    return $arrayOutput;
3951
  }
3952
3953
  /**
3954
   * Get a binary representation of a specific string.
3955 1
   *
3956
   * @param string $str <p>The input string.</p>
3957 1
   *
3958
   * @return string
3959
   */
3960
  public static function str_to_binary($str)
3961 1
  {
3962
    $str = (string)$str;
3963
3964
    $value = unpack('H*', $str);
3965
3966
    return base_convert($value[1], 16, 2);
3967
  }
3968
3969
  /**
3970
   * alias for "UTF8::to_ascii()"
3971
   *
3972
   * @see UTF8::to_ascii()
3973 125
   *
3974
   * @param string $str
3975 125
   * @param string $unknown
3976
   *
3977
   * @return string
3978
   */
3979
  public static function str_transliterate($str, $unknown = '?')
3980
  {
3981
    return self::to_ascii($str, $unknown);
3982
  }
3983
3984
  /**
3985 125
   * Counts number of words in the UTF-8 string.
3986
   *
3987 125
   * @param string $str      <p>The input string.</p>
3988
   * @param int    $format   [optional] <p>
3989 125
   *                         <strong>0</strong> => return a number of words (default)<br />
3990 1
   *                         <strong>1</strong> => return an array of words<br />
3991
   *                         <strong>2</strong> => return an array of words with word-offset as key
3992
   *                         </p>
3993 125
   * @param string $charlist [optional] <p>Additional chars that contains to words and do not start a new word
3994 125
   *                         (default: "'", "’")</p>
3995
   *
3996
   * @return array|int <p>The number of words in the string</p>
3997 2
   */
3998 2
  public static function str_word_count($str, $format = 0, $charlist = '')
3999
  {
4000
    $charlist = self::rxClass($charlist, '\pL');
4001 2
    $strParts = \preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
4002 2
4003
    $len = count($strParts);
4004
4005 2
    if ($format === 1) {
4006 2
4007 2
      $numberOfWords = array();
4008
      for ($i = 1; $i < $len; $i += 2) {
4009
        $numberOfWords[] = $strParts[$i];
4010 2
      }
4011 2
4012 2
    } elseif ($format === 2) {
4013 2
4014 2
      $numberOfWords = array();
4015 2
      $offset = self::strlen($strParts[0]);
4016 2
      for ($i = 1; $i < $len; $i += 2) {
4017 2
        $numberOfWords[$offset] = $strParts[$i];
4018 2
        $offset += self::strlen($strParts[$i]) + self::strlen($strParts[$i + 1]);
4019 2
      }
4020 2
4021 2
    } else {
4022 2
4023 2
      $numberOfWords = ($len - 1) / 2;
4024 2
4025
    }
4026 2
4027 2
    return $numberOfWords;
4028 2
  }
4029
4030 2
  /**
4031
   * Case-insensitive string comparison.
4032 2
   *
4033
   * INFO: Case-insensitive version of UTF8::strcmp()
4034
   *
4035
   * @param string $str1
4036
   * @param string $str2
4037
   *
4038
   * @return int <p>
4039
   *             <strong>&lt; 0</strong> if str1 is less than str2;<br />
4040
   *             <strong>&gt; 0</strong> if str1 is greater than str2,<br />
4041
   *             <strong>0</strong> if they are equal.
4042 2
   *             </p>
4043
   */
4044 2
  public static function strcasecmp($str1, $str2)
4045 2
  {
4046
    return self::strcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4047 2
  }
4048 1
4049 1
  /**
4050 1
   * Case-sensitive string comparison.
4051
   *
4052 2
   * @param string $str1
4053
   * @param string $str2
4054
   *
4055
   * @return int  <p>
4056
   *              <strong>&lt; 0</strong> if str1 is less than str2<br />
4057
   *              <strong>&gt; 0</strong> if str1 is greater than str2<br />
4058
   *              <strong>0</strong> if they are equal.
4059
   *              </p>
4060
   */
4061
  public static function strcmp($str1, $str2)
4062
  {
4063
    return $str1 . '' === $str2 . '' ? 0 : strcmp(
4064 7
        \Normalizer::normalize($str1, \Normalizer::NFD),
4065
        \Normalizer::normalize($str2, \Normalizer::NFD)
4066 7
    );
4067 7
  }
4068
4069 7
  /**
4070
   * Find length of initial segment not matching mask.
4071 7
   *
4072
   * @param string $str
4073 2
   * @param string $charList
4074
   * @param int    $offset
4075 2
   * @param int    $length
4076
   *
4077 1
   * @return int|null
4078 1
   */
4079
  public static function strcspn($str, $charList, $offset = 0, $length = 2147483647)
4080 2
  {
4081 2
    if ('' === $charList .= '') {
4082
      return null;
4083 7
    }
4084 7
4085 1
    if ($offset || 2147483647 !== $length) {
4086 1
      $str = (string)self::substr($str, $offset, $length);
4087
    } else {
4088 7
      $str = (string)$str;
4089 7
    }
4090
4091 7
    if (preg_match('/^(.*?)' . self::rxClass($charList) . '/us', $str, $length)) {
4092
      /** @noinspection OffsetOperationsInspection */
4093
      return self::strlen($length[1]);
4094
    } else {
4095
      return self::strlen($str);
4096
    }
4097
  }
4098
4099
  /**
4100
   * Create a UTF-8 string from code points.
4101
   *
4102
   * INFO: opposite to UTF8::codepoints()
4103
   *
4104
   * @param array $array <p>Integer or Hexadecimal codepoints.</p>
4105
   *
4106
   * @return string <p>UTF-8 encoded string.</p>
4107
   */
4108
  public static function string(array $array)
4109
  {
4110
    return implode(
4111
        array_map(
4112
            array(
4113
                '\\voku\\helper\\UTF8',
4114
                'chr',
4115
            ),
4116
            $array
4117
        )
4118
    );
4119
  }
4120
4121
  /**
4122
   * alias for "UTF8::string_has_bom()"
4123
   *
4124
   * @see UTF8::string_has_bom()
4125
   *
4126
   * @param string $str
4127
   *
4128
   * @return bool
4129
   */
4130
  public static function hasBom($str)
4131
  {
4132
    return self::string_has_bom($str);
4133
  }
4134
4135
  /**
4136
   * Checks if string starts with "BOM" (Byte Order Mark Character) character.
4137
   *
4138
   * @param string $str <p>The input string.</p>
4139
   *
4140
   * @return bool <p><strong>true</strong> if the string has BOM at the start, <strong>false</strong> otherwise.</p>
4141
   */
4142 16
  public static function string_has_bom($str)
4143
  {
4144 16
    foreach (self::$bom as $bomString => $bomByteLength) {
4145 2
      if (0 === strpos($str, $bomString)) {
4146
        return true;
4147
      }
4148
    }
4149 15
4150
    return false;
4151 15
  }
4152
4153
  /**
4154
   * Strip HTML and PHP tags from a string + clean invalid UTF-8.
4155
   *
4156
   * @link http://php.net/manual/en/function.strip-tags.php
4157
   *
4158 15
   * @param string $str            <p>
4159 15
   *                               The input string.
4160
   *                               </p>
4161 15
   * @param string $allowable_tags [optional] <p>
4162 3
   *                               You can use the optional second parameter to specify tags which should
4163
   *                               not be stripped.
4164
   *                               </p>
4165 14
   *                               <p>
4166 9
   *                               HTML comments and PHP tags are also stripped. This is hardcoded and
4167
   *                               can not be changed with allowable_tags.
4168
   *                               </p>
4169 12
   *
4170 9
   * @return string <p>The stripped string.</p>
4171
   */
4172
  public static function strip_tags($str, $allowable_tags = null)
4173 11
  {
4174
    // clean broken utf8
4175
    $str = self::clean($str);
4176
4177
    return strip_tags($str, $allowable_tags);
4178
  }
4179
4180
  /**
4181
   * Finds position of first occurrence of a string within another, case insensitive.
4182
   *
4183
   * @link http://php.net/manual/en/function.mb-stripos.php
4184
   *
4185
   * @param string  $haystack  <p>
4186
   *                           The string from which to get the position of the first occurrence
4187
   *                           of needle
4188
   *                           </p>
4189
   * @param string  $needle    <p>
4190
   *                           The string to find in haystack
4191
   *                           </p>
4192
   * @param int     $offset    [optional] <p>
4193 1
   *                           The position in haystack
4194
   *                           to start searching
4195
   *                           </p>
4196 1
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4197
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4198 1
   *
4199
   * @return int|false <p>
4200 1
   *                   Return the numeric position of the first occurrence of needle in the haystack string,<br />
4201 1
   *                   or false if needle is not found.
4202 1
   *                   </p>
4203
   */
4204
  public static function stripos($haystack, $needle, $offset = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4205 1
  {
4206
    $haystack = (string)$haystack;
4207
    $needle = (string)$needle;
4208
4209
    if (!isset($haystack[0], $needle[0])) {
4210
      return false;
4211
    }
4212
4213 41
    if ($cleanUtf8 === true) {
4214
      $haystack = self::clean($haystack);
4215
      $needle = self::clean($needle);
4216 41
    }
4217
4218
    if (
4219
        $encoding === 'UTF-8'
4220
        ||
4221
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4222
    ) {
4223
      $encoding = 'UTF-8';
4224
    } else {
4225
      $encoding = self::normalize_encoding($encoding);
4226
    }
4227 1
4228
    return \mb_stripos($haystack, $needle, $offset, $encoding);
4229 1
  }
4230 1
4231
  /**
4232
   * Returns all of haystack starting from and including the first occurrence of needle to the end.
4233 1
   *
4234 1
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4235 1
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4236
   * @param bool   $before_needle [optional] <p>
4237
   *                              If <b>TRUE</b>, grapheme_strstr() returns the part of the
4238 1
   *                              haystack before the first occurrence of the needle (excluding the needle).
4239
   *                              </p>
4240
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function</p>
4241 1
   *
4242
   * @return false|string A sub-string,<br />or <strong>false</strong> if needle is not found.
4243
   */
4244
  public static function stristr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4245 1
  {
4246 1
    if ('' === $needle .= '') {
4247 1
      return false;
4248
    }
4249
4250 1
    if ($encoding !== 'UTF-8') {
4251
      $encoding = self::normalize_encoding($encoding);
4252
    }
4253 1
4254
    return \mb_stristr($haystack, $needle, $before_needle, $encoding);
4255
  }
4256
4257 1
  /**
4258
   * Get the string length, not the byte-length!
4259 1
   *
4260 1
   * @link     http://php.net/manual/en/function.mb-strlen.php
4261 1
   *
4262 1
   * @param string  $str       <p>The string being checked for length.</p>
4263 1
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4264
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4265
   *
4266
   * @return int <p>The number of characters in the string $str having character encoding $encoding. (One multi-byte
4267
   *             character counted as +1)</p>
4268
   */
4269
  public static function strlen($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4270
  {
4271
    $str = (string)$str;
4272
4273 10
    if (!isset($str[0])) {
4274
      return 0;
4275 10
    }
4276 10
4277 5
    if (
4278 5
        $encoding === 'UTF-8'
4279 10
        ||
4280
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4281 10
    ) {
4282
      $encoding = 'UTF-8';
4283
    } else {
4284
      $encoding = self::normalize_encoding($encoding);
4285
    }
4286
4287
    switch ($encoding) {
4288
      case 'ASCII':
4289
      case 'CP850':
4290
        return strlen($str);
4291
    }
4292
4293 5
    if ($encoding === 'UTF-8' && $cleanUtf8 === true) {
4294
      $str = self::clean($str);
4295 5
    }
4296
4297
    return \mb_strlen($str, $encoding);
4298
  }
4299
4300
  /**
4301
   * Case insensitive string comparisons using a "natural order" algorithm.
4302
   *
4303
   * INFO: natural order version of UTF8::strcasecmp()
4304
   *
4305
   * @param string $str1 <p>The first string.</p>
4306 1
   * @param string $str2 <p>The second string.</p>
4307
   *
4308 1
   * @return int <strong>&lt; 0</strong> if str1 is less than str2<br />
4309 1
   *             <strong>&gt; 0</strong> if str1 is greater than str2<br />
4310 1
   *             <strong>0</strong> if they are equal
4311
   */
4312 1
  public static function strnatcasecmp($str1, $str2)
4313 1
  {
4314 1
    return self::strnatcmp(self::strtocasefold($str1), self::strtocasefold($str2));
4315 1
  }
4316 1
4317
  /**
4318 1
   * String comparisons using a "natural order" algorithm
4319
   *
4320
   * INFO: natural order version of UTF8::strcmp()
4321
   *
4322
   * @link  http://php.net/manual/en/function.strnatcmp.php
4323
   *
4324
   * @param string $str1 <p>The first string.</p>
4325
   * @param string $str2 <p>The second string.</p>
4326
   *
4327
   * @return int <strong>&lt; 0</strong> if str1 is less than str2;<br />
4328
   *             <strong>&gt; 0</strong> if str1 is greater than str2;<br />
4329
   *             <strong>0</strong> if they are equal
4330
   */
4331
  public static function strnatcmp($str1, $str2)
4332
  {
4333
    return $str1 . '' === $str2 . '' ? 0 : strnatcmp(self::strtonatfold($str1), self::strtonatfold($str2));
4334 42
  }
4335
4336
  /**
4337 42
   * Case-insensitive string comparison of the first n characters.
4338
   *
4339
   * @link  http://php.net/manual/en/function.strncasecmp.php
4340
   *
4341 42
   * @param string $str1 <p>The first string.</p>
4342 42
   * @param string $str2 <p>The second string.</p>
4343 42
   * @param int    $len  <p>The length of strings to be used in the comparison.</p>
4344 42
   *
4345
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4346 42
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4347
   *             <strong>0</strong> if they are equal
4348
   */
4349 42
  public static function strncasecmp($str1, $str2, $len)
4350 42
  {
4351
    return self::strncmp(self::strtocasefold($str1), self::strtocasefold($str2), $len);
4352 42
  }
4353
4354
  /**
4355
   * String comparison of the first n characters.
4356
   *
4357
   * @link  http://php.net/manual/en/function.strncmp.php
4358
   *
4359
   * @param string $str1 <p>The first string.</p>
4360
   * @param string $str2 <p>The second string.</p>
4361
   * @param int    $len  <p>Number of characters to use in the comparison.</p>
4362
   *
4363 42
   * @return int <strong>&lt; 0</strong> if <i>str1</i> is less than <i>str2</i>;<br />
4364
   *             <strong>&gt; 0</strong> if <i>str1</i> is greater than <i>str2</i>;<br />
4365 42
   *             <strong>0</strong> if they are equal
4366
   */
4367 42
  public static function strncmp($str1, $str2, $len)
4368 42
  {
4369 42
    $str1 = self::substr($str1, 0, $len);
4370
    $str2 = self::substr($str2, 0, $len);
4371 42
4372 42
    return self::strcmp($str1, $str2);
0 ignored issues
show
Security Bug introduced by
It seems like $str1 defined by self::substr($str1, 0, $len) on line 4369 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str2 defined by self::substr($str2, 0, $len) on line 4370 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4373 42
  }
4374
4375 42
  /**
4376
   * Search a string for any of a set of characters.
4377
   *
4378
   * @link  http://php.net/manual/en/function.strpbrk.php
4379
   *
4380
   * @param string $haystack  <p>The string where char_list is looked for.</p>
4381
   * @param string $char_list <p>This parameter is case sensitive.</p>
4382
   *
4383
   * @return string String starting from the character found, or false if it is not found.
4384
   */
4385
  public static function strpbrk($haystack, $char_list)
4386 23
  {
4387
    $haystack = (string)$haystack;
4388 23
    $char_list = (string)$char_list;
4389
4390 23
    if (!isset($haystack[0], $char_list[0])) {
4391 5
      return false;
4392
    }
4393
4394
    if (preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) {
4395 19
      return substr($haystack, strpos($haystack, $m[0]));
4396 3
    } else {
4397
      return false;
4398
    }
4399 18
  }
4400
4401 18
  /**
4402
   * Find position of first occurrence of string in a string.
4403
   *
4404
   * @link http://php.net/manual/en/function.mb-strpos.php
4405
   *
4406
   * @param string  $haystack  <p>The string being checked.</p>
4407
   * @param string  $needle    <p>The position counted from the beginning of haystack.</p>
4408
   * @param int     $offset    [optional] <p>The search offset. If it is not specified, 0 is used.</p>
4409
   * @param string  $encoding  [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4410
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4411
   *
4412 45
   * @return int|false <p>
4413
   *                   The numeric position of the first occurrence of needle in the haystack string.<br />
4414 45
   *                   If needle is not found it returns false.
4415
   *                   </p>
4416 45
   */
4417
  public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8', $cleanUtf8 = false)
4418 45
  {
4419 34
    $haystack = (string)$haystack;
4420
    $needle = (string)$needle;
4421
4422 17
    if (!isset($haystack[0], $needle[0])) {
4423
      return false;
4424
    }
4425 17
4426 17
    // init
4427
    $offset = (int)$offset;
4428 17
4429 17
    // iconv and mbstring do not support integer $needle
4430 17
4431 2
    if (((int)$needle) === $needle && ($needle >= 0)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of (int) $needle (integer) and $needle (string) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
4432 2
      $needle = (string)self::chr($needle);
4433
    }
4434
4435 17
    if ($cleanUtf8 === true) {
4436
      // \mb_strpos returns wrong position if invalid characters are found in $haystack before $needle
4437 17
      // iconv_strpos is not tolerant to invalid characters
4438 17
4439 17
      $needle = self::clean((string)$needle);
4440
      $haystack = self::clean($haystack);
4441 17
    }
4442 17
4443 17
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4444
      self::checkForSupport();
4445
    }
4446
4447 17
    if (
4448
        $encoding === 'UTF-8'
4449 17
        ||
4450
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4451
    ) {
4452
      $encoding = 'UTF-8';
4453
    } else {
4454
      $encoding = self::normalize_encoding($encoding);
4455
    }
4456
4457
    if (
4458
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4459
        ||
4460
        self::$support['mbstring'] === true
4461
    ) {
4462
      return \mb_strpos($haystack, $needle, $offset, $encoding);
4463
    }
4464
4465
    if (self::$support['iconv'] === true) {
4466
      // ignore invalid negative offset to keep compatility
4467
      // with php < 5.5.35, < 5.6.21, < 7.0.6
0 ignored issues
show
Unused Code Comprehensibility introduced by
39% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
4468
      return \grapheme_strpos($haystack, $needle, $offset > 0 ? $offset : 0);
4469
    }
4470 1
4471
    if ($offset > 0) {
4472 1
      $haystack = self::substr($haystack, $offset);
4473 1
    }
4474
4475 View Code Duplication
    if (($pos = strpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4476
      $left = substr($haystack, 0, $pos);
4477
4478 1
      // negative offset not supported in PHP strpos(), ignoring
4479 1
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4480 1
    }
4481 1
4482
    return false;
4483
  }
4484 1
4485
  /**
4486
   * Finds the last occurrence of a character in a string within another.
4487
   *
4488
   * @link http://php.net/manual/en/function.mb-strrchr.php
4489
   *
4490
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4491
   * @param string $needle   <p>The string to find in haystack</p>
4492
   * @param bool   $part     [optional] <p>
4493
   *                         Determines which portion of haystack
4494
   *                         this function returns.
4495
   *                         If set to true, it returns all of haystack
4496 36
   *                         from the beginning to the last occurrence of needle.
4497
   *                         If set to false, it returns all of haystack
4498 36
   *                         from the last occurrence of needle to the end,
4499
   *                         </p>
4500 36
   * @param string $encoding [optional] <p>
4501 4
   *                         Character encoding name to use.
4502
   *                         If it is omitted, internal character encoding is used.
4503
   *                         </p>
4504
   *
4505 35
   * @return string|false The portion of haystack or false if needle is not found.
4506 35
   */
4507 35 View Code Duplication
  public static function strrchr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4508
  {
4509 35
    if ($encoding !== 'UTF-8') {
4510
      $encoding = self::normalize_encoding($encoding);
4511 35
    }
4512 6
4513 6
    return \mb_strrchr($haystack, $needle, $part, $encoding);
4514
  }
4515 35
4516 35
  /**
4517 35
   * alias for "UTF8::strstr()"
4518 35
   *
4519 35
   * @see UTF8::strstr()
4520
   *
4521 35
   * @param string $haystack
4522
   * @param string $needle
4523
   * @param bool   $before_needle
4524
   *
4525
   * @return string|false
4526
   */
4527
  public static function strchr($haystack, $needle, $before_needle = false)
4528
  {
4529
    return self::strstr($haystack, $needle, $before_needle);
4530
  }
4531
4532
  /**
4533
   * alias for "UTF8::stristr()"
4534
   *
4535
   * @see UTF8::stristr()
4536
   *
4537
   * @param string $haystack
4538
   * @param string $needle
4539
   * @param bool   $before_needle
4540
   *
4541
   * @return string|false
4542
   */
4543
  public static function strichr($haystack, $needle, $before_needle = false)
4544
  {
4545
    return self::stristr($haystack, $needle, $before_needle);
4546
  }
4547
4548
  /**
4549
   * Reverses characters order in the string.
4550
   *
4551
   * @param string $str The input string
4552
   *
4553 35
   * @return string The string with characters in the reverse sequence
4554 5
   */
4555
  public static function strrev($str)
4556 5
  {
4557 5
    $str = (string)$str;
4558
4559
    if (!isset($str[0])) {
4560 35
      return '';
4561
    }
4562
4563
    return implode(array_reverse(self::split($str)));
4564 35
  }
4565
4566
  /**
4567
   * Finds the last occurrence of a character in a string within another, case insensitive.
4568
   *
4569
   * @link http://php.net/manual/en/function.mb-strrichr.php
4570
   *
4571
   * @param string $haystack <p>The string from which to get the last occurrence of needle.</p>
4572
   * @param string $needle   <p>The string to find in haystack.</p>
4573
   * @param bool   $part     [optional] <p>
4574
   *                         Determines which portion of haystack
4575 12
   *                         this function returns.
4576
   *                         If set to true, it returns all of haystack
4577
   *                         from the beginning to the last occurrence of needle.
4578
   *                         If set to false, it returns all of haystack
4579
   *                         from the last occurrence of needle to the end,
4580
   *                         </p>
4581
   * @param string $encoding [optional] <p>
4582 12
   *                         Character encoding name to use.
4583 2
   *                         If it is omitted, internal character encoding is used.
4584 1
   *                         </p>
4585 2
   *
4586 1
   * @return string|false <p>The portion of haystack or<br />false if needle is not found.</p>
4587 2
   */
4588 View Code Duplication
  public static function strrichr($haystack, $needle, $part = false, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4589 2
  {
4590
    if ($encoding !== 'UTF-8') {
4591
      $encoding = self::normalize_encoding($encoding);
4592 2
    }
4593
4594
    return \mb_strrichr($haystack, $needle, $part, $encoding);
4595
  }
4596
4597
  /**
4598 12
   * Find position of last occurrence of a case-insensitive string.
4599 3
   *
4600
   * @param string  $haystack  <p>The string to look in.</p>
4601
   * @param string  $needle    <p>The string to look for.</p>
4602
   * @param int     $offset    [optional] <p>Number of characters to ignore in the beginning or end.</p>
4603
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4604
   *
4605
   * @return int|false <p>
4606 12
   *                   The numeric position of the last occurrence of needle in the haystack string.<br />If needle is
4607 9
   *                   not found, it returns false.
4608
   *                   </p>
4609
   */
4610
  public static function strripos($haystack, $needle, $offset = 0, $cleanUtf8 = false)
4611
  {
4612
    return self::strrpos(self::strtolower($haystack), self::strtolower($needle), $offset, $cleanUtf8);
4613
  }
4614
4615
  /**
4616 6
   * Find position of last occurrence of a string in a string.
4617 6
   *
4618 6
   * @link http://php.net/manual/en/function.mb-strrpos.php
4619 6
   *
4620 6
   * @param string     $haystack  <p>The string being checked, for the last occurrence of needle</p>
4621 6
   * @param string|int $needle    <p>The string to find in haystack.<br />Or a code point as int.</p>
4622
   * @param int        $offset    [optional] <p>May be specified to begin searching an arbitrary number of characters
4623 6
   *                              into the string. Negative values will stop searching at an arbitrary point prior to
4624 6
   *                              the end of the string.
4625 6
   *                              </p>
4626 6
   * @param boolean    $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4627
   *
4628
   * @return int|false <p>The numeric position of the last occurrence of needle in the haystack string.<br />If needle
4629
   *                   is not found, it returns false.</p>
4630
   */
4631
  public static function strrpos($haystack, $needle, $offset = null, $cleanUtf8 = false)
4632
  {
4633
    $haystack = (string)$haystack;
4634
4635
    if (((int)$needle) === $needle && ($needle >= 0)) {
4636
      $needle = self::chr($needle);
4637
    }
4638
4639
    $needle = (string)$needle;
4640
4641
    if (!isset($haystack[0], $needle[0])) {
4642
      return false;
4643
    }
4644
4645
    // init
4646
    $needle = (string)$needle;
4647
    $offset = (int)$offset;
4648
4649
    if ($cleanUtf8 === true) {
4650
      // \mb_strrpos && iconv_strrpos is not tolerant to invalid characters
4651
4652
      $needle = self::clean($needle);
4653
      $haystack = self::clean($haystack);
4654
    }
4655
4656
4657
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4658
      self::checkForSupport();
4659
    }
4660
4661
    if (self::$support['mbstring'] === true) {
4662
      return \mb_strrpos($haystack, $needle, $offset, 'UTF-8');
4663
    }
4664
4665
    if (self::$support['iconv'] === true) {
4666
      return \grapheme_strrpos($haystack, $needle, $offset);
4667
    }
4668
4669
    // fallback
4670 13
4671
    if ($offset > 0) {
4672 13
      $haystack = self::substr($haystack, $offset);
4673
    } elseif ($offset < 0) {
4674
      $haystack = self::substr($haystack, 0, $offset);
4675 13
    }
4676 13
4677 1 View Code Duplication
    if (($pos = strrpos($haystack, $needle)) !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4678 1
      $left = substr($haystack, 0, $pos);
4679 12
4680
      // negative offset not supported in PHP strpos(), ignoring
4681 13
      return ($offset > 0 ? $offset : 0) + self::strlen($left);
4682
    }
4683 13
4684 13
    return false;
4685
  }
4686 13
4687
  /**
4688
   * Finds the length of the initial segment of a string consisting entirely of characters contained within a given
4689
   * mask.
4690
   *
4691
   * @param string $str    <p>The input string.</p>
4692
   * @param string $mask   <p>The mask of chars</p>
4693
   * @param int    $offset [optional]
4694
   * @param int    $length [optional]
4695
   *
4696
   * @return int
4697
   */
4698 1
  public static function strspn($str, $mask, $offset = 0, $length = 2147483647)
4699
  {
4700 1
    if ($offset || 2147483647 !== $length) {
4701
      $str = self::substr($str, $offset, $length);
4702 1
    }
4703
4704
    return preg_match('/^' . self::rxClass($mask) . '+/u', $str, $str) ? self::strlen($str[0]) : 0;
4705
  }
4706 1
4707
  /**
4708 1
   * Returns part of haystack string from the first occurrence of needle to the end of haystack.
4709
   *
4710
   * @param string $haystack      <p>The input string. Must be valid UTF-8.</p>
4711
   * @param string $needle        <p>The string to look for. Must be valid UTF-8.</p>
4712 1
   * @param bool   $before_needle [optional] <p>
4713 1
   *                              If <b>TRUE</b>, strstr() returns the part of the
4714
   *                              haystack before the first occurrence of the needle (excluding the needle).
4715
   *                              </p>
4716 1
   * @param string $encoding      [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4717 1
   *
4718 1
   * @return string|false A sub-string,<br />or <strong>false</strong> if needle is not found.
4719 1
   */
4720
  public static function strstr($haystack, $needle, $before_needle = false, $encoding = 'UTF-8')
4721 1
  {
4722
    if ($encoding !== 'UTF-8') {
4723
      $encoding = self::normalize_encoding($encoding);
4724 1
    }
4725
4726
    if (
4727 1
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4728
        ||
4729
        self::$support['mbstring'] === true
4730
    ) {
4731
      return \mb_strstr($haystack, $needle, $before_needle, $encoding);
4732
    }
4733
4734
    return \grapheme_strstr($haystack, $needle, $before_needle);
4735
  }
4736
4737
  /**
4738
   * Unicode transformation for case-less matching.
4739
   *
4740 2
   * @link http://unicode.org/reports/tr21/tr21-5.html
4741
   *
4742 2
   * @param string $str  <p>The input string.</p>
4743
   * @param bool   $full <p>
4744 2
   *                     <b>true</b> === replace full case folding chars + strtolower (default)<br />
4745 2
   *                     <b>false</b> use only $commonCaseFold +  strtolower
4746
   *                     </p>
4747 2
   *
4748
   * @return string
4749
   */
4750 2
  public static function strtocasefold($str, $full = true)
4751 2
  {
4752 2
    static $fullCaseFold = null;
4753 2
    static $commonCaseFoldKeys = null;
4754 2
    static $commonCaseFoldValues = null;
4755
4756 2
    if ($commonCaseFoldKeys === null) {
4757 2
      $commonCaseFoldKeys = array_keys(self::$commonCaseFold);
4758 2
      $commonCaseFoldValues = array_values(self::$commonCaseFold);
4759 2
    }
4760 2
4761 2
    $str = str_replace($commonCaseFoldKeys, $commonCaseFoldValues, $str);
4762
4763 2
    if ($full) {
4764 2
4765 2
      if ($fullCaseFold === null) {
4766 2
        $fullCaseFold = self::getData('caseFolding_full');
4767 2
      }
4768 2
4769
      /** @noinspection OffsetOperationsInspection */
4770 2
      $str = str_replace($fullCaseFold[0], $fullCaseFold[1], $str);
4771
    }
4772
4773 2
    $str = self::clean($str);
4774
4775
    return self::strtolower($str);
4776
  }
4777
4778
  /**
4779
   * Make a string lowercase.
4780
   *
4781
   * @link http://php.net/manual/en/function.mb-strtolower.php
4782
   *
4783
   * @param string $str      <p>The string being lowercased.</p>
4784
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function</p>
4785
   *
4786
   * @return string str with all alphabetic characters converted to lowercase.
4787
   */
4788 View Code Duplication
  public static function strtolower($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4789
  {
4790
    // init
4791
    $str = (string)$str;
4792
4793
    if (!isset($str[0])) {
4794 1
      return '';
4795
    }
4796 1
4797
    if ($encoding !== 'UTF-8') {
4798 1
      $encoding = self::normalize_encoding($encoding);
4799
    }
4800
4801
    return \mb_strtolower($str, $encoding);
4802
  }
4803
4804
  /**
4805
   * Generic case sensitive transformation for collation matching.
4806
   *
4807
   * @param string $str <p>The input string</p>
4808
   *
4809
   * @return string
4810
   */
4811
  private static function strtonatfold($str)
4812
  {
4813
    return preg_replace('/\p{Mn}+/u', '', \Normalizer::normalize($str, \Normalizer::NFD));
4814
  }
4815
4816
  /**
4817
   * Make a string uppercase.
4818
   *
4819
   * @link http://php.net/manual/en/function.mb-strtoupper.php
4820
   *
4821
   * @param string $str      <p>The string being uppercased.</p>
4822
   * @param string $encoding [optional] <p>Set the charset for e.g. "\mb_" function.</p>
4823
   *
4824
   * @return string str with all alphabetic characters converted to uppercase.
4825
   */
4826 View Code Duplication
  public static function strtoupper($str, $encoding = 'UTF-8')
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
4827
  {
4828
    $str = (string)$str;
4829
4830
    if (!isset($str[0])) {
4831 12
      return '';
4832
    }
4833 12
4834
    if ($encoding !== 'UTF-8') {
4835
      $encoding = self::normalize_encoding($encoding);
4836
    }
4837
4838
    return \mb_strtoupper($str, $encoding);
4839
  }
4840
4841
  /**
4842
   * Translate characters or replace sub-strings.
4843 1
   *
4844
   * @link  http://php.net/manual/en/function.strtr.php
4845 1
   *
4846
   * @param string          $str  <p>The string being translated.</p>
4847 1
   * @param string|string[] $from <p>The string replacing from.</p>
4848
   * @param string|string[] $to   <p>The string being translated to to.</p>
4849 1
   *
4850
   * @return string <p>
4851
   *                This function returns a copy of str, translating all occurrences of each character in from to the
4852
   *                corresponding character in to.
4853
   *                </p>
4854
   */
4855
  public static function strtr($str, $from, $to = INF)
4856
  {
4857
    if (INF !== $to) {
4858
      $from = self::str_split($from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by self::str_split($from) on line 4858 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4859
      $to = self::str_split($to);
0 ignored issues
show
Bug introduced by
It seems like $to defined by self::str_split($to) on line 4859 can also be of type array<integer,string>; however, voku\helper\UTF8::str_split() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
4860
      $countFrom = count($from);
4861 1
      $countTo = count($to);
4862
4863 1
      if ($countFrom > $countTo) {
4864
        $from = array_slice($from, 0, $countTo);
4865 1
      } elseif ($countFrom < $countTo) {
4866 1
        $to = array_slice($to, 0, $countFrom);
4867 1
      }
4868
4869 1
      $from = array_combine($from, $to);
4870 1
    }
4871 1
4872 1
    return strtr($str, $from);
0 ignored issues
show
Bug introduced by
It seems like $from defined by parameter $from on line 4855 can also be of type string; however, strtr() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
4873
  }
4874
4875 1
  /**
4876
   * Return the width of a string.
4877
   *
4878
   * @param string  $str       <p>The input string.</p>
4879
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4880
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4881
   *
4882
   * @return int
4883
   */
4884
  public static function strwidth($str, $encoding = 'UTF-8', $cleanUtf8 = false)
4885
  {
4886 20
    if ($encoding !== 'UTF-8') {
4887
      $encoding = self::normalize_encoding($encoding);
4888
    }
4889 20
4890 20
    if ($cleanUtf8 === true) {
4891
      // iconv and mbstring are not tolerant to invalid encoding
4892 20
      // further, their behaviour is inconsistent with that of PHP's substr
4893
4894
      $str = self::clean($str);
4895
    }
4896 20
4897 20
    return \mb_strwidth($str, $encoding);
4898
  }
4899 20
4900 20
  /**
4901
   * Get part of a string.
4902
   *
4903 1
   * @link http://php.net/manual/en/function.mb-substr.php
4904 1
   *
4905
   * @param string  $str       <p>The string being checked.</p>
4906
   * @param int     $start     <p>The first position used in str.</p>
4907 1
   * @param int     $length    [optional] <p>The maximum length of the returned string.</p>
4908 1
   * @param string  $encoding  [optional] <p>Default is UTF-8</p>
4909 1
   * @param boolean $cleanUtf8 [optional] <p>Clean non UTF-8 chars from the string.</p>
4910 1
   *
4911 1
   * @return string Returns a sub-string specified by the start and length parameters.
4912
   */
4913 1
  public static function substr($str, $start = 0, $length = null, $encoding = 'UTF-8', $cleanUtf8 = false)
4914
  {
4915 1
    // init
4916
    $str = (string)$str;
4917
4918
    if (!isset($str[0])) {
4919
      return '';
4920
    }
4921
4922
    if ($cleanUtf8 === true) {
4923
      // iconv and mbstring are not tolerant to invalid encoding
4924
      // further, their behaviour is inconsistent with that of PHP's substr
4925 1
4926
      $str = self::clean($str);
4927 1
    }
4928
4929 1
    $str_length = 0;
4930
    if ($start || $length === null) {
4931 1
      $str_length = (int)self::strlen($str);
4932
    }
4933
4934
    if ($start && $start > $str_length) {
4935
      return false;
4936
    }
4937
4938
    if ($length === null) {
4939
      $length = $str_length;
4940
    } else {
4941
      $length = (int)$length;
4942
    }
4943
4944 7
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
4945
      self::checkForSupport();
4946 7
    }
4947
4948
    if (
4949
        $encoding === 'UTF-8'
4950
        ||
4951
        $encoding === true || $encoding === false // INFO: the "bool"-check is only a fallback for old versions
4952
    ) {
4953
      $encoding = 'UTF-8';
4954
    } else {
4955
      $encoding = self::normalize_encoding($encoding);
4956
    }
4957
4958
    if (
4959
        $encoding !== 'UTF-8' // INFO: use "mb_"-function (with polyfill) also if we need another encoding
4960 1
        ||
4961
        self::$support['mbstring'] === true
4962 1
    ) {
4963 1
      return \mb_substr($str, $start, $length, $encoding);
4964
    }
4965 1
4966
    if (self::$support['iconv'] === true) {
4967 1
      return (string)\grapheme_substr($str, $start, $length);
4968
    }
4969 1
4970 1
    // fallback
4971 1
4972 1
    // split to array, and remove invalid characters
4973
    $array = self::split($str);
4974 1
4975
    // extract relevant part, and join to make sting again
4976 1
    return implode(array_slice($array, $start, $length));
4977
  }
4978 1
4979 1
  /**
4980 1
   * Binary safe comparison of two strings from an offset, up to length characters.
4981 1
   *
4982 1
   * @param string  $main_str           <p>The main string being compared.</p>
4983 1
   * @param string  $str                <p>The secondary string being compared.</p>
4984
   * @param int     $offset             <p>The start position for the comparison. If negative, it starts counting from
4985 1
   *                                    the end of the string.</p>
4986
   * @param int     $length             [optional] <p>The length of the comparison. The default value is the largest of
4987 1
   *                                    the length of the str compared to the length of main_str less the offset.</p>
4988
   * @param boolean $case_insensitivity [optional] <p>If case_insensitivity is TRUE, comparison is case
4989
   *                                    insensitive.</p>
4990
   *
4991 1
   * @return int
4992
   */
4993
  public static function substr_compare($main_str, $str, $offset, $length = 2147483647, $case_insensitivity = false)
4994
  {
4995
    $main_str = self::substr($main_str, $offset, $length);
4996
    $str = self::substr($str, 0, self::strlen($main_str));
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strlen() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4997
4998
    return $case_insensitivity === true ? self::strcasecmp($main_str, $str) : self::strcmp($main_str, $str);
0 ignored issues
show
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 4996 can also be of type false; however, voku\helper\UTF8::strcasecmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $main_str defined by self::substr($main_str, $offset, $length) on line 4995 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
Security Bug introduced by
It seems like $str defined by self::substr($str, 0, self::strlen($main_str)) on line 4996 can also be of type false; however, voku\helper\UTF8::strcmp() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
4999
  }
5000
5001
  /**
5002
   * Count the number of substring occurrences.
5003
   *
5004
   * @link  http://php.net/manual/en/function.substr-count.php
5005
   *
5006 9
   * @param string $haystack  <p>The string to search in.</p>
5007
   * @param string $needle    <p>The substring to search for.</p>
5008 9
   * @param int    $offset    [optional] <p>The offset where to start counting.</p>
5009
   * @param int    $length    [optional] <p>
5010
   *                          The maximum length after the specified offset to search for the
5011
   *                          substring. It outputs a warning if the offset plus the length is
5012
   *                          greater than the haystack length.
5013
   *                          </p>
5014
   * @param string $encoding  <p>Set the charset for e.g. "\mb_" function.</p>
5015
   *
5016
   * @return int <p>This functions returns an integer.</p>
5017
   */
5018
  public static function substr_count($haystack, $needle, $offset = 0, $length = null, $encoding = 'UTF-8')
5019
  {
5020
    $haystack = (string)$haystack;
5021 12
    $needle = (string)$needle;
5022
5023 12
    if (!isset($haystack[0], $needle[0])) {
5024 11
      return false;
5025 11
    }
5026 12
5027
    if ($offset || $length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
5028
      $offset = (int)$offset;
5029
      $length = (int)$length;
5030
5031
      if ($length + $offset <= 0) {
5032
        return false;
5033
      }
5034
5035
      $haystack = self::substr($haystack, $offset, $length, $encoding);
5036
    }
5037
5038
    if ($encoding !== 'UTF-8') {
5039 8
      $encoding = self::normalize_encoding($encoding);
5040
    }
5041 8
5042 1
    return \mb_substr_count($haystack, $needle, $encoding);
5043
  }
5044
5045 7
  /**
5046 2
   * Replace text within a portion of a string.
5047 2
   *
5048 5
   * source: https://gist.github.com/stemar/8287074
5049
   *
5050
   * @param string|string[] $str         <p>The input string or an array of stings.</p>
5051 7
   * @param string|string[] $replacement <p>The replacement string or an array of stings.</p>
5052
   * @param int|int[]       $start
5053 7
   * @param int|int[]|void  $length      [optional]
5054
   *
5055 1
   * @return string|string[]
5056
   */
5057
  public static function substr_replace($str, $replacement, $start, $length = null)
5058
  {
5059
    if (is_array($str)) {
5060
      $num = count($str);
5061
5062
      // $replacement
5063
      if (is_array($replacement)) {
5064
        $replacement = array_slice($replacement, 0, $num);
5065
      } else {
5066
        $replacement = array_pad(array($replacement), $num, $replacement);
5067
      }
5068 2
5069
      // $start
5070 2 View Code Duplication
      if (is_array($start)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5071 2
        $start = array_slice($start, 0, $num);
5072
        foreach ($start as &$valueTmp) {
5073 2
          $valueTmp = (int)$valueTmp === $valueTmp ? $valueTmp : 0;
5074 2
        }
5075 2
        unset($valueTmp);
5076
      } else {
5077 2
        $start = array_pad(array($start), $num, $start);
5078 2
      }
5079
5080
      // $length
5081
      if (!isset($length)) {
5082
        $length = array_fill(0, $num, 0);
5083 View Code Duplication
      } elseif (is_array($length)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5084
        $length = array_slice($length, 0, $num);
5085
        foreach ($length as &$valueTmpV2) {
5086
          if (isset($valueTmpV2)) {
5087
            $valueTmpV2 = (int)$valueTmpV2 === $valueTmpV2 ? $valueTmpV2 : $num;
5088
          } else {
5089
            $valueTmpV2 = 0;
5090
          }
5091
        }
5092
        unset($valueTmpV2);
5093
      } else {
5094
        $length = array_pad(array($length), $num, $length);
5095
      }
5096
5097
      // Recursive call
5098
      return array_map(array(__CLASS__, 'substr_replace'), $str, $replacement, $start, $length);
5099
    } else {
5100
      if (is_array($replacement)) {
5101
        if (count($replacement) > 0) {
5102 2
          $replacement = $replacement[0];
5103
        } else {
5104 2
          $replacement = '';
5105 2
        }
5106 2
      }
5107
    }
5108 2
5109
    preg_match_all('/./us', (string)$str, $smatches);
5110 2
    preg_match_all('/./us', (string)$replacement, $rmatches);
5111
5112
    if ($length === null) {
5113
      $length = \mb_strlen($str);
5114
    }
5115
5116
    array_splice($smatches[0], $start, $length, $rmatches[0]);
5117
5118
    return implode($smatches[0], null);
5119
  }
5120
5121
  /**
5122
   * Returns a case swapped version of the string.
5123
   *
5124
   * @param string $str      <p>The input string.</p>
5125
   * @param string $encoding [optional] <p>Default is UTF-8</p>
5126
   *
5127
   * @return string <p>Each character's case swapped.</p>
5128
   */
5129
  public static function swapCase($str, $encoding = 'UTF-8')
5130
  {
5131
    $str = (string)$str;
5132 2
5133
    if (!isset($str[0])) {
5134
      return '';
5135 2
    }
5136
5137 2
    if ($encoding !== 'UTF-8') {
5138
      $encoding = self::normalize_encoding($encoding);
5139
    }
5140
5141
    $str = self::clean($str);
5142
5143
    $strSwappedCase = preg_replace_callback(
5144
        '/[\S]/u',
5145
        function ($match) use ($encoding) {
5146
          $marchToUpper = UTF8::strtoupper($match[0], $encoding);
5147
5148
          if ($match[0] === $marchToUpper) {
5149
            return UTF8::strtolower($match[0], $encoding);
5150
          } else {
5151
            return $marchToUpper;
5152
          }
5153
        },
5154
        $str
5155
    );
5156
5157
    return $strSwappedCase;
5158
  }
5159
5160
  /**
5161
   * alias for "UTF8::to_ascii()"
5162 8
   *
5163
   * @see UTF8::to_ascii()
5164 8
   *
5165 8
   * @param string $s
5166
   * @param string $subst_chr
5167 8
   *
5168 3
   * @return string
5169
   */
5170
  public static function toAscii($s, $subst_chr = '?')
5171
  {
5172 7
    return self::to_ascii($s, $subst_chr);
5173
  }
5174 7
5175 1
  /**
5176 1
   * alias for "UTF8::to_latin1()"
5177 1
   *
5178
   * @see UTF8::to_latin1()
5179
   *
5180 7
   * @param $str
5181 1
   *
5182 1
   * @return string
5183 7
   */
5184
  public static function toLatin1($str)
5185
  {
5186 7
    return self::to_latin1($str);
5187
  }
5188
5189
  /**
5190
   * alias for "UTF8::to_utf8()"
5191
   *
5192
   * @see UTF8::to_utf8()
5193
   *
5194
   * @param string $str
5195
   *
5196
   * @return string
5197
   */
5198 7
  public static function toUTF8($str)
5199
  {
5200 7
    return self::to_utf8($str);
5201 2
  }
5202
5203
  /**
5204
   * Convert a string into ASCII.
5205 5
   *
5206
   * @param string $str     <p>The input string.</p>
5207 5
   * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p>
5208
   *
5209
   * @return string
5210
   */
5211
  public static function to_ascii($str, $unknown = '?')
5212
  {
5213
    static $UTF8_TO_ASCII;
5214
5215
    // init
5216
    $str = (string)$str;
5217
5218
    if (!isset($str[0])) {
5219
      return '';
5220
    }
5221 61
5222
    $str = self::clean($str);
5223 61
5224
    if (!isset(self::$support['already_checked_via_portable_utf8'])) {
5225 61
      self::checkForSupport();
5226 4
    }
5227
5228
    if (self::$support['intl'] === true && Bootup::is_php('5.4')) {
5229
      $str = transliterator_transliterate('Any-Latin; Latin-ASCII;', $str);
5230 60
5231
      // check again, if we only have ASCII, now ...
5232
      if (!preg_match("/[\x80-\xFF]/", $str)) {
5233 60
        return $str;
5234
      }
5235
    }
5236
5237 60
    preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
5238 60
    $chars = $ar[0];
5239
    foreach ($chars as &$c) {
5240
5241
      $ordC0 = ord($c[0]);
5242 60
5243
      if ($ordC0 >= 0 && $ordC0 <= 127) {
5244 60
        continue;
5245 1
      }
5246 1
5247
      $ordC1 = ord($c[1]);
5248 60
5249
      // ASCII - next please
5250
      if ($ordC0 >= 192 && $ordC0 <= 223) {
5251
        $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128);
5252
      }
5253
5254
      if ($ordC0 >= 224) {
5255
        $ordC2 = ord($c[2]);
5256
5257
        if ($ordC0 <= 239) {
5258
          $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128);
5259
        }
5260
5261
        if ($ordC0 >= 240) {
5262
          $ordC3 = ord($c[3]);
5263 1
5264
          if ($ordC0 <= 247) {
5265 1
            $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128);
5266
          }
5267
5268
          if ($ordC0 >= 248) {
5269
            $ordC4 = ord($c[4]);
5270
5271 View Code Duplication
            if ($ordC0 <= 251) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5272
              $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128);
5273
            }
5274
5275
            if ($ordC0 >= 252) {
5276
              $ordC5 = ord($c[5]);
5277
5278 View Code Duplication
              if ($ordC0 <= 253) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5279
                $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128);
5280
              }
5281
            }
5282
          }
5283
        }
5284
      }
5285
5286
      if ($ordC0 >= 254 && $ordC0 <= 255) {
5287 2
        $c = $unknown;
5288
        continue;
5289 2
      }
5290
5291
      if (!isset($ord)) {
5292
        $c = $unknown;
5293
        continue;
5294
      }
5295
5296
      $bank = $ord >> 8;
5297
      if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
5298
        $bankfile = __DIR__ . '/data/' . sprintf('x%02x', $bank) . '.php';
5299
        if (file_exists($bankfile)) {
5300
          /** @noinspection PhpIncludeInspection */
5301
          require $bankfile;
5302
        } else {
5303
          $UTF8_TO_ASCII[$bank] = array();
5304
        }
5305
      }
5306
5307
      $newchar = $ord & 255;
5308
      if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
5309
        $c = $UTF8_TO_ASCII[$bank][$newchar];
5310
      } else {
5311 1
        $c = $unknown;
5312
      }
5313 1
    }
5314
5315
    return implode('', $chars);
5316
  }
5317
5318
  /**
5319
   * alias for "UTF8::to_iso8859()"
5320
   *
5321
   * @see UTF8::to_iso8859()
5322
   *
5323
   * @param string $str
5324
   *
5325
   * @return string|string[]
5326
   */
5327
  public static function toIso8859($str)
5328
  {
5329
    return self::to_iso8859($str);
5330
  }
5331
5332
  /**
5333
   * alias for "UTF8::to_iso8859()"
5334
   *
5335
   * @see UTF8::to_iso8859()
5336 2
   *
5337
   * @param string|string[] $str
5338 2
   *
5339 2
   * @return string|string[]
5340
   */
5341 2
  public static function to_latin1($str)
5342
  {
5343
    return self::to_iso8859($str);
5344
  }
5345
5346
  /**
5347
   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
5348
   *
5349
   * - It decode UTF-8 codepoints and unicode escape sequences.
5350
   *
5351
   * - It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859-1.
5352
   *
5353
   * - It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
5354
   *
5355
   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
5356
   *    are followed by any of these:  ("group B")
5357
   *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
5358 1
   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
5359
   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
5360 1
   * is also a valid unicode character, and will be left unchanged.
5361 1
   *
5362
   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
5363 1
   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
5364 1
   *
5365
   * @param string|string[] $str <p>Any string or array.</p>
5366
   *
5367 1
   * @return string|string[] <p>The UTF-8 encoded string.</p>
5368 1
   */
5369
  public static function to_utf8($str)
5370
  {
5371
    if (is_array($str)) {
5372
      foreach ($str as $k => $v) {
5373
        /** @noinspection AlterInForeachInspection */
5374
        /** @noinspection OffsetOperationsInspection */
5375
        $str[$k] = self::to_utf8($v);
5376
      }
5377
5378
      return $str;
5379
    }
5380
5381
    $str = (string)$str;
5382
5383
    if (!isset($str[0])) {
5384
      return $str;
5385
    }
5386
5387
    $max = strlen($str);
5388
    $buf = '';
5389
5390
    /** @noinspection ForeachInvariantsInspection */
5391
    for ($i = 0; $i < $max; $i++) {
5392
      $c1 = $str[$i];
5393
5394 15
      if ($c1 >= "\xc0") { // should be converted to UTF8, if it's not UTF8 already
5395
        $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1];
5396 15
        $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2];
5397 15
        $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3];
5398
5399 15
        if ($c1 >= "\xc0" & $c1 <= "\xdf") { // looks like 2 bytes UTF8
5400 2
5401
          if ($c2 >= "\x80" && $c2 <= "\xbf") { // yeah, almost sure it's UTF8 already
5402
            $buf .= $c1 . $c2;
5403
            $i++;
5404 14
          } else { // not valid UTF8 - convert it
5405 14
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5406
            $cc2 = ($c1 & "\x3f") | "\x80";
5407
            $buf .= $cc1 . $cc2;
5408
          }
5409 14
5410 View Code Duplication
        } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { // looks like 3 bytes UTF8
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5411
5412
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { // yeah, almost sure it's UTF8 already
5413 14
            $buf .= $c1 . $c2 . $c3;
5414
            $i += 2;
5415
          } else { // not valid UTF8 - convert it
5416
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5417 1
            $cc2 = ($c1 & "\x3f") | "\x80";
5418 1
            $buf .= $cc1 . $cc2;
5419 1
          }
5420
5421 14
        } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { // looks like 4 bytes UTF8
5422
5423 View Code Duplication
          if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { // yeah, almost sure it's UTF8 already
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
5424 14
            $buf .= $c1 . $c2 . $c3 . $c4;
5425 1
            $i += 3;
5426 1
          } else { // not valid UTF8 - convert it
5427 14
            $cc1 = (chr(ord($c1) / 64) | "\xc0");
5428
            $cc2 = ($c1 & "\x3f") | "\x80";
5429
            $buf .= $cc1 . $cc2;
5430 14
          }
5431
5432
        } else { // doesn't look like UTF8, but should be converted
5433
          $cc1 = (chr(ord($c1) / 64) | "\xc0");
5434
          $cc2 = (($c1 & "\x3f") | "\x80");
5435
          $buf .= $cc1 . $cc2;
5436
        }
5437
5438
      } elseif (($c1 & "\xc0") === "\x80") { // needs conversion
5439
5440
        $ordC1 = ord($c1);
5441
        if (isset(self::$win1252ToUtf8[$ordC1])) { // found in Windows-1252 special cases
5442
          $buf .= self::$win1252ToUtf8[$ordC1];
5443
        } else {
5444
          $cc1 = (chr($ordC1 / 64) | "\xc0");
5445
          $cc2 = (($c1 & "\x3f") | "\x80");
5446
          $buf .= $cc1 . $cc2;
5447
        }
5448
5449
      } else { // it doesn't need conversion
5450
        $buf .= $c1;
5451
      }
5452
    }
5453
5454
    // decode unicode escape sequences
5455
    $buf = preg_replace_callback(
5456
        '/\\\\u([0-9a-f]{4})/i',
5457
        function ($match) {
5458
          return \mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
5459
        },
5460
        $buf
5461
    );
5462
5463
    // decode UTF-8 codepoints
5464
    $buf = preg_replace_callback(
5465
        '/&#\d{2,4};/',
5466
        function ($match) {
5467
          return \mb_convert_encoding($match[0], 'UTF-8', 'HTML-ENTITIES');
5468
        },
5469
        $buf
5470
    );
5471
5472
    return $buf;
5473
  }
5474
5475
  /**
5476
   * Convert a string into "ISO-8859"-encoding (Latin-1).
5477
   *
5478
   * @param string|string[] $str
5479
   *
5480
   * @return string|string[]
5481 1
   */
5482
  public static function to_iso8859($str)
5483 1
  {
5484 1
    if (is_array($str)) {
5485
5486 1
      foreach ($str as $k => $v) {
5487
        /** @noinspection AlterInForeachInspection */
5488
        /** @noinspection OffsetOperationsInspection */
5489
        $str[$k] = self::to_iso8859($v);
5490
      }
5491
5492
      return $str;
5493
    }
5494
5495
    $str = (string)$str;
5496 4
5497
    if (!isset($str[0])) {
5498 4
      return '';
5499
    }
5500
5501
    return self::utf8_decode($str);
5502
  }
5503
5504
  /**
5505
   * Strip whitespace or other characters from beginning or end of a UTF-8 string.
5506
   *
5507
   * INFO: This is slower then "trim()"
5508
   *
5509
   * We can only use the original-function, if we use <= 7-Bit in the string / chars
5510
   * but the check for ACSII (7-Bit) cost more time, then we can safe here.
5511
   *
5512
   * @param string $str   <p>The string to be trimmed</p>
5513
   * @param string $chars [optional] <p>Optional characters to be stripped</p>
5514
   *
5515
   * @return string <p>The trimmed string.</p>
5516
   */
5517
  public static function trim($str = '', $chars = INF)
5518
  {
5519
    $str = (string)$str;
5520
5521
    if (!isset($str[0])) {
5522
      return '';
5523
    }
5524
5525
    // Info: http://nadeausoftware.com/articles/2007/9/php_tip_how_strip_punctuation_characters_web_page#Unicodecharactercategories
5526
    if ($chars === INF || !$chars) {
5527
      return preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $str);
5528
    }
5529 1
5530
    return self::rtrim(self::ltrim($str, $chars), $chars);
5531 1
  }
5532 1
5533
  /**
5534 1
   * Makes string's first char uppercase.
5535
   *
5536
   * @param string $str <p>The input string.</p>
5537
   *
5538
   * @return string <p>The resulting string</p>
5539
   */
5540
  public static function ucfirst($str)
5541
  {
5542
    return self::strtoupper(self::substr($str, 0, 1)) . self::substr($str, 1);
0 ignored issues
show
Security Bug introduced by
It seems like self::substr($str, 0, 1) targeting voku\helper\UTF8::substr() can also be of type false; however, voku\helper\UTF8::strtoupper() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
5543
  }
5544
5545
  /**
5546 1
   * alias for "UTF8::ucfirst()"
5547
   *
5548 1
   * @see UTF8::ucfirst()
5549
   *
5550
   * @param string $word
5551
   *
5552
   * @return string
5553
   */
5554
  public static function ucword($word)
5555
  {
5556
    return self::ucfirst($word);
5557
  }
5558
5559
  /**
5560
   * Uppercase for all words in the string.
5561
   *
5562
   * @param string   $str        <p>The input string.</p>
5563
   * @param string[] $exceptions [optional] <p>Exclusion for some words.</p>
5564
   *
5565
   * @return string
5566
   */
5567
  public static function ucwords($str, $exceptions = array())
5568
  {
5569
    if (!$str) {
5570
      return '';
5571
    }
5572
5573
    // init
5574 11
    $words = explode(' ', $str);
5575
    $newwords = array();
5576 11
5577
    if (count($exceptions) > 0) {
5578 11
      $useExceptions = true;
5579 2
    } else {
5580 2
      $useExceptions = false;
5581
    }
5582 11
5583
    foreach ($words as $word) {
5584 11
      if (
5585 2
          ($useExceptions === false)
5586
          ||
5587
          (
5588
              $useExceptions === true
5589 10
              &&
5590
              !in_array($word, $exceptions, true)
5591 10
          )
5592 10
      ) {
5593
        $word = self::ucfirst($word);
5594 10
      }
5595
      $newwords[] = $word;
5596
    }
5597 2
5598 2
    return implode(' ', $newwords);
5599 2
  }
5600
5601 10
  /**
5602 10
   * Multi decode html entity & fix urlencoded-win1252-chars.
5603
   *
5604
   * e.g:
5605
   * 'D&#252;sseldorf'               => 'Düsseldorf'
5606
   * 'D%FCsseldorf'                  => 'Düsseldorf'
5607
   * 'D&#xFC;sseldorf'               => 'Düsseldorf'
5608
   * 'D%26%23xFC%3Bsseldorf'         => 'Düsseldorf'
5609
   * 'Düsseldorf'                   => 'Düsseldorf'
5610
   * 'D%C3%BCsseldorf'               => 'Düsseldorf'
5611
   * 'D%C3%83%C2%BCsseldorf'         => 'Düsseldorf'
5612
   * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf'
5613
   *
5614
   * @param string $str <p>The input string.</p>
5615
   *
5616
   * @return string
5617
   */
5618
  public static function urldecode($str)
5619
  {
5620
    $str = (string)$str;
5621
5622
    if (!isset($str[0])) {
5623
      return '';
5624
    }
5625
5626
    $str = preg_replace('/%u([0-9a-f]{3,4})/i', '&#x\\1;', urldecode($str));
5627
5628
    $flags = Bootup::is_php('5.4') ? ENT_QUOTES | ENT_HTML5 : ENT_QUOTES;
5629
5630
    $str = self::fix_simple_utf8(
5631
        rawurldecode(
5632
            self::html_entity_decode(
5633
                self::to_utf8($str),
0 ignored issues
show
Bug introduced by
It seems like self::to_utf8($str) targeting voku\helper\UTF8::to_utf8() can also be of type array; however, voku\helper\UTF8::html_entity_decode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
5634
                $flags
5635
            )
5636
        )
5637
    );
5638 8
5639
    return (string)$str;
5640 8
  }
5641 2
5642 2
  /**
5643
   * Return a array with "urlencoded"-win1252 -> UTF-8
5644 8
   *
5645
   * @return mixed
5646
   */
5647
  public static function urldecode_fix_win1252_chars()
5648
  {
5649
    static $array = array(
5650
        '%20' => ' ',
5651
        '%21' => '!',
5652
        '%22' => '"',
5653
        '%23' => '#',
5654
        '%24' => '$',
5655
        '%25' => '%',
5656
        '%26' => '&',
5657
        '%27' => "'",
5658
        '%28' => '(',
5659
        '%29' => ')',
5660
        '%2A' => '*',
5661
        '%2B' => '+',
5662
        '%2C' => ',',
5663
        '%2D' => '-',
5664
        '%2E' => '.',
5665 1
        '%2F' => '/',
5666
        '%30' => '0',
5667 1
        '%31' => '1',
5668
        '%32' => '2',
5669 1
        '%33' => '3',
5670
        '%34' => '4',
5671
        '%35' => '5',
5672
        '%36' => '6',
5673
        '%37' => '7',
5674
        '%38' => '8',
5675
        '%39' => '9',
5676
        '%3A' => ':',
5677
        '%3B' => ';',
5678
        '%3C' => '<',
5679
        '%3D' => '=',
5680
        '%3E' => '>',
5681
        '%3F' => '?',
5682 11
        '%40' => '@',
5683
        '%41' => 'A',
5684 11
        '%42' => 'B',
5685 11
        '%43' => 'C',
5686 11
        '%44' => 'D',
5687
        '%45' => 'E',
5688 11
        '%46' => 'F',
5689 1
        '%47' => 'G',
5690 1
        '%48' => 'H',
5691 1
        '%49' => 'I',
5692
        '%4A' => 'J',
5693 11
        '%4B' => 'K',
5694
        '%4C' => 'L',
5695 11
        '%4D' => 'M',
5696
        '%4E' => 'N',
5697 11
        '%4F' => 'O',
5698 1
        '%50' => 'P',
5699 1
        '%51' => 'Q',
5700
        '%52' => 'R',
5701
        '%53' => 'S',
5702 11
        '%54' => 'T',
5703 11
        '%55' => 'U',
5704
        '%56' => 'V',
5705 11
        '%57' => 'W',
5706
        '%58' => 'X',
5707 11
        '%59' => 'Y',
5708
        '%5A' => 'Z',
5709
        '%5B' => '[',
5710
        '%5C' => '\\',
5711
        '%5D' => ']',
5712
        '%5E' => '^',
5713
        '%5F' => '_',
5714
        '%60' => '`',
5715
        '%61' => 'a',
5716
        '%62' => 'b',
5717
        '%63' => 'c',
5718
        '%64' => 'd',
5719
        '%65' => 'e',
5720
        '%66' => 'f',
5721
        '%67' => 'g',
5722
        '%68' => 'h',
5723 21
        '%69' => 'i',
5724
        '%6A' => 'j',
5725 21
        '%6B' => 'k',
5726
        '%6C' => 'l',
5727 21
        '%6D' => 'm',
5728 6
        '%6E' => 'n',
5729
        '%6F' => 'o',
5730
        '%70' => 'p',
5731
        '%71' => 'q',
5732 19
        '%72' => 'r',
5733 19
        '%73' => 's',
5734
        '%74' => 't',
5735 19
        '%75' => 'u',
5736
        '%76' => 'v',
5737
        '%77' => 'w',
5738
        '%78' => 'x',
5739
        '%79' => 'y',
5740
        '%7A' => 'z',
5741
        '%7B' => '{',
5742
        '%7C' => '|',
5743
        '%7D' => '}',
5744
        '%7E' => '~',
5745 3
        '%7F' => '',
5746
        '%80' => '`',
5747 3
        '%81' => '',
5748
        '%82' => '‚',
5749
        '%83' => 'ƒ',
5750
        '%84' => '„',
5751
        '%85' => '…',
5752
        '%86' => '†',
5753
        '%87' => '‡',
5754
        '%88' => 'ˆ',
5755
        '%89' => '‰',
5756
        '%8A' => 'Š',
5757
        '%8B' => '‹',
5758
        '%8C' => 'Œ',
5759
        '%8D' => '',
5760
        '%8E' => 'Ž',
5761
        '%8F' => '',
5762 16
        '%90' => '',
5763
        '%91' => '‘',
5764 16
        '%92' => '’',
5765
        '%93' => '“',
5766 16
        '%94' => '”',
5767 4
        '%95' => '•',
5768
        '%96' => '–',
5769
        '%97' => '—',
5770
        '%98' => '˜',
5771 15
        '%99' => '™',
5772
        '%9A' => 'š',
5773 15
        '%9B' => '›',
5774 15
        '%9C' => 'œ',
5775
        '%9D' => '',
5776 15
        '%9E' => 'ž',
5777
        '%9F' => 'Ÿ',
5778
        '%A0' => '',
5779
        '%A1' => '¡',
5780
        '%A2' => '¢',
5781
        '%A3' => '£',
5782
        '%A4' => '¤',
5783
        '%A5' => '¥',
5784
        '%A6' => '¦',
5785
        '%A7' => '§',
5786
        '%A8' => '¨',
5787
        '%A9' => '©',
5788
        '%AA' => 'ª',
5789
        '%AB' => '«',
5790
        '%AC' => '¬',
5791
        '%AD' => '',
5792
        '%AE' => '®',
5793
        '%AF' => '¯',
5794
        '%B0' => '°',
5795
        '%B1' => '±',
5796
        '%B2' => '²',
5797
        '%B3' => '³',
5798
        '%B4' => '´',
5799
        '%B5' => 'µ',
5800
        '%B6' => '¶',
5801
        '%B7' => '·',
5802
        '%B8' => '¸',
5803
        '%B9' => '¹',
5804
        '%BA' => 'º',
5805
        '%BB' => '»',
5806
        '%BC' => '¼',
5807
        '%BD' => '½',
5808
        '%BE' => '¾',
5809
        '%BF' => '¿',
5810
        '%C0' => 'À',
5811
        '%C1' => 'Á',
5812
        '%C2' => 'Â',
5813
        '%C3' => 'Ã',
5814
        '%C4' => 'Ä',
5815
        '%C5' => 'Å',
5816 1
        '%C6' => 'Æ',
5817
        '%C7' => 'Ç',
5818 1
        '%C8' => 'È',
5819 1
        '%C9' => 'É',
5820 1
        '%CA' => 'Ê',
5821 1
        '%CB' => 'Ë',
5822 1
        '%CC' => 'Ì',
5823
        '%CD' => 'Í',
5824 1
        '%CE' => 'Î',
5825 1
        '%CF' => 'Ï',
5826 1
        '%D0' => 'Ð',
5827 1
        '%D1' => 'Ñ',
5828 1
        '%D2' => 'Ò',
5829
        '%D3' => 'Ó',
5830 1
        '%D4' => 'Ô',
5831 1
        '%D5' => 'Õ',
5832
        '%D6' => 'Ö',
5833 1
        '%D7' => '×',
5834
        '%D8' => 'Ø',
5835
        '%D9' => 'Ù',
5836
        '%DA' => 'Ú',
5837
        '%DB' => 'Û',
5838
        '%DC' => 'Ü',
5839
        '%DD' => 'Ý',
5840
        '%DE' => 'Þ',
5841
        '%DF' => 'ß',
5842
        '%E0' => 'à',
5843 1
        '%E1' => 'á',
5844
        '%E2' => 'â',
5845
        '%E3' => 'ã',
5846 1
        '%E4' => 'ä',
5847
        '%E5' => 'å',
5848 1
        '%E6' => 'æ',
5849
        '%E7' => 'ç',
5850
        '%E8' => 'è',
5851
        '%E9' => 'é',
5852
        '%EA' => 'ê',
5853
        '%EB' => 'ë',
5854
        '%EC' => 'ì',
5855
        '%ED' => 'í',
5856
        '%EE' => 'î',
5857
        '%EF' => 'ï',
5858
        '%F0' => 'ð',
5859
        '%F1' => 'ñ',
5860
        '%F2' => 'ò',
5861
        '%F3' => 'ó',
5862
        '%F4' => 'ô',
5863
        '%F5' => 'õ',
5864
        '%F6' => 'ö',
5865
        '%F7' => '÷',
5866
        '%F8' => 'ø',
5867
        '%F9' => 'ù',
5868
        '%FA' => 'ú',
5869
        '%FB' => 'û',
5870 47
        '%FC' => 'ü',
5871
        '%FD' => 'ý',
5872 47
        '%FE' => 'þ',
5873
        '%FF' => 'ÿ',
5874 47
    );
5875 11
5876
    return $array;
5877
  }
5878
5879 45
  /**
5880
   * Decodes an UTF-8 string to ISO-8859-1.
5881 45
   *
5882
   * @param string $str <p>The input string.</p>
5883
   *
5884
   * @return string
5885 1
   */
5886 1
  public static function utf8_decode($str)
5887
  {
5888 45
    static $utf8ToWin1252Keys = null;
5889 45
    static $utf8ToWin1252Values = null;
5890 37
5891 37
    $str = (string)$str;
5892
5893 45
    if (!isset($str[0])) {
5894 2
      return '';
5895
    }
5896
5897 43
    // init
5898 20
    $str = self::to_utf8($str);
5899 20
5900 41
    if ($utf8ToWin1252Keys === null) {
5901
      $utf8ToWin1252Keys = array_keys(self::$utf8ToWin1252);
5902
      $utf8ToWin1252Values = array_values(self::$utf8ToWin1252);
5903 43
    }
5904
5905
    return Xml::utf8_decode(str_replace($utf8ToWin1252Keys, $utf8ToWin1252Values, $str));
5906 43
  }
5907 1
5908 1
  /**
5909 43
   * Encodes an ISO-8859-1 string to UTF-8.
5910
   *
5911
   * @param string $str <p>The input string.</p>
5912 43
   *
5913
   * @return string
5914
   */
5915
  public static function utf8_encode($str)
5916
  {
5917
    $str = \utf8_encode($str);
5918
5919
    if (false === strpos($str, "\xC2")) {
5920
      return $str;
5921
    } else {
5922
5923
      static $cp1252ToUtf8Keys = null;
5924
      static $cp1252ToUtf8Values = null;
5925
5926
      if ($cp1252ToUtf8Keys === null) {
5927
        $cp1252ToUtf8Keys = array_keys(self::$cp1252ToUtf8);
5928
        $cp1252ToUtf8Values = array_values(self::$cp1252ToUtf8);
5929
      }
5930
5931
      return str_replace($cp1252ToUtf8Keys, $cp1252ToUtf8Values, $str);
5932
    }
5933
  }
5934
5935
  /**
5936
   * fix -> utf8-win1252 chars
5937
   *
5938
   * @param string $str <p>The input string.</p>
5939
   *
5940
   * @return string
5941 1
   *
5942
   * @deprecated use "UTF8::fix_simple_utf8()"
5943 1
   */
5944 1
  public static function utf8_fix_win1252_chars($str)
5945
  {
5946 1
    return self::fix_simple_utf8($str);
5947
  }
5948
5949
  /**
5950
   * Returns an array with all utf8 whitespace characters.
5951
   *
5952
   * @see   : http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html
5953
   *
5954
   * @author: Derek E. [email protected]
5955
   *
5956
   * @return array <p>
5957
   *               An array with all known whitespace characters as values and the type of whitespace as keys
5958
   *               as defined in above URL.
5959
   *               </p>
5960
   */
5961
  public static function whitespace_table()
5962
  {
5963
    return self::$whitespaceTable;
5964
  }
5965
5966
  /**
5967
   * Limit the number of words in a string.
5968
   *
5969
   * @param string $str      <p>The input string.</p>
5970
   * @param int    $words    <p>The limit of words as integer.</p>
5971 1
   * @param string $strAddOn <p>Replacement for the striped string.</p>
5972
   *
5973 1
   * @return string
5974 1
   */
5975
  public static function words_limit($str, $words = 100, $strAddOn = '...')
5976 1
  {
5977 1
    $str = (string)$str;
5978
5979
    if (!isset($str[0])) {
5980 1
      return '';
5981 1
    }
5982 1
5983
    $words = (int)$words;
5984 1
5985 1
    if ($words < 1) {
5986
      return '';
5987
    }
5988 1
5989 1
    preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $str, $matches);
5990
5991 1
    if (
5992
        !isset($matches[0])
5993 1
        ||
5994
        self::strlen($str) === self::strlen($matches[0])
5995
    ) {
5996
      return $str;
5997
    }
5998
5999
    return self::rtrim($matches[0]) . $strAddOn;
6000
  }
6001
6002
  /**
6003
   * Wraps a string to a given number of characters
6004
   *
6005
   * @link  http://php.net/manual/en/function.wordwrap.php
6006
   *
6007
   * @param string $str   <p>The input string.</p>
6008 6
   * @param int    $width [optional] <p>The column width.</p>
6009
   * @param string $break [optional] <p>The line is broken using the optional break parameter.</p>
6010 6
   * @param bool   $cut   [optional] <p>
6011 1
   *                      If the cut is set to true, the string is
6012
   *                      always wrapped at or before the specified width. So if you have
6013
   *                      a word that is larger than the given width, it is broken apart.
6014 1
   *                      </p>
6015 1
   *
6016 1
   * @return string <p>The given string wrapped at the specified column.</p>
6017 1
   */
6018
  public static function wordwrap($str, $width = 75, $break = "\n", $cut = false)
6019
  {
6020
    $str = (string)$str;
6021 1
    $break = (string)$break;
6022 1
6023 1
    if (!isset($str[0], $break[0])) {
6024 1
      return '';
6025 1
    }
6026 1
6027 1
    $w = '';
6028 1
    $strSplit = explode($break, $str);
6029
    $count = count($strSplit);
6030
6031
    if (1 === $count && '' === $strSplit[0]) {
6032 1
      return '';
6033 1
    }
6034 1
6035 1
    $chars = array();
6036 1
    /** @noinspection ForeachInvariantsInspection */
6037 1
    for ($i = 0; $i < $count; ++$i) {
6038 1
6039 1
      if ($i) {
6040
        $chars[] = $break;
6041
        $w .= '#';
6042 1
      }
6043 1
6044 1
      $c = $strSplit[$i];
6045 1
      unset($strSplit[$i]);
6046
6047
      foreach (self::split($c) as $c) {
6048
        $chars[] = $c;
6049 1
        $w .= ' ' === $c ? ' ' : '?';
6050
      }
6051 6
    }
6052 1
6053 1
    $strReturn = '';
6054 1
    $j = 0;
6055 1
    $b = $i = -1;
6056
    $w = wordwrap($w, $width, '#', $cut);
6057 1
6058
    while (false !== $b = self::strpos($w, '#', $b + 1)) {
6059
      for (++$i; $i < $b; ++$i) {
6060 6
        $strReturn .= $chars[$j];
6061 6
        unset($chars[$j++]);
6062
      }
6063 6
6064 4
      if ($break === $chars[$j] || ' ' === $chars[$j]) {
6065
        unset($chars[$j++]);
6066 4
      }
6067 4
6068
      $strReturn .= $break;
6069 6
    }
6070
6071 6
    return $strReturn . implode('', $chars);
6072
  }
6073
6074
  /**
6075
   * Returns an array of Unicode White Space characters.
6076
   *
6077
   * @return array <p>An array with numeric code point as key and White Space Character as value.</p>
6078
   */
6079
  public static function ws()
6080
  {
6081
    return self::$whitespace;
6082 1
  }
6083
6084
}
6085