Urlizer::validUtf8()   D
last analyzed

Complexity

Conditions 20
Paths 12

Size

Total Lines 104
Code Lines 57

Duplication

Lines 21
Ratio 20.19 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
c 2
b 1
f 0
dl 21
loc 104
rs 4.7294
cc 20
eloc 57
nc 12
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Mykees\MediaBundle\Util;
4
5
/*
6
 * This class is from DoctrineExtensions Bundle
7
 * https://github.com/l3pp4rd/DoctrineExtensions/blob/master/lib/Gedmo/Sluggable/Util/Urlizer.php
8
 */
9
10
/**
11
 * This is the part taken from Doctrine 1.2.3
12
 * Doctrine inflector has static methods for inflecting text
13
 *
14
 * The methods in these classes are from several different sources collected
15
 * across several different php projects and several different authors. The
16
 * original author names and emails are not known
17
 *
18
 * Uses 3rd party libraries and functions:
19
 *         http://sourceforge.net/projects/phputf8
20
 *
21
 * @license     http://www.opensource.org/licenses/lgpl-license.php LGPL
22
 * @since       1.0
23
 * @version     $Revision: 3189 $
24
 * @author      Konsta Vesterinen <[email protected]>
25
 * @author      Jonathan H. Wage <[email protected]>
26
 * @author         <[email protected]>
27
 */
28
class Urlizer
29
{
30
    /**
31
     * Check if a string has utf7 characters in it
32
     *
33
     * By bmorel at ssi dot fr
34
     *
35
     * @param  string $string
36
     * @return boolean $bool
37
     */
38
    public static function seemsUtf8($string)
39
    {
40
        $stringLength = strlen($string);
41
        for ($i = 0; $i < $stringLength; $i++) {
42
            if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
43
            elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
44
            elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
45
            elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
46
            elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
47
            elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
48
            else return false; # Does not match any model
49
            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
50
                if ((++$i == strlen($string)) || ((ord($string[$i]) & 0xC0) != 0x80))
51
                return false;
52
            }
53
        }
54
        return true;
55
    }
56
57
    /**
58
     * Remove any illegal characters, accents, etc.
59
     *
60
     * @param  string $string  String to unaccent
61
     * @return string $string  Unaccented string
62
     */
63
    public static function unaccent($string)
64
    {
65
        if (!preg_match('/[\x80-\xff]/', $string)) {
66
            return $string;
67
        }
68
69
        if (self::seemsUtf8($string)) {
70
            $chars = array(
71
            // Decompositions for Latin-1 Supplement
72
            chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
73
            chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
74
            chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
75
            chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
76
            chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
77
            chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
78
            chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
79
            chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
80
            chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
81
            chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
82
            chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
83
            chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
84
            chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
85
            chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
86
            chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
87
            chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
88
            chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
89
            chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
90
            chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
91
            chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
92
            chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
93
            chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
94
            chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
95
            chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
96
            chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
97
            chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
98
            chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
99
            chr(195).chr(191) => 'y',
100
            // Decompositions for Latin Extended-A
101
            chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
102
            chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
103
            chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
104
            chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
105
            chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
106
            chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
107
            chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
108
            chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
109
            chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
110
            chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
111
            chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
112
            chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
113
            chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
114
            chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
115
            chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
116
            chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
117
            chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
118
            chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
119
            chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
120
            chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
121
            chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
122
            chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
123
            chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
124
            chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
125
            chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
126
            chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
127
            chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
128
            chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
129
            chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
130
            chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
131
            chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
132
            chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
133
            chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
134
            chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
135
            chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
136
            chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
137
            chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
138
            chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
139
            chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
140
            chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
141
            chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
142
            chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
143
            chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
144
            chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
145
            chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
146
            chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
147
            chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
148
            chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
149
            chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
150
            chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
151
            chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
152
            chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
153
            chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
154
            chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
155
            chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
156
            chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
157
            chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
158
            chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
159
            chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
160
            chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
161
            chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
162
            chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
163
            chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
164
            chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
165
            // Euro Sign
166
            chr(226).chr(130).chr(172) => 'E',
167
            // GBP (Pound) Sign
168
            chr(194).chr(163) => '',
169
            'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
170
            'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
171
            // Norwegian characters
172
            'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
173
            );
174
175
            $string = strtr($string, $chars);
176
        } else {
177
            // Assume ISO-8859-1 if not UTF-8
178
            $chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
0 ignored issues
show
Coding Style Comprehensibility introduced by
$chars was never initialized. Although not strictly required by PHP, it is generally a good practice to add $chars = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
179
            .chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
180
            .chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
181
            .chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
182
            .chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
183
            .chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
184
            .chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
185
            .chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
186
            .chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
187
            .chr(252).chr(253).chr(255);
188
189
            $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";
190
191
            $string = strtr($string, $chars['in'], $chars['out']);
192
            $doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
0 ignored issues
show
Coding Style Comprehensibility introduced by
$doubleChars was never initialized. Although not strictly required by PHP, it is generally a good practice to add $doubleChars = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
193
            $doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
194
            $string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
195
        }
196
197
        return $string;
198
    }
199
200
201
    /**
202
     * Does not transliterate correctly eastern languages
203
     *
204
     * @param string $text
205
     * @param string $separator
206
     * @return string
207
     */
208
    public static function urlize($text, $separator = '-')
209
    {
210
        $text = self::unaccent($text);
211
        return self::postProcessText($text, $separator);
212
    }
213
214
    /**
215
     * Uses transliteration tables to convert any kind of utf8 character
216
     *
217
     * @param string $text
218
     * @param string $separator
219
     * @return string $text
220
     */
221
    public static function transliterate($text, $separator = '-')
222
    {
223
        if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
224
            $text = self::utf8ToAscii($text);
0 ignored issues
show
Bug introduced by
The method utf8ToAscii() does not seem to exist on object<Mykees\MediaBundle\Util\Urlizer>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
225
        }
226
        return  self::postProcessText($text, $separator);
227
    }
228
229
    /**
230
    * Tests a string as to whether it's valid UTF-8 and supported by the
231
    * Unicode standard
232
    * Note: this function has been modified to simple return true or false
233
    * @author <[email protected]>
234
    * @param string UTF-8 encoded string
235
    * @return boolean true if valid
236
    * @see http://hsivonen.iki.fi/php-utf8/
237
    */
238
    public static function validUtf8($str)
239
    {
240
        $mState = 0;     // cached expected number of octets after the current octet
241
                         // until the beginning of the next UTF8 character sequence
242
        $mUcs4  = 0;     // cached Unicode character
243
        $mBytes = 1;     // cached expected number of octets in the current sequence
244
245
        $len = strlen($str);
246
        for ($i = 0; $i < $len; $i++) {
247
            $in = ord($str{$i});
248
            if ($mState == 0) {
249
                // When mState is zero we expect either a US-ASCII character or a
250
                // multi-octet sequence.
251
                if (0 == (0x80 & ($in))) {
252
                    // US-ASCII, pass straight through.
253
                    $mBytes = 1;
254 View Code Duplication
                } elseif (0xC0 == (0xE0 & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
255
                    // First octet of 2 octet sequence
256
                    $mUcs4 = ($in);
257
                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
258
                    $mState = 1;
259
                    $mBytes = 2;
260
                } elseif (0xE0 == (0xF0 & ($in))) {
261
                    // First octet of 3 octet sequence
262
                    $mUcs4 = ($in);
263
                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
264
                    $mState = 2;
265
                    $mBytes = 3;
266 View Code Duplication
                } elseif (0xF0 == (0xF8 & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
267
                    // First octet of 4 octet sequence
268
                    $mUcs4 = ($in);
269
                    $mUcs4 = ($mUcs4 & 0x07) << 18;
270
                    $mState = 3;
271
                    $mBytes = 4;
272
                } elseif (0xF8 == (0xFC & ($in))) {
273
                    /* First octet of 5 octet sequence.
274
                    *
275
                    * This is illegal because the encoded codepoint must be either
276
                    * (a) not the shortest form or
277
                    * (b) outside the Unicode range of 0-0x10FFFF.
278
                    * Rather than trying to resynchronize, we will carry on until the end
279
                    * of the sequence and let the later error handling code catch it.
280
                    */
281
                    $mUcs4 = ($in);
282
                    $mUcs4 = ($mUcs4 & 0x03) << 24;
283
                    $mState = 4;
284
                    $mBytes = 5;
285 View Code Duplication
                } elseif (0xFC == (0xFE & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
286
                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
287
                    $mUcs4 = ($in);
288
                    $mUcs4 = ($mUcs4 & 1) << 30;
289
                    $mState = 5;
290
                    $mBytes = 6;
291
                } else {
292
                    /* Current octet is neither in the US-ASCII range nor a legal first
293
                     * octet of a multi-octet sequence.
294
                     */
295
                    return false;
296
                }
297
            } else {
298
                // When mState is non-zero, we expect a continuation of the multi-octet
299
                // sequence
300
                if (0x80 == (0xC0 & ($in))) {
301
                    // Legal continuation.
302
                    $shift = ($mState - 1) * 6;
303
                    $tmp = $in;
304
                    $tmp = ($tmp & 0x0000003F) << $shift;
305
                    $mUcs4 |= $tmp;
306
                    /**
307
                    * End of the multi-octet sequence. mUcs4 now contains the final
308
                    * Unicode codepoint to be output
309
                    */
310
                    if (0 == --$mState) {
311
                        /*
312
                        * Check for illegal sequences and codepoints.
313
                        */
314
                        // From Unicode 3.1, non-shortest form is illegal
315
                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
316
                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
317
                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
318
                            (4 < $mBytes) ||
319
                            // From Unicode 3.2, surrogate characters are illegal
320
                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
321
                            // Codepoints outside the Unicode range are illegal
322
                            ($mUcs4 > 0x10FFFF)
323
                        ) {
324
                            return false;
325
                        }
326
                        //initialize UTF8 cache
327
                        $mState = 0;
328
                        $mUcs4  = 0;
329
                        $mBytes = 1;
330
                    }
331
                } else {
332
                    /**
333
                    *((0xC0 & (*in) != 0x80) && (mState != 0))
334
                    * Incomplete multi-octet sequence.
335
                    */
336
                    return false;
337
                }
338
            }
339
        }
340
        return true;
341
    }
342
343
    /**
344
     * Cleans up the text and adds separator
345
     *
346
     * @param string $text
347
     * @param string $separator
348
     * @return string
349
     */
350
    private static function postProcessText($text, $separator)
351
    {
352
        if (function_exists('mb_strtolower')) {
353
            $text = mb_strtolower($text);
354
        } else {
355
            $text = strtolower($text);
356
        }
357
358
        // Remove all none word characters
359
        $text = preg_replace('/\W/', ' ', $text);
360
361
        // More stripping. Replace spaces with dashes
362
        $text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
363
                           preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
364
                           preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
365
                           preg_replace('/::/', '/', $text)))));
366
367
        return trim($text, $separator);
368
    }
369
}
370