Urlizer::unaccent()   B
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 136
Code Lines 120

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 136
rs 8.2857
cc 3
eloc 120
nc 3
nop 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Mykees\TagBundle\Util;
4
5
    /*
6
     * This class is from DoctrineExtensions Bundle
7
     * https://github.com/l3pp4rd/DoctrineExtensions/blob/master/lib/Gedmo/Sluggable/Util/Urlizer.php
8
     */
9
10
/**
11
 * This is the part taken from Doctrine 1.2.3
12
 * Doctrine inflector has static methods for inflecting text
13
 *
14
 * The methods in these classes are from several different sources collected
15
 * across several different php projects and several different authors. The
16
 * original author names and emails are not known
17
 *
18
 * Uses 3rd party libraries and functions:
19
 *         http://sourceforge.net/projects/phputf8
20
 *
21
 * @license     http://www.opensource.org/licenses/lgpl-license.php LGPL
22
 * @since       1.0
23
 * @version     $Revision: 3189 $
24
 * @author      Konsta Vesterinen <[email protected]>
25
 * @author      Jonathan H. Wage <[email protected]>
26
 * @author         <[email protected]>
27
 */
28
class Urlizer
29
{
30
    /**
31
     * Check if a string has utf7 characters in it
32
     *
33
     * By bmorel at ssi dot fr
34
     *
35
     * @param  string $string
36
     * @return boolean $bool
37
     */
38
    public static function seemsUtf8($string)
39
    {
40
        $stringLength = strlen($string);
41
        for ($i = 0; $i < $stringLength; $i++) {
42
            if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
43
            elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
44
            elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
45
            elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
46
            elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
47
            elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
48
            else return false; # Does not match any model
49
            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
50
                if ((++$i == strlen($string)) || ((ord($string[$i]) & 0xC0) != 0x80))
51
                    return false;
52
            }
53
        }
54
        return true;
55
    }
56
57
    /**
58
     * Remove any illegal characters, accents, etc.
59
     *
60
     * @param  string $string  String to unaccent
61
     * @return string $string  Unaccented string
62
     */
63
    public static function unaccent($string)
64
    {
65
        if (!preg_match('/[\x80-\xff]/', $string)) {
66
            return $string;
67
        }
68
69
        if (self::seemsUtf8($string)) {
70
            $chars = array(
71
                // Decompositions for Latin-1 Supplement
72
                chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
73
                chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
74
                chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
75
                chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
76
                chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
77
                chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
78
                chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
79
                chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
80
                chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
81
                chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
82
                chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
83
                chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
84
                chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
85
                chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
86
                chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
87
                chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
88
                chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
89
                chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
90
                chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
91
                chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
92
                chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
93
                chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
94
                chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
95
                chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
96
                chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
97
                chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
98
                chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
99
                chr(195).chr(191) => 'y',
100
                // Decompositions for Latin Extended-A
101
                chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
102
                chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
103
                chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
104
                chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
105
                chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
106
                chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
107
                chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
108
                chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
109
                chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
110
                chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
111
                chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
112
                chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
113
                chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
114
                chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
115
                chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
116
                chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
117
                chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
118
                chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
119
                chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
120
                chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
121
                chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
122
                chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
123
                chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
124
                chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
125
                chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
126
                chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
127
                chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
128
                chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
129
                chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
130
                chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
131
                chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
132
                chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
133
                chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
134
                chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
135
                chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
136
                chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
137
                chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
138
                chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
139
                chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
140
                chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
141
                chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
142
                chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
143
                chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
144
                chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
145
                chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
146
                chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
147
                chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
148
                chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
149
                chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
150
                chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
151
                chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
152
                chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
153
                chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
154
                chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
155
                chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
156
                chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
157
                chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
158
                chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
159
                chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
160
                chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
161
                chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
162
                chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
163
                chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
164
                chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
165
                // Euro Sign
166
                chr(226).chr(130).chr(172) => 'E',
167
                // GBP (Pound) Sign
168
                chr(194).chr(163) => '',
169
                'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
170
                'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
171
                // Norwegian characters
172
                'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
173
            );
174
175
            $string = strtr($string, $chars);
176
        } else {
177
            // Assume ISO-8859-1 if not UTF-8
178
            $chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
0 ignored issues
show
Coding Style Comprehensibility introduced by
$chars was never initialized. Although not strictly required by PHP, it is generally a good practice to add $chars = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
179
                .chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
180
                .chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
181
                .chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
182
                .chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
183
                .chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
184
                .chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
185
                .chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
186
                .chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
187
                .chr(252).chr(253).chr(255);
188
189
            $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";
190
191
            $string = strtr($string, $chars['in'], $chars['out']);
192
            $doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
0 ignored issues
show
Coding Style Comprehensibility introduced by
$doubleChars was never initialized. Although not strictly required by PHP, it is generally a good practice to add $doubleChars = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
193
            $doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
194
            $string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
195
        }
196
197
        return $string;
198
    }
199
200
201
    /**
202
     * Does not transliterate correctly eastern languages
203
     *
204
     * @param string $text
205
     * @param string $separator
206
     * @return string
207
     */
208
    public static function urlize($text, $separator = '-')
209
    {
210
        $text = self::unaccent($text);
211
        return self::postProcessText($text, $separator);
212
    }
213
214
    /**
215
     * Uses transliteration tables to convert any kind of utf8 character
216
     *
217
     * @param string $text
218
     * @param string $separator
219
     * @return string $text
220
     */
221
    public static function transliterate($text, $separator = '-')
222
    {
223
        if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
224
            $text = self::utf8ToAscii($text);
0 ignored issues
show
Bug introduced by
The method utf8ToAscii() does not seem to exist on object<Mykees\TagBundle\Util\Urlizer>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
225
        }
226
        return  self::postProcessText($text, $separator);
227
    }
228
229
    /**
230
     * Tests a string as to whether it's valid UTF-8 and supported by the
231
     * Unicode standard
232
     * Note: this function has been modified to simple return true or false
233
     * @author <[email protected]>
234
     * @param string UTF-8 encoded string
235
     * @return boolean true if valid
236
     * @see http://hsivonen.iki.fi/php-utf8/
237
     */
238
    public static function validUtf8($str)
239
    {
240
        $mState = 0;     // cached expected number of octets after the current octet
241
        // until the beginning of the next UTF8 character sequence
242
        $mUcs4  = 0;     // cached Unicode character
243
        $mBytes = 1;     // cached expected number of octets in the current sequence
244
245
        $len = strlen($str);
246
        for ($i = 0; $i < $len; $i++) {
247
            $in = ord($str{$i});
248
            if ($mState == 0) {
249
                // When mState is zero we expect either a US-ASCII character or a
250
                // multi-octet sequence.
251
                if (0 == (0x80 & ($in))) {
252
                    // US-ASCII, pass straight through.
253
                    $mBytes = 1;
254 View Code Duplication
                } elseif (0xC0 == (0xE0 & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
255
                    // First octet of 2 octet sequence
256
                    $mUcs4 = ($in);
257
                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
258
                    $mState = 1;
259
                    $mBytes = 2;
260
                } elseif (0xE0 == (0xF0 & ($in))) {
261
                    // First octet of 3 octet sequence
262
                    $mUcs4 = ($in);
263
                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
264
                    $mState = 2;
265
                    $mBytes = 3;
266 View Code Duplication
                } elseif (0xF0 == (0xF8 & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
267
                    // First octet of 4 octet sequence
268
                    $mUcs4 = ($in);
269
                    $mUcs4 = ($mUcs4 & 0x07) << 18;
270
                    $mState = 3;
271
                    $mBytes = 4;
272
                } elseif (0xF8 == (0xFC & ($in))) {
273
                    /* First octet of 5 octet sequence.
274
                    *
275
                    * This is illegal because the encoded codepoint must be either
276
                    * (a) not the shortest form or
277
                    * (b) outside the Unicode range of 0-0x10FFFF.
278
                    * Rather than trying to resynchronize, we will carry on until the end
279
                    * of the sequence and let the later error handling code catch it.
280
                    */
281
                    $mUcs4 = ($in);
282
                    $mUcs4 = ($mUcs4 & 0x03) << 24;
283
                    $mState = 4;
284
                    $mBytes = 5;
285 View Code Duplication
                } elseif (0xFC == (0xFE & ($in))) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
286
                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
287
                    $mUcs4 = ($in);
288
                    $mUcs4 = ($mUcs4 & 1) << 30;
289
                    $mState = 5;
290
                    $mBytes = 6;
291
                } else {
292
                    /* Current octet is neither in the US-ASCII range nor a legal first
293
                     * octet of a multi-octet sequence.
294
                     */
295
                    return false;
296
                }
297
            } else {
298
                // When mState is non-zero, we expect a continuation of the multi-octet
299
                // sequence
300
                if (0x80 == (0xC0 & ($in))) {
301
                    // Legal continuation.
302
                    $shift = ($mState - 1) * 6;
303
                    $tmp = $in;
304
                    $tmp = ($tmp & 0x0000003F) << $shift;
305
                    $mUcs4 |= $tmp;
306
                    /**
307
                     * End of the multi-octet sequence. mUcs4 now contains the final
308
                     * Unicode codepoint to be output
309
                     */
310
                    if (0 == --$mState) {
311
                        /*
312
                        * Check for illegal sequences and codepoints.
313
                        */
314
                        // From Unicode 3.1, non-shortest form is illegal
315
                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
316
                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
317
                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
318
                            (4 < $mBytes) ||
319
                            // From Unicode 3.2, surrogate characters are illegal
320
                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
321
                            // Codepoints outside the Unicode range are illegal
322
                            ($mUcs4 > 0x10FFFF)
323
                        ) {
324
                            return false;
325
                        }
326
                        //initialize UTF8 cache
327
                        $mState = 0;
328
                        $mUcs4  = 0;
329
                        $mBytes = 1;
330
                    }
331
                } else {
332
                    /**
333
                     *((0xC0 & (*in) != 0x80) && (mState != 0))
334
                     * Incomplete multi-octet sequence.
335
                     */
336
                    return false;
337
                }
338
            }
339
        }
340
        return true;
341
    }
342
343
    /**
344
     * Cleans up the text and adds separator
345
     *
346
     * @param string $text
347
     * @param string $separator
348
     * @return string
349
     */
350
    private static function postProcessText($text, $separator)
351
    {
352
        if (function_exists('mb_strtolower')) {
353
            $text = mb_strtolower($text);
354
        } else {
355
            $text = strtolower($text);
356
        }
357
358
        // Remove all none word characters
359
        $text = preg_replace('/\W/', ' ', $text);
360
361
        // More stripping. Replace spaces with dashes
362
        $text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
363
            preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
364
                preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
365
                    preg_replace('/::/', '/', $text)))));
366
367
        return trim($text, $separator);
368
    }
369
}
370