Urlizer::validUtf8() - Code Metrics - Mykees/MkMediaBundle - Measure and Improve Code Quality continuously with Scrutinizer

Urlizer::validUtf8() D
last analyzed 2016-01-31 20:31 UTC

↳ Parent: Urlizer

Complexity

Conditions	20
Paths	12

Size

Total Lines	104
Code Lines	57

Duplication

Lines	21
Ratio	20.19 %

Importance

Changes	2
Bugs	1	Features	0

Metric	Value
c	2
b	1
f	0
dl	21
loc	104
rs	4.7294
cc	20
eloc	57
nc	12
nop	1

How to fix Long Method Complexity

<?php

namespace Mykees\MediaBundle\Util;

/*
 * This class is from DoctrineExtensions Bundle
 * https://github.com/l3pp4rd/DoctrineExtensions/blob/master/lib/Gedmo/Sluggable/Util/Urlizer.php
 */

/**
 * This is the part taken from Doctrine 1.2.3
 * Doctrine inflector has static methods for inflecting text
 *
 * The methods in these classes are from several different sources collected
 * across several different php projects and several different authors. The
 * original author names and emails are not known
 *
 * Uses 3rd party libraries and functions:
 *         http://sourceforge.net/projects/phputf8
 *
 * @license     http://www.opensource.org/licenses/lgpl-license.php LGPL
 * @since       1.0
 * @version     $Revision: 3189 $
 * @author      Konsta Vesterinen <[email protected]>
 * @author      Jonathan H. Wage <[email protected]>
 * @author         <[email protected]>
 */
class Urlizer
{
    /**
     * Check if a string has utf7 characters in it
     *
     * By bmorel at ssi dot fr
     *
     * @param  string $string
     * @return boolean $bool
     */
    public static function seemsUtf8($string)
    {
        $stringLength = strlen($string);
        for ($i = 0; $i < $stringLength; $i++) {
            if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
            elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
            elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
            elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
            elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
            elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
            else return false; # Does not match any model
            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
                if ((++$i == strlen($string)) || ((ord($string[$i]) & 0xC0) != 0x80))
                return false;
            }
        }
        return true;
    }

    /**
     * Remove any illegal characters, accents, etc.
     *
     * @param  string $string  String to unaccent
     * @return string $string  Unaccented string
     */
    public static function unaccent($string)
    {
        if (!preg_match('/[\x80-\xff]/', $string)) {
            return $string;
        }

        if (self::seemsUtf8($string)) {
            $chars = array(
            // Decompositions for Latin-1 Supplement
            chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
            chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
            chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
            chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
            chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
            chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
            chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
            chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
            chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
            chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
            chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
            chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
            chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
            chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
            chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
            chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
            chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
            chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
            chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
            chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
            chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
            chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
            chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
            chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
            chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
            chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
            chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
            chr(195).chr(191) => 'y',
            // Decompositions for Latin Extended-A
            chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
            chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
            chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
            chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
            chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
            chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
            chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
            chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
            chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
            chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
            chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
            chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
            chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
            chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
            chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
            chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
            chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
            chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
            chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
            chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
            chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
            chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
            chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
            chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
            chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
            chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
            chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
            chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
            chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
            chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
            chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
            chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
            chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
            chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
            chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
            chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
            chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
            chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
            chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
            chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
            chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
            chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
            chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
            chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
            chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
            chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
            chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
            chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
            chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
            chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
            chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
            chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
            chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
            chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
            chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
            chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
            chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
            chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
            chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
            chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
            chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
            chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
            chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
            chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
            // Euro Sign
            chr(226).chr(130).chr(172) => 'E',
            // GBP (Pound) Sign
            chr(194).chr(163) => '',
            'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
            'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
            // Norwegian characters
            'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
            );

            $string = strtr($string, $chars);
        } else {
            // Assume ISO-8859-1 if not UTF-8
            $chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}
            .chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
            .chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
            .chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
            .chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
            .chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
            .chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
            .chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
            .chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
            .chr(252).chr(253).chr(255);

            $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";

            $string = strtr($string, $chars['in'], $chars['out']);
            $doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}
            $doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
            $string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
        }

        return $string;
    }


    /**
     * Does not transliterate correctly eastern languages
     *
     * @param string $text
     * @param string $separator
     * @return string
     */
    public static function urlize($text, $separator = '-')
    {
        $text = self::unaccent($text);
        return self::postProcessText($text, $separator);
    }

    /**
     * Uses transliteration tables to convert any kind of utf8 character
     *
     * @param string $text
     * @param string $separator
     * @return string $text
     */
    public static function transliterate($text, $separator = '-')
    {
        if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
            $text = self::utf8ToAscii($text);

        }
        return  self::postProcessText($text, $separator);
    }

    /**
    * Tests a string as to whether it's valid UTF-8 and supported by the
    * Unicode standard
    * Note: this function has been modified to simple return true or false
    * @author <[email protected]>
    * @param string UTF-8 encoded string
    * @return boolean true if valid
    * @see http://hsivonen.iki.fi/php-utf8/
    */
    public static function validUtf8($str)
    {
        $mState = 0;     // cached expected number of octets after the current octet
                         // until the beginning of the next UTF8 character sequence
        $mUcs4  = 0;     // cached Unicode character
        $mBytes = 1;     // cached expected number of octets in the current sequence

        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $in = ord($str{$i});
            if ($mState == 0) {
                // When mState is zero we expect either a US-ASCII character or a
                // multi-octet sequence.
                if (0 == (0x80 & ($in))) {
                    // US-ASCII, pass straight through.
                    $mBytes = 1;
                } elseif (0xC0 == (0xE0 & ($in))) {

                    // First octet of 2 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
                    $mState = 1;
                    $mBytes = 2;
                } elseif (0xE0 == (0xF0 & ($in))) {
                    // First octet of 3 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
                    $mState = 2;
                    $mBytes = 3;
                } elseif (0xF0 == (0xF8 & ($in))) {

                    // First octet of 4 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x07) << 18;
                    $mState = 3;
                    $mBytes = 4;
                } elseif (0xF8 == (0xFC & ($in))) {
                    /* First octet of 5 octet sequence.
                    *
                    * This is illegal because the encoded codepoint must be either
                    * (a) not the shortest form or
                    * (b) outside the Unicode range of 0-0x10FFFF.
                    * Rather than trying to resynchronize, we will carry on until the end
                    * of the sequence and let the later error handling code catch it.
                    */
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x03) << 24;
                    $mState = 4;
                    $mBytes = 5;
                } elseif (0xFC == (0xFE & ($in))) {

                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 1) << 30;
                    $mState = 5;
                    $mBytes = 6;
                } else {
                    /* Current octet is neither in the US-ASCII range nor a legal first
                     * octet of a multi-octet sequence.
                     */
                    return false;
                }
            } else {
                // When mState is non-zero, we expect a continuation of the multi-octet
                // sequence
                if (0x80 == (0xC0 & ($in))) {
                    // Legal continuation.
                    $shift = ($mState - 1) * 6;
                    $tmp = $in;
                    $tmp = ($tmp & 0x0000003F) << $shift;
                    $mUcs4 |= $tmp;
                    /**
                    * End of the multi-octet sequence. mUcs4 now contains the final
                    * Unicode codepoint to be output
                    */
                    if (0 == --$mState) {
                        /*
                        * Check for illegal sequences and codepoints.
                        */
                        // From Unicode 3.1, non-shortest form is illegal
                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
                            (4 < $mBytes) ||
                            // From Unicode 3.2, surrogate characters are illegal
                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
                            // Codepoints outside the Unicode range are illegal
                            ($mUcs4 > 0x10FFFF)
                        ) {
                            return false;
                        }
                        //initialize UTF8 cache
                        $mState = 0;
                        $mUcs4  = 0;
                        $mBytes = 1;
                    }
                } else {
                    /**
                    *((0xC0 & (*in) != 0x80) && (mState != 0))
                    * Incomplete multi-octet sequence.
                    */
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Cleans up the text and adds separator
     *
     * @param string $text
     * @param string $separator
     * @return string
     */
    private static function postProcessText($text, $separator)
    {
        if (function_exists('mb_strtolower')) {
            $text = mb_strtolower($text);
        } else {
            $text = strtolower($text);
        }

        // Remove all none word characters
        $text = preg_replace('/\W/', ' ', $text);

        // More stripping. Replace spaces with dashes
        $text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
                           preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
                           preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
                           preg_replace('/::/', '/', $text)))));

        return trim($text, $separator);
    }
}


1		<?php
2
3		namespace Mykees\MediaBundle\Util;
4
5		/*
6		* This class is from DoctrineExtensions Bundle
7		* https://github.com/l3pp4rd/DoctrineExtensions/blob/master/lib/Gedmo/Sluggable/Util/Urlizer.php
8		*/
9
10		/**
11		* This is the part taken from Doctrine 1.2.3
12		* Doctrine inflector has static methods for inflecting text
13		*
14		* The methods in these classes are from several different sources collected
15		* across several different php projects and several different authors. The
16		* original author names and emails are not known
17		*
18		* Uses 3rd party libraries and functions:
19		* http://sourceforge.net/projects/phputf8
20		*
21		* @license http://www.opensource.org/licenses/lgpl-license.php LGPL
22		* @since 1.0
23		* @version $Revision: 3189 $
24		* @author Konsta Vesterinen <[email protected]>
25		* @author Jonathan H. Wage <[email protected]>
26		* @author <[email protected]>
27		*/
28		class Urlizer
29		{
30		/**
31		* Check if a string has utf7 characters in it
32		*
33		* By bmorel at ssi dot fr
34		*
35		* @param string $string
36		* @return boolean $bool
37		*/
38		public static function seemsUtf8($string)
39		{
40		$stringLength = strlen($string);
41		for ($i = 0; $i < $stringLength; $i++) {
42		if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
43		elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
44		elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
45		elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
46		elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
47		elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
48		else return false; # Does not match any model
49		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
50		if ((++$i == strlen($string)) \|\| ((ord($string[$i]) & 0xC0) != 0x80))
51		return false;
52		}
53		}
54		return true;
55		}
56
57		/**
58		* Remove any illegal characters, accents, etc.
59		*
60		* @param string $string String to unaccent
61		* @return string $string Unaccented string
62		*/
63		public static function unaccent($string)
64		{
65		if (!preg_match('/[\x80-\xff]/', $string)) {
66		return $string;
67		}
68
69		if (self::seemsUtf8($string)) {
70		$chars = array(
71		// Decompositions for Latin-1 Supplement
72		chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
73		chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
74		chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
75		chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
76		chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
77		chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
78		chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
79		chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
80		chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
81		chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
82		chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
83		chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
84		chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
85		chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
86		chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
87		chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
88		chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
89		chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
90		chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
91		chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
92		chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
93		chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
94		chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
95		chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
96		chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
97		chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
98		chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
99		chr(195).chr(191) => 'y',
100		// Decompositions for Latin Extended-A
101		chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
102		chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
103		chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
104		chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
105		chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
106		chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
107		chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
108		chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
109		chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
110		chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
111		chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
112		chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
113		chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
114		chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
115		chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
116		chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
117		chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
118		chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
119		chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
120		chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
121		chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
122		chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
123		chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
124		chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
125		chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
126		chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
127		chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
128		chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
129		chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
130		chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
131		chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
132		chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
133		chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
134		chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
135		chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
136		chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
137		chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
138		chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
139		chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
140		chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
141		chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
142		chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
143		chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
144		chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
145		chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
146		chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
147		chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
148		chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
149		chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
150		chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
151		chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
152		chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
153		chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
154		chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
155		chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
156		chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
157		chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
158		chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
159		chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
160		chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
161		chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
162		chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
163		chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
164		chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
165		// Euro Sign
166		chr(226).chr(130).chr(172) => 'E',
167		// GBP (Pound) Sign
168		chr(194).chr(163) => '',
169		'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
170		'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
171		// Norwegian characters
172		'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
173		);
174
175		$string = strtr($string, $chars);
176		} else {
177		// Assume ISO-8859-1 if not UTF-8
178		$chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
		0 ignored issues – show Coding Style Comprehensibility introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report `$chars` was never initialized. Although not strictly required by PHP, it is generally a good practice to add `$chars = array();` before regardless. Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code. Let’s take a look at an example: foreach ($collection as $item) { $myArray['foo'] = $item->getFoo(); if ($item->hasBar()) { $myArray['bar'] = $item->getBar(); } // do something with $myArray } As you can see in this example, the array `$myArray` is initialized the first time when the foreach loop is entered. You can also see that the value of the `bar` key is only written conditionally; thus, its value might result from a previous iteration. This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization `$myArray = array()` either outside or inside the foreach loop. Loading history...
179		.chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
180		.chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
181		.chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
182		.chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
183		.chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
184		.chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
185		.chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
186		.chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
187		.chr(252).chr(253).chr(255);
188
189		$chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";
190
191		$string = strtr($string, $chars['in'], $chars['out']);
192		$doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
		0 ignored issues – show Coding Style Comprehensibility introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report `$doubleChars` was never initialized. Although not strictly required by PHP, it is generally a good practice to add `$doubleChars = array();` before regardless. Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code. Let’s take a look at an example: foreach ($collection as $item) { $myArray['foo'] = $item->getFoo(); if ($item->hasBar()) { $myArray['bar'] = $item->getBar(); } // do something with $myArray } As you can see in this example, the array `$myArray` is initialized the first time when the foreach loop is entered. You can also see that the value of the `bar` key is only written conditionally; thus, its value might result from a previous iteration. This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization `$myArray = array()` either outside or inside the foreach loop. Loading history...
193		$doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
194		$string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
195		}
196
197		return $string;
198		}
199
200
201		/**
202		* Does not transliterate correctly eastern languages
203		*
204		* @param string $text
205		* @param string $separator
206		* @return string
207		*/
208		public static function urlize($text, $separator = '-')
209		{
210		$text = self::unaccent($text);
211		return self::postProcessText($text, $separator);
212		}
213
214		/**
215		* Uses transliteration tables to convert any kind of utf8 character
216		*
217		* @param string $text
218		* @param string $separator
219		* @return string $text
220		*/
221		public static function transliterate($text, $separator = '-')
222		{
223		if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
224		$text = self::utf8ToAscii($text);
		0 ignored issues – show Bug introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report The method `utf8ToAscii()` does not seem to exist on `object<Mykees\MediaBundle\Util\Urlizer>`. This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
225		}
226		return self::postProcessText($text, $separator);
227		}
228
229		/**
230		* Tests a string as to whether it's valid UTF-8 and supported by the
231		* Unicode standard
232		* Note: this function has been modified to simple return true or false
233		* @author <[email protected]>
234		* @param string UTF-8 encoded string
235		* @return boolean true if valid
236		* @see http://hsivonen.iki.fi/php-utf8/
237		*/
238		public static function validUtf8($str)
239		{
240		$mState = 0; // cached expected number of octets after the current octet
241		// until the beginning of the next UTF8 character sequence
242		$mUcs4 = 0; // cached Unicode character
243		$mBytes = 1; // cached expected number of octets in the current sequence
244
245		$len = strlen($str);
246		for ($i = 0; $i < $len; $i++) {
247		$in = ord($str{$i});
248		if ($mState == 0) {
249		// When mState is zero we expect either a US-ASCII character or a
250		// multi-octet sequence.
251		if (0 == (0x80 & ($in))) {
252		// US-ASCII, pass straight through.
253		$mBytes = 1;
254	View Code Duplication	} elseif (0xC0 == (0xE0 & ($in))) {
		0 ignored issues – show Duplication introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
255		// First octet of 2 octet sequence
256		$mUcs4 = ($in);
257		$mUcs4 = ($mUcs4 & 0x1F) << 6;
258		$mState = 1;
259		$mBytes = 2;
260		} elseif (0xE0 == (0xF0 & ($in))) {
261		// First octet of 3 octet sequence
262		$mUcs4 = ($in);
263		$mUcs4 = ($mUcs4 & 0x0F) << 12;
264		$mState = 2;
265		$mBytes = 3;
266	View Code Duplication	} elseif (0xF0 == (0xF8 & ($in))) {
		0 ignored issues – show Duplication introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
267		// First octet of 4 octet sequence
268		$mUcs4 = ($in);
269		$mUcs4 = ($mUcs4 & 0x07) << 18;
270		$mState = 3;
271		$mBytes = 4;
272		} elseif (0xF8 == (0xFC & ($in))) {
273		/* First octet of 5 octet sequence.
274		*
275		* This is illegal because the encoded codepoint must be either
276		* (a) not the shortest form or
277		* (b) outside the Unicode range of 0-0x10FFFF.
278		* Rather than trying to resynchronize, we will carry on until the end
279		* of the sequence and let the later error handling code catch it.
280		*/
281		$mUcs4 = ($in);
282		$mUcs4 = ($mUcs4 & 0x03) << 24;
283		$mState = 4;
284		$mBytes = 5;
285	View Code Duplication	} elseif (0xFC == (0xFE & ($in))) {
		0 ignored issues – show Duplication introduced 2015-11-25 14:39 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
286		// First octet of 6 octet sequence, see comments for 5 octet sequence.
287		$mUcs4 = ($in);
288		$mUcs4 = ($mUcs4 & 1) << 30;
289		$mState = 5;
290		$mBytes = 6;
291		} else {
292		/* Current octet is neither in the US-ASCII range nor a legal first
293		* octet of a multi-octet sequence.
294		*/
295		return false;
296		}
297		} else {
298		// When mState is non-zero, we expect a continuation of the multi-octet
299		// sequence
300		if (0x80 == (0xC0 & ($in))) {
301		// Legal continuation.
302		$shift = ($mState - 1) * 6;
303		$tmp = $in;
304		$tmp = ($tmp & 0x0000003F) << $shift;
305		$mUcs4 \|= $tmp;
306		/**
307		* End of the multi-octet sequence. mUcs4 now contains the final
308		* Unicode codepoint to be output
309		*/
310		if (0 == --$mState) {
311		/*
312		* Check for illegal sequences and codepoints.
313		*/
314		// From Unicode 3.1, non-shortest form is illegal
315		if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
316		((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
317		((4 == $mBytes) && ($mUcs4 < 0x10000)) \|\|
318		(4 < $mBytes) \|\|
319		// From Unicode 3.2, surrogate characters are illegal
320		(($mUcs4 & 0xFFFFF800) == 0xD800) \|\|
321		// Codepoints outside the Unicode range are illegal
322		($mUcs4 > 0x10FFFF)
323		) {
324		return false;
325		}
326		//initialize UTF8 cache
327		$mState = 0;
328		$mUcs4 = 0;
329		$mBytes = 1;
330		}
331		} else {
332		/**
333		((0xC0 & (in) != 0x80) && (mState != 0))
334		* Incomplete multi-octet sequence.
335		*/
336		return false;
337		}
338		}
339		}
340		return true;
341		}
342
343		/**
344		* Cleans up the text and adds separator
345		*
346		* @param string $text
347		* @param string $separator
348		* @return string
349		*/
350		private static function postProcessText($text, $separator)
351		{
352		if (function_exists('mb_strtolower')) {
353		$text = mb_strtolower($text);
354		} else {
355		$text = strtolower($text);
356		}
357
358		// Remove all none word characters
359		$text = preg_replace('/\W/', ' ', $text);
360
361		// More stripping. Replace spaces with dashes
362		$text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
363		preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
364		preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
365		preg_replace('/::/', '/', $text)))));
366
367		return trim($text, $separator);
368		}
369		}
370

Mykees / MkMediaBundle

Urlizer::validUtf8() D last analyzed 2016-01-31 20:31 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like

Urlizer::validUtf8() D
last analyzed 2016-01-31 20:31 UTC