Transliterator - Code Metrics - gplcart/transliterator - Measure and Improve Code Quality continuously with Scrutinizer

Transliterator A
last analyzed 2018-03-10 14:47 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	190
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	0

Importance

Changes

Metric	Value
wmc	35
lcom	1
cbo	0
dl	0
loc	190
rs	9
c	0
b	0
f	0

4 Methods

Rating	Name	Size	Complexity
D	transliterate()	90	19
A	getAreas()	6	1
D	getTailBytes()	32	9
B	replace()	29	6

<?php

/**
 * @package Transliterator
 * @author Iurii Makukh
 * @copyright Copyright (c) 2017, Iurii Makukh
 * @license https://www.gnu.org/licenses/gpl-3.0.en.html GPL-3.0+
 */

namespace gplcart\modules\transliterator\helpers;

/**
 * Helper class for Transliterator module
 * Based on code from https://www.drupal.org/project/transliteration
 */
class Transliterator
{

    /**
     * Transliterates a string
     * @param string $string
     * @param string $unknown
     * @param null|string $source_langcode
     * @return string
     */
    public function transliterate($string, $unknown = '?', $source_langcode = null)
    {
        // ASCII is always valid NFC! If we're only ever given plain ASCII, we can
        // avoid the overhead of initializing the decomposition tables by skipping
        // out early.
        if (!preg_match('/[\x80-\xff]/', $string)) {
            return $string;
        }

        $tail_bytes = $this->getTailBytes();
        $areas = $this->getAreas($string);
        $result = '';
        foreach ($areas[0] as $str) {

            if ($str[0] < "\x80") {
                // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
                // skip over it.
                $result .= $str;
                continue;
            }

            // We'll have to examine the chunk byte by byte to ensure that it consists
            // of valid UTF-8 sequences, and to see if any of them might not be normalized.
            // Since PHP is not the fastest language on earth, some of this code is a
            // little ugly with inner loop optimizations.
            $head = '';
            $chunk = strlen($str);

            // Counting down is faster. I'm *so* sorry.
            $len = $chunk + 1;
            for ($i = -1; --$len;) {
                $c = $str[++$i];
                if ($remaining = $tail_bytes[$c]) {
                    // UTF-8 head byte!
                    $sequence = $head = $c;
                    do {
                        // Look for the defined number of tail bytes...
                        if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
                            // Legal tail bytes are nice.
                            $sequence .= $c;
                        } else {
                            if ($len == 0) {
                                // Premature end of string! Drop a replacement character into
                                // output to represent the invalid UTF-8 sequence.
                                $result .= $unknown;
                                break 2;
                            } else {
                                // Illegal tail byte; abandon the sequence.
                                $result .= $unknown;
                                // Back up and reprocess this byte; it may itself be a legal
                                // ASCII or UTF-8 sequence head.
                                --$i;
                                ++$len;
                                continue 2;
                            }
                        }
                    } while (--$remaining);
                    $n = ord($head);
                    if ($n <= 0xdf) {
                        $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
                    } elseif ($n <= 0xef) {
                        $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
                    } elseif ($n <= 0xf7) {
                        $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
                    } elseif ($n <= 0xfb) {
                        $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
                    } elseif ($n <= 0xfd) {
                        $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
                    }
                    $result .= $this->replace($ord, $unknown, $source_langcode);
function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}
                    $head = '';
                } elseif ($c < "\x80") {
                    // ASCII byte.
                    $result .= $c;
                    $head = '';
                } elseif ($c < "\xc0") {
                    // Illegal tail bytes.
                    if ($head == '') {
                        $result .= $unknown;
                    }
                } else {
                    // Miscellaneous freaks.
                    $result .= $unknown;
                    $head = '';
                }
            }
        }

        return $result;
    }

    /**
     * Chops the text into pure-ASCII and non-ASCII areas
     * @param string $string
     * @return array
     */
    protected function getAreas($string)
    {
        $matches = array();
        preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
        return $matches;
    }

    /**
     * Each UTF-8 head byte is followed by a certain number of tail bytes.
     * This method returns an array of tail bytes
     * @staticvar null|array $tail_bytes
     * @return array
     */
    protected function getTailBytes()
    {
        static $tail_bytes;

        if (isset($tail_bytes)) {
            return $tail_bytes;
        }

        $tail_bytes = array();

        for ($n = 0; $n < 256; $n++) {
            if ($n < 0xc0) {
                $remaining = 0;
            } elseif ($n < 0xe0) {
                $remaining = 1;
            } elseif ($n < 0xf0) {
                $remaining = 2;
            } elseif ($n < 0xf8) {
                $remaining = 3;
            } elseif ($n < 0xfc) {
                $remaining = 4;
            } elseif ($n < 0xfe) {
                $remaining = 5;
            } else {
                $remaining = 0;
            }

            $tail_bytes[chr($n)] = $remaining;
        }

        return $tail_bytes;
    }

    /**
     * Replaces a Unicode character using the transliteration database.
     * @param string $ord An ordinal Unicode character code.
     * @param string $unknown Replacement string for characters that do not have a suitable ASCII equivalent.
     * @param string $langcode Optional ISO 639 language code
     * @return string ASCII replacement character.
     */
    protected function replace($ord, $unknown = '?', $langcode)
    {
        static $map = array();

        $bank = $ord >> 8;
        if (!isset($map[$bank][$langcode])) {

            $file = __DIR__ . '/../data/' . sprintf('x%02x', $bank) . '.php';

            if (file_exists($file)) {

                $variant = $base = array();

                include $file;

                if ($langcode != 'en' && isset($variant[$langcode])) {
                    // Merge in language specific mappings.
                    $map[$bank][$langcode] = $variant[$langcode] + $base;
                } else {
                    $map[$bank][$langcode] = $base;
                }
            } else {
                $map[$bank][$langcode] = array();
            }
        }

        $ord = $ord & 255;
        return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
    }

}


1			<?php
2
3			/**
4			* @package Transliterator
5			* @author Iurii Makukh
6			* @copyright Copyright (c) 2017, Iurii Makukh
7			* @license https://www.gnu.org/licenses/gpl-3.0.en.html GPL-3.0+
8			*/
9
10			namespace gplcart\modules\transliterator\helpers;
11
12			/**
13			* Helper class for Transliterator module
14			* Based on code from https://www.drupal.org/project/transliteration
15			*/
16			class Transliterator
17			{
18
19			/**
20			* Transliterates a string
21			* @param string $string
22			* @param string $unknown
23			* @param null\|string $source_langcode
24			* @return string
25			*/
26			public function transliterate($string, $unknown = '?', $source_langcode = null)
27			{
28			// ASCII is always valid NFC! If we're only ever given plain ASCII, we can
29			// avoid the overhead of initializing the decomposition tables by skipping
30			// out early.
31			if (!preg_match('/[\x80-\xff]/', $string)) {
32			return $string;
33			}
34
35			$tail_bytes = $this->getTailBytes();
36			$areas = $this->getAreas($string);
37			$result = '';
38			foreach ($areas[0] as $str) {
39
40			if ($str[0] < "\x80") {
41			// ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
42			// skip over it.
43			$result .= $str;
44			continue;
45			}
46
47			// We'll have to examine the chunk byte by byte to ensure that it consists
48			// of valid UTF-8 sequences, and to see if any of them might not be normalized.
49			// Since PHP is not the fastest language on earth, some of this code is a
50			// little ugly with inner loop optimizations.
51			$head = '';
52			$chunk = strlen($str);
53
54			// Counting down is faster. I'm so sorry.
55			$len = $chunk + 1;
56			for ($i = -1; --$len;) {
57			$c = $str[++$i];
58			if ($remaining = $tail_bytes[$c]) {
59			// UTF-8 head byte!
60			$sequence = $head = $c;
61			do {
62			// Look for the defined number of tail bytes...
63			if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
64			// Legal tail bytes are nice.
65			$sequence .= $c;
66			} else {
67			if ($len == 0) {
68			// Premature end of string! Drop a replacement character into
69			// output to represent the invalid UTF-8 sequence.
70			$result .= $unknown;
71			break 2;
72			} else {
73			// Illegal tail byte; abandon the sequence.
74			$result .= $unknown;
75			// Back up and reprocess this byte; it may itself be a legal
76			// ASCII or UTF-8 sequence head.
77			--$i;
78			++$len;
79			continue 2;
80			}
81			}
82			} while (--$remaining);
83			$n = ord($head);
84			if ($n <= 0xdf) {
85			$ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
86			} elseif ($n <= 0xef) {
87			$ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
88			} elseif ($n <= 0xf7) {
89			$ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
90			} elseif ($n <= 0xfb) {
91			$ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
92			} elseif ($n <= 0xfd) {
93			$ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
94			}
95			$result .= $this->replace($ord, $unknown, $source_langcode);
			0 ignored issues – show Bug introduced 2017-04-14 13:00 UTC by Report Bug Copy Issue Report The variable `$ord` does not seem to be defined for all execution paths leading up to this point. If you define a variable conditionally, it can happen that it is not defined for all execution paths. Let’s take a look at an example: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } // $x is potentially undefined here. echo $x; } In the above example, the variable `$x` is defined if you pass “foo” or “bar” as argument for `$a`. However, since the `switch` statement has no default case statement, if you pass any other value, the variable `$x` would be undefined. Available Fixes Check for existence of the variable explicitly: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } if (isset($x)) { // Make sure it's always set. echo $x; } } Define a default value for the variable: function myFunction($a) { $x = ''; // Set a default which gets overridden for certain paths. switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } echo $x; } Add a value for the missing path: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; // We add support for the missing case. default: $x = ''; break; } echo $x; } Loading history...
96			$head = '';
97			} elseif ($c < "\x80") {
98			// ASCII byte.
99			$result .= $c;
100			$head = '';
101			} elseif ($c < "\xc0") {
102			// Illegal tail bytes.
103			if ($head == '') {
104			$result .= $unknown;
105			}
106			} else {
107			// Miscellaneous freaks.
108			$result .= $unknown;
109			$head = '';
110			}
111			}
112			}
113
114			return $result;
115			}
116
117			/**
118			* Chops the text into pure-ASCII and non-ASCII areas
119			* @param string $string
120			* @return array
121			*/
122			protected function getAreas($string)
123			{
124			$matches = array();
125			preg_match_all('/[\x00-\x7f]+\|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
126			return $matches;
127			}
128
129			/**
130			* Each UTF-8 head byte is followed by a certain number of tail bytes.
131			* This method returns an array of tail bytes
132			* @staticvar null\|array $tail_bytes
133			* @return array
134			*/
135			protected function getTailBytes()
136			{
137			static $tail_bytes;
138
139			if (isset($tail_bytes)) {
140			return $tail_bytes;
141			}
142
143			$tail_bytes = array();
144
145			for ($n = 0; $n < 256; $n++) {
146			if ($n < 0xc0) {
147			$remaining = 0;
148			} elseif ($n < 0xe0) {
149			$remaining = 1;
150			} elseif ($n < 0xf0) {
151			$remaining = 2;
152			} elseif ($n < 0xf8) {
153			$remaining = 3;
154			} elseif ($n < 0xfc) {
155			$remaining = 4;
156			} elseif ($n < 0xfe) {
157			$remaining = 5;
158			} else {
159			$remaining = 0;
160			}
161
162			$tail_bytes[chr($n)] = $remaining;
163			}
164
165			return $tail_bytes;
166			}
167
168			/**
169			* Replaces a Unicode character using the transliteration database.
170			* @param string $ord An ordinal Unicode character code.
171			* @param string $unknown Replacement string for characters that do not have a suitable ASCII equivalent.
172			* @param string $langcode Optional ISO 639 language code
173			* @return string ASCII replacement character.
174			*/
175			protected function replace($ord, $unknown = '?', $langcode)
176			{
177			static $map = array();
178
179			$bank = $ord >> 8;
180			if (!isset($map[$bank][$langcode])) {
181
182			$file = __DIR__ . '/../data/' . sprintf('x%02x', $bank) . '.php';
183
184			if (file_exists($file)) {
185
186			$variant = $base = array();
187
188			include $file;
189
190			if ($langcode != 'en' && isset($variant[$langcode])) {
191			// Merge in language specific mappings.
192			$map[$bank][$langcode] = $variant[$langcode] + $base;
193			} else {
194			$map[$bank][$langcode] = $base;
195			}
196			} else {
197			$map[$bank][$langcode] = array();
198			}
199			}
200
201			$ord = $ord & 255;
202			return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
203			}
204
205			}
206

gplcart / transliterator

Transliterator A last analyzed 2018-03-10 14:47 UTC

Complexity

Size/Duplication

Coupling/Cohesion

Importance

4 Methods

Available Fixes

Duplication Side-by-Side

Filter issues like

Transliterator A
last analyzed 2018-03-10 14:47 UTC