Transliterator   A
last analyzed

Complexity

Total Complexity 35

Size/Duplication

Total Lines 190
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 0

Importance

Changes 0
Metric Value
wmc 35
lcom 1
cbo 0
dl 0
loc 190
rs 9
c 0
b 0
f 0

4 Methods

Rating   Name   Duplication   Size   Complexity  
D transliterate() 0 90 19
A getAreas() 0 6 1
D getTailBytes() 0 32 9
B replace() 0 29 6
1
<?php
2
3
/**
4
 * @package Transliterator
5
 * @author Iurii Makukh
6
 * @copyright Copyright (c) 2017, Iurii Makukh
7
 * @license https://www.gnu.org/licenses/gpl-3.0.en.html GPL-3.0+
8
 */
9
10
namespace gplcart\modules\transliterator\helpers;
11
12
/**
13
 * Helper class for Transliterator module
14
 * Based on code from https://www.drupal.org/project/transliteration
15
 */
16
class Transliterator
17
{
18
19
    /**
20
     * Transliterates a string
21
     * @param string $string
22
     * @param string $unknown
23
     * @param null|string $source_langcode
24
     * @return string
25
     */
26
    public function transliterate($string, $unknown = '?', $source_langcode = null)
27
    {
28
        // ASCII is always valid NFC! If we're only ever given plain ASCII, we can
29
        // avoid the overhead of initializing the decomposition tables by skipping
30
        // out early.
31
        if (!preg_match('/[\x80-\xff]/', $string)) {
32
            return $string;
33
        }
34
35
        $tail_bytes = $this->getTailBytes();
36
        $areas = $this->getAreas($string);
37
        $result = '';
38
        foreach ($areas[0] as $str) {
39
40
            if ($str[0] < "\x80") {
41
                // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
42
                // skip over it.
43
                $result .= $str;
44
                continue;
45
            }
46
47
            // We'll have to examine the chunk byte by byte to ensure that it consists
48
            // of valid UTF-8 sequences, and to see if any of them might not be normalized.
49
            // Since PHP is not the fastest language on earth, some of this code is a
50
            // little ugly with inner loop optimizations.
51
            $head = '';
52
            $chunk = strlen($str);
53
54
            // Counting down is faster. I'm *so* sorry.
55
            $len = $chunk + 1;
56
            for ($i = -1; --$len;) {
57
                $c = $str[++$i];
58
                if ($remaining = $tail_bytes[$c]) {
59
                    // UTF-8 head byte!
60
                    $sequence = $head = $c;
61
                    do {
62
                        // Look for the defined number of tail bytes...
63
                        if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
64
                            // Legal tail bytes are nice.
65
                            $sequence .= $c;
66
                        } else {
67
                            if ($len == 0) {
68
                                // Premature end of string! Drop a replacement character into
69
                                // output to represent the invalid UTF-8 sequence.
70
                                $result .= $unknown;
71
                                break 2;
72
                            } else {
73
                                // Illegal tail byte; abandon the sequence.
74
                                $result .= $unknown;
75
                                // Back up and reprocess this byte; it may itself be a legal
76
                                // ASCII or UTF-8 sequence head.
77
                                --$i;
78
                                ++$len;
79
                                continue 2;
80
                            }
81
                        }
82
                    } while (--$remaining);
83
                    $n = ord($head);
84
                    if ($n <= 0xdf) {
85
                        $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
86
                    } elseif ($n <= 0xef) {
87
                        $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
88
                    } elseif ($n <= 0xf7) {
89
                        $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
90
                    } elseif ($n <= 0xfb) {
91
                        $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
92
                    } elseif ($n <= 0xfd) {
93
                        $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
94
                    }
95
                    $result .= $this->replace($ord, $unknown, $source_langcode);
0 ignored issues
show
Bug introduced by
The variable $ord does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
96
                    $head = '';
97
                } elseif ($c < "\x80") {
98
                    // ASCII byte.
99
                    $result .= $c;
100
                    $head = '';
101
                } elseif ($c < "\xc0") {
102
                    // Illegal tail bytes.
103
                    if ($head == '') {
104
                        $result .= $unknown;
105
                    }
106
                } else {
107
                    // Miscellaneous freaks.
108
                    $result .= $unknown;
109
                    $head = '';
110
                }
111
            }
112
        }
113
114
        return $result;
115
    }
116
117
    /**
118
     * Chops the text into pure-ASCII and non-ASCII areas
119
     * @param string $string
120
     * @return array
121
     */
122
    protected function getAreas($string)
123
    {
124
        $matches = array();
125
        preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
126
        return $matches;
127
    }
128
129
    /**
130
     * Each UTF-8 head byte is followed by a certain number of tail bytes.
131
     * This method returns an array of tail bytes
132
     * @staticvar null|array $tail_bytes
133
     * @return array
134
     */
135
    protected function getTailBytes()
136
    {
137
        static $tail_bytes;
138
139
        if (isset($tail_bytes)) {
140
            return $tail_bytes;
141
        }
142
143
        $tail_bytes = array();
144
145
        for ($n = 0; $n < 256; $n++) {
146
            if ($n < 0xc0) {
147
                $remaining = 0;
148
            } elseif ($n < 0xe0) {
149
                $remaining = 1;
150
            } elseif ($n < 0xf0) {
151
                $remaining = 2;
152
            } elseif ($n < 0xf8) {
153
                $remaining = 3;
154
            } elseif ($n < 0xfc) {
155
                $remaining = 4;
156
            } elseif ($n < 0xfe) {
157
                $remaining = 5;
158
            } else {
159
                $remaining = 0;
160
            }
161
162
            $tail_bytes[chr($n)] = $remaining;
163
        }
164
165
        return $tail_bytes;
166
    }
167
168
    /**
169
     * Replaces a Unicode character using the transliteration database.
170
     * @param string $ord An ordinal Unicode character code.
171
     * @param string $unknown Replacement string for characters that do not have a suitable ASCII equivalent.
172
     * @param string $langcode Optional ISO 639 language code
173
     * @return string ASCII replacement character.
174
     */
175
    protected function replace($ord, $unknown = '?', $langcode)
176
    {
177
        static $map = array();
178
179
        $bank = $ord >> 8;
180
        if (!isset($map[$bank][$langcode])) {
181
182
            $file = __DIR__ . '/../data/' . sprintf('x%02x', $bank) . '.php';
183
184
            if (file_exists($file)) {
185
186
                $variant = $base = array();
187
188
                include $file;
189
190
                if ($langcode != 'en' && isset($variant[$langcode])) {
191
                    // Merge in language specific mappings.
192
                    $map[$bank][$langcode] = $variant[$langcode] + $base;
193
                } else {
194
                    $map[$bank][$langcode] = $base;
195
                }
196
            } else {
197
                $map[$bank][$langcode] = array();
198
            }
199
        }
200
201
        $ord = $ord & 255;
202
        return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
203
    }
204
205
}
206