Completed
Push — master ( a47b74...34d315 )
by Garrett
02:18
created

UStrObj::cpToUtf8Char()   D

Complexity

Conditions 9
Paths 7

Size

Total Lines 36
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 5
Bugs 0 Features 0
Metric Value
c 5
b 0
f 0
dl 0
loc 36
rs 4.909
nc 7
cc 9
eloc 23
nop 1
1
<?php
2
3
namespace StringObject;
4
5
class UStrObj extends AnyStrObj
0 ignored issues
show
Bug introduced by
There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, resetToken, times, translate, trim, unescape, uuDecode, uuEncode
Loading history...
6
{
7
    const NOT_NORMALIZED = 0;
8
    const NFC = 1;
9
    const NFD = 2;
10
    const NFK = 4;
11
    const NFKC = 5;
12
    const NFKD = 6;
13
14
    protected $chars = [];
15
    protected $uhandler;
16
    protected $normform = self::NOT_NORMALIZED;
17
18
    protected static $spec = [
19
        2 => ['mask' => 0b00011111, 'start' => 0x80],
20
        3 => ['mask' => 0b00001111, 'start' => 0x800],
21
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
22
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
23
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24
    ];
25
    protected static $winc1umap = [
26
        0x80 => 0x20AC,
27
        0x81 => 0xFFFD, // invalid
28
        0x82 => 0x201A,
29
        0x83 => 0x0192,
30
        0x84 => 0x201E,
31
        0x85 => 0x2026,
32
        0x86 => 0x2020,
33
        0x87 => 0x2021,
34
        0x88 => 0x02C6,
35
        0x89 => 0x2030,
36
        0x8A => 0x0160,
37
        0x8B => 0x2039,
38
        0x8C => 0x0152,
39
        0x8D => 0xFFFD, // invalid
40
        0x8E => 0x017D,
41
        0x8F => 0xFFFD, // invalid
42
        0x90 => 0xFFFD, // invalid
43
        0x91 => 0x2018,
44
        0x92 => 0x2019,
45
        0x93 => 0x201C,
46
        0x94 => 0x201D,
47
        0x95 => 0x2022,
48
        0x96 => 0x2013,
49
        0x97 => 0x2014,
50
        0x98 => 0x02DC,
51
        0x99 => 0x2122,
52
        0x9A => 0x0161,
53
        0x9B => 0x203A,
54
        0x9C => 0x0153,
55
        0x9D => 0xFFFD, // invalid
56
        0x9E => 0x017E,
57
        0x9F => 0x0178,
58
    ];
59
60 View Code Duplication
    public function toArray($delim = '', $limit = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
61
    {
62
        $this->loadToArray();
63
64
        if (empty($delim)) {
65
            return $this->chars;
66
        }
67
        if (is_int($delim)) {
68
            return \str_split($this->raw, $delim);
69
        }
70
        if ($limit === null) {
71
            return \explode($delim, $this->raw);
72
        }
73
        return \explode($delim, $this->raw, $limit);
74
    }
75
76
    /**
77
     * @return string
78
     */
79
    public function charAt($index)
80
    {
81
        $this->loadToArray();
82
        return $this->chars[$index][0];
83
    }
84
85
    /**
86
     * @return int
87
     */
88
    public function charCodeAt($index)
89
    {
90
        $this->loadToArray();
91
        return $this->chars[$index][1];
92
    }
93
94
    /**
95
     *
96
     */
97
    protected static function cpToUtf8Char($cpt)
98
    {
99
        if ($cpt < self::$spec[2]['start']) {
100
            return \chr($cpt);
101
        }
102
103
        if ($cpt == 0xFEFF) {
104
            return '';
105
        }
106
107
        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
108
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
109
        }
110
111
        if ($cpt < self::$spec[3]['start']) {
112
            $data = [
113
                0b11000000 | ($cpt >> 6),
114
                0b10000000 | ($cpt & 0b00111111)
115
            ];
116
        } elseif ($cpt < self::$spec[4]['start']) {
117
            $data = [
118
                0b11100000 | ($cpt >> 12),
119
                0b10000000 | (($cpt >> 6) & 0b00111111),
120
                0b10000000 | ($cpt & 0b00111111),
121
            ];
122
        } elseif ($cpt <= 0x10FFFF) {
123
            $data = [
124
                0b11110100,
125
                0b10000000 | (($cpt >> 12) & 0b00111111),
126
                0b10000000 | (($cpt >> 6) & 0b00111111),
127
                0b10000000 | ($cpt & 0b00111111),
128
            ];
129
        }
130
131
        return implode(array_map('chr', $data));
0 ignored issues
show
Bug introduced by
The variable $data does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
132
    }
133
    /**
134
     * @param integer $byte
135
     */
136
    protected static function charLength($byte)
137
    {
138
        if (($byte & 0b11111110) === 0b11111100) {
139
            return 6;
140
        }
141
        if (($byte & 0b11111100) === 0b11111000) {
142
            return 5;
143
        }
144
        if (($byte & 0b11111000) === 0b11110000) {
145
            return 4;
146
        }
147
        if (($byte & 0b11110000) === 0b11100000) {
148
            return 3;
149
        }
150
        if (($byte & 0b11100000) === 0b11000000) {
151
            return 2;
152
        }
153
        return 1;
154
    }
155
156
    private function loadToArray()
157
    {
158
        if (!empty($this->chars)) {
159
            return;
160
        }
161
162
        $len = \strlen($this->raw);
163
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
164
        $invalid = false;
165
166
        for ($offset = 0; $offset < $len; $offset++) {
167
            $char = $this->raw{$offset};
168
            $ord = \ord($char);
169
170
            if ($inside === false) {
171
                $bytes = self::charLength($ord);
172
173
                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
174
                    // valid UTF-8 multibyte start
175
                    $inside = true;
176
                    $cache = $char;
177
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
178
                    $originOffset = $offset;
179
                } elseif ($ord < self::$spec[2]['start']) {
180
                    // ASCII 7-bit char
181
                    $this->chars[] = [$char, $ord];
182
                } else {
183
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
184
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
185
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
186
                    $invalid = false;
187
                }
188
                continue;
189
            }
190
191
            // $inside === true, i.e. *should be* continuation character
192
            if (($ord & 0b11000000) !== 0b10000000) {
193
                // actually, it's not one, so now the whole UTF-8 char is invalid
194
                // go back and force it to parse as ISO or 1252
195
                $inside = false;
196
                $invalid = true;
197
                $offset = $originOffset - 1;
198
                continue;
199
            }
200
201
            // put this byte's data where it needs to go
202
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
203
            $cache .= $char;
204
205
            if ($originOffset + ($bytes - 1) === $offset) {
206
                // we're done parsing this char, now let's verify
207
                $inside = false;
208
209
                // check for overlong, surrogate, too large, BOM, or C0/C1
210
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
211
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
212
                $toobig = ($ordcache > 0x10FFFF);
213
214
                if ($overlong || $surrogate || $toobig) {
215
                    $invalid = true;
216
                    $offset = $originOffset - 1;
217
                    continue;
218
                }
219
220
                if ($ordcache === 0xFEFF) { // BOM
221
                    if ($originOffset !== 0) {
222
                        // if not at beginning, store as word joiner U+2060
223
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
224
                    }
225
                    // otherwise discard
226
                    continue;
227
                }
228
229
                // verification passed, now store it
230
                $this->chars[] = [$cache, $ordcache];
231
            }
232
        }
233
    }
234
}
235