Completed
Push — master ( 34d315...a20938 )
by Garrett
09:58
created

UString   B

Complexity

Total Complexity 38

Size/Duplication

Total Lines 240
Duplicated Lines 6.25 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 2
Bugs 0 Features 0
Metric Value
wmc 38
c 2
b 0
f 0
lcom 1
cbo 1
dl 15
loc 240
rs 8.3999

8 Methods

Rating   Name   Duplication   Size   Complexity  
A toArray() 15 15 4
A charAt() 0 5 1
A charCodeAt() 0 5 1
A detectForm() 0 4 1
A normalize() 0 4 1
C cpToUtf8Char() 0 36 8
B charLength() 0 19 6
C parse() 0 78 16

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
3
namespace StringObject;
4
5
class UString extends AnyString
0 ignored issues
show
Bug introduced by
There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, replaceWhole, resetToken, times, translate, trim, unescape, uuDecode, uuEncode
Loading history...
6
{
7
    const NOT_NORMALIZED = 0;
8
    const NFC = 1;
9
    const NFD = 2;
10
    const NFK = 4;
11
    const NFKC = 5;
12
    const NFKD = 6;
13
14
    protected $chars = [];
15
    protected $uhandler;
16
    protected $normform = self::NOT_NORMALIZED;
17
18
    protected static $spec = [
19
        2 => ['mask' => 0b00011111, 'start' => 0x80],
20
        3 => ['mask' => 0b00001111, 'start' => 0x800],
21
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
22
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
23
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24
    ];
25
    protected static $winc1umap = [
26
        0x80 => 0x20AC,
27
        0x81 => 0xFFFD, // invalid
28
        0x82 => 0x201A,
29
        0x83 => 0x0192,
30
        0x84 => 0x201E,
31
        0x85 => 0x2026,
32
        0x86 => 0x2020,
33
        0x87 => 0x2021,
34
        0x88 => 0x02C6,
35
        0x89 => 0x2030,
36
        0x8A => 0x0160,
37
        0x8B => 0x2039,
38
        0x8C => 0x0152,
39
        0x8D => 0xFFFD, // invalid
40
        0x8E => 0x017D,
41
        0x8F => 0xFFFD, // invalid
42
        0x90 => 0xFFFD, // invalid
43
        0x91 => 0x2018,
44
        0x92 => 0x2019,
45
        0x93 => 0x201C,
46
        0x94 => 0x201D,
47
        0x95 => 0x2022,
48
        0x96 => 0x2013,
49
        0x97 => 0x2014,
50
        0x98 => 0x02DC,
51
        0x99 => 0x2122,
52
        0x9A => 0x0161,
53
        0x9B => 0x203A,
54
        0x9C => 0x0153,
55
        0x9D => 0xFFFD, // invalid
56
        0x9E => 0x017E,
57
        0x9F => 0x0178,
58
    ];
59
60 View Code Duplication
    public function toArray($delim = '', $limit = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
61
    {
62
        $this->parse();
63
64
        if (empty($delim)) {
65
            return $this->chars;
66
        }
67
        if (is_int($delim)) {
68
            return \str_split($this->raw, $delim);
69
        }
70
        if ($limit === null) {
71
            return \explode($delim, $this->raw);
72
        }
73
        return \explode($delim, $this->raw, $limit);
74
    }
75
76
    /**
77
     * @return string
78
     */
79
    public function charAt($index)
80
    {
81
        $this->parse();
82
        return $this->chars[$index][0];
83
    }
84
85
    /**
86
     * @return int
87
     */
88
    public function charCodeAt($index)
89
    {
90
        $this->parse();
91
        return $this->chars[$index][1];
92
    }
93
94
    public function detectForm()
95
    {
96
97
    }
98
99
    public function normalize()
100
    {
101
102
    }
103
104
    /**
105
     *
106
     */
107
    protected static function cpToUtf8Char($cpt)
108
    {
109
        if ($cpt < self::$spec[2]['start']) {
110
            return \chr($cpt);
111
        }
112
113
        if ($cpt == 0xFEFF) {
114
            return '';
115
        }
116
117
        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
118
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
119
        }
120
121
        if ($cpt < self::$spec[3]['start']) {
122
            $data = [
123
                0b11000000 | ($cpt >> 6),
124
                0b10000000 | ($cpt & 0b00111111)
125
            ];
126
        } elseif ($cpt < self::$spec[4]['start']) {
127
            $data = [
128
                0b11100000 | ($cpt >> 12),
129
                0b10000000 | (($cpt >> 6) & 0b00111111),
130
                0b10000000 | ($cpt & 0b00111111),
131
            ];
132
        } else {
133
            $data = [
134
                0b11110100,
135
                0b10000000 | (($cpt >> 12) & 0b00111111),
136
                0b10000000 | (($cpt >> 6) & 0b00111111),
137
                0b10000000 | ($cpt & 0b00111111),
138
            ];
139
        }
140
141
        return implode(array_map('chr', $data));
142
    }
143
    /**
144
     * @param integer $byte
145
     */
146
    protected static function charLength($byte)
147
    {
148
        if (($byte & 0b11111110) === 0b11111100) {
149
            return 6;
150
        }
151
        if (($byte & 0b11111100) === 0b11111000) {
152
            return 5;
153
        }
154
        if (($byte & 0b11111000) === 0b11110000) {
155
            return 4;
156
        }
157
        if (($byte & 0b11110000) === 0b11100000) {
158
            return 3;
159
        }
160
        if (($byte & 0b11100000) === 0b11000000) {
161
            return 2;
162
        }
163
        return 1;
164
    }
165
166
    private function parse()
167
    {
168
        if (!empty($this->chars)) {
169
            return;
170
        }
171
172
        $len = \strlen($this->raw);
173
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
174
        $invalid = false;
175
176
        for ($offset = 0; $offset < $len; $offset++) {
177
            $char = $this->raw{$offset};
178
            $ord = \ord($char);
179
180
            if ($inside === false) {
181
                $bytes = self::charLength($ord);
182
183
                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
184
                    // valid UTF-8 multibyte start
185
                    $inside = true;
186
                    $cache = $char;
187
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
188
                    $originOffset = $offset;
189
                } elseif ($ord < self::$spec[2]['start']) {
190
                    // ASCII 7-bit char
191
                    $this->chars[] = [$char, $ord];
192
                } else {
193
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
194
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
195
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
196
                    $invalid = false;
197
                }
198
                continue;
199
            }
200
201
            // $inside === true, i.e. *should be* continuation character
202
            if (($ord & 0b11000000) !== 0b10000000) {
203
                // actually, it's not one, so now the whole UTF-8 char is invalid
204
                // go back and force it to parse as ISO or 1252
205
                $inside = false;
206
                $invalid = true;
207
                $offset = $originOffset - 1;
208
                continue;
209
            }
210
211
            // put this byte's data where it needs to go
212
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
213
            $cache .= $char;
214
215
            if ($originOffset + ($bytes - 1) === $offset) {
216
                // we're done parsing this char, now let's verify
217
                $inside = false;
218
219
                // check for overlong, surrogate, too large, BOM, or C0/C1
220
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
221
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
222
                $toobig = ($ordcache > 0x10FFFF);
223
224
                if ($overlong || $surrogate || $toobig) {
225
                    $invalid = true;
226
                    $offset = $originOffset - 1;
227
                    continue;
228
                }
229
230
                if ($ordcache === 0xFEFF) { // BOM
231
                    if ($originOffset !== 0) {
232
                        // if not at beginning, store as word joiner U+2060
233
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
234
                    }
235
                    // otherwise discard
236
                    continue;
237
                }
238
239
                // verification passed, now store it
240
                $this->chars[] = [$cache, $ordcache];
241
            }
242
        }
243
    }
244
}
245