Completed
Push — master ( f7fae0...1e758a )
by Garrett
02:45
created

UString::length()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 5
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
3
namespace StringObject;
4
5
class UString extends AnyString
0 ignored issues
show
Bug introduced by
There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, resetToken, times, translate, trim, unescape, uuDecode, uuEncode
Loading history...
6
{
7
    const NOT_NORMALIZED = 0;
8
    const NFC = 1;
9
    const NFD = 2;
10
    const NFK = 4;
11
    const NFKC = 5;
12
    const NFKD = 6;
13
14
    protected $chars = [];
15
    protected $uhandler;
16
    protected $normform = self::NOT_NORMALIZED;
17
18
    protected static $spec = [
19
        2 => ['mask' => 0b00011111, 'start' => 0x80],
20
        3 => ['mask' => 0b00001111, 'start' => 0x800],
21
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
22
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
23
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24
    ];
25
    protected static $winc1umap = [
26
        0x80 => 0x20AC,
27
        0x81 => 0xFFFD, // invalid
28
        0x82 => 0x201A,
29
        0x83 => 0x0192,
30
        0x84 => 0x201E,
31
        0x85 => 0x2026,
32
        0x86 => 0x2020,
33
        0x87 => 0x2021,
34
        0x88 => 0x02C6,
35
        0x89 => 0x2030,
36
        0x8A => 0x0160,
37
        0x8B => 0x2039,
38
        0x8C => 0x0152,
39
        0x8D => 0xFFFD, // invalid
40
        0x8E => 0x017D,
41
        0x8F => 0xFFFD, // invalid
42
        0x90 => 0xFFFD, // invalid
43
        0x91 => 0x2018,
44
        0x92 => 0x2019,
45
        0x93 => 0x201C,
46
        0x94 => 0x201D,
47
        0x95 => 0x2022,
48
        0x96 => 0x2013,
49
        0x97 => 0x2014,
50
        0x98 => 0x02DC,
51
        0x99 => 0x2122,
52
        0x9A => 0x0161,
53
        0x9B => 0x203A,
54
        0x9C => 0x0153,
55
        0x9D => 0xFFFD, // invalid
56
        0x9E => 0x017E,
57
        0x9F => 0x0178,
58
    ];
59
60 View Code Duplication
    public function toArray($delim = '', $limit = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
61
    {
62
        $this->parse();
63
64
        if (empty($delim)) {
65
            return $this->chars;
66
        }
67
        if (is_int($delim)) {
68
            return \str_split($this->raw, $delim);
69
        }
70
        if ($limit === null) {
71
            return \explode($delim, $this->raw);
72
        }
73
        return \explode($delim, $this->raw, $limit);
74
    }
75
76
    /**
77
     * @return string
78
     */
79
    public function charAt($index)
80
    {
81
        $this->parse();
82
        return $this->chars[$index][0];
83
    }
84
85
    /**
86
     * @return int
87
     */
88
    public function charCodeAt($index)
89
    {
90
        $this->parse();
91
        return $this->chars[$index][1];
92
    }
93
94
    public function detectForm()
95
    {
96
97
    }
98
99
    public function length()
100
    {
101
        $this->parse();
102
        return \count($this->chars);
103
    }
104
105
    public function normalize($target = self::NFC)
0 ignored issues
show
Unused Code introduced by
The parameter $target is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
106
    {
107
108
    }
109
110
    /**
111
     *
112
     */
113
    protected static function cpToUtf8Char($cpt)
114
    {
115
        if ($cpt < self::$spec[2]['start']) {
116
            return \chr($cpt);
117
        }
118
119
        if ($cpt == 0xFEFF) {
120
            return '';
121
        }
122
123
        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
124
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
125
        }
126
127
        if ($cpt < self::$spec[3]['start']) {
128
            $data = [
129
                0b11000000 | ($cpt >> 6),
130
                0b10000000 | ($cpt & 0b00111111)
131
            ];
132
        } elseif ($cpt < self::$spec[4]['start']) {
133
            $data = [
134
                0b11100000 | ($cpt >> 12),
135
                0b10000000 | (($cpt >> 6) & 0b00111111),
136
                0b10000000 | ($cpt & 0b00111111),
137
            ];
138
        } else {
139
            $data = [
140
                0b11110100,
141
                0b10000000 | (($cpt >> 12) & 0b00111111),
142
                0b10000000 | (($cpt >> 6) & 0b00111111),
143
                0b10000000 | ($cpt & 0b00111111),
144
            ];
145
        }
146
147
        return implode(array_map('chr', $data));
148
    }
149
    /**
150
     * @param integer $byte
151
     */
152
    protected static function charLength($byte)
153
    {
154
        if (($byte & 0b11111110) === 0b11111100) {
155
            return 6;
156
        }
157
        if (($byte & 0b11111100) === 0b11111000) {
158
            return 5;
159
        }
160
        if (($byte & 0b11111000) === 0b11110000) {
161
            return 4;
162
        }
163
        if (($byte & 0b11110000) === 0b11100000) {
164
            return 3;
165
        }
166
        if (($byte & 0b11100000) === 0b11000000) {
167
            return 2;
168
        }
169
        return 1;
170
    }
171
172
    private function parse()
173
    {
174
        if (!empty($this->chars)) {
175
            return;
176
        }
177
178
        $len = \strlen($this->raw);
179
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
180
        $invalid = false;
181
182
        for ($offset = 0; $offset < $len; $offset++) {
183
            $char = $this->raw{$offset};
184
            $ord = \ord($char);
185
186
            if ($inside === false) {
187
                $bytes = self::charLength($ord);
188
189
                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
190
                    // valid UTF-8 multibyte start
191
                    $inside = true;
192
                    $cache = $char;
193
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
194
                    $originOffset = $offset;
195
                } elseif ($ord < self::$spec[2]['start']) {
196
                    // ASCII 7-bit char
197
                    $this->chars[] = [$char, $ord];
198
                } else {
199
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
200
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
201
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
202
                    $invalid = false;
203
                }
204
                continue;
205
            }
206
207
            // $inside === true, i.e. *should be* continuation character
208
            if (($ord & 0b11000000) !== 0b10000000) {
209
                // actually, it's not one, so now the whole UTF-8 char is invalid
210
                // go back and force it to parse as ISO or 1252
211
                $inside = false;
212
                $invalid = true;
213
                $offset = $originOffset - 1;
214
                continue;
215
            }
216
217
            // put this byte's data where it needs to go
218
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
219
            $cache .= $char;
220
221
            if ($originOffset + ($bytes - 1) === $offset) {
222
                // we're done parsing this char, now let's verify
223
                $inside = false;
224
225
                // check for overlong, surrogate, too large, BOM, or C0/C1
226
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
227
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
228
                $toobig = ($ordcache > 0x10FFFF);
229
230
                if ($overlong || $surrogate || $toobig) {
231
                    $invalid = true;
232
                    $offset = $originOffset - 1;
233
                    continue;
234
                }
235
236
                if ($ordcache === 0xFEFF) { // BOM
237
                    if ($originOffset !== 0) {
238
                        // if not at beginning, store as word joiner U+2060
239
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
240
                    }
241
                    // otherwise discard
242
                    continue;
243
                }
244
245
                // verification passed, now store it
246
                $this->chars[] = [$cache, $ordcache];
247
            }
248
        }
249
    }
250
}
251