| Conditions | 16 |
| Paths | 12 |
| Total Lines | 78 |
| Code Lines | 44 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | <?php |
||
| 172 | private function parse() |
||
| 173 | { |
||
| 174 | if (!empty($this->chars)) { |
||
| 175 | return; |
||
| 176 | } |
||
| 177 | |||
| 178 | $len = \strlen($this->raw); |
||
| 179 | $inside = false; // are we "inside" of evaluating a valid UTF-8 char? |
||
| 180 | $invalid = false; |
||
| 181 | |||
| 182 | for ($offset = 0; $offset < $len; $offset++) { |
||
| 183 | $char = $this->raw{$offset}; |
||
| 184 | $ord = \ord($char); |
||
| 185 | |||
| 186 | if ($inside === false) { |
||
| 187 | $bytes = self::charLength($ord); |
||
| 188 | |||
| 189 | if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) { |
||
| 190 | // valid UTF-8 multibyte start |
||
| 191 | $inside = true; |
||
| 192 | $cache = $char; |
||
| 193 | $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1)); |
||
| 194 | $originOffset = $offset; |
||
| 195 | } elseif ($ord < self::$spec[2]['start']) { |
||
| 196 | // ASCII 7-bit char |
||
| 197 | $this->chars[] = [$char, $ord]; |
||
| 198 | } else { |
||
| 199 | // either C0/C1 block or higher; map from cp1252 to utf8 or just convert |
||
| 200 | $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord; |
||
| 201 | $this->chars[] = [self::cpToUtf8Char($ord), $ord]; |
||
| 202 | $invalid = false; |
||
| 203 | } |
||
| 204 | continue; |
||
| 205 | } |
||
| 206 | |||
| 207 | // $inside === true, i.e. *should be* continuation character |
||
| 208 | if (($ord & 0b11000000) !== 0b10000000) { |
||
| 209 | // actually, it's not one, so now the whole UTF-8 char is invalid |
||
| 210 | // go back and force it to parse as ISO or 1252 |
||
| 211 | $inside = false; |
||
| 212 | $invalid = true; |
||
| 213 | $offset = $originOffset - 1; |
||
| 214 | continue; |
||
| 215 | } |
||
| 216 | |||
| 217 | // put this byte's data where it needs to go |
||
| 218 | $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset))); |
||
| 219 | $cache .= $char; |
||
| 220 | |||
| 221 | if ($originOffset + ($bytes - 1) === $offset) { |
||
| 222 | // we're done parsing this char, now let's verify |
||
| 223 | $inside = false; |
||
| 224 | |||
| 225 | // check for overlong, surrogate, too large, BOM, or C0/C1 |
||
| 226 | $overlong = ($ordcache < self::$spec[$bytes]['start']); |
||
| 227 | $surrogate = ($ordcache & 0xFFFFF800 === 0xD800); |
||
| 228 | $toobig = ($ordcache > 0x10FFFF); |
||
| 229 | |||
| 230 | if ($overlong || $surrogate || $toobig) { |
||
| 231 | $invalid = true; |
||
| 232 | $offset = $originOffset - 1; |
||
| 233 | continue; |
||
| 234 | } |
||
| 235 | |||
| 236 | if ($ordcache === 0xFEFF) { // BOM |
||
| 237 | if ($originOffset !== 0) { |
||
| 238 | // if not at beginning, store as word joiner U+2060 |
||
| 239 | $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060]; |
||
| 240 | } |
||
| 241 | // otherwise discard |
||
| 242 | continue; |
||
| 243 | } |
||
| 244 | |||
| 245 | // verification passed, now store it |
||
| 246 | $this->chars[] = [$cache, $ordcache]; |
||
| 247 | } |
||
| 248 | } |
||
| 249 | } |
||
| 250 | } |
||
| 251 |