UStrObj::findUtf8CharAt() - Code Metrics - Inspection of "refactored; making UTF-8 a subclass of StrObj" - garrettw/stringobject - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3daf45...3860d5 )

by Garrett

created 2015-12-04 23:08 UTC

UStrObj::findUtf8CharAt() C

↳ Parent: UStrObj

Complexity

Conditions	17
Paths	17

Size

Total Lines	69
Code Lines	34

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	69
rs	5.6439
cc	17
eloc	34
nc	17
nop	1

How to fix Long Method Complexity

<?php

namespace StringObject;

class UStrObj extends StrObj
{
    protected $chars = [];
    protected $uhandler;

    protected static $masks = [
        2 => 0b00011111,
        3 => 0b00001111,
        4 => 0b00000111,
    ];
    protected static $winc1umap = [
        128 => 0x20AC,
        130 => 0x201A,
        131 => 0x0192,
        132 => 0x201E,
        133 => 0x2026,
        134 => 0x2020,
        135 => 0x2021,
        136 => 0x02C6,
        137 => 0x2030,
        138 => 0x0160,
        139 => 0x2039,
        140 => 0x0152,
        142 => 0x017D,
        145 => 0x2018,
        146 => 0x2019,
        147 => 0x201C,
        148 => 0x201D,
        149 => 0x2022,
        150 => 0x2013,
        151 => 0x2014,
        152 => 0x02DC,
        153 => 0x2122,
        154 => 0x0161,
        155 => 0x203A,
        156 => 0x0153,
        158 => 0x017E,
        159 => 0x0178,
    ];

    protected static $c1umap = [
        0xC280 => 0x20AC,
        0xC282 => 0x201A,
        0xC283 => 0x0192,
        0xC284 => 0x201E,
        0xC285 => 0x2026,
        0xC286 => 0x2020,
        0xC287 => 0x2021,
        0xC288 => 0x02C6,
        0xC289 => 0x2030,
        0xC28A => 0x0160,
        0xC28B => 0x2039,
        0xC28C => 0x0152,
        0xC28E => 0x017D,
        0xC291 => 0x2018,
        0xC292 => 0x2019,
        0xC293 => 0x201C,
        0xC294 => 0x201D,
        0xC295 => 0x2022,
        0xC296 => 0x2013,
        0xC297 => 0x2014,
        0xC298 => 0x02DC,
        0xC299 => 0x2122,
        0xC29A => 0x0161,
        0xC29B => 0x203A,
        0xC29C => 0x0153,
        0xC29E => 0x017E,
        0xC29F => 0x0178,
    ];

    public function __construct($thing, $uhandler)
    {
        parent::__construct($thing);
        $this->uhandler = $uhandler;
    }

    public function toArray($delim = '', $limit = null)

    {
        $this->loadToArray();

        if (empty($delim)) {
            return $this->chars;
        }
        if (is_int($delim)) {
            return \str_split($this->raw, $delim);
        }
        if ($limit === null) {
            return \explode($delim, $this->raw);
        }
        return \explode($delim, $this->raw, $limit);
    }

    public function charAt($index)

    {
        $this->loadToArray();
        return $this->chars[$index];
    }

    public function charCodeAt($index)

    {
        $this->loadToArray();
        $bytes = \array_map('ord', \str_split($this->chars[$index]));
        $count = \strlen($this->chars[$index]);

        if ($count === 1) {
            if ($bytes[0] > 0b01111111 && $bytes[0] < 0b10100000) {
                return self::$winc1umap[$bytes[0]];
            }
            return $bytes[0];
        }

        $overlong = false;
$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

        foreach ($bytes as $i => $data) {
            if ($i === 0) {
                $codepoint = ($data & self::$masks[$count]);
                $overlong = ($codepoint === 0);
$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}
                continue;
            }
            $codepoint <<= 6;
function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}
            $codepoint += $data & 0b00111111;
        }

        if ($codepoint > 0x10FFFF) {
if (rand(1, 6) > 3) {
//print "Check failed";
} else {
    print "Check succeeded";
}
            // invalid
        }

        return $codepoint;
    }

    private function loadToArray()
    {
        if (!empty($this->chars)) {
            return;
        }

        $offset = 0;
        $len = \strlen($this->raw);
        while ($offset < $len) {
            $data = $this->raw{$offset};
            $bytes = self::charLength($data);
            $valid = ($offset + $bytes <= $len);

            for ($pos = 2; $pos <= $bytes && $valid === true; $pos++) {
                $byte = $this->raw{$offset + $pos - 1};
                $ord = \ord($byte);

                if ($ord < 128 && $ord > 191) {
                    $valid = false;
                }
                $data .= $byte;
            }

            if ($bytes === 1 || $valid === false) {
                $this->chars[] = $this->raw{$offset++};
                continue;
            }

            $this->chars[] = $data;
            $offset += $bytes;
        }
    }

    protected function parseUtf8CharAt($offset)
    {
        list($start, $length, $valid, $current) = $this->findUtf8CharAt($offset);

        if ($length === 1) {
            if ($current > 0b01111111 && $current < 0b10100000) {
                return [$start, $length, self::$winc1umap[$current]];
            }
            return [$start, $length, $current];
        }

        $byte = \ord($this->raw{$start});

        if ($valid === false) {
            if ($length === 2 && $byte === 0b11000000) {
                // overlong ascii
                return [$start + 1, 1, ($offset === $start) ? \ord($this->raw{$start + 1}) : $byte];
            }
            return [$offset, 1, $current];
        }

        if ($valid === true) {
            $bigcode = $byte & 0b00011111;

            if ($length === 3) {
                $bigcode = $byte & 0b00001111;
            } elseif ($length === 4) {
                $bigcode = $byte & 0b00000111;
            }

            for ($next = 1; $next < $length; $next++) {
                $bigcode <<= 6;
                $bigcode += \ord($this->raw{$start + $next}) & 0b00111111;
            }

            if ($bigcode > 0x10FFFF) {
                return [$offset, 1, $current];
            }
            return [$start, $length, $bigcode];
        }
    }

    /**
     * Determines if the byte at the given offset is part of a valid UTF8 char,
     * and returns its actual starting offset, length in bytes, validity,
     * and the byte at the original offset.
     */
    protected function findUtf8CharAt($offset)
    {
        $byte = \ord($this->raw{$offset});

        if ($byte <= 0b01111111) {
            // ASCII passthru, 1 byte long
            return [$offset, 1, true, $byte];
        }

        if ($byte <= 0b10111111) {
            // either part of a UTF8 char, or an invalid UTF8 codepoint.
            // try to find start of UTF8 char
            $original = $offset;
            while ($offset > 0 && $original - $offset < 4) {
                $prev = \ord($this->raw{--$offset});

                if ($prev <= 0b01111111) {
                    // prev is plain ASCII so current char can't be valid
                    return [$original, 1, false, $byte];
                }

                if ($prev <= 0b10111111) {
                    // prev is also part of a UTF8 char, so keep looking
                    continue;
                }

                if ($prev == 0xC0 || $prev == 0xC1) {
                    // prev is an invalid UTF8 starter for overlong ASCII
                    return [$offset, 2, false, $byte];
                }

                if ($prev <= 0b11110100) {
                    // prev is valid start byte, validate length to check this char
                    $length = self::charLength($prev);

                    if ($original < $offset + $length) {
                        return [$offset, $length, true, $byte];
                    }
                }
                return [$original, 1, false, $byte];
            }
            return [$original, 1, false, $byte];
        }

        if ($byte <= 0b11110100) {
            // valid UTF8 start byte, find the rest, determine if length is valid
            $actual = $length = self::charLength($byte);

            for ($i = 1; $i < $length; $i++) {
                if ($offset + $i >= $this->length()) {
                    $actual = $i - 1;
                    break;
                }
                $last = \ord($this->raw{$offset + $i});
                if ($last < 0b10000000 || $last > 0b10111111) {
                    $actual = $i;
                    break;
                }
            }

            if ($actual !== $length) {
                return [$offset, $actual, false, $byte];
            }
            return [$offset, $length, true, $byte];
        }

        // if 245 to 255, Windows-1252 passthru
        return [$offset, 1, false, $byte];
    }

    /**
     * @param integer $byte
     */
    protected static function charLength($byte)
    {
        if ($byte >> 3 === 0b00011110) {
            return 4;
        }
        if ($byte >> 4 === 0b00001110) {
            return 3;
        }
        if ($byte >> 5 === 0b00000110) {
            return 2;
        }
        return 1;
    }
}


1		<?php
2
3		namespace StringObject;
4
5		class UStrObj extends StrObj
6		{
7		protected $chars = [];
8		protected $uhandler;
9
10		protected static $masks = [
11		2 => 0b00011111,
12		3 => 0b00001111,
13		4 => 0b00000111,
14		];
15		protected static $winc1umap = [
16		128 => 0x20AC,
17		130 => 0x201A,
18		131 => 0x0192,
19		132 => 0x201E,
20		133 => 0x2026,
21		134 => 0x2020,
22		135 => 0x2021,
23		136 => 0x02C6,
24		137 => 0x2030,
25		138 => 0x0160,
26		139 => 0x2039,
27		140 => 0x0152,
28		142 => 0x017D,
29		145 => 0x2018,
30		146 => 0x2019,
31		147 => 0x201C,
32		148 => 0x201D,
33		149 => 0x2022,
34		150 => 0x2013,
35		151 => 0x2014,
36		152 => 0x02DC,
37		153 => 0x2122,
38		154 => 0x0161,
39		155 => 0x203A,
40		156 => 0x0153,
41		158 => 0x017E,
42		159 => 0x0178,
43		];
44
45		protected static $c1umap = [
46		0xC280 => 0x20AC,
47		0xC282 => 0x201A,
48		0xC283 => 0x0192,
49		0xC284 => 0x201E,
50		0xC285 => 0x2026,
51		0xC286 => 0x2020,
52		0xC287 => 0x2021,
53		0xC288 => 0x02C6,
54		0xC289 => 0x2030,
55		0xC28A => 0x0160,
56		0xC28B => 0x2039,
57		0xC28C => 0x0152,
58		0xC28E => 0x017D,
59		0xC291 => 0x2018,
60		0xC292 => 0x2019,
61		0xC293 => 0x201C,
62		0xC294 => 0x201D,
63		0xC295 => 0x2022,
64		0xC296 => 0x2013,
65		0xC297 => 0x2014,
66		0xC298 => 0x02DC,
67		0xC299 => 0x2122,
68		0xC29A => 0x0161,
69		0xC29B => 0x203A,
70		0xC29C => 0x0153,
71		0xC29E => 0x017E,
72		0xC29F => 0x0178,
73		];
74
75		public function __construct($thing, $uhandler)
76		{
77		parent::__construct($thing);
78		$this->uhandler = $uhandler;
79		}
80
81	View Code Duplication	public function toArray($delim = '', $limit = null)
		0 ignored issues – show Duplication introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
82		{
83		$this->loadToArray();
84
85		if (empty($delim)) {
86		return $this->chars;
87		}
88		if (is_int($delim)) {
89		return \str_split($this->raw, $delim);
90		}
91		if ($limit === null) {
92		return \explode($delim, $this->raw);
93		}
94		return \explode($delim, $this->raw, $limit);
95		}
96
97		public function charAt($index)
		0 ignored issues – show Documentation introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report The return type could not be reliably inferred; please add a `@return` annotation. Our type inference engine in quite powerful, but sometimes the code does not provide enough clues to go by. In these cases we request you to add a `@return` annotation as described here. Loading history...
98		{
99		$this->loadToArray();
100		return $this->chars[$index];
101		}
102
103		public function charCodeAt($index)
		0 ignored issues – show Documentation introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report The return type could not be reliably inferred; please add a `@return` annotation. Our type inference engine in quite powerful, but sometimes the code does not provide enough clues to go by. In these cases we request you to add a `@return` annotation as described here. Loading history...
104		{
105		$this->loadToArray();
106		$bytes = \array_map('ord', \str_split($this->chars[$index]));
107		$count = \strlen($this->chars[$index]);
108
109		if ($count === 1) {
110		if ($bytes[0] > 0b01111111 && $bytes[0] < 0b10100000) {
111		return self::$winc1umap[$bytes[0]];
112		}
113		return $bytes[0];
114		}
115
116		$overlong = false;
		0 ignored issues – show Unused Code introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report `$overlong` is not used, you could remove the assignment. This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently. $myVar = 'Value'; $higher = false; if (rand(1, 6) > 3) { $higher = true; } else { $higher = false; } Both the `$myVar` assignment in line 1 and the `$higher` assignment in line 2 are dead. The first because `$myVar` is never used and the second because `$higher` is always overwritten for every possible time line. Loading history...
117
118		foreach ($bytes as $i => $data) {
119		if ($i === 0) {
120		$codepoint = ($data & self::$masks[$count]);
121		$overlong = ($codepoint === 0);
		0 ignored issues – show Unused Code introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report `$overlong` is not used, you could remove the assignment. This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently. $myVar = 'Value'; $higher = false; if (rand(1, 6) > 3) { $higher = true; } else { $higher = false; } Both the `$myVar` assignment in line 1 and the `$higher` assignment in line 2 are dead. The first because `$myVar` is never used and the second because `$higher` is always overwritten for every possible time line. Loading history...
122		continue;
123		}
124		$codepoint <<= 6;
		0 ignored issues – show Bug introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report The variable `$codepoint` does not seem to be defined for all execution paths leading up to this point. If you define a variable conditionally, it can happen that it is not defined for all execution paths. Let’s take a look at an example: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } // $x is potentially undefined here. echo $x; } In the above example, the variable `$x` is defined if you pass “foo” or “bar” as argument for `$a`. However, since the `switch` statement has no default case statement, if you pass any other value, the variable `$x` would be undefined. Available Fixes Check for existence of the variable explicitly: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } if (isset($x)) { // Make sure it's always set. echo $x; } } Define a default value for the variable: function myFunction($a) { $x = ''; // Set a default which gets overridden for certain paths. switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } echo $x; } Add a value for the missing path: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; // We add support for the missing case. default: $x = ''; break; } echo $x; } Loading history...
125		$codepoint += $data & 0b00111111;
126		}
127
128		if ($codepoint > 0x10FFFF) {
		0 ignored issues – show Unused Code introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report This `if` statement is empty and can be removed. This check looks for the bodies of `if` statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete. These `if` bodies can be removed. If you have an empty if but statements in the `else` branch, consider inverting the condition. if (rand(1, 6) > 3) { //print "Check failed"; } else { print "Check succeeded"; } could be turned into if (rand(1, 6) <= 3) { print "Check succeeded"; } This is much more concise to read. Loading history...
129		// invalid
130		}
131
132		return $codepoint;
133		}
134
135		private function loadToArray()
136		{
137		if (!empty($this->chars)) {
138		return;
139		}
140
141		$offset = 0;
142		$len = \strlen($this->raw);
143		while ($offset < $len) {
144		$data = $this->raw{$offset};
145		$bytes = self::charLength($data);
146		$valid = ($offset + $bytes <= $len);
147
148		for ($pos = 2; $pos <= $bytes && $valid === true; $pos++) {
149		$byte = $this->raw{$offset + $pos - 1};
150		$ord = \ord($byte);
151
152		if ($ord < 128 && $ord > 191) {
153		$valid = false;
154		}
155		$data .= $byte;
156		}
157
158		if ($bytes === 1 \|\| $valid === false) {
159		$this->chars[] = $this->raw{$offset++};
160		continue;
161		}
162
163		$this->chars[] = $data;
164		$offset += $bytes;
165		}
166		}
167
168		protected function parseUtf8CharAt($offset)
169		{
170		list($start, $length, $valid, $current) = $this->findUtf8CharAt($offset);
171
172		if ($length === 1) {
173		if ($current > 0b01111111 && $current < 0b10100000) {
174		return [$start, $length, self::$winc1umap[$current]];
175		}
176		return [$start, $length, $current];
177		}
178
179		$byte = \ord($this->raw{$start});
180
181		if ($valid === false) {
182		if ($length === 2 && $byte === 0b11000000) {
183		// overlong ascii
184		return [$start + 1, 1, ($offset === $start) ? \ord($this->raw{$start + 1}) : $byte];
185		}
186		return [$offset, 1, $current];
187		}
188
189		if ($valid === true) {
190		$bigcode = $byte & 0b00011111;
191
192		if ($length === 3) {
193		$bigcode = $byte & 0b00001111;
194		} elseif ($length === 4) {
195		$bigcode = $byte & 0b00000111;
196		}
197
198		for ($next = 1; $next < $length; $next++) {
199		$bigcode <<= 6;
200		$bigcode += \ord($this->raw{$start + $next}) & 0b00111111;
201		}
202
203		if ($bigcode > 0x10FFFF) {
204		return [$offset, 1, $current];
205		}
206		return [$start, $length, $bigcode];
207		}
208		}
209
210		/**
211		* Determines if the byte at the given offset is part of a valid UTF8 char,
212		* and returns its actual starting offset, length in bytes, validity,
213		* and the byte at the original offset.
214		*/
215		protected function findUtf8CharAt($offset)
216		{
217		$byte = \ord($this->raw{$offset});
218
219		if ($byte <= 0b01111111) {
220		// ASCII passthru, 1 byte long
221		return [$offset, 1, true, $byte];
222		}
223
224		if ($byte <= 0b10111111) {
225		// either part of a UTF8 char, or an invalid UTF8 codepoint.
226		// try to find start of UTF8 char
227		$original = $offset;
228		while ($offset > 0 && $original - $offset < 4) {
229		$prev = \ord($this->raw{--$offset});
230
231		if ($prev <= 0b01111111) {
232		// prev is plain ASCII so current char can't be valid
233		return [$original, 1, false, $byte];
234		}
235
236		if ($prev <= 0b10111111) {
237		// prev is also part of a UTF8 char, so keep looking
238		continue;
239		}
240
241		if ($prev == 0xC0 \|\| $prev == 0xC1) {
242		// prev is an invalid UTF8 starter for overlong ASCII
243		return [$offset, 2, false, $byte];
244		}
245
246		if ($prev <= 0b11110100) {
247		// prev is valid start byte, validate length to check this char
248		$length = self::charLength($prev);
249
250		if ($original < $offset + $length) {
251		return [$offset, $length, true, $byte];
252		}
253		}
254		return [$original, 1, false, $byte];
255		}
256		return [$original, 1, false, $byte];
257		}
258
259		if ($byte <= 0b11110100) {
260		// valid UTF8 start byte, find the rest, determine if length is valid
261		$actual = $length = self::charLength($byte);
262
263		for ($i = 1; $i < $length; $i++) {
264		if ($offset + $i >= $this->length()) {
265		$actual = $i - 1;
266		break;
267		}
268		$last = \ord($this->raw{$offset + $i});
269		if ($last < 0b10000000 \|\| $last > 0b10111111) {
270		$actual = $i;
271		break;
272		}
273		}
274
275		if ($actual !== $length) {
276		return [$offset, $actual, false, $byte];
277		}
278		return [$offset, $length, true, $byte];
279		}
280
281		// if 245 to 255, Windows-1252 passthru
282		return [$offset, 1, false, $byte];
283		}
284
285		/**
286		* @param integer $byte
287		*/
288		protected static function charLength($byte)
289		{
290		if ($byte >> 3 === 0b00011110) {
291		return 4;
292		}
293		if ($byte >> 4 === 0b00001110) {
294		return 3;
295		}
296		if ($byte >> 5 === 0b00000110) {
297		return 2;
298		}
299		return 1;
300		}
301		}
302

garrettw / stringobject

Push — master ( 3daf45...3860d5 )

UStrObj::findUtf8CharAt() C

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Available Fixes

Duplication Side-by-Side

Filter issues like