UString::parse() - Code Metrics - Inspection of "more scrutinizer cleanup" - garrettw/stringobject - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f7fae0...1e758a )

by Garrett

created 2016-05-12 14:10 UTC

UString::parse() C

↳ Parent: UString

Complexity

Conditions	16
Paths	12

Size

Total Lines	78
Code Lines	44

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	78
rs	5.142
cc	16
eloc	44
nc	12
nop	0

How to fix Long Method Complexity

<?php

namespace StringObject;

class UString extends AnyString

{
    const NOT_NORMALIZED = 0;
    const NFC = 1;
    const NFD = 2;
    const NFK = 4;
    const NFKC = 5;
    const NFKD = 6;

    protected $chars = [];
    protected $uhandler;
    protected $normform = self::NOT_NORMALIZED;

    protected static $spec = [
        2 => ['mask' => 0b00011111, 'start' => 0x80],
        3 => ['mask' => 0b00001111, 'start' => 0x800],
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
    ];
    protected static $winc1umap = [
        0x80 => 0x20AC,
        0x81 => 0xFFFD, // invalid
        0x82 => 0x201A,
        0x83 => 0x0192,
        0x84 => 0x201E,
        0x85 => 0x2026,
        0x86 => 0x2020,
        0x87 => 0x2021,
        0x88 => 0x02C6,
        0x89 => 0x2030,
        0x8A => 0x0160,
        0x8B => 0x2039,
        0x8C => 0x0152,
        0x8D => 0xFFFD, // invalid
        0x8E => 0x017D,
        0x8F => 0xFFFD, // invalid
        0x90 => 0xFFFD, // invalid
        0x91 => 0x2018,
        0x92 => 0x2019,
        0x93 => 0x201C,
        0x94 => 0x201D,
        0x95 => 0x2022,
        0x96 => 0x2013,
        0x97 => 0x2014,
        0x98 => 0x02DC,
        0x99 => 0x2122,
        0x9A => 0x0161,
        0x9B => 0x203A,
        0x9C => 0x0153,
        0x9D => 0xFFFD, // invalid
        0x9E => 0x017E,
        0x9F => 0x0178,
    ];

    public function toArray($delim = '', $limit = null)

    {
        $this->parse();

        if (empty($delim)) {
            return $this->chars;
        }
        if (is_int($delim)) {
            return \str_split($this->raw, $delim);
        }
        if ($limit === null) {
            return \explode($delim, $this->raw);
        }
        return \explode($delim, $this->raw, $limit);
    }

    /**
     * @return string
     */
    public function charAt($index)
    {
        $this->parse();
        return $this->chars[$index][0];
    }

    /**
     * @return int
     */
    public function charCodeAt($index)
    {
        $this->parse();
        return $this->chars[$index][1];
    }

    public function detectForm()
    {

    }

    public function length()
    {
        $this->parse();
        return \count($this->chars);
    }

    public function normalize($target = self::NFC)

    {

    }

    /**
     *
     */
    protected static function cpToUtf8Char($cpt)
    {
        if ($cpt < self::$spec[2]['start']) {
            return \chr($cpt);
        }

        if ($cpt == 0xFEFF) {
            return '';
        }

        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
        }

        if ($cpt < self::$spec[3]['start']) {
            $data = [
                0b11000000 | ($cpt >> 6),
                0b10000000 | ($cpt & 0b00111111)
            ];
        } elseif ($cpt < self::$spec[4]['start']) {
            $data = [
                0b11100000 | ($cpt >> 12),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        } else {
            $data = [
                0b11110100,
                0b10000000 | (($cpt >> 12) & 0b00111111),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        }

        return implode(array_map('chr', $data));
    }
    /**
     * @param integer $byte
     */
    protected static function charLength($byte)
    {
        if (($byte & 0b11111110) === 0b11111100) {
            return 6;
        }
        if (($byte & 0b11111100) === 0b11111000) {
            return 5;
        }
        if (($byte & 0b11111000) === 0b11110000) {
            return 4;
        }
        if (($byte & 0b11110000) === 0b11100000) {
            return 3;
        }
        if (($byte & 0b11100000) === 0b11000000) {
            return 2;
        }
        return 1;
    }

    private function parse()
    {
        if (!empty($this->chars)) {
            return;
        }

        $len = \strlen($this->raw);
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
        $invalid = false;

        for ($offset = 0; $offset < $len; $offset++) {
            $char = $this->raw{$offset};
            $ord = \ord($char);

            if ($inside === false) {
                $bytes = self::charLength($ord);

                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
                    // valid UTF-8 multibyte start
                    $inside = true;
                    $cache = $char;
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
                    $originOffset = $offset;
                } elseif ($ord < self::$spec[2]['start']) {
                    // ASCII 7-bit char
                    $this->chars[] = [$char, $ord];
                } else {
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
                    $invalid = false;
                }
                continue;
            }

            // $inside === true, i.e. *should be* continuation character
            if (($ord & 0b11000000) !== 0b10000000) {
                // actually, it's not one, so now the whole UTF-8 char is invalid
                // go back and force it to parse as ISO or 1252
                $inside = false;
                $invalid = true;
                $offset = $originOffset - 1;
                continue;
            }

            // put this byte's data where it needs to go
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
            $cache .= $char;

            if ($originOffset + ($bytes - 1) === $offset) {
                // we're done parsing this char, now let's verify
                $inside = false;

                // check for overlong, surrogate, too large, BOM, or C0/C1
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
                $toobig = ($ordcache > 0x10FFFF);

                if ($overlong || $surrogate || $toobig) {
                    $invalid = true;
                    $offset = $originOffset - 1;
                    continue;
                }

                if ($ordcache === 0xFEFF) { // BOM
                    if ($originOffset !== 0) {
                        // if not at beginning, store as word joiner U+2060
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
                    }
                    // otherwise discard
                    continue;
                }

                // verification passed, now store it
                $this->chars[] = [$cache, $ordcache];
            }
        }
    }
}


1		<?php
2
3		namespace StringObject;
4
5		class UString extends AnyString
		0 ignored issues – show Bug introduced 2016-03-20 22:38 UTC by Report Bug Copy Issue Report There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, resetToken, times, translate, trim, unescape, uuDecode, uuEncode Loading history...
6		{
7		const NOT_NORMALIZED = 0;
8		const NFC = 1;
9		const NFD = 2;
10		const NFK = 4;
11		const NFKC = 5;
12		const NFKD = 6;
13
14		protected $chars = [];
15		protected $uhandler;
16		protected $normform = self::NOT_NORMALIZED;
17
18		protected static $spec = [
19		2 => ['mask' => 0b00011111, 'start' => 0x80],
20		3 => ['mask' => 0b00001111, 'start' => 0x800],
21		4 => ['mask' => 0b00000111, 'start' => 0x10000],
22		5 => ['mask' => 0b00000011, 'start' => 0x200000],
23		6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24		];
25		protected static $winc1umap = [
26		0x80 => 0x20AC,
27		0x81 => 0xFFFD, // invalid
28		0x82 => 0x201A,
29		0x83 => 0x0192,
30		0x84 => 0x201E,
31		0x85 => 0x2026,
32		0x86 => 0x2020,
33		0x87 => 0x2021,
34		0x88 => 0x02C6,
35		0x89 => 0x2030,
36		0x8A => 0x0160,
37		0x8B => 0x2039,
38		0x8C => 0x0152,
39		0x8D => 0xFFFD, // invalid
40		0x8E => 0x017D,
41		0x8F => 0xFFFD, // invalid
42		0x90 => 0xFFFD, // invalid
43		0x91 => 0x2018,
44		0x92 => 0x2019,
45		0x93 => 0x201C,
46		0x94 => 0x201D,
47		0x95 => 0x2022,
48		0x96 => 0x2013,
49		0x97 => 0x2014,
50		0x98 => 0x02DC,
51		0x99 => 0x2122,
52		0x9A => 0x0161,
53		0x9B => 0x203A,
54		0x9C => 0x0153,
55		0x9D => 0xFFFD, // invalid
56		0x9E => 0x017E,
57		0x9F => 0x0178,
58		];
59
60	View Code Duplication	public function toArray($delim = '', $limit = null)
		0 ignored issues – show Duplication introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
61		{
62		$this->parse();
63
64		if (empty($delim)) {
65		return $this->chars;
66		}
67		if (is_int($delim)) {
68		return \str_split($this->raw, $delim);
69		}
70		if ($limit === null) {
71		return \explode($delim, $this->raw);
72		}
73		return \explode($delim, $this->raw, $limit);
74		}
75
76		/**
77		* @return string
78		*/
79		public function charAt($index)
80		{
81		$this->parse();
82		return $this->chars[$index][0];
83		}
84
85		/**
86		* @return int
87		*/
88		public function charCodeAt($index)
89		{
90		$this->parse();
91		return $this->chars[$index][1];
92		}
93
94		public function detectForm()
95		{
96
97		}
98
99		public function length()
100		{
101		$this->parse();
102		return \count($this->chars);
103		}
104
105		public function normalize($target = self::NFC)
		0 ignored issues – show Unused Code introduced 2016-05-12 14:13 UTC by Report Bug Copy Issue Report The parameter `$target` is not used and could be removed. This check looks from parameters that have been defined for a function or method, but which are not used in the method body. Loading history...
106		{
107
108		}
109
110		/**
111		*
112		*/
113		protected static function cpToUtf8Char($cpt)
114		{
115		if ($cpt < self::$spec[2]['start']) {
116		return \chr($cpt);
117		}
118
119		if ($cpt == 0xFEFF) {
120		return '';
121		}
122
123		if (($cpt >= 0xD800 && $cpt <= 0xDFFF) \|\| $cpt > 0x10FFFF) {
124		return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
125		}
126
127		if ($cpt < self::$spec[3]['start']) {
128		$data = [
129		0b11000000 \| ($cpt >> 6),
130		0b10000000 \| ($cpt & 0b00111111)
131		];
132		} elseif ($cpt < self::$spec[4]['start']) {
133		$data = [
134		0b11100000 \| ($cpt >> 12),
135		0b10000000 \| (($cpt >> 6) & 0b00111111),
136		0b10000000 \| ($cpt & 0b00111111),
137		];
138		} else {
139		$data = [
140		0b11110100,
141		0b10000000 \| (($cpt >> 12) & 0b00111111),
142		0b10000000 \| (($cpt >> 6) & 0b00111111),
143		0b10000000 \| ($cpt & 0b00111111),
144		];
145		}
146
147		return implode(array_map('chr', $data));
148		}
149		/**
150		* @param integer $byte
151		*/
152		protected static function charLength($byte)
153		{
154		if (($byte & 0b11111110) === 0b11111100) {
155		return 6;
156		}
157		if (($byte & 0b11111100) === 0b11111000) {
158		return 5;
159		}
160		if (($byte & 0b11111000) === 0b11110000) {
161		return 4;
162		}
163		if (($byte & 0b11110000) === 0b11100000) {
164		return 3;
165		}
166		if (($byte & 0b11100000) === 0b11000000) {
167		return 2;
168		}
169		return 1;
170		}
171
172		private function parse()
173		{
174		if (!empty($this->chars)) {
175		return;
176		}
177
178		$len = \strlen($this->raw);
179		$inside = false; // are we "inside" of evaluating a valid UTF-8 char?
180		$invalid = false;
181
182		for ($offset = 0; $offset < $len; $offset++) {
183		$char = $this->raw{$offset};
184		$ord = \ord($char);
185
186		if ($inside === false) {
187		$bytes = self::charLength($ord);
188
189		if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
190		// valid UTF-8 multibyte start
191		$inside = true;
192		$cache = $char;
193		$ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
194		$originOffset = $offset;
195		} elseif ($ord < self::$spec[2]['start']) {
196		// ASCII 7-bit char
197		$this->chars[] = [$char, $ord];
198		} else {
199		// either C0/C1 block or higher; map from cp1252 to utf8 or just convert
200		$ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
201		$this->chars[] = [self::cpToUtf8Char($ord), $ord];
202		$invalid = false;
203		}
204		continue;
205		}
206
207		// $inside === true, i.e. should be continuation character
208		if (($ord & 0b11000000) !== 0b10000000) {
209		// actually, it's not one, so now the whole UTF-8 char is invalid
210		// go back and force it to parse as ISO or 1252
211		$inside = false;
212		$invalid = true;
213		$offset = $originOffset - 1;
214		continue;
215		}
216
217		// put this byte's data where it needs to go
218		$ordcache \|= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
219		$cache .= $char;
220
221		if ($originOffset + ($bytes - 1) === $offset) {
222		// we're done parsing this char, now let's verify
223		$inside = false;
224
225		// check for overlong, surrogate, too large, BOM, or C0/C1
226		$overlong = ($ordcache < self::$spec[$bytes]['start']);
227		$surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
228		$toobig = ($ordcache > 0x10FFFF);
229
230		if ($overlong \|\| $surrogate \|\| $toobig) {
231		$invalid = true;
232		$offset = $originOffset - 1;
233		continue;
234		}
235
236		if ($ordcache === 0xFEFF) { // BOM
237		if ($originOffset !== 0) {
238		// if not at beginning, store as word joiner U+2060
239		$this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
240		}
241		// otherwise discard
242		continue;
243		}
244
245		// verification passed, now store it
246		$this->chars[] = [$cache, $ordcache];
247		}
248		}
249		}
250		}
251

garrettw / stringobject

Push — master ( f7fae0...1e758a )

UString::parse() C

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like