UStrObj::loadToArray() - Code Metrics - Inspection of "more work on unicode support, not done yet" - garrettw/stringobject - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 1d9387...a47b74 )

by Garrett

created 2016-03-20 22:29 UTC

UStrObj::loadToArray() C

↳ Parent: UStrObj

Complexity

Conditions	16
Paths	12

Size

Total Lines	78
Code Lines	44

Duplication

Lines	0
Ratio	0 %

Importance

Changes	4
Bugs	0	Features	0

Metric	Value
c	4
b	0
f	0
dl	0
loc	78
rs	5.142
cc	16
eloc	44
nc	12
nop	0

How to fix Long Method Complexity

<?php

namespace StringObject;

class UStrObj extends AnyStrObj

{
    const NOT_NORMALIZED = 0;
    const NFC = 1;
    const NFD = 2;
    const NFK = 4;
    const NFKC = 5;
    const NFKD = 6;

    protected $chars = [];
    protected $uhandler;
    protected $normform = self::NOT_NORMALIZED;

    protected static $spec = [
        2 => ['mask' => 0b00011111, 'start' => 0x80],
        3 => ['mask' => 0b00001111, 'start' => 0x800],
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
    ];
    protected static $winc1umap = [
        0x80 => 0x20AC,
        0x81 => 0xFFFD, // invalid
        0x82 => 0x201A,
        0x83 => 0x0192,
        0x84 => 0x201E,
        0x85 => 0x2026,
        0x86 => 0x2020,
        0x87 => 0x2021,
        0x88 => 0x02C6,
        0x89 => 0x2030,
        0x8A => 0x0160,
        0x8B => 0x2039,
        0x8C => 0x0152,
        0x8D => 0xFFFD, // invalid
        0x8E => 0x017D,
        0x8F => 0xFFFD, // invalid
        0x90 => 0xFFFD, // invalid
        0x91 => 0x2018,
        0x92 => 0x2019,
        0x93 => 0x201C,
        0x94 => 0x201D,
        0x95 => 0x2022,
        0x96 => 0x2013,
        0x97 => 0x2014,
        0x98 => 0x02DC,
        0x99 => 0x2122,
        0x9A => 0x0161,
        0x9B => 0x203A,
        0x9C => 0x0153,
        0x9D => 0xFFFD, // invalid
        0x9E => 0x017E,
        0x9F => 0x0178,
    ];

    public function toArray($delim = '', $limit = null)

    {
        $this->loadToArray();

        if (empty($delim)) {
            return $this->chars;
        }
        if (is_int($delim)) {
            return \str_split($this->raw, $delim);
        }
        if ($limit === null) {
            return \explode($delim, $this->raw);
        }
        return \explode($delim, $this->raw, $limit);
    }

    /**
     * @return string
     */
    public function charAt($index)
    {
        $this->loadToArray();
        return $this->chars[$index][0];
    }

    /**
     * @return int
     */
    public function charCodeAt($index)
    {
        $this->loadToArray();
        return $this->chars[$index][1];
    }

    /**
     *
     */
    protected static function cpToUtf8Char($cpt)
    {
        if ($cpt < self::$spec[2]['start']) {
            return \chr($cpt);
        }

        if ($cpt == 0xFEFF) {
            return '';
        }

        $invalid = [0xEF, 0xBF, 0xBD]; // U+FFFD

        if ($cpt < self::$spec[3]['start']) {
            $data = [
                0b11000000 | ($cpt >> 6),
                0b10000000 | ($cpt & 0b00111111)
            ];
        } elseif ($cpt >= 0xD800 && $cpt <= 0xDFFF) {
            $data = $invalid;
        } elseif ($cpt < self::$spec[4]['start']) {
            $data = [
                0b11100000 | ($cpt >> 12),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        } elseif ($cpt <= 0x10FFFF) {
            $data = [
                0b11110100,
                0b10000000 | (($cpt >> 12) & 0b00111111),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        } else {
            $data = $invalid;
        }

        return implode(array_map('chr', $data));
    }
    /**
     * @param integer $byte
     */
    protected static function charLength($byte)
    {
        if (($byte & 0b11111110) === 0b11111100) {
            return 6;
        }
        if (($byte & 0b11111100) === 0b11111000) {
            return 5;
        }
        if (($byte & 0b11111000) === 0b11110000) {
            return 4;
        }
        if (($byte & 0b11110000) === 0b11100000) {
            return 3;
        }
        if (($byte & 0b11100000) === 0b11000000) {
            return 2;
        }
        return 1;
    }

    private function loadToArray()
    {
        if (!empty($this->chars)) {
            return;
        }

        $len = \strlen($this->raw);
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
        $invalid = false;

        for ($offset = 0; $offset < $len; $offset++) {
            $char = $this->raw{$offset};
            $ord = \ord($char);

            if ($inside === false) {
                $bytes = self::charLength($ord);

                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
                    // valid UTF-8 multibyte start
                    $inside = true;
                    $cache = $char;
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
                    $originOffset = $offset;
                } elseif ($ord < self::$spec[2]['start']) {
                    // ASCII 7-bit char
                    $this->chars[] = [$char, $ord];
                } else {
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
                    $invalid = false;
                }
                continue;
            }

            // $inside === true, i.e. *should be* continuation character
            if (($ord & 0b11000000) !== 0b10000000) {
                // actually, it's not one, so now the whole UTF-8 char is invalid
                // go back and force it to parse as ISO or 1252
                $inside = false;
                $invalid = true;
                $offset = $originOffset - 1;
                continue;
            }

            // put this byte's data where it needs to go
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
            $cache .= $char;

            if ($originOffset + ($bytes - 1) === $offset) {
                // we're done parsing this char, now let's verify
                $inside = false;

                // check for overlong, surrogate, too large, BOM, or C0/C1
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
                $toobig = ($ordcache > 0x10FFFF);

                if ($overlong || $surrogate || $toobig) {
                    $invalid = true;
                    $offset = $originOffset - 1;
                    continue;
                }

                if ($ordcache === 0xFEFF) { // BOM
                    if ($originOffset !== 0) {
                        // if not at beginning, store as word joiner U+2060
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
                    }
                    // otherwise discard
                    continue;
                }

                // verification passed, now store it
                $this->chars[] = [$cache, $ordcache];
            }
        }
    }

}


1		<?php
2
3		namespace StringObject;
4
5		class UStrObj extends AnyStrObj
		0 ignored issues – show Bug introduced 2016-03-20 22:38 UTC by Report Bug Copy Issue Report There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, resetToken, times, translate, trim, unescape, uuDecode, uuEncode Loading history...
6		{
7		const NOT_NORMALIZED = 0;
8		const NFC = 1;
9		const NFD = 2;
10		const NFK = 4;
11		const NFKC = 5;
12		const NFKD = 6;
13
14		protected $chars = [];
15		protected $uhandler;
16		protected $normform = self::NOT_NORMALIZED;
17
18		protected static $spec = [
19		2 => ['mask' => 0b00011111, 'start' => 0x80],
20		3 => ['mask' => 0b00001111, 'start' => 0x800],
21		4 => ['mask' => 0b00000111, 'start' => 0x10000],
22		5 => ['mask' => 0b00000011, 'start' => 0x200000],
23		6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24		];
25		protected static $winc1umap = [
26		0x80 => 0x20AC,
27		0x81 => 0xFFFD, // invalid
28		0x82 => 0x201A,
29		0x83 => 0x0192,
30		0x84 => 0x201E,
31		0x85 => 0x2026,
32		0x86 => 0x2020,
33		0x87 => 0x2021,
34		0x88 => 0x02C6,
35		0x89 => 0x2030,
36		0x8A => 0x0160,
37		0x8B => 0x2039,
38		0x8C => 0x0152,
39		0x8D => 0xFFFD, // invalid
40		0x8E => 0x017D,
41		0x8F => 0xFFFD, // invalid
42		0x90 => 0xFFFD, // invalid
43		0x91 => 0x2018,
44		0x92 => 0x2019,
45		0x93 => 0x201C,
46		0x94 => 0x201D,
47		0x95 => 0x2022,
48		0x96 => 0x2013,
49		0x97 => 0x2014,
50		0x98 => 0x02DC,
51		0x99 => 0x2122,
52		0x9A => 0x0161,
53		0x9B => 0x203A,
54		0x9C => 0x0153,
55		0x9D => 0xFFFD, // invalid
56		0x9E => 0x017E,
57		0x9F => 0x0178,
58		];
59
60	View Code Duplication	public function toArray($delim = '', $limit = null)
		0 ignored issues – show Duplication introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
61		{
62		$this->loadToArray();
63
64		if (empty($delim)) {
65		return $this->chars;
66		}
67		if (is_int($delim)) {
68		return \str_split($this->raw, $delim);
69		}
70		if ($limit === null) {
71		return \explode($delim, $this->raw);
72		}
73		return \explode($delim, $this->raw, $limit);
74		}
75
76		/**
77		* @return string
78		*/
79		public function charAt($index)
80		{
81		$this->loadToArray();
82		return $this->chars[$index][0];
83		}
84
85		/**
86		* @return int
87		*/
88		public function charCodeAt($index)
89		{
90		$this->loadToArray();
91		return $this->chars[$index][1];
92		}
93
94		/**
95		*
96		*/
97		protected static function cpToUtf8Char($cpt)
98		{
99		if ($cpt < self::$spec[2]['start']) {
100		return \chr($cpt);
101		}
102
103		if ($cpt == 0xFEFF) {
104		return '';
105		}
106
107		$invalid = [0xEF, 0xBF, 0xBD]; // U+FFFD
108
109		if ($cpt < self::$spec[3]['start']) {
110		$data = [
111		0b11000000 \| ($cpt >> 6),
112		0b10000000 \| ($cpt & 0b00111111)
113		];
114		} elseif ($cpt >= 0xD800 && $cpt <= 0xDFFF) {
115		$data = $invalid;
116		} elseif ($cpt < self::$spec[4]['start']) {
117		$data = [
118		0b11100000 \| ($cpt >> 12),
119		0b10000000 \| (($cpt >> 6) & 0b00111111),
120		0b10000000 \| ($cpt & 0b00111111),
121		];
122		} elseif ($cpt <= 0x10FFFF) {
123		$data = [
124		0b11110100,
125		0b10000000 \| (($cpt >> 12) & 0b00111111),
126		0b10000000 \| (($cpt >> 6) & 0b00111111),
127		0b10000000 \| ($cpt & 0b00111111),
128		];
129		} else {
130		$data = $invalid;
131		}
132
133		return implode(array_map('chr', $data));
134		}
135		/**
136		* @param integer $byte
137		*/
138		protected static function charLength($byte)
139		{
140		if (($byte & 0b11111110) === 0b11111100) {
141		return 6;
142		}
143		if (($byte & 0b11111100) === 0b11111000) {
144		return 5;
145		}
146		if (($byte & 0b11111000) === 0b11110000) {
147		return 4;
148		}
149		if (($byte & 0b11110000) === 0b11100000) {
150		return 3;
151		}
152		if (($byte & 0b11100000) === 0b11000000) {
153		return 2;
154		}
155		return 1;
156		}
157
158		private function loadToArray()
159		{
160		if (!empty($this->chars)) {
161		return;
162		}
163
164		$len = \strlen($this->raw);
165		$inside = false; // are we "inside" of evaluating a valid UTF-8 char?
166		$invalid = false;
167
168		for ($offset = 0; $offset < $len; $offset++) {
169		$char = $this->raw{$offset};
170		$ord = \ord($char);
171
172		if ($inside === false) {
173		$bytes = self::charLength($ord);
174
175		if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
176		// valid UTF-8 multibyte start
177		$inside = true;
178		$cache = $char;
179		$ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
180		$originOffset = $offset;
181		} elseif ($ord < self::$spec[2]['start']) {
182		// ASCII 7-bit char
183		$this->chars[] = [$char, $ord];
184		} else {
185		// either C0/C1 block or higher; map from cp1252 to utf8 or just convert
186		$ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
187		$this->chars[] = [self::cpToUtf8Char($ord), $ord];
188		$invalid = false;
189		}
190		continue;
191		}
192
193		// $inside === true, i.e. should be continuation character
194		if (($ord & 0b11000000) !== 0b10000000) {
195		// actually, it's not one, so now the whole UTF-8 char is invalid
196		// go back and force it to parse as ISO or 1252
197		$inside = false;
198		$invalid = true;
199		$offset = $originOffset - 1;
200		continue;
201		}
202
203		// put this byte's data where it needs to go
204		$ordcache \|= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
205		$cache .= $char;
206
207		if ($originOffset + ($bytes - 1) === $offset) {
208		// we're done parsing this char, now let's verify
209		$inside = false;
210
211		// check for overlong, surrogate, too large, BOM, or C0/C1
212		$overlong = ($ordcache < self::$spec[$bytes]['start']);
213		$surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
214		$toobig = ($ordcache > 0x10FFFF);
215
216		if ($overlong \|\| $surrogate \|\| $toobig) {
217		$invalid = true;
218		$offset = $originOffset - 1;
219		continue;
220		}
221
222		if ($ordcache === 0xFEFF) { // BOM
223		if ($originOffset !== 0) {
224		// if not at beginning, store as word joiner U+2060
225		$this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
226		}
227		// otherwise discard
228		continue;
229		}
230
231		// verification passed, now store it
232		$this->chars[] = [$cache, $ordcache];
233		}
234		}
235		}
236
237		}
238

garrettw / stringobject

Push — master ( 1d9387...a47b74 )

UStrObj::loadToArray() C

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like