UString::cpToUtf8Char() - Code Metrics - Inspection of "cleaned up flag values and usage in AString" - garrettw/stringobject - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 34d315...a20938 )

by Garrett

created 2016-05-11 21:06 UTC

UString::cpToUtf8Char() C

↳ Parent: UString

Complexity

Conditions	8
Paths	6

Size

Total Lines	36
Code Lines	23

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	36
rs	5.3846
cc	8
eloc	23
nc	6
nop	1

<?php

namespace StringObject;

class UString extends AnyString

{
    const NOT_NORMALIZED = 0;
    const NFC = 1;
    const NFD = 2;
    const NFK = 4;
    const NFKC = 5;
    const NFKD = 6;

    protected $chars = [];
    protected $uhandler;
    protected $normform = self::NOT_NORMALIZED;

    protected static $spec = [
        2 => ['mask' => 0b00011111, 'start' => 0x80],
        3 => ['mask' => 0b00001111, 'start' => 0x800],
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
    ];
    protected static $winc1umap = [
        0x80 => 0x20AC,
        0x81 => 0xFFFD, // invalid
        0x82 => 0x201A,
        0x83 => 0x0192,
        0x84 => 0x201E,
        0x85 => 0x2026,
        0x86 => 0x2020,
        0x87 => 0x2021,
        0x88 => 0x02C6,
        0x89 => 0x2030,
        0x8A => 0x0160,
        0x8B => 0x2039,
        0x8C => 0x0152,
        0x8D => 0xFFFD, // invalid
        0x8E => 0x017D,
        0x8F => 0xFFFD, // invalid
        0x90 => 0xFFFD, // invalid
        0x91 => 0x2018,
        0x92 => 0x2019,
        0x93 => 0x201C,
        0x94 => 0x201D,
        0x95 => 0x2022,
        0x96 => 0x2013,
        0x97 => 0x2014,
        0x98 => 0x02DC,
        0x99 => 0x2122,
        0x9A => 0x0161,
        0x9B => 0x203A,
        0x9C => 0x0153,
        0x9D => 0xFFFD, // invalid
        0x9E => 0x017E,
        0x9F => 0x0178,
    ];

    public function toArray($delim = '', $limit = null)

    {
        $this->parse();

        if (empty($delim)) {
            return $this->chars;
        }
        if (is_int($delim)) {
            return \str_split($this->raw, $delim);
        }
        if ($limit === null) {
            return \explode($delim, $this->raw);
        }
        return \explode($delim, $this->raw, $limit);
    }

    /**
     * @return string
     */
    public function charAt($index)
    {
        $this->parse();
        return $this->chars[$index][0];
    }

    /**
     * @return int
     */
    public function charCodeAt($index)
    {
        $this->parse();
        return $this->chars[$index][1];
    }

    public function detectForm()
    {

    }

    public function normalize()
    {

    }

    /**
     *
     */
    protected static function cpToUtf8Char($cpt)
    {
        if ($cpt < self::$spec[2]['start']) {
            return \chr($cpt);
        }

        if ($cpt == 0xFEFF) {
            return '';
        }

        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
        }

        if ($cpt < self::$spec[3]['start']) {
            $data = [
                0b11000000 | ($cpt >> 6),
                0b10000000 | ($cpt & 0b00111111)
            ];
        } elseif ($cpt < self::$spec[4]['start']) {
            $data = [
                0b11100000 | ($cpt >> 12),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        } else {
            $data = [
                0b11110100,
                0b10000000 | (($cpt >> 12) & 0b00111111),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        }

        return implode(array_map('chr', $data));
    }
    /**
     * @param integer $byte
     */
    protected static function charLength($byte)
    {
        if (($byte & 0b11111110) === 0b11111100) {
            return 6;
        }
        if (($byte & 0b11111100) === 0b11111000) {
            return 5;
        }
        if (($byte & 0b11111000) === 0b11110000) {
            return 4;
        }
        if (($byte & 0b11110000) === 0b11100000) {
            return 3;
        }
        if (($byte & 0b11100000) === 0b11000000) {
            return 2;
        }
        return 1;
    }

    private function parse()
    {
        if (!empty($this->chars)) {
            return;
        }

        $len = \strlen($this->raw);
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
        $invalid = false;

        for ($offset = 0; $offset < $len; $offset++) {
            $char = $this->raw{$offset};
            $ord = \ord($char);

            if ($inside === false) {
                $bytes = self::charLength($ord);

                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
                    // valid UTF-8 multibyte start
                    $inside = true;
                    $cache = $char;
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
                    $originOffset = $offset;
                } elseif ($ord < self::$spec[2]['start']) {
                    // ASCII 7-bit char
                    $this->chars[] = [$char, $ord];
                } else {
                    // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
                    $ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
                    $this->chars[] = [self::cpToUtf8Char($ord), $ord];
                    $invalid = false;
                }
                continue;
            }

            // $inside === true, i.e. *should be* continuation character
            if (($ord & 0b11000000) !== 0b10000000) {
                // actually, it's not one, so now the whole UTF-8 char is invalid
                // go back and force it to parse as ISO or 1252
                $inside = false;
                $invalid = true;
                $offset = $originOffset - 1;
                continue;
            }

            // put this byte's data where it needs to go
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
            $cache .= $char;

            if ($originOffset + ($bytes - 1) === $offset) {
                // we're done parsing this char, now let's verify
                $inside = false;

                // check for overlong, surrogate, too large, BOM, or C0/C1
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
                $toobig = ($ordcache > 0x10FFFF);

                if ($overlong || $surrogate || $toobig) {
                    $invalid = true;
                    $offset = $originOffset - 1;
                    continue;
                }

                if ($ordcache === 0xFEFF) { // BOM
                    if ($originOffset !== 0) {
                        // if not at beginning, store as word joiner U+2060
                        $this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
                    }
                    // otherwise discard
                    continue;
                }

                // verification passed, now store it
                $this->chars[] = [$cache, $ordcache];
            }
        }
    }
}


1		<?php
2
3		namespace StringObject;
4
5		class UString extends AnyString
		0 ignored issues – show Bug introduced 2016-05-11 21:16 UTC by Report Bug Copy Issue Report There is at least one abstract method in this class. Maybe declare it as abstract, or implement the remaining methods: compareTo, escape, isAscii, isEmpty, nextToken, remove, repeat, replace, replaceWhole, resetToken, times, translate, trim, unescape, uuDecode, uuEncode Loading history...
6		{
7		const NOT_NORMALIZED = 0;
8		const NFC = 1;
9		const NFD = 2;
10		const NFK = 4;
11		const NFKC = 5;
12		const NFKD = 6;
13
14		protected $chars = [];
15		protected $uhandler;
16		protected $normform = self::NOT_NORMALIZED;
17
18		protected static $spec = [
19		2 => ['mask' => 0b00011111, 'start' => 0x80],
20		3 => ['mask' => 0b00001111, 'start' => 0x800],
21		4 => ['mask' => 0b00000111, 'start' => 0x10000],
22		5 => ['mask' => 0b00000011, 'start' => 0x200000],
23		6 => ['mask' => 0b00000001, 'start' => 0x4000000],
24		];
25		protected static $winc1umap = [
26		0x80 => 0x20AC,
27		0x81 => 0xFFFD, // invalid
28		0x82 => 0x201A,
29		0x83 => 0x0192,
30		0x84 => 0x201E,
31		0x85 => 0x2026,
32		0x86 => 0x2020,
33		0x87 => 0x2021,
34		0x88 => 0x02C6,
35		0x89 => 0x2030,
36		0x8A => 0x0160,
37		0x8B => 0x2039,
38		0x8C => 0x0152,
39		0x8D => 0xFFFD, // invalid
40		0x8E => 0x017D,
41		0x8F => 0xFFFD, // invalid
42		0x90 => 0xFFFD, // invalid
43		0x91 => 0x2018,
44		0x92 => 0x2019,
45		0x93 => 0x201C,
46		0x94 => 0x201D,
47		0x95 => 0x2022,
48		0x96 => 0x2013,
49		0x97 => 0x2014,
50		0x98 => 0x02DC,
51		0x99 => 0x2122,
52		0x9A => 0x0161,
53		0x9B => 0x203A,
54		0x9C => 0x0153,
55		0x9D => 0xFFFD, // invalid
56		0x9E => 0x017E,
57		0x9F => 0x0178,
58		];
59
60	View Code Duplication	public function toArray($delim = '', $limit = null)
		0 ignored issues – show Duplication introduced 2015-12-04 23:19 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
61		{
62		$this->parse();
63
64		if (empty($delim)) {
65		return $this->chars;
66		}
67		if (is_int($delim)) {
68		return \str_split($this->raw, $delim);
69		}
70		if ($limit === null) {
71		return \explode($delim, $this->raw);
72		}
73		return \explode($delim, $this->raw, $limit);
74		}
75
76		/**
77		* @return string
78		*/
79		public function charAt($index)
80		{
81		$this->parse();
82		return $this->chars[$index][0];
83		}
84
85		/**
86		* @return int
87		*/
88		public function charCodeAt($index)
89		{
90		$this->parse();
91		return $this->chars[$index][1];
92		}
93
94		public function detectForm()
95		{
96
97		}
98
99		public function normalize()
100		{
101
102		}
103
104		/**
105		*
106		*/
107		protected static function cpToUtf8Char($cpt)
108		{
109		if ($cpt < self::$spec[2]['start']) {
110		return \chr($cpt);
111		}
112
113		if ($cpt == 0xFEFF) {
114		return '';
115		}
116
117		if (($cpt >= 0xD800 && $cpt <= 0xDFFF) \|\| $cpt > 0x10FFFF) {
118		return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
119		}
120
121		if ($cpt < self::$spec[3]['start']) {
122		$data = [
123		0b11000000 \| ($cpt >> 6),
124		0b10000000 \| ($cpt & 0b00111111)
125		];
126		} elseif ($cpt < self::$spec[4]['start']) {
127		$data = [
128		0b11100000 \| ($cpt >> 12),
129		0b10000000 \| (($cpt >> 6) & 0b00111111),
130		0b10000000 \| ($cpt & 0b00111111),
131		];
132		} else {
133		$data = [
134		0b11110100,
135		0b10000000 \| (($cpt >> 12) & 0b00111111),
136		0b10000000 \| (($cpt >> 6) & 0b00111111),
137		0b10000000 \| ($cpt & 0b00111111),
138		];
139		}
140
141		return implode(array_map('chr', $data));
142		}
143		/**
144		* @param integer $byte
145		*/
146		protected static function charLength($byte)
147		{
148		if (($byte & 0b11111110) === 0b11111100) {
149		return 6;
150		}
151		if (($byte & 0b11111100) === 0b11111000) {
152		return 5;
153		}
154		if (($byte & 0b11111000) === 0b11110000) {
155		return 4;
156		}
157		if (($byte & 0b11110000) === 0b11100000) {
158		return 3;
159		}
160		if (($byte & 0b11100000) === 0b11000000) {
161		return 2;
162		}
163		return 1;
164		}
165
166		private function parse()
167		{
168		if (!empty($this->chars)) {
169		return;
170		}
171
172		$len = \strlen($this->raw);
173		$inside = false; // are we "inside" of evaluating a valid UTF-8 char?
174		$invalid = false;
175
176		for ($offset = 0; $offset < $len; $offset++) {
177		$char = $this->raw{$offset};
178		$ord = \ord($char);
179
180		if ($inside === false) {
181		$bytes = self::charLength($ord);
182
183		if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
184		// valid UTF-8 multibyte start
185		$inside = true;
186		$cache = $char;
187		$ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
188		$originOffset = $offset;
189		} elseif ($ord < self::$spec[2]['start']) {
190		// ASCII 7-bit char
191		$this->chars[] = [$char, $ord];
192		} else {
193		// either C0/C1 block or higher; map from cp1252 to utf8 or just convert
194		$ord = (isset(self::$winc1umap[$ord])) ? self::$winc1umap[$ord] : $ord;
195		$this->chars[] = [self::cpToUtf8Char($ord), $ord];
196		$invalid = false;
197		}
198		continue;
199		}
200
201		// $inside === true, i.e. should be continuation character
202		if (($ord & 0b11000000) !== 0b10000000) {
203		// actually, it's not one, so now the whole UTF-8 char is invalid
204		// go back and force it to parse as ISO or 1252
205		$inside = false;
206		$invalid = true;
207		$offset = $originOffset - 1;
208		continue;
209		}
210
211		// put this byte's data where it needs to go
212		$ordcache \|= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
213		$cache .= $char;
214
215		if ($originOffset + ($bytes - 1) === $offset) {
216		// we're done parsing this char, now let's verify
217		$inside = false;
218
219		// check for overlong, surrogate, too large, BOM, or C0/C1
220		$overlong = ($ordcache < self::$spec[$bytes]['start']);
221		$surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
222		$toobig = ($ordcache > 0x10FFFF);
223
224		if ($overlong \|\| $surrogate \|\| $toobig) {
225		$invalid = true;
226		$offset = $originOffset - 1;
227		continue;
228		}
229
230		if ($ordcache === 0xFEFF) { // BOM
231		if ($originOffset !== 0) {
232		// if not at beginning, store as word joiner U+2060
233		$this->chars[] = [\chr(0xE2) . \chr(0x81) . \chr(0xA0), 0x2060];
234		}
235		// otherwise discard
236		continue;
237		}
238
239		// verification passed, now store it
240		$this->chars[] = [$cache, $ordcache];
241		}
242		}
243		}
244		}
245

garrettw / stringobject

Push — master ( 34d315...a20938 )

UString::cpToUtf8Char() C

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like