UtfString::getCharLength() - Code Metrics - Inspection of "Optimize offsetGet" - phpmyadmin/sql-parser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#506)

unknown

created 2023-09-08 15:50 UTC

UtfString::getCharLength() B

↳ Parent: Project

Complexity

Conditions	7
Paths	12

Size

Total Lines	33
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	15
CRAP Score	7

Importance

Changes

Metric	Value
cc	7
eloc	14
nc	12
nop	1
dl	0
loc	33
ccs	15
cts	15
cp	1
crap	7
rs	8.8333
c	0
b	0
f	0

<?php

declare(strict_types=1);

namespace PhpMyAdmin\SqlParser;

use ArrayAccess;
use Exception;
use Stringable;

use function mb_check_encoding;
use function mb_strlen;
use function mb_substr;
use function ord;
use function strlen;
use function substr;

/**
 * Implementation for UTF-8 strings.
 *
 * The subscript operator in PHP, when used with string will return a byte and not a character. Because in UTF-8
 * strings a character may occupy more than one byte, the subscript operator may return an invalid character.
 *
 * Because the lexer relies on the subscript operator this class had to be implemented.
 *
 * Implements array-like access for UTF-8 strings.
 *
 * In this library, this class should be used to parse UTF-8 queries.
 *
 * @implements ArrayAccess<int, string>
 */
class UtfString implements ArrayAccess, Stringable
{
    /**
     * The raw, multi-byte string.
     *
     * @var string
     */
    public $str = '';

    /**
     * The index of current byte.
     *
     * For ASCII strings, the byte index is equal to the character index.
     *
     * @var int
     */
    public $byteIdx = 0;

    /**
     * The index of current character.
     *
     * For non-ASCII strings, some characters occupy more than one byte and
     * the character index will have a lower value than the byte index.
     *
     * @var int
     */
    public $charIdx = 0;

    /**
     * The length of the string (in bytes).
     *
     * @var int
     */
    public $byteLen = 0;

    /**
     * The length of the string (in characters).
     *
     * @var int
     */
    public $charLen = 0;

    /**
     * @param string $str the string
     */
    public function __construct($str)
    {
        $this->str = $str;
        $this->byteLen = mb_strlen($str, '8bit');
        if (! mb_check_encoding($str, 'UTF-8')) {
            $this->charLen = 0;
        } else {
            $this->charLen = mb_strlen($str, 'UTF-8');
        }
    }

    /**
     * Checks if the given offset exists.
     *
     * @param int $offset the offset to be checked
     */
    public function offsetExists($offset): bool
    {
        return ($offset >= 0) && ($offset < $this->charLen);
    }

    /**
     * Gets the character at given offset.
     *
     * @param int $offset the offset to be returned
     */
    public function offsetGet($offset): string|null
    {
        // This function moves the internal byte and character pointer to the requested offset.
        // This function is part of hot code so the aim is to do the following
        // operations as efficiently as possible.
        // UTF-8 character encoding is a variable length encoding that encodes Unicode
        // characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
        // to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
        if (($offset < 0) || ($offset >= $this->charLen)) {
            return null;
        }

        $delta = $offset - $this->charIdx;

        if ($delta > 0) {
            // Fast forwarding.
            $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
            $this->charIdx += $delta;
        } elseif ($delta < 0) {
            // Rewinding.
            while ($delta++ < 0) {
                // We rewind byte by byte and only count characters that are not continuation bytes,
                // i.e. ASCII characters and first octets of multibyte characters
                do {
                    $byte = ord($this->str[--$this->byteIdx]);
                } while (($byte >= 128) && ($byte < 192));

                --$this->charIdx;
            }
        }

        // Fetch the first Unicode character within the next 4 bytes in the string.
        return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
    }

    /**
     * Sets the value of a character.
     *
     * @param int    $offset the offset to be set
     * @param string $value  the value to be set
     *
     * @throws Exception not implemented.
     */
    public function offsetSet($offset, $value): void
    {
        throw new Exception('Not implemented.');
    }

    /**
     * Unsets an index.
     *
     * @param int $offset the value to be unset
     *
     * @throws Exception not implemented.
     */
    public function offsetUnset($offset): void
    {
        throw new Exception('Not implemented.');
    }

    /**
     * Returns the length in characters of the string.
     */
    public function length(): int
    {
        return $this->charLen;
    }

    /**
     * Returns the contained string.
     */
    public function __toString(): string
    {
        return $this->str;
    }
}


1		<?php
2
3		declare(strict_types=1);
4
5		namespace PhpMyAdmin\SqlParser;
6
7		use ArrayAccess;
8		use Exception;
9		use Stringable;
10
11		use function mb_check_encoding;
12		use function mb_strlen;
13		use function mb_substr;
14		use function ord;
15		use function strlen;
16		use function substr;
17
18		/**
19		* Implementation for UTF-8 strings.
20		*
21		* The subscript operator in PHP, when used with string will return a byte and not a character. Because in UTF-8
22		* strings a character may occupy more than one byte, the subscript operator may return an invalid character.
23		*
24		* Because the lexer relies on the subscript operator this class had to be implemented.
25		*
26		* Implements array-like access for UTF-8 strings.
27		*
28		* In this library, this class should be used to parse UTF-8 queries.
29		*
30		* @implements ArrayAccess<int, string>
31		*/
32		class UtfString implements ArrayAccess, Stringable
33		{
34		/**
35		* The raw, multi-byte string.
36		*
37		* @var string
38		*/
39		public $str = '';
40
41		/**
42		* The index of current byte.
43		*
44		* For ASCII strings, the byte index is equal to the character index.
45		*
46		* @var int
47		*/
48		public $byteIdx = 0;
49
50		/**
51		* The index of current character.
52		*
53		* For non-ASCII strings, some characters occupy more than one byte and
54		* the character index will have a lower value than the byte index.
55		*
56		* @var int
57		*/
58		public $charIdx = 0;
59
60		/**
61		* The length of the string (in bytes).
62		*
63		* @var int
64		*/
65		public $byteLen = 0;
66
67		/**
68		* The length of the string (in characters).
69		*
70		* @var int
71		*/
72		public $charLen = 0;
73
74		/**
75		* @param string $str the string
76		*/
77	28	public function __construct($str)
78		{
79	28	$this->str = $str;
80	28	$this->byteLen = mb_strlen($str, '8bit');
81	28	if (! mb_check_encoding($str, 'UTF-8')) {
82	2	$this->charLen = 0;
83		} else {
84	26	$this->charLen = mb_strlen($str, 'UTF-8');
85		}
86		}
87
88		/**
89		* Checks if the given offset exists.
90		*
91		* @param int $offset the offset to be checked
92		*/
93	2	public function offsetExists($offset): bool
94		{
95	2	return ($offset >= 0) && ($offset < $this->charLen);
96		}
97
98		/**
99		* Gets the character at given offset.
100		*
101		* @param int $offset the offset to be returned
102		*/
103	22	public function offsetGet($offset): string\|null
104		{
105		// This function moves the internal byte and character pointer to the requested offset.
106		// This function is part of hot code so the aim is to do the following
107		// operations as efficiently as possible.
108		// UTF-8 character encoding is a variable length encoding that encodes Unicode
109		// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
110		// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
111	22	if (($offset < 0) \|\| ($offset >= $this->charLen)) {
112	4	return null;
113		}
114
115	20	$delta = $offset - $this->charIdx;
116
117	20	if ($delta > 0) {
118		// Fast forwarding.
119	20	$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
120	20	$this->charIdx += $delta;
121	18	} elseif ($delta < 0) {
122		// Rewinding.
123	18	while ($delta++ < 0) {
124		// We rewind byte by byte and only count characters that are not continuation bytes,
125		// i.e. ASCII characters and first octets of multibyte characters
126		do {
127	18	$byte = ord($this->str[--$this->byteIdx]);
128	18	} while (($byte >= 128) && ($byte < 192));
129
130	18	--$this->charIdx;
131		}
132		}
133
134		// Fetch the first Unicode character within the next 4 bytes in the string.
135	20	return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
136		}
137
138		/**
139		* Sets the value of a character.
140		*
141		* @param int $offset the offset to be set
142		* @param string $value the value to be set
143		*
144		* @throws Exception not implemented.
145		*/
146	2	public function offsetSet($offset, $value): void
147		{
148	2	throw new Exception('Not implemented.');
149		}
150
151		/**
152		* Unsets an index.
153		*
154		* @param int $offset the value to be unset
155		*
156		* @throws Exception not implemented.
157		*/
158	2	public function offsetUnset($offset): void
159		{
160	2	throw new Exception('Not implemented.');
161		}
162
163		/**
164		* Returns the length in characters of the string.
165		*/
166	10	public function length(): int
167		{
168	10	return $this->charLen;
169		}
170
171		/**
172		* Returns the contained string.
173		*/
174	2	public function __toString(): string
175		{
176	2	return $this->str;
177		}
178		}
179

phpmyadmin / sql-parser

Pull Request — master (#506)

UtfString::getCharLength() B

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like