Passed
Push — master ( 6fd2c5...d57481 )
by Maurício
03:03 queued 13s
created

UtfString::getCharLength()   B

Complexity

Conditions 7
Paths 12

Size

Total Lines 33
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 7

Importance

Changes 0
Metric Value
cc 7
eloc 14
nc 12
nop 1
dl 0
loc 33
ccs 15
cts 15
cp 1
crap 7
rs 8.8333
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace PhpMyAdmin\SqlParser;
6
7
use ArrayAccess;
8
use Exception;
9
use Stringable;
10
11
use function mb_check_encoding;
12
use function mb_strlen;
13
use function mb_substr;
14
use function ord;
15
use function strlen;
16
use function substr;
17
18
/**
19
 * Implementation for UTF-8 strings.
20
 *
21
 * The subscript operator in PHP, when used with string will return a byte and not a character. Because in UTF-8
22
 * strings a character may occupy more than one byte, the subscript operator may return an invalid character.
23
 *
24
 * Because the lexer relies on the subscript operator this class had to be implemented.
25
 *
26
 * Implements array-like access for UTF-8 strings.
27
 *
28
 * In this library, this class should be used to parse UTF-8 queries.
29
 *
30
 * @implements ArrayAccess<int, string>
31
 */
32
class UtfString implements ArrayAccess, Stringable
33
{
34
    /**
35
     * The raw, multi-byte string.
36
     *
37
     * @var string
38
     */
39
    public $str = '';
40
41
    /**
42
     * The index of current byte.
43
     *
44
     * For ASCII strings, the byte index is equal to the character index.
45
     *
46
     * @var int
47
     */
48
    public $byteIdx = 0;
49
50
    /**
51
     * The index of current character.
52
     *
53
     * For non-ASCII strings, some characters occupy more than one byte and
54
     * the character index will have a lower value than the byte index.
55
     *
56
     * @var int
57
     */
58
    public $charIdx = 0;
59
60
    /**
61
     * The length of the string (in bytes).
62
     *
63
     * @var int
64
     */
65
    public $byteLen = 0;
66
67
    /**
68
     * The length of the string (in characters).
69
     *
70
     * @var int
71
     */
72
    public $charLen = 0;
73
74
    /**
75
     * @param string $str the string
76
     */
77 28
    public function __construct($str)
78
    {
79 28
        $this->str = $str;
80 28
        $this->byteLen = mb_strlen($str, '8bit');
81 28
        if (! mb_check_encoding($str, 'UTF-8')) {
82 2
            $this->charLen = 0;
83
        } else {
84 26
            $this->charLen = mb_strlen($str, 'UTF-8');
85
        }
86
    }
87
88
    /**
89
     * Checks if the given offset exists.
90
     *
91
     * @param int $offset the offset to be checked
92
     */
93 2
    public function offsetExists($offset): bool
94
    {
95 2
        return ($offset >= 0) && ($offset < $this->charLen);
96
    }
97
98
    /**
99
     * Gets the character at given offset.
100
     *
101
     * @param int $offset the offset to be returned
102
     */
103 22
    public function offsetGet($offset): string|null
104
    {
105
        // This function moves the internal byte and character pointer to the requested offset.
106
        // This function is part of hot code so the aim is to do the following
107
        // operations as efficiently as possible.
108
        // UTF-8 character encoding is a variable length encoding that encodes Unicode
109
        // characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
110
        // to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
111 22
        if (($offset < 0) || ($offset >= $this->charLen)) {
112 4
            return null;
113
        }
114
115 20
        $delta = $offset - $this->charIdx;
116
117 20
        if ($delta > 0) {
118
            // Fast forwarding.
119 20
            $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
120 20
            $this->charIdx += $delta;
121 18
        } elseif ($delta < 0) {
122
            // Rewinding.
123 18
            while ($delta++ < 0) {
124
                // We rewind byte by byte and only count characters that are not continuation bytes,
125
                // i.e. ASCII characters and first octets of multibyte characters
126
                do {
127 18
                    $byte = ord($this->str[--$this->byteIdx]);
128 18
                } while (($byte >= 128) && ($byte < 192));
129
130 18
                --$this->charIdx;
131
            }
132
        }
133
134
        // Fetch the first Unicode character within the next 4 bytes in the string.
135 20
        return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
136
    }
137
138
    /**
139
     * Sets the value of a character.
140
     *
141
     * @param int    $offset the offset to be set
142
     * @param string $value  the value to be set
143
     *
144
     * @throws Exception not implemented.
145
     */
146 2
    public function offsetSet($offset, $value): void
147
    {
148 2
        throw new Exception('Not implemented.');
149
    }
150
151
    /**
152
     * Unsets an index.
153
     *
154
     * @param int $offset the value to be unset
155
     *
156
     * @throws Exception not implemented.
157
     */
158 2
    public function offsetUnset($offset): void
159
    {
160 2
        throw new Exception('Not implemented.');
161
    }
162
163
    /**
164
     * Returns the length in characters of the string.
165
     */
166 10
    public function length(): int
167
    {
168 10
        return $this->charLen;
169
    }
170
171
    /**
172
     * Returns the contained string.
173
     */
174 2
    public function __toString(): string
175
    {
176 2
        return $this->str;
177
    }
178
}
179