Completed
Pull Request — master (#160)
by
unknown
01:52
created

Scanner::doCharsWhile()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 4.432

Importance

Changes 0
Metric Value
dl 0
loc 17
ccs 7
cts 10
cp 0.7
rs 9.7
c 0
b 0
f 0
cc 4
nc 3
nop 2
crap 4.432
1
<?php
2
3
namespace Masterminds\HTML5\Parser;
4
5
use Masterminds\HTML5\Exception;
6
7
/**
8
 * The scanner scans over a given data input to react appropriately to characters.
9
 */
10
class Scanner
11
{
12
    const CHARS_HEX = 'abcdefABCDEF01234567890';
13
    const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
14
    const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
15
16
    /**
17
     * The string data we're parsing.
18
     */
19
    private $data;
20
21
    /**
22
     * The current integer byte position we are in $data.
23
     */
24
    private $char;
25
26
    /**
27
     * Length of $data; when $char === $data, we are at the end-of-file.
28
     */
29
    private $EOF;
30
31
    /**
32
     * Parse errors.
33
     */
34
    public $errors = array();
35
36
    /**
37
     * Create a new Scanner.
38
     *
39
     * @param string $data     Data to parse.
40
     * @param string $encoding The encoding to use for the data.
41
     *
42
     * @throws Exception If the given data cannot be encoded to UTF-8.
43
     */
44 142
    public function __construct($data, $encoding = 'UTF-8')
45
    {
46 142
        if ($data instanceof InputStream) {
47 4
            @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
48 4
            $data = (string) $data;
49 4
        }
50
51 142
        $data = UTF8Utils::convertToUTF8($data, $encoding);
52
53
        // There is good reason to question whether it makes sense to
54
        // do this here, since most of these checks are done during
55
        // parsing, and since this check doesn't actually *do* anything.
56 142
        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
57
58 142
        $data = $this->replaceLinefeeds($data);
59
60 142
        $this->data = $data;
61 142
        $this->char = 0;
62 142
        $this->EOF = strlen($data);
63 142
    }
64
65
    /**
66
     * Check if upcomming chars match the given sequence.
67
     *
68
     * This will read the stream for the $sequence. If it's
69
     * found, this will return true. If not, return false.
70
     * Since this unconsumes any chars it reads, the caller
71
     * will still need to read the next sequence, even if
72
     * this returns true.
73
     *
74
     * Example: $this->scanner->sequenceMatches('</script>') will
75
     * see if the input stream is at the start of a
76
     * '</script>' string.
77
     *
78
     * @param string $sequence
79
     * @param bool   $caseSensitive
80
     *
81
     * @return bool
82
     */
83 51
    public function sequenceMatches($sequence, $caseSensitive = true)
84
    {
85 51
        $portion = substr($this->data, $this->char, strlen($sequence));
86
87 51
        return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence);
88
    }
89
90
    /**
91
     * Get the current position.
92
     *
93
     * @return int The current intiger byte position.
94
     */
95 14
    public function position()
96
    {
97 14
        return $this->char;
98
    }
99
100
    /**
101
     * Take a peek at the next character in the data.
102
     *
103
     * @return string The next character.
104
     */
105 13
    public function peek()
106
    {
107 13
        if (($this->char + 1) <= $this->EOF) {
108 13
            return $this->data[$this->char + 1];
109
        }
110
111
        return false;
112
    }
113
114
    /**
115
     * Get the next character.
116
     * Note: This advances the pointer.
117
     *
118
     * @return string The next character.
119
     */
120 137
    public function next()
121
    {
122 137
        ++$this->char;
123
124 137
        if ($this->char < $this->EOF) {
125 137
            return $this->data[$this->char];
126
        }
127
128 124
        return false;
129
    }
130
131
    /**
132
     * Get the current character.
133
     * Note, this does not advance the pointer.
134
     *
135
     * @return string The current character.
136
     */
137 128
    public function current()
138
    {
139 128
        if ($this->char < $this->EOF) {
140 126
            return $this->data[$this->char];
141
        }
142
143 127
        return false;
144
    }
145
146
    /**
147
     * Silently consume N chars.
148
     *
149
     * @param int $count
150
     */
151 42
    public function consume($count = 1)
152
    {
153 42
        $this->char += $count;
154 42
    }
155
156
    /**
157
     * Unconsume some of the data.
158
     * This moves the data pointer backwards.
159
     *
160
     * @param int $howMany The number of characters to move the pointer back.
161
     */
162 47
    public function unconsume($howMany = 1)
163
    {
164 47
        if (($this->char - $howMany) >= 0) {
165 47
            $this->char -= $howMany;
166 47
        }
167 47
    }
168
169
    /**
170
     * Get the next group of that contains hex characters.
171
     * Note, along with getting the characters the pointer in the data will be
172
     * moved as well.
173
     *
174
     * @return string The next group that is hex characters.
175
     */
176 3
    public function getHex()
177
    {
178 3
        return $this->doCharsWhile(static::CHARS_HEX);
179
    }
180
181
    /**
182
     * Get the next group of characters that are ASCII Alpha characters.
183
     * Note, along with getting the characters the pointer in the data will be
184
     * moved as well.
185
     *
186
     * @return string The next group of ASCII alpha characters.
187
     */
188 9
    public function getAsciiAlpha()
189
    {
190 9
        return $this->doCharsWhile(static::CHARS_ALPHA);
191
    }
192
193
    /**
194
     * Get the next group of characters that are ASCII Alpha characters and numbers.
195
     * Note, along with getting the characters the pointer in the data will be
196
     * moved as well.
197
     *
198
     * @return string The next group of ASCII alpha characters and numbers.
199
     */
200 15
    public function getAsciiAlphaNum()
201
    {
202 15
        return $this->doCharsWhile(static::CHARS_ALNUM);
203
    }
204
205
    /**
206
     * Get the next group of numbers.
207
     * Note, along with getting the characters the pointer in the data will be
208
     * moved as well.
209
     *
210
     * @return string The next group of numbers.
211
     */
212 2
    public function getNumeric()
213
    {
214 2
        return $this->doCharsWhile('0123456789');
215
    }
216
217
    /**
218
     * Consume whitespace.
219
     * Whitespace in HTML5 is: formfeed, tab, newline, space.
220
     *
221
     * @return int The length of the matched whitespaces.
222
     */
223 121
    public function whitespace()
224
    {
225 121
        if ($this->char >= $this->EOF) {
226 3
            return false;
227
        }
228
229 121
        $len = strspn($this->data, "\n\t\f ", $this->char);
230
231 121
        $this->char += $len;
232
233 121
        return $len;
234
    }
235
236
    /**
237
     * Returns the current line that is being consumed.
238
     *
239
     * @return int The current line number.
240
     */
241 16
    public function currentLine()
242
    {
243 16
        if (empty($this->EOF) || 0 === $this->char) {
244 1
            return 1;
245
        }
246
247
        // Add one to $this->char because we want the number for the next
248
        // byte to be processed.
249 16
        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
250
    }
251
252
    /**
253
     * Read chars until something in the mask is encountered.
254
     *
255
     * @param string $mask
256
     *
257
     * @return mixed
258
     */
259 116
    public function charsUntil($mask)
260
    {
261 116
        return $this->doCharsUntil($mask);
262
    }
263
264
    /**
265
     * Read chars as long as the mask matches.
266
     *
267
     * @param string $mask
268
     *
269
     * @return int
270
     */
271 117
    public function charsWhile($mask)
272
    {
273 117
        return $this->doCharsWhile($mask);
274
    }
275
276
    /**
277
     * Returns the current column of the current line that the tokenizer is at.
278
     *
279
     * Newlines are column 0. The first char after a newline is column 1.
280
     *
281
     * @return int The column number.
282
     */
283 16
    public function columnOffset()
284
    {
285
        // Short circuit for the first char.
286 16
        if (0 === $this->char) {
287
            return 0;
288
        }
289
290
        // strrpos is weird, and the offset needs to be negative for what we
291
        // want (i.e., the last \n before $this->char). This needs to not have
292
        // one (to make it point to the next character, the one we want the
293
        // position of) added to it because strrpos's behaviour includes the
294
        // final offset byte.
295 16
        $backwardFrom = $this->char - 1 - strlen($this->data);
296 16
        $lastLine = strrpos($this->data, "\n", $backwardFrom);
297
298
        // However, for here we want the length up until the next byte to be
299
        // processed, so add one to the current byte ($this->char).
300 16
        if (false !== $lastLine) {
301 3
            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
302 3
        } else {
303
            // After a newline.
304 14
            $findLengthOf = substr($this->data, 0, $this->char);
305
        }
306
307 16
        return UTF8Utils::countChars($findLengthOf);
308
    }
309
310
    /**
311
     * Get all characters until EOF.
312
     *
313
     * This consumes characters until the EOF.
314
     *
315
     * @return int The number of characters remaining.
316
     */
317 1
    public function remainingChars()
318
    {
319 1
        if ($this->char < $this->EOF) {
320 1
            $data = substr($this->data, $this->char);
321 1
            $this->char = $this->EOF;
322
323 1
            return $data;
324
        }
325
326
        return ''; // false;
327
    }
328
329
    /**
330
     * Replace linefeed characters according to the spec.
331
     *
332
     * @param $data
333
     *
334
     * @return string
335
     */
336 142
    private function replaceLinefeeds($data)
337
    {
338
        /*
339
         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
340
         * Any CR characters that are followed by LF characters must be removed, and any CR characters not
341
         * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
342
         * represented by LF characters, and there are never any CR characters in the input to the tokenization
343
         * stage.
344
         */
345
        $crlfTable = array(
346 142
            "\0" => "\xEF\xBF\xBD",
347 142
            "\r\n" => "\n",
348 142
            "\r" => "\n",
349 142
        );
350
351 142
        return strtr($data, $crlfTable);
352
    }
353
354
    /**
355
     * Read to a particular match (or until $max bytes are consumed).
356
     *
357
     * This operates on byte sequences, not characters.
358
     *
359
     * Matches as far as possible until we reach a certain set of bytes
360
     * and returns the matched substring.
361
     *
362
     * @param string $bytes Bytes to match.
363
     * @param int    $max   Maximum number of bytes to scan.
364
     *
365
     * @return mixed Index or false if no match is found. You should use strong
366
     *               equality when checking the result, since index could be 0.
367
     */
368 116
    private function doCharsUntil($bytes, $max = null)
369
    {
370 116
        if ($this->char >= $this->EOF) {
371 4
            return false;
372
        }
373
374 116
        if (0 === $max || $max) {
375
            $len = strcspn($this->data, $bytes, $this->char, $max);
376
        } else {
377 116
            $len = strcspn($this->data, $bytes, $this->char);
378
        }
379
380 116
        $string = (string) substr($this->data, $this->char, $len);
381 116
        $this->char += $len;
382
383 116
        return $string;
384
    }
385
386
    /**
387
     * Returns the string so long as $bytes matches.
388
     *
389
     * Matches as far as possible with a certain set of bytes
390
     * and returns the matched substring.
391
     *
392
     * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
393
     *                      current char, the pointer advances and the char is part of the
394
     *                      substring.
395
     * @param int    $max   The max number of chars to read.
396
     *
397
     * @return string
398
     */
399 129
    private function doCharsWhile($bytes, $max = null)
400
    {
401 129
        if ($this->char >= $this->EOF) {
402
            return false;
403
        }
404
405 129
        if (0 === $max || $max) {
406
            $len = strspn($this->data, $bytes, $this->char, $max);
407
        } else {
408 129
            $len = strspn($this->data, $bytes, $this->char);
409
        }
410
411 129
        $string = (string) substr($this->data, $this->char, $len);
412 129
        $this->char += $len;
413
414 129
        return $string;
415
    }
416
}
417