Completed
Pull Request — master (#153)
by Christophe
01:39
created

Scanner::doCharsWhile()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 4.432

Importance

Changes 0
Metric Value
dl 0
loc 17
ccs 7
cts 10
cp 0.7
rs 9.7
c 0
b 0
f 0
cc 4
nc 3
nop 2
crap 4.432
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
4
use Masterminds\HTML5\Exception;
5
6
/**
7
 * The scanner scans over a given data input to react appropriately to characters.
8
 */
9
class Scanner
10
{
11
    const CHARS_HEX = 'abcdefABCDEF01234567890';
12
    const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
13
    const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
14
15
    /**
16
     * The string data we're parsing.
17
     */
18
    private $data;
19
20
    /**
21
     * The current integer byte position we are in $data
22
     */
23
    private $char;
24
25
    /**
26
     * Length of $data; when $char === $data, we are at the end-of-file.
27
     */
28
    private $EOF;
29
30
    /**
31
     * Parse errors.
32
     */
33
    public $errors = array();
34
35
    /**
36
     * Create a new Scanner.
37
     *
38
     * @param string $data Data to parse
39
     * @param string $encoding The encoding to use for the data.
40
     *
41
     * @throws Exception If the given data cannot be encoded to UTF-8.
42
     */
43 142
    public function __construct($data, $encoding = 'UTF-8')
44
    {
45 142
        if ($data instanceof InputStream) {
46 4
            @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
47 4
            $data = (string) $data;
48 4
        }
49
50 142
        $data = UTF8Utils::convertToUTF8($data, $encoding);
51
52
        // There is good reason to question whether it makes sense to
53
        // do this here, since most of these checks are done during
54
        // parsing, and since this check doesn't actually *do* anything.
55 142
        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
56
57 142
        $data = $this->replaceLinefeeds($data);
58
59 142
        $this->data = $data;
60 142
        $this->char = 0;
61 142
        $this->EOF = strlen($data);
62 142
    }
63
64
    /**
65
     * Check if upcomming chars match the given sequence.
66
     *
67
     * This will read the stream for the $sequence. If it's
68
     * found, this will return true. If not, return false.
69
     * Since this unconsumes any chars it reads, the caller
70
     * will still need to read the next sequence, even if
71
     * this returns true.
72
     *
73
     * Example: $this->scanner->sequenceMatches('</script>') will
74
     * see if the input stream is at the start of a
75
     * '</script>' string.
76
     *
77
     * @param string $sequence
78
     * @param bool $caseSensitive
79
     *
80
     * @return bool
81
     */
82 51
    public function sequenceMatches($sequence, $caseSensitive = true)
83
    {
84 51
        $portion = substr($this->data, $this->char, strlen($sequence));
85 51
        return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0;
86
    }
87
88
    /**
89
     * Get the current position.
90
     *
91
     * @return int The current intiger byte position.
92
     */
93 14
    public function position()
94
    {
95 14
        return $this->char;
96
    }
97
98
    /**
99
     * Take a peek at the next character in the data.
100
     *
101
     * @return string The next character.
102
     */
103 13
    public function peek()
104
    {
105 13
        if (($this->char + 1) <= $this->EOF) {
106 13
            return $this->data[$this->char + 1];
107
        }
108
109
        return false;
110
    }
111
112
    /**
113
     * Get the next character.
114
     *
115
     * Note: This advances the pointer.
116
     *
117
     * @return string The next character.
118
     */
119 137
    public function next()
120
    {
121 137
        $this->char++;
122
123 137
        if ($this->char < $this->EOF) {
124 137
            return $this->data[$this->char];
125
        }
126
127 124
        return false;
128
    }
129
130
    /**
131
     * Get the current character.
132
     *
133
     * Note, this does not advance the pointer.
134
     *
135
     * @return string The current character.
136
     */
137 128
    public function current()
138
    {
139 128
        if ($this->char < $this->EOF) {
140 126
            return $this->data[$this->char];
141
        }
142
143 127
        return false;
144
    }
145
146
    /**
147
     * Silently consume N chars.
148
     *
149
     * @param int $count
150
     */
151 42
    public function consume($count = 1)
152
    {
153 42
        $this->char += $count;
154 42
    }
155
156
    /**
157
     * Unconsume some of the data.
158
     * This moves the data pointer backwards.
159
     *
160
     * @param int $howMany
161
     *            The number of characters to move the pointer back.
162
     */
163 47
    public function unconsume($howMany = 1)
164
    {
165 47
        if (($this->char - $howMany) >= 0) {
166 47
            $this->char = $this->char - $howMany;
167 47
        }
168 47
    }
169
170
    /**
171
     * Get the next group of that contains hex characters.
172
     *
173
     * Note, along with getting the characters the pointer in the data will be
174
     * moved as well.
175
     *
176
     * @return string The next group that is hex characters.
177
     */
178 3
    public function getHex()
179
    {
180 3
        return $this->doCharsWhile(static::CHARS_HEX);
181
    }
182
183
    /**
184
     * Get the next group of characters that are ASCII Alpha characters.
185
     *
186
     * Note, along with getting the characters the pointer in the data will be
187
     * moved as well.
188
     *
189
     * @return string The next group of ASCII alpha characters.
190
     */
191 9
    public function getAsciiAlpha()
192
    {
193 9
        return $this->doCharsWhile(static::CHARS_ALPHA);
194
    }
195
196
    /**
197
     * Get the next group of characters that are ASCII Alpha characters and numbers.
198
     *
199
     * Note, along with getting the characters the pointer in the data will be
200
     * moved as well.
201
     *
202
     * @return string The next group of ASCII alpha characters and numbers.
203
     */
204 15
    public function getAsciiAlphaNum()
205
    {
206 15
        return $this->doCharsWhile(static::CHARS_ALNUM);
207
    }
208
209
    /**
210
     * Get the next group of numbers.
211
     *
212
     * Note, along with getting the characters the pointer in the data will be
213
     * moved as well.
214
     *
215
     * @return string The next group of numbers.
216
     */
217 2
    public function getNumeric()
218
    {
219 2
        return $this->doCharsWhile('0123456789');
220
    }
221
222
    /**
223
     * Consume whitespace.
224
     *
225
     * Whitespace in HTML5 is: formfeed, tab, newline, space.
226
     *
227
     * @return int The length of the matched whitespaces
228
     */
229 121
    public function whitespace()
230
    {
231 121
        if ($this->char >= $this->EOF) {
232 3
            return false;
233
        }
234
235 121
        $len = strspn($this->data, "\n\t\f ", $this->char);
236
237 121
        $this->char += $len;
238
239 121
        return $len;
240
    }
241
242
    /**
243
     * Returns the current line that is being consumed.
244
     *
245
     * @return int The current line number.
246
     */
247 16
    public function currentLine()
248
    {
249 16
        if (empty($this->EOF) || $this->char == 0) {
250 1
            return 1;
251
        }
252
253
        // Add one to $this->char because we want the number for the next
254
        // byte to be processed.
255 16
        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
256
    }
257
258
    /**
259
     * Read chars until something in the mask is encountered.
260
     *
261
     * @param string $mask
262
     *
263
     * @return mixed
264
     */
265 116
    public function charsUntil($mask)
266
    {
267 116
        return $this->doCharsUntil($mask);
268
    }
269
270
    /**
271
     * Read chars as long as the mask matches.
272
     *
273
     * @param string $mask
274
     *
275
     * @return int
276
     */
277 117
    public function charsWhile($mask)
278
    {
279 117
        return $this->doCharsWhile($mask);
280
    }
281
282
    /**
283
     * Returns the current column of the current line that the tokenizer is at.
284
     *
285
     * Newlines are column 0. The first char after a newline is column 1.
286
     *
287
     * @return int The column number.
288
     */
289 16
    public function columnOffset()
290
    {
291
        // Short circuit for the first char.
292 16
        if ($this->char == 0) {
293
            return 0;
294
        }
295
296
        // strrpos is weird, and the offset needs to be negative for what we
297
        // want (i.e., the last \n before $this->char). This needs to not have
298
        // one (to make it point to the next character, the one we want the
299
        // position of) added to it because strrpos's behaviour includes the
300
        // final offset byte.
301 16
        $backwardFrom = $this->char - 1 - strlen($this->data);
302 16
        $lastLine = strrpos($this->data, "\n", $backwardFrom);
303
304
        // However, for here we want the length up until the next byte to be
305
        // processed, so add one to the current byte ($this->char).
306 16
        if ($lastLine !== false) {
307 3
            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
308 3
        } else {
309
            // After a newline.
310 14
            $findLengthOf = substr($this->data, 0, $this->char);
311
        }
312
313 16
        return UTF8Utils::countChars($findLengthOf);
314
    }
315
316
    /**
317
     * Get all characters until EOF.
318
     *
319
     * This consumes characters until the EOF.
320
     *
321
     * @return int The number of characters remaining.
322
     */
323 1
    public function remainingChars()
324
    {
325 1
        if ($this->char < $this->EOF) {
326 1
            $data = substr($this->data, $this->char);
327 1
            $this->char = $this->EOF;
328
329 1
            return $data;
330
        }
331
332
        return ''; // false;
333
    }
334
335
    /**
336
     * Replace linefeed characters according to the spec.
337
     *
338
     * @param $data
339
     *
340
     * @return string
341
     */
342 142
    private function replaceLinefeeds($data)
343
    {
344
        /*
345
         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
346
         * Any CR characters that are followed by LF characters must be removed, and any CR characters not
347
         * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
348
         * represented by LF characters, and there are never any CR characters in the input to the tokenization
349
         * stage.
350
         */
351
        $crlfTable = array(
352 142
            "\0" => "\xEF\xBF\xBD",
353 142
            "\r\n" => "\n",
354
            "\r" => "\n"
355 142
        );
356
357 142
        return strtr($data, $crlfTable);
358
    }
359
360
    /**
361
     * Read to a particular match (or until $max bytes are consumed).
362
     *
363
     * This operates on byte sequences, not characters.
364
     *
365
     * Matches as far as possible until we reach a certain set of bytes
366
     * and returns the matched substring.
367
     *
368
     * @param string $bytes
369
     *            Bytes to match.
370
     * @param int $max
371
     *            Maximum number of bytes to scan.
372
     * @return mixed Index or false if no match is found. You should use strong
373
     *         equality when checking the result, since index could be 0.
374
     */
375 116
    private function doCharsUntil($bytes, $max = null)
376
    {
377 116
        if ($this->char >= $this->EOF) {
378 4
            return false;
379
        }
380
381 116
        if ($max === 0 || $max) {
382
            $len = strcspn($this->data, $bytes, $this->char, $max);
383
        } else {
384 116
            $len = strcspn($this->data, $bytes, $this->char);
385
        }
386
387 116
        $string = (string) substr($this->data, $this->char, $len);
388 116
        $this->char += $len;
389
390 116
        return $string;
391
    }
392
393
    /**
394
     * Returns the string so long as $bytes matches.
395
     *
396
     * Matches as far as possible with a certain set of bytes
397
     * and returns the matched substring.
398
     *
399
     * @param string $bytes
400
     *            A mask of bytes to match. If ANY byte in this mask matches the
401
     *            current char, the pointer advances and the char is part of the
402
     *            substring.
403
     * @param int $max
404
     *            The max number of chars to read.
405
     *
406
     * @return string
407
     */
408 129
    private function doCharsWhile($bytes, $max = null)
409
    {
410 129
        if ($this->char >= $this->EOF) {
411
            return false;
412
        }
413
414 129
        if ($max === 0 || $max) {
415
            $len = strspn($this->data, $bytes, $this->char, $max);
416
        } else {
417 129
            $len = strspn($this->data, $bytes, $this->char);
418
        }
419
420 129
        $string = (string) substr($this->data, $this->char, $len);
421 129
        $this->char += $len;
422
423 129
        return $string;
424
    }
425
}
426