Completed
Pull Request — master (#160)
by
unknown
01:39
created

Scanner   A

Complexity

Total Complexity 39

Size/Duplication

Total Lines 414
Duplicated Lines 0 %

Coupling/Cohesion

Dependencies 1

Test Coverage

Coverage 92.16%

Importance

Changes 0
Metric Value
wmc 39
cbo 1
dl 0
loc 414
ccs 94
cts 102
cp 0.9216
rs 9.28
c 0
b 0
f 0

21 Methods

Rating   Name   Duplication   Size   Complexity  
A doCharsUntil() 0 17 4
A __construct() 0 20 2
A sequenceMatches() 0 6 2
A position() 0 4 1
A peek() 0 8 2
A next() 0 10 2
A current() 0 8 2
A consume() 0 4 1
A unconsume() 0 6 2
A getHex() 0 4 1
A getAsciiAlpha() 0 4 1
A getAsciiAlphaNum() 0 4 1
A getNumeric() 0 4 1
A whitespace() 0 12 2
A currentLine() 0 10 3
A charsUntil() 0 4 1
A charsWhile() 0 4 1
A columnOffset() 0 26 3
A remainingChars() 0 11 2
A replaceLinefeeds() 0 17 1
A doCharsWhile() 0 17 4
1
<?php
2
3
namespace Masterminds\HTML5\Parser;
4
5
use Masterminds\HTML5\Exception;
6
7
/**
8
 * The scanner scans over a given data input to react appropriately to characters.
9
 */
10
class Scanner
11
{
12
    const CHARS_HEX = 'abcdefABCDEF01234567890';
13
    const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
14
    const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
15
16
    /**
17
     * The string data we're parsing.
18
     */
19
    private $data;
20
21
    /**
22
     * The current integer byte position we are in $data.
23
     */
24
    private $char;
25
26
    /**
27
     * Length of $data; when $char === $data, we are at the end-of-file.
28
     */
29
    private $EOF;
30
31
    /**
32
     * Parse errors.
33
     */
34
    public $errors = array();
35
36
    /**
37
     * Create a new Scanner.
38
     *
39
     * @param string $data     Data to parse
40
     * @param string $encoding the encoding to use for the data
41
     *
42
     * @throws Exception if the given data cannot be encoded to UTF-8
43
     */
44 142
    public function __construct($data, $encoding = 'UTF-8')
45
    {
46 142
        if ($data instanceof InputStream) {
47 4
            @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
48 4
            $data = (string) $data;
49 4
        }
50
51 142
        $data = UTF8Utils::convertToUTF8($data, $encoding);
52
53
        // There is good reason to question whether it makes sense to
54
        // do this here, since most of these checks are done during
55
        // parsing, and since this check doesn't actually *do* anything.
56 142
        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
57
58 142
        $data = $this->replaceLinefeeds($data);
59
60 142
        $this->data = $data;
61 142
        $this->char = 0;
62 142
        $this->EOF = strlen($data);
63 142
    }
64
65
    /**
66
     * Check if upcomming chars match the given sequence.
67
     *
68
     * This will read the stream for the $sequence. If it's
69
     * found, this will return true. If not, return false.
70
     * Since this unconsumes any chars it reads, the caller
71
     * will still need to read the next sequence, even if
72
     * this returns true.
73
     *
74
     * Example: $this->scanner->sequenceMatches('</script>') will
75
     * see if the input stream is at the start of a
76
     * '</script>' string.
77
     *
78
     * @param string $sequence
79
     * @param bool   $caseSensitive
80
     *
81
     * @return bool
82
     */
83 51
    public function sequenceMatches($sequence, $caseSensitive = true)
84
    {
85 51
        $portion = substr($this->data, $this->char, strlen($sequence));
86
87 51
        return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence);
88
    }
89
90
    /**
91
     * Get the current position.
92
     *
93
     * @return int the current intiger byte position
94
     */
95 14
    public function position()
96
    {
97 14
        return $this->char;
98
    }
99
100
    /**
101
     * Take a peek at the next character in the data.
102
     *
103
     * @return string the next character
104
     */
105 13
    public function peek()
106
    {
107 13
        if (($this->char + 1) <= $this->EOF) {
108 13
            return $this->data[$this->char + 1];
109
        }
110
111
        return false;
112
    }
113
114
    /**
115
     * Get the next character.
116
     *
117
     * Note: This advances the pointer.
118
     *
119
     * @return string the next character
120
     */
121 137
    public function next()
122
    {
123 137
        ++$this->char;
124
125 137
        if ($this->char < $this->EOF) {
126 137
            return $this->data[$this->char];
127
        }
128
129 124
        return false;
130
    }
131
132
    /**
133
     * Get the current character.
134
     *
135
     * Note, this does not advance the pointer.
136
     *
137
     * @return string the current character
138
     */
139 128
    public function current()
140
    {
141 128
        if ($this->char < $this->EOF) {
142 126
            return $this->data[$this->char];
143
        }
144
145 127
        return false;
146
    }
147
148
    /**
149
     * Silently consume N chars.
150
     *
151
     * @param int $count
152
     */
153 42
    public function consume($count = 1)
154
    {
155 42
        $this->char += $count;
156 42
    }
157
158
    /**
159
     * Unconsume some of the data.
160
     * This moves the data pointer backwards.
161
     *
162
     * @param int $howMany The number of characters to move the pointer back
163
     */
164 47
    public function unconsume($howMany = 1)
165
    {
166 47
        if (($this->char - $howMany) >= 0) {
167 47
            $this->char -= $howMany;
168 47
        }
169 47
    }
170
171
    /**
172
     * Get the next group of that contains hex characters.
173
     *
174
     * Note, along with getting the characters the pointer in the data will be
175
     * moved as well.
176
     *
177
     * @return string the next group that is hex characters
178
     */
179 3
    public function getHex()
180
    {
181 3
        return $this->doCharsWhile(static::CHARS_HEX);
182
    }
183
184
    /**
185
     * Get the next group of characters that are ASCII Alpha characters.
186
     *
187
     * Note, along with getting the characters the pointer in the data will be
188
     * moved as well.
189
     *
190
     * @return string the next group of ASCII alpha characters
191
     */
192 9
    public function getAsciiAlpha()
193
    {
194 9
        return $this->doCharsWhile(static::CHARS_ALPHA);
195
    }
196
197
    /**
198
     * Get the next group of characters that are ASCII Alpha characters and numbers.
199
     *
200
     * Note, along with getting the characters the pointer in the data will be
201
     * moved as well.
202
     *
203
     * @return string the next group of ASCII alpha characters and numbers
204
     */
205 15
    public function getAsciiAlphaNum()
206
    {
207 15
        return $this->doCharsWhile(static::CHARS_ALNUM);
208
    }
209
210
    /**
211
     * Get the next group of numbers.
212
     *
213
     * Note, along with getting the characters the pointer in the data will be
214
     * moved as well.
215
     *
216
     * @return string the next group of numbers
217
     */
218 2
    public function getNumeric()
219
    {
220 2
        return $this->doCharsWhile('0123456789');
221
    }
222
223
    /**
224
     * Consume whitespace.
225
     *
226
     * Whitespace in HTML5 is: formfeed, tab, newline, space.
227
     *
228
     * @return int The length of the matched whitespaces
229
     */
230 121
    public function whitespace()
231
    {
232 121
        if ($this->char >= $this->EOF) {
233 3
            return false;
234
        }
235
236 121
        $len = strspn($this->data, "\n\t\f ", $this->char);
237
238 121
        $this->char += $len;
239
240 121
        return $len;
241
    }
242
243
    /**
244
     * Returns the current line that is being consumed.
245
     *
246
     * @return int the current line number
247
     */
248 16
    public function currentLine()
249
    {
250 16
        if (empty($this->EOF) || 0 === $this->char) {
251 1
            return 1;
252
        }
253
254
        // Add one to $this->char because we want the number for the next
255
        // byte to be processed.
256 16
        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
257
    }
258
259
    /**
260
     * Read chars until something in the mask is encountered.
261
     *
262
     * @param string $mask
263
     *
264
     * @return mixed
265
     */
266 116
    public function charsUntil($mask)
267
    {
268 116
        return $this->doCharsUntil($mask);
269
    }
270
271
    /**
272
     * Read chars as long as the mask matches.
273
     *
274
     * @param string $mask
275
     *
276
     * @return int
277
     */
278 117
    public function charsWhile($mask)
279
    {
280 117
        return $this->doCharsWhile($mask);
281
    }
282
283
    /**
284
     * Returns the current column of the current line that the tokenizer is at.
285
     *
286
     * Newlines are column 0. The first char after a newline is column 1.
287
     *
288
     * @return int the column number
289
     */
290 16
    public function columnOffset()
291
    {
292
        // Short circuit for the first char.
293 16
        if (0 === $this->char) {
294
            return 0;
295
        }
296
297
        // strrpos is weird, and the offset needs to be negative for what we
298
        // want (i.e., the last \n before $this->char). This needs to not have
299
        // one (to make it point to the next character, the one we want the
300
        // position of) added to it because strrpos's behaviour includes the
301
        // final offset byte.
302 16
        $backwardFrom = $this->char - 1 - strlen($this->data);
303 16
        $lastLine = strrpos($this->data, "\n", $backwardFrom);
304
305
        // However, for here we want the length up until the next byte to be
306
        // processed, so add one to the current byte ($this->char).
307 16
        if (false !== $lastLine) {
308 3
            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
309 3
        } else {
310
            // After a newline.
311 14
            $findLengthOf = substr($this->data, 0, $this->char);
312
        }
313
314 16
        return UTF8Utils::countChars($findLengthOf);
315
    }
316
317
    /**
318
     * Get all characters until EOF.
319
     *
320
     * This consumes characters until the EOF.
321
     *
322
     * @return int the number of characters remaining
323
     */
324 1
    public function remainingChars()
325
    {
326 1
        if ($this->char < $this->EOF) {
327 1
            $data = substr($this->data, $this->char);
328 1
            $this->char = $this->EOF;
329
330 1
            return $data;
331
        }
332
333
        return ''; // false;
334
    }
335
336
    /**
337
     * Replace linefeed characters according to the spec.
338
     *
339
     * @param $data
340
     *
341
     * @return string
342
     */
343 142
    private function replaceLinefeeds($data)
344
    {
345
        /*
346
         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
347
         * Any CR characters that are followed by LF characters must be removed, and any CR characters not
348
         * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
349
         * represented by LF characters, and there are never any CR characters in the input to the tokenization
350
         * stage.
351
         */
352
        $crlfTable = array(
353 142
            "\0" => "\xEF\xBF\xBD",
354 142
            "\r\n" => "\n",
355 142
            "\r" => "\n",
356 142
        );
357
358 142
        return strtr($data, $crlfTable);
359
    }
360
361
    /**
362
     * Read to a particular match (or until $max bytes are consumed).
363
     *
364
     * This operates on byte sequences, not characters.
365
     *
366
     * Matches as far as possible until we reach a certain set of bytes
367
     * and returns the matched substring.
368
     *
369
     * @param string $bytes Bytes to match
370
     * @param int    $max   Maximum number of bytes to scan
371
     *
372
     * @return mixed Index or false if no match is found. You should use strong
373
     *               equality when checking the result, since index could be 0.
374
     */
375 116
    private function doCharsUntil($bytes, $max = null)
376
    {
377 116
        if ($this->char >= $this->EOF) {
378 4
            return false;
379
        }
380
381 116
        if (0 === $max || $max) {
382
            $len = strcspn($this->data, $bytes, $this->char, $max);
383
        } else {
384 116
            $len = strcspn($this->data, $bytes, $this->char);
385
        }
386
387 116
        $string = (string) substr($this->data, $this->char, $len);
388 116
        $this->char += $len;
389
390 116
        return $string;
391
    }
392
393
    /**
394
     * Returns the string so long as $bytes matches.
395
     *
396
     * Matches as far as possible with a certain set of bytes
397
     * and returns the matched substring.
398
     *
399
     * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
400
     *                      current char, the pointer advances and the char is part of the
401
     *                      substring.
402
     * @param int    $max   The max number of chars to read
403
     *
404
     * @return string
405
     */
406 129
    private function doCharsWhile($bytes, $max = null)
407
    {
408 129
        if ($this->char >= $this->EOF) {
409
            return false;
410
        }
411
412 129
        if (0 === $max || $max) {
413
            $len = strspn($this->data, $bytes, $this->char, $max);
414
        } else {
415 129
            $len = strspn($this->data, $bytes, $this->char);
416
        }
417
418 129
        $string = (string) substr($this->data, $this->char, $len);
419 129
        $this->char += $len;
420
421 129
        return $string;
422
    }
423
}
424