Completed
Push — master ( 30aab1...cadcfa )
by Asmir
07:00
created

Scanner   A

Complexity

Total Complexity 38

Size/Duplication

Total Lines 407
Duplicated Lines 0 %

Coupling/Cohesion

Dependencies 1

Test Coverage

Coverage 92.78%

Importance

Changes 0
Metric Value
wmc 38
cbo 1
dl 0
loc 407
ccs 90
cts 97
cp 0.9278
rs 9.36
c 0
b 0
f 0

21 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 20 2
A sequenceMatches() 0 5 2
A position() 0 4 1
A peek() 0 8 2
A next() 0 10 2
A current() 0 8 2
A consume() 0 4 1
A unconsume() 0 6 2
A getHex() 0 4 1
A getAsciiAlpha() 0 4 1
A getAsciiAlphaNum() 0 4 1
A getNumeric() 0 4 1
A whitespace() 0 4 1
A currentLine() 0 10 3
A charsUntil() 0 4 1
A charsWhile() 0 4 1
A columnOffset() 0 26 3
A remainingChars() 0 11 2
A replaceLinefeeds() 0 17 1
A doCharsUntil() 0 17 4
A doCharsWhile() 0 17 4
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
4
use Masterminds\HTML5\Exception;
5
6
/**
7
 * The scanner scans over a given data input to react appropriately to characters.
8
 */
9
class Scanner
10
{
11
    const CHARS_HEX = 'abcdefABCDEF01234567890';
12
    const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
13
    const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
14
15
    /**
16
     * The string data we're parsing.
17
     */
18
    private $data;
19
20
    /**
21
     * The current integer byte position we are in $data
22
     */
23
    private $char;
24
25
    /**
26
     * Length of $data; when $char === $data, we are at the end-of-file.
27
     */
28
    private $EOF;
29
30
    /**
31
     * Parse errors.
32
     */
33
    public $errors = array();
34
35
    /**
36
     * Create a new Scanner.
37
     *
38
     * @param string $data Data to parse
39
     * @param string $encoding The encoding to use for the data.
40
     *
41
     * @throws Exception If the given data cannot be encoded to UTF-8.
42
     */
43 142
    public function __construct($data, $encoding = 'UTF-8')
44
    {
45 142
        if ($data instanceof InputStream) {
46 4
            @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
47 4
            $data = (string) $data;
48 4
        }
49
50 142
        $data = UTF8Utils::convertToUTF8($data, $encoding);
51
52
        // There is good reason to question whether it makes sense to
53
        // do this here, since most of these checks are done during
54
        // parsing, and since this check doesn't actually *do* anything.
55 142
        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
56
57 142
        $data = $this->replaceLinefeeds($data);
58
59 142
        $this->data = $data;
60 142
        $this->char = 0;
61 142
        $this->EOF = strlen($data);
62 142
    }
63
64
    /**
65
     * Check if upcomming chars match the given sequence.
66
     *
67
     * This will read the stream for the $sequence. If it's
68
     * found, this will return true. If not, return false.
69
     * Since this unconsumes any chars it reads, the caller
70
     * will still need to read the next sequence, even if
71
     * this returns true.
72
     *
73
     * Example: $this->scanner->sequenceMatches('</script>') will
74
     * see if the input stream is at the start of a
75
     * '</script>' string.
76
     *
77
     * @param string $sequence
78
     * @param bool $caseSensitive
79
     *
80
     * @return bool
81
     */
82 51
    public function sequenceMatches($sequence, $caseSensitive = true)
83
    {
84 51
        $portion = substr($this->data, $this->char, strlen($sequence));
85 51
        return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0;
86
    }
87
88
    /**
89
     * Get the current position.
90
     *
91
     * @return int The current intiger byte position.
92
     */
93 14
    public function position()
94
    {
95 14
        return $this->char;
96
    }
97
98
    /**
99
     * Take a peek at the next character in the data.
100
     *
101
     * @return string The next character.
102
     */
103 13
    public function peek()
104
    {
105 13
        if (($this->char + 1) <= $this->EOF) {
106 13
            return $this->data[$this->char + 1];
107
        }
108
109
        return false;
110
    }
111
112
    /**
113
     * Get the next character.
114
     *
115
     * Note: This advances the pointer.
116
     *
117
     * @return string The next character.
118
     */
119 137
    public function next()
120
    {
121 137
        $this->char++;
122
123 137
        if ($this->char < $this->EOF) {
124 137
            return $this->data[$this->char];
125
        }
126
127 124
        return false;
128
    }
129
130
    /**
131
     * Get the current character.
132
     *
133
     * Note, this does not advance the pointer.
134
     *
135
     * @return string The current character.
136
     */
137 128
    public function current()
138
    {
139 128
        if ($this->char < $this->EOF) {
140 126
            return $this->data[$this->char];
141
        }
142
143 127
        return false;
144
    }
145
146
    /**
147
     * Silently consume N chars.
148
     *
149
     * @param int $count
150
     */
151 42
    public function consume($count = 1)
152
    {
153 42
        $this->char += $count;
154 42
    }
155
156
    /**
157
     * Unconsume some of the data.
158
     * This moves the data pointer backwards.
159
     *
160
     * @param int $howMany
161
     *            The number of characters to move the pointer back.
162
     */
163 47
    public function unconsume($howMany = 1)
164
    {
165 47
        if (($this->char - $howMany) >= 0) {
166 47
            $this->char = $this->char - $howMany;
167 47
        }
168 47
    }
169
170
    /**
171
     * Get the next group of that contains hex characters.
172
     *
173
     * Note, along with getting the characters the pointer in the data will be
174
     * moved as well.
175
     *
176
     * @return string The next group that is hex characters.
177
     */
178 3
    public function getHex()
179
    {
180 3
        return $this->doCharsWhile(static::CHARS_HEX);
181
    }
182
183
    /**
184
     * Get the next group of characters that are ASCII Alpha characters.
185
     *
186
     * Note, along with getting the characters the pointer in the data will be
187
     * moved as well.
188
     *
189
     * @return string The next group of ASCII alpha characters.
190
     */
191 9
    public function getAsciiAlpha()
192
    {
193 9
        return $this->doCharsWhile(static::CHARS_ALPHA);
194
    }
195
196
    /**
197
     * Get the next group of characters that are ASCII Alpha characters and numbers.
198
     *
199
     * Note, along with getting the characters the pointer in the data will be
200
     * moved as well.
201
     *
202
     * @return string The next group of ASCII alpha characters and numbers.
203
     */
204 15
    public function getAsciiAlphaNum()
205
    {
206 15
        return $this->doCharsWhile(static::CHARS_ALNUM);
207
    }
208
209
    /**
210
     * Get the next group of numbers.
211
     *
212
     * Note, along with getting the characters the pointer in the data will be
213
     * moved as well.
214
     *
215
     * @return string The next group of numbers.
216
     */
217 2
    public function getNumeric()
218
    {
219 2
        return $this->doCharsWhile('0123456789');
220
    }
221
222
    /**
223
     * Consume whitespace.
224
     *
225
     * Whitespace in HTML5 is: formfeed, tab, newline, space.
226
     */
227 121
    public function whitespace()
228
    {
229 121
        return $this->doCharsWhile("\n\t\f ");
230
    }
231
232
    /**
233
     * Returns the current line that is being consumed.
234
     *
235
     * @return int The current line number.
236
     */
237 16
    public function currentLine()
238
    {
239 16
        if (empty($this->EOF) || $this->char == 0) {
240 1
            return 1;
241
        }
242
243
        // Add one to $this->char because we want the number for the next
244
        // byte to be processed.
245 16
        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
246
    }
247
248
    /**
249
     * Read chars until something in the mask is encountered.
250
     *
251
     * @param string $mask
252
     *
253
     * @return mixed
254
     */
255 116
    public function charsUntil($mask)
256
    {
257 116
        return $this->doCharsUntil($mask);
258
    }
259
260
    /**
261
     * Read chars as long as the mask matches.
262
     *
263
     * @param string $mask
264
     *
265
     * @return int
266
     */
267 117
    public function charsWhile($mask)
268
    {
269 117
        return $this->doCharsWhile($mask);
270
    }
271
272
    /**
273
     * Returns the current column of the current line that the tokenizer is at.
274
     *
275
     * Newlines are column 0. The first char after a newline is column 1.
276
     *
277
     * @return int The column number.
278
     */
279 16
    public function columnOffset()
280
    {
281
        // Short circuit for the first char.
282 16
        if ($this->char == 0) {
283
            return 0;
284
        }
285
286
        // strrpos is weird, and the offset needs to be negative for what we
287
        // want (i.e., the last \n before $this->char). This needs to not have
288
        // one (to make it point to the next character, the one we want the
289
        // position of) added to it because strrpos's behaviour includes the
290
        // final offset byte.
291 16
        $backwardFrom = $this->char - 1 - strlen($this->data);
292 16
        $lastLine = strrpos($this->data, "\n", $backwardFrom);
293
294
        // However, for here we want the length up until the next byte to be
295
        // processed, so add one to the current byte ($this->char).
296 16
        if ($lastLine !== false) {
297 3
            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
298 3
        } else {
299
            // After a newline.
300 14
            $findLengthOf = substr($this->data, 0, $this->char);
301
        }
302
303 16
        return UTF8Utils::countChars($findLengthOf);
304
    }
305
306
    /**
307
     * Get all characters until EOF.
308
     *
309
     * This consumes characters until the EOF.
310
     *
311
     * @return int The number of characters remaining.
312
     */
313 1
    public function remainingChars()
314
    {
315 1
        if ($this->char < $this->EOF) {
316 1
            $data = substr($this->data, $this->char);
317 1
            $this->char = $this->EOF;
318
319 1
            return $data;
320
        }
321
322
        return ''; // false;
323
    }
324
325
    /**
326
     * Replace linefeed characters according to the spec.
327
     *
328
     * @param $data
329
     *
330
     * @return string
331
     */
332 142
    private function replaceLinefeeds($data)
333
    {
334
        /*
335
         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
336
         * Any CR characters that are followed by LF characters must be removed, and any CR characters not
337
         * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
338
         * represented by LF characters, and there are never any CR characters in the input to the tokenization
339
         * stage.
340
         */
341
        $crlfTable = array(
342 142
            "\0" => "\xEF\xBF\xBD",
343 142
            "\r\n" => "\n",
344
            "\r" => "\n"
345 142
        );
346
347 142
        return strtr($data, $crlfTable);
348
    }
349
350
    /**
351
     * Read to a particular match (or until $max bytes are consumed).
352
     *
353
     * This operates on byte sequences, not characters.
354
     *
355
     * Matches as far as possible until we reach a certain set of bytes
356
     * and returns the matched substring.
357
     *
358
     * @param string $bytes
359
     *            Bytes to match.
360
     * @param int $max
361
     *            Maximum number of bytes to scan.
362
     * @return mixed Index or false if no match is found. You should use strong
363
     *         equality when checking the result, since index could be 0.
364
     */
365 116
    private function doCharsUntil($bytes, $max = null)
366
    {
367 116
        if ($this->char >= $this->EOF) {
368 4
            return false;
369
        }
370
371 116
        if ($max === 0 || $max) {
372
            $len = strcspn($this->data, $bytes, $this->char, $max);
373
        } else {
374 116
            $len = strcspn($this->data, $bytes, $this->char);
375
        }
376
377 116
        $string = (string) substr($this->data, $this->char, $len);
378 116
        $this->char += $len;
379
380 116
        return $string;
381
    }
382
383
    /**
384
     * Returns the string so long as $bytes matches.
385
     *
386
     * Matches as far as possible with a certain set of bytes
387
     * and returns the matched substring.
388
     *
389
     * @param string $bytes
390
     *            A mask of bytes to match. If ANY byte in this mask matches the
391
     *            current char, the pointer advances and the char is part of the
392
     *            substring.
393
     * @param int $max
394
     *            The max number of chars to read.
395
     *
396
     * @return string
397
     */
398 130
    private function doCharsWhile($bytes, $max = null)
399
    {
400 130
        if ($this->char >= $this->EOF) {
401 3
            return false;
402
        }
403
404 130
        if ($max === 0 || $max) {
405
            $len = strspn($this->data, $bytes, $this->char, $max);
406
        } else {
407 130
            $len = strspn($this->data, $bytes, $this->char);
408
        }
409
410 130
        $string = (string) substr($this->data, $this->char, $len);
411 130
        $this->char += $len;
412
413 130
        return $string;
414
    }
415
}
416