1 | <?php |
||
10 | class Scanner |
||
11 | { |
||
12 | const CHARS_HEX = 'abcdefABCDEF01234567890'; |
||
13 | const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; |
||
14 | const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; |
||
15 | |||
16 | /** |
||
17 | * The string data we're parsing. |
||
18 | */ |
||
19 | private $data; |
||
20 | |||
21 | /** |
||
22 | * The current integer byte position we are in $data. |
||
23 | */ |
||
24 | private $char; |
||
25 | |||
26 | /** |
||
27 | * Length of $data; when $char === $data, we are at the end-of-file. |
||
28 | */ |
||
29 | private $EOF; |
||
30 | |||
31 | /** |
||
32 | * Parse errors. |
||
33 | */ |
||
34 | public $errors = array(); |
||
35 | |||
36 | /** |
||
37 | * Create a new Scanner. |
||
38 | * |
||
39 | * @param string $data Data to parse. |
||
40 | * @param string $encoding The encoding to use for the data. |
||
41 | * |
||
42 | * @throws Exception If the given data cannot be encoded to UTF-8. |
||
43 | */ |
||
44 | 142 | public function __construct($data, $encoding = 'UTF-8') |
|
45 | { |
||
46 | 142 | if ($data instanceof InputStream) { |
|
47 | 4 | @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED); |
|
48 | 4 | $data = (string) $data; |
|
49 | 4 | } |
|
50 | |||
51 | 142 | $data = UTF8Utils::convertToUTF8($data, $encoding); |
|
52 | |||
53 | // There is good reason to question whether it makes sense to |
||
54 | // do this here, since most of these checks are done during |
||
55 | // parsing, and since this check doesn't actually *do* anything. |
||
56 | 142 | $this->errors = UTF8Utils::checkForIllegalCodepoints($data); |
|
57 | |||
58 | 142 | $data = $this->replaceLinefeeds($data); |
|
59 | |||
60 | 142 | $this->data = $data; |
|
61 | 142 | $this->char = 0; |
|
62 | 142 | $this->EOF = strlen($data); |
|
63 | 142 | } |
|
64 | |||
65 | /** |
||
66 | * Check if upcomming chars match the given sequence. |
||
67 | * |
||
68 | * This will read the stream for the $sequence. If it's |
||
69 | * found, this will return true. If not, return false. |
||
70 | * Since this unconsumes any chars it reads, the caller |
||
71 | * will still need to read the next sequence, even if |
||
72 | * this returns true. |
||
73 | * |
||
74 | * Example: $this->scanner->sequenceMatches('</script>') will |
||
75 | * see if the input stream is at the start of a |
||
76 | * '</script>' string. |
||
77 | * |
||
78 | * @param string $sequence |
||
79 | * @param bool $caseSensitive |
||
80 | * |
||
81 | * @return bool |
||
82 | */ |
||
83 | 51 | public function sequenceMatches($sequence, $caseSensitive = true) |
|
84 | { |
||
85 | 51 | $portion = substr($this->data, $this->char, strlen($sequence)); |
|
86 | |||
87 | 51 | return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence); |
|
88 | } |
||
89 | |||
90 | /** |
||
91 | * Get the current position. |
||
92 | * |
||
93 | * @return int The current intiger byte position. |
||
94 | */ |
||
95 | 14 | public function position() |
|
96 | { |
||
97 | 14 | return $this->char; |
|
98 | } |
||
99 | |||
100 | /** |
||
101 | * Take a peek at the next character in the data. |
||
102 | * |
||
103 | * @return string The next character. |
||
104 | */ |
||
105 | 13 | public function peek() |
|
106 | { |
||
107 | 13 | if (($this->char + 1) <= $this->EOF) { |
|
108 | 13 | return $this->data[$this->char + 1]; |
|
109 | } |
||
110 | |||
111 | return false; |
||
112 | } |
||
113 | |||
114 | /** |
||
115 | * Get the next character. |
||
116 | * Note: This advances the pointer. |
||
117 | * |
||
118 | * @return string The next character. |
||
119 | */ |
||
120 | 137 | public function next() |
|
121 | { |
||
122 | 137 | ++$this->char; |
|
123 | |||
124 | 137 | if ($this->char < $this->EOF) { |
|
125 | 137 | return $this->data[$this->char]; |
|
126 | } |
||
127 | |||
128 | 124 | return false; |
|
129 | } |
||
130 | |||
131 | /** |
||
132 | * Get the current character. |
||
133 | * Note, this does not advance the pointer. |
||
134 | * |
||
135 | * @return string The current character. |
||
136 | */ |
||
137 | 128 | public function current() |
|
145 | |||
146 | /** |
||
147 | * Silently consume N chars. |
||
148 | * |
||
149 | * @param int $count |
||
150 | */ |
||
151 | 42 | public function consume($count = 1) |
|
155 | |||
156 | /** |
||
157 | * Unconsume some of the data. |
||
158 | * This moves the data pointer backwards. |
||
159 | * |
||
160 | * @param int $howMany The number of characters to move the pointer back. |
||
161 | */ |
||
162 | 47 | public function unconsume($howMany = 1) |
|
168 | |||
169 | /** |
||
170 | * Get the next group of that contains hex characters. |
||
171 | * Note, along with getting the characters the pointer in the data will be |
||
172 | * moved as well. |
||
173 | * |
||
174 | * @return string The next group that is hex characters. |
||
175 | */ |
||
176 | 3 | public function getHex() |
|
180 | |||
181 | /** |
||
182 | * Get the next group of characters that are ASCII Alpha characters. |
||
183 | * Note, along with getting the characters the pointer in the data will be |
||
184 | * moved as well. |
||
185 | * |
||
186 | * @return string The next group of ASCII alpha characters. |
||
187 | */ |
||
188 | 9 | public function getAsciiAlpha() |
|
192 | |||
193 | /** |
||
194 | * Get the next group of characters that are ASCII Alpha characters and numbers. |
||
195 | * Note, along with getting the characters the pointer in the data will be |
||
196 | * moved as well. |
||
197 | * |
||
198 | * @return string The next group of ASCII alpha characters and numbers. |
||
199 | */ |
||
200 | 15 | public function getAsciiAlphaNum() |
|
204 | |||
205 | /** |
||
206 | * Get the next group of numbers. |
||
207 | * Note, along with getting the characters the pointer in the data will be |
||
208 | * moved as well. |
||
209 | * |
||
210 | * @return string The next group of numbers. |
||
211 | */ |
||
212 | 2 | public function getNumeric() |
|
216 | |||
217 | /** |
||
218 | * Consume whitespace. |
||
219 | * Whitespace in HTML5 is: formfeed, tab, newline, space. |
||
220 | * |
||
221 | * @return int The length of the matched whitespaces. |
||
222 | */ |
||
223 | 121 | public function whitespace() |
|
235 | |||
236 | /** |
||
237 | * Returns the current line that is being consumed. |
||
238 | * |
||
239 | * @return int The current line number. |
||
240 | */ |
||
241 | 16 | public function currentLine() |
|
251 | |||
252 | /** |
||
253 | * Read chars until something in the mask is encountered. |
||
254 | * |
||
255 | * @param string $mask |
||
256 | * |
||
257 | * @return mixed |
||
258 | */ |
||
259 | 116 | public function charsUntil($mask) |
|
263 | |||
264 | /** |
||
265 | * Read chars as long as the mask matches. |
||
266 | * |
||
267 | * @param string $mask |
||
268 | * |
||
269 | * @return int |
||
270 | */ |
||
271 | 117 | public function charsWhile($mask) |
|
275 | |||
276 | /** |
||
277 | * Returns the current column of the current line that the tokenizer is at. |
||
278 | * |
||
279 | * Newlines are column 0. The first char after a newline is column 1. |
||
280 | * |
||
281 | * @return int The column number. |
||
282 | */ |
||
283 | 16 | public function columnOffset() |
|
309 | |||
310 | /** |
||
311 | * Get all characters until EOF. |
||
312 | * |
||
313 | * This consumes characters until the EOF. |
||
314 | * |
||
315 | * @return int The number of characters remaining. |
||
316 | */ |
||
317 | 1 | public function remainingChars() |
|
328 | |||
329 | /** |
||
330 | * Replace linefeed characters according to the spec. |
||
331 | * |
||
332 | * @param $data |
||
333 | * |
||
334 | * @return string |
||
335 | */ |
||
336 | 142 | private function replaceLinefeeds($data) |
|
353 | |||
354 | /** |
||
355 | * Read to a particular match (or until $max bytes are consumed). |
||
356 | * |
||
357 | * This operates on byte sequences, not characters. |
||
358 | * |
||
359 | * Matches as far as possible until we reach a certain set of bytes |
||
360 | * and returns the matched substring. |
||
361 | * |
||
362 | * @param string $bytes Bytes to match. |
||
363 | * @param int $max Maximum number of bytes to scan. |
||
364 | * |
||
365 | * @return mixed Index or false if no match is found. You should use strong |
||
366 | * equality when checking the result, since index could be 0. |
||
367 | */ |
||
368 | 116 | private function doCharsUntil($bytes, $max = null) |
|
385 | |||
386 | /** |
||
387 | * Returns the string so long as $bytes matches. |
||
388 | * |
||
389 | * Matches as far as possible with a certain set of bytes |
||
390 | * and returns the matched substring. |
||
391 | * |
||
392 | * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the |
||
393 | * current char, the pointer advances and the char is part of the |
||
394 | * substring. |
||
395 | * @param int $max The max number of chars to read. |
||
396 | * |
||
397 | * @return string |
||
398 | */ |
||
399 | 129 | private function doCharsWhile($bytes, $max = null) |
|
416 | } |
||
417 |