1 | <?php |
||
9 | class Scanner |
||
10 | { |
||
11 | const CHARS_HEX = 'abcdefABCDEF01234567890'; |
||
12 | const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; |
||
13 | const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; |
||
14 | |||
15 | /** |
||
16 | * The string data we're parsing. |
||
17 | */ |
||
18 | private $data; |
||
19 | |||
20 | /** |
||
21 | * The current integer byte position we are in $data |
||
22 | */ |
||
23 | private $char; |
||
24 | |||
25 | /** |
||
26 | * Length of $data; when $char === $data, we are at the end-of-file. |
||
27 | */ |
||
28 | private $EOF; |
||
29 | |||
30 | /** |
||
31 | * Parse errors. |
||
32 | */ |
||
33 | public $errors = array(); |
||
34 | |||
35 | /** |
||
36 | * Create a new Scanner. |
||
37 | * |
||
38 | * @param string $data Data to parse |
||
39 | * @param string $encoding The encoding to use for the data. |
||
40 | * |
||
41 | * @throws Exception If the given data cannot be encoded to UTF-8. |
||
42 | */ |
||
43 | 142 | public function __construct($data, $encoding = 'UTF-8') |
|
63 | |||
64 | /** |
||
65 | * Check if upcomming chars match the given sequence. |
||
66 | * |
||
67 | * This will read the stream for the $sequence. If it's |
||
68 | * found, this will return true. If not, return false. |
||
69 | * Since this unconsumes any chars it reads, the caller |
||
70 | * will still need to read the next sequence, even if |
||
71 | * this returns true. |
||
72 | * |
||
73 | * Example: $this->scanner->sequenceMatches('</script>') will |
||
74 | * see if the input stream is at the start of a |
||
75 | * '</script>' string. |
||
76 | * |
||
77 | * @param string $sequence |
||
78 | * @param bool $caseSensitive |
||
79 | * |
||
80 | * @return bool |
||
81 | */ |
||
82 | 51 | public function sequenceMatches($sequence, $caseSensitive = true) |
|
87 | |||
88 | /** |
||
89 | * Get the current position. |
||
90 | * |
||
91 | * @return int The current intiger byte position. |
||
92 | */ |
||
93 | 14 | public function position() |
|
97 | |||
98 | /** |
||
99 | * Take a peek at the next character in the data. |
||
100 | * |
||
101 | * @return string The next character. |
||
102 | */ |
||
103 | 13 | public function peek() |
|
111 | |||
112 | /** |
||
113 | * Get the next character. |
||
114 | * |
||
115 | * Note: This advances the pointer. |
||
116 | * |
||
117 | * @return string The next character. |
||
118 | */ |
||
119 | 137 | public function next() |
|
129 | |||
130 | /** |
||
131 | * Get the current character. |
||
132 | * |
||
133 | * Note, this does not advance the pointer. |
||
134 | * |
||
135 | * @return string The current character. |
||
136 | */ |
||
137 | 128 | public function current() |
|
145 | |||
146 | /** |
||
147 | * Silently consume N chars. |
||
148 | * |
||
149 | * @param int $count |
||
150 | */ |
||
151 | 42 | public function consume($count = 1) |
|
155 | |||
156 | /** |
||
157 | * Unconsume some of the data. |
||
158 | * This moves the data pointer backwards. |
||
159 | * |
||
160 | * @param int $howMany |
||
161 | * The number of characters to move the pointer back. |
||
162 | */ |
||
163 | 47 | public function unconsume($howMany = 1) |
|
169 | |||
170 | /** |
||
171 | * Get the next group of that contains hex characters. |
||
172 | * |
||
173 | * Note, along with getting the characters the pointer in the data will be |
||
174 | * moved as well. |
||
175 | * |
||
176 | * @return string The next group that is hex characters. |
||
177 | */ |
||
178 | 3 | public function getHex() |
|
182 | |||
183 | /** |
||
184 | * Get the next group of characters that are ASCII Alpha characters. |
||
185 | * |
||
186 | * Note, along with getting the characters the pointer in the data will be |
||
187 | * moved as well. |
||
188 | * |
||
189 | * @return string The next group of ASCII alpha characters. |
||
190 | */ |
||
191 | 9 | public function getAsciiAlpha() |
|
195 | |||
196 | /** |
||
197 | * Get the next group of characters that are ASCII Alpha characters and numbers. |
||
198 | * |
||
199 | * Note, along with getting the characters the pointer in the data will be |
||
200 | * moved as well. |
||
201 | * |
||
202 | * @return string The next group of ASCII alpha characters and numbers. |
||
203 | */ |
||
204 | 15 | public function getAsciiAlphaNum() |
|
208 | |||
209 | /** |
||
210 | * Get the next group of numbers. |
||
211 | * |
||
212 | * Note, along with getting the characters the pointer in the data will be |
||
213 | * moved as well. |
||
214 | * |
||
215 | * @return string The next group of numbers. |
||
216 | */ |
||
217 | 2 | public function getNumeric() |
|
221 | |||
222 | /** |
||
223 | * Consume whitespace. |
||
224 | * |
||
225 | * Whitespace in HTML5 is: formfeed, tab, newline, space. |
||
226 | */ |
||
227 | 121 | public function whitespace() |
|
231 | |||
232 | /** |
||
233 | * Returns the current line that is being consumed. |
||
234 | * |
||
235 | * @return int The current line number. |
||
236 | */ |
||
237 | 16 | public function currentLine() |
|
247 | |||
248 | /** |
||
249 | * Read chars until something in the mask is encountered. |
||
250 | * |
||
251 | * @param string $mask |
||
252 | * |
||
253 | * @return mixed |
||
254 | */ |
||
255 | 116 | public function charsUntil($mask) |
|
259 | |||
260 | /** |
||
261 | * Read chars as long as the mask matches. |
||
262 | * |
||
263 | * @param string $mask |
||
264 | * |
||
265 | * @return int |
||
266 | */ |
||
267 | 117 | public function charsWhile($mask) |
|
271 | |||
272 | /** |
||
273 | * Returns the current column of the current line that the tokenizer is at. |
||
274 | * |
||
275 | * Newlines are column 0. The first char after a newline is column 1. |
||
276 | * |
||
277 | * @return int The column number. |
||
278 | */ |
||
279 | 16 | public function columnOffset() |
|
305 | |||
306 | /** |
||
307 | * Get all characters until EOF. |
||
308 | * |
||
309 | * This consumes characters until the EOF. |
||
310 | * |
||
311 | * @return int The number of characters remaining. |
||
312 | */ |
||
313 | 1 | public function remainingChars() |
|
324 | |||
325 | /** |
||
326 | * Replace linefeed characters according to the spec. |
||
327 | * |
||
328 | * @param $data |
||
329 | * |
||
330 | * @return string |
||
331 | */ |
||
332 | 142 | private function replaceLinefeeds($data) |
|
349 | |||
350 | /** |
||
351 | * Read to a particular match (or until $max bytes are consumed). |
||
352 | * |
||
353 | * This operates on byte sequences, not characters. |
||
354 | * |
||
355 | * Matches as far as possible until we reach a certain set of bytes |
||
356 | * and returns the matched substring. |
||
357 | * |
||
358 | * @param string $bytes |
||
359 | * Bytes to match. |
||
360 | * @param int $max |
||
361 | * Maximum number of bytes to scan. |
||
362 | * @return mixed Index or false if no match is found. You should use strong |
||
363 | * equality when checking the result, since index could be 0. |
||
364 | */ |
||
365 | 116 | private function doCharsUntil($bytes, $max = null) |
|
382 | |||
383 | /** |
||
384 | * Returns the string so long as $bytes matches. |
||
385 | * |
||
386 | * Matches as far as possible with a certain set of bytes |
||
387 | * and returns the matched substring. |
||
388 | * |
||
389 | * @param string $bytes |
||
390 | * A mask of bytes to match. If ANY byte in this mask matches the |
||
391 | * current char, the pointer advances and the char is part of the |
||
392 | * substring. |
||
393 | * @param int $max |
||
394 | * The max number of chars to read. |
||
395 | * |
||
396 | * @return string |
||
397 | */ |
||
398 | 130 | private function doCharsWhile($bytes, $max = null) |
|
415 | } |
||
416 |