1 | <?php |
||
10 | class Scanner |
||
11 | { |
||
12 | const CHARS_HEX = 'abcdefABCDEF01234567890'; |
||
13 | const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; |
||
14 | const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; |
||
15 | |||
16 | /** |
||
17 | * The string data we're parsing. |
||
18 | */ |
||
19 | private $data; |
||
20 | |||
21 | /** |
||
22 | * The current integer byte position we are in $data. |
||
23 | */ |
||
24 | private $char; |
||
25 | |||
26 | /** |
||
27 | * Length of $data; when $char === $data, we are at the end-of-file. |
||
28 | */ |
||
29 | private $EOF; |
||
30 | |||
31 | /** |
||
32 | * Parse errors. |
||
33 | */ |
||
34 | public $errors = array(); |
||
35 | |||
36 | /** |
||
37 | * Create a new Scanner. |
||
38 | * |
||
39 | * @param string $data Data to parse |
||
40 | * @param string $encoding the encoding to use for the data |
||
41 | * |
||
42 | * @throws Exception if the given data cannot be encoded to UTF-8 |
||
43 | */ |
||
44 | 142 | public function __construct($data, $encoding = 'UTF-8') |
|
45 | { |
||
46 | 142 | if ($data instanceof InputStream) { |
|
47 | 4 | @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED); |
|
48 | 4 | $data = (string) $data; |
|
49 | 4 | } |
|
50 | |||
51 | 142 | $data = UTF8Utils::convertToUTF8($data, $encoding); |
|
52 | |||
53 | // There is good reason to question whether it makes sense to |
||
54 | // do this here, since most of these checks are done during |
||
55 | // parsing, and since this check doesn't actually *do* anything. |
||
56 | 142 | $this->errors = UTF8Utils::checkForIllegalCodepoints($data); |
|
57 | |||
58 | 142 | $data = $this->replaceLinefeeds($data); |
|
59 | |||
60 | 142 | $this->data = $data; |
|
61 | 142 | $this->char = 0; |
|
62 | 142 | $this->EOF = strlen($data); |
|
63 | 142 | } |
|
64 | |||
65 | /** |
||
66 | * Check if upcomming chars match the given sequence. |
||
67 | * |
||
68 | * This will read the stream for the $sequence. If it's |
||
69 | * found, this will return true. If not, return false. |
||
70 | * Since this unconsumes any chars it reads, the caller |
||
71 | * will still need to read the next sequence, even if |
||
72 | * this returns true. |
||
73 | * |
||
74 | * Example: $this->scanner->sequenceMatches('</script>') will |
||
75 | * see if the input stream is at the start of a |
||
76 | * '</script>' string. |
||
77 | * |
||
78 | * @param string $sequence |
||
79 | * @param bool $caseSensitive |
||
80 | * |
||
81 | * @return bool |
||
82 | */ |
||
83 | 51 | public function sequenceMatches($sequence, $caseSensitive = true) |
|
84 | { |
||
85 | 51 | $portion = substr($this->data, $this->char, strlen($sequence)); |
|
86 | |||
87 | 51 | return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence); |
|
88 | } |
||
89 | |||
90 | /** |
||
91 | * Get the current position. |
||
92 | * |
||
93 | * @return int the current intiger byte position |
||
94 | */ |
||
95 | 14 | public function position() |
|
96 | { |
||
97 | 14 | return $this->char; |
|
98 | } |
||
99 | |||
100 | /** |
||
101 | * Take a peek at the next character in the data. |
||
102 | * |
||
103 | * @return string the next character |
||
104 | */ |
||
105 | 13 | public function peek() |
|
106 | { |
||
107 | 13 | if (($this->char + 1) <= $this->EOF) { |
|
108 | 13 | return $this->data[$this->char + 1]; |
|
109 | } |
||
110 | |||
111 | return false; |
||
112 | } |
||
113 | |||
114 | /** |
||
115 | * Get the next character. |
||
116 | * |
||
117 | * Note: This advances the pointer. |
||
118 | * |
||
119 | * @return string the next character |
||
120 | */ |
||
121 | 137 | public function next() |
|
122 | { |
||
123 | 137 | ++$this->char; |
|
124 | |||
125 | 137 | if ($this->char < $this->EOF) { |
|
126 | 137 | return $this->data[$this->char]; |
|
127 | } |
||
128 | |||
129 | 124 | return false; |
|
130 | } |
||
131 | |||
132 | /** |
||
133 | * Get the current character. |
||
134 | * |
||
135 | * Note, this does not advance the pointer. |
||
136 | * |
||
137 | * @return string the current character |
||
138 | */ |
||
139 | 128 | public function current() |
|
140 | { |
||
141 | 128 | if ($this->char < $this->EOF) { |
|
142 | 126 | return $this->data[$this->char]; |
|
143 | } |
||
144 | |||
145 | 127 | return false; |
|
146 | } |
||
147 | |||
148 | /** |
||
149 | * Silently consume N chars. |
||
150 | * |
||
151 | * @param int $count |
||
152 | */ |
||
153 | 42 | public function consume($count = 1) |
|
154 | { |
||
155 | 42 | $this->char += $count; |
|
156 | 42 | } |
|
157 | |||
158 | /** |
||
159 | * Unconsume some of the data. |
||
160 | * This moves the data pointer backwards. |
||
161 | * |
||
162 | * @param int $howMany The number of characters to move the pointer back |
||
163 | */ |
||
164 | 47 | public function unconsume($howMany = 1) |
|
165 | { |
||
166 | 47 | if (($this->char - $howMany) >= 0) { |
|
167 | 47 | $this->char -= $howMany; |
|
168 | 47 | } |
|
169 | 47 | } |
|
170 | |||
171 | /** |
||
172 | * Get the next group of that contains hex characters. |
||
173 | * |
||
174 | * Note, along with getting the characters the pointer in the data will be |
||
175 | * moved as well. |
||
176 | * |
||
177 | * @return string the next group that is hex characters |
||
178 | */ |
||
179 | 3 | public function getHex() |
|
180 | { |
||
181 | 3 | return $this->doCharsWhile(static::CHARS_HEX); |
|
182 | } |
||
183 | |||
184 | /** |
||
185 | * Get the next group of characters that are ASCII Alpha characters. |
||
186 | * |
||
187 | * Note, along with getting the characters the pointer in the data will be |
||
188 | * moved as well. |
||
189 | * |
||
190 | * @return string the next group of ASCII alpha characters |
||
191 | */ |
||
192 | 9 | public function getAsciiAlpha() |
|
193 | { |
||
194 | 9 | return $this->doCharsWhile(static::CHARS_ALPHA); |
|
195 | } |
||
196 | |||
197 | /** |
||
198 | * Get the next group of characters that are ASCII Alpha characters and numbers. |
||
199 | * |
||
200 | * Note, along with getting the characters the pointer in the data will be |
||
201 | * moved as well. |
||
202 | * |
||
203 | * @return string the next group of ASCII alpha characters and numbers |
||
204 | */ |
||
205 | 15 | public function getAsciiAlphaNum() |
|
206 | { |
||
207 | 15 | return $this->doCharsWhile(static::CHARS_ALNUM); |
|
208 | } |
||
209 | |||
210 | /** |
||
211 | * Get the next group of numbers. |
||
212 | * |
||
213 | * Note, along with getting the characters the pointer in the data will be |
||
214 | * moved as well. |
||
215 | * |
||
216 | * @return string the next group of numbers |
||
217 | */ |
||
218 | 2 | public function getNumeric() |
|
219 | { |
||
220 | 2 | return $this->doCharsWhile('0123456789'); |
|
221 | } |
||
222 | |||
223 | /** |
||
224 | * Consume whitespace. |
||
225 | * |
||
226 | * Whitespace in HTML5 is: formfeed, tab, newline, space. |
||
227 | * |
||
228 | * @return int The length of the matched whitespaces |
||
229 | */ |
||
230 | 121 | public function whitespace() |
|
231 | { |
||
232 | 121 | if ($this->char >= $this->EOF) { |
|
233 | 3 | return false; |
|
234 | } |
||
235 | |||
236 | 121 | $len = strspn($this->data, "\n\t\f ", $this->char); |
|
237 | |||
238 | 121 | $this->char += $len; |
|
239 | |||
240 | 121 | return $len; |
|
241 | } |
||
242 | |||
243 | /** |
||
244 | * Returns the current line that is being consumed. |
||
245 | * |
||
246 | * @return int the current line number |
||
247 | */ |
||
248 | 16 | public function currentLine() |
|
249 | { |
||
250 | 16 | if (empty($this->EOF) || 0 === $this->char) { |
|
251 | 1 | return 1; |
|
252 | } |
||
253 | |||
254 | // Add one to $this->char because we want the number for the next |
||
255 | // byte to be processed. |
||
256 | 16 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; |
|
257 | } |
||
258 | |||
259 | /** |
||
260 | * Read chars until something in the mask is encountered. |
||
261 | * |
||
262 | * @param string $mask |
||
263 | * |
||
264 | * @return mixed |
||
265 | */ |
||
266 | 116 | public function charsUntil($mask) |
|
267 | { |
||
268 | 116 | return $this->doCharsUntil($mask); |
|
269 | } |
||
270 | |||
271 | /** |
||
272 | * Read chars as long as the mask matches. |
||
273 | * |
||
274 | * @param string $mask |
||
275 | * |
||
276 | * @return int |
||
277 | */ |
||
278 | 117 | public function charsWhile($mask) |
|
279 | { |
||
280 | 117 | return $this->doCharsWhile($mask); |
|
281 | } |
||
282 | |||
283 | /** |
||
284 | * Returns the current column of the current line that the tokenizer is at. |
||
285 | * |
||
286 | * Newlines are column 0. The first char after a newline is column 1. |
||
287 | * |
||
288 | * @return int the column number |
||
289 | */ |
||
290 | 16 | public function columnOffset() |
|
291 | { |
||
292 | // Short circuit for the first char. |
||
293 | 16 | if (0 === $this->char) { |
|
294 | return 0; |
||
295 | } |
||
296 | |||
297 | // strrpos is weird, and the offset needs to be negative for what we |
||
298 | // want (i.e., the last \n before $this->char). This needs to not have |
||
299 | // one (to make it point to the next character, the one we want the |
||
300 | // position of) added to it because strrpos's behaviour includes the |
||
301 | // final offset byte. |
||
302 | 16 | $backwardFrom = $this->char - 1 - strlen($this->data); |
|
303 | 16 | $lastLine = strrpos($this->data, "\n", $backwardFrom); |
|
304 | |||
305 | // However, for here we want the length up until the next byte to be |
||
306 | // processed, so add one to the current byte ($this->char). |
||
307 | 16 | if (false !== $lastLine) { |
|
308 | 3 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); |
|
309 | 3 | } else { |
|
310 | // After a newline. |
||
311 | 14 | $findLengthOf = substr($this->data, 0, $this->char); |
|
312 | } |
||
313 | |||
314 | 16 | return UTF8Utils::countChars($findLengthOf); |
|
315 | } |
||
316 | |||
317 | /** |
||
318 | * Get all characters until EOF. |
||
319 | * |
||
320 | * This consumes characters until the EOF. |
||
321 | * |
||
322 | * @return int the number of characters remaining |
||
323 | */ |
||
324 | 1 | public function remainingChars() |
|
325 | { |
||
326 | 1 | if ($this->char < $this->EOF) { |
|
327 | 1 | $data = substr($this->data, $this->char); |
|
328 | 1 | $this->char = $this->EOF; |
|
329 | |||
330 | 1 | return $data; |
|
331 | } |
||
332 | |||
333 | return ''; // false; |
||
334 | } |
||
335 | |||
336 | /** |
||
337 | * Replace linefeed characters according to the spec. |
||
338 | * |
||
339 | * @param $data |
||
340 | * |
||
341 | * @return string |
||
342 | */ |
||
343 | 142 | private function replaceLinefeeds($data) |
|
344 | { |
||
345 | /* |
||
346 | * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. |
||
347 | * Any CR characters that are followed by LF characters must be removed, and any CR characters not |
||
348 | * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are |
||
349 | * represented by LF characters, and there are never any CR characters in the input to the tokenization |
||
350 | * stage. |
||
351 | */ |
||
352 | $crlfTable = array( |
||
353 | 142 | "\0" => "\xEF\xBF\xBD", |
|
354 | 142 | "\r\n" => "\n", |
|
355 | 142 | "\r" => "\n", |
|
356 | 142 | ); |
|
357 | |||
358 | 142 | return strtr($data, $crlfTable); |
|
359 | } |
||
360 | |||
361 | /** |
||
362 | * Read to a particular match (or until $max bytes are consumed). |
||
363 | * |
||
364 | * This operates on byte sequences, not characters. |
||
365 | * |
||
366 | * Matches as far as possible until we reach a certain set of bytes |
||
367 | * and returns the matched substring. |
||
368 | * |
||
369 | * @param string $bytes Bytes to match |
||
370 | * @param int $max Maximum number of bytes to scan |
||
371 | * |
||
372 | * @return mixed Index or false if no match is found. You should use strong |
||
373 | * equality when checking the result, since index could be 0. |
||
374 | */ |
||
375 | 116 | private function doCharsUntil($bytes, $max = null) |
|
392 | |||
393 | /** |
||
394 | * Returns the string so long as $bytes matches. |
||
395 | * |
||
396 | * Matches as far as possible with a certain set of bytes |
||
397 | * and returns the matched substring. |
||
398 | * |
||
399 | * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the |
||
400 | * current char, the pointer advances and the char is part of the |
||
401 | * substring. |
||
402 | * @param int $max The max number of chars to read |
||
403 | * |
||
404 | * @return string |
||
405 | */ |
||
406 | 129 | private function doCharsWhile($bytes, $max = null) |
|
407 | { |
||
408 | 129 | if ($this->char >= $this->EOF) { |
|
409 | return false; |
||
410 | } |
||
411 | |||
423 | } |
||
424 |