1 | <?php |
||
2 | /** |
||
3 | * @author Niels A.D. |
||
4 | * @author Todd Burry <[email protected]> |
||
5 | * @copyright 2010 Niels A.D., 2014 Todd Burry |
||
6 | * @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1 |
||
7 | * @package pQuery |
||
8 | */ |
||
9 | |||
10 | namespace pQuery; |
||
11 | |||
12 | /** |
||
13 | * Parses a HTML document |
||
14 | * |
||
15 | * Functionality can be extended by overriding functions or adjusting the tag map. |
||
16 | * Document may contain small errors, the parser will try to recover and resume parsing. |
||
17 | */ |
||
18 | class HtmlParserBase extends TokenizerBase { |
||
19 | |||
20 | /** |
||
21 | * Tag open token, used for "<" |
||
22 | */ |
||
23 | const TOK_TAG_OPEN = 100; |
||
24 | /** |
||
25 | * Tag close token, used for ">" |
||
26 | */ |
||
27 | const TOK_TAG_CLOSE = 101; |
||
28 | /** |
||
29 | * Forward slash token, used for "/" |
||
30 | */ |
||
31 | const TOK_SLASH_FORWARD = 103; |
||
32 | /** |
||
33 | * Backslash token, used for "\" |
||
34 | */ |
||
35 | const TOK_SLASH_BACKWARD = 104; |
||
36 | /** |
||
37 | * String token, used for attribute values (" and ') |
||
38 | */ |
||
39 | const TOK_STRING = 104; |
||
40 | /** |
||
41 | * Equals token, used for "=" |
||
42 | */ |
||
43 | const TOK_EQUALS = 105; |
||
44 | |||
45 | /** |
||
46 | * Sets HTML identifiers, tags/attributes are considered identifiers |
||
47 | * @see TokenizerBase::$identifiers |
||
48 | * @access private |
||
49 | */ |
||
50 | var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890:-_!?%'; |
||
51 | |||
52 | /** |
||
53 | * Status of the parser (tagname, closing tag, etc) |
||
54 | * @var array |
||
55 | */ |
||
56 | var $status = array(); |
||
57 | |||
58 | /** |
||
59 | * Map characters to match their tokens |
||
60 | * @see TokenizerBase::$custom_char_map |
||
61 | * @access private |
||
62 | */ |
||
63 | var $custom_char_map = array( |
||
64 | '<' => self::TOK_TAG_OPEN, |
||
65 | '>' => self::TOK_TAG_CLOSE, |
||
66 | "'" => 'parse_string', |
||
67 | '"' => 'parse_string', |
||
68 | '/' => self::TOK_SLASH_FORWARD, |
||
69 | '\\' => self::TOK_SLASH_BACKWARD, |
||
70 | '=' => self::TOK_EQUALS |
||
71 | ); |
||
72 | |||
73 | 37 | function __construct($doc = '', $pos = 0) { |
|
74 | 37 | parent::__construct($doc, $pos); |
|
75 | 37 | $this->parse_all(); |
|
76 | 37 | } |
|
77 | |||
78 | #php4 PHP4 class constructor compatibility |
||
79 | #function HtmlParserBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);} |
||
80 | #php4e |
||
81 | |||
82 | /** |
||
83 | Callback functions for certain tags |
||
84 | @var array (TAG_NAME => FUNCTION_NAME) |
||
85 | @internal Function should be a method in the class |
||
86 | @internal Tagname should be lowercase and is everything after <, e.g. "?php" or "!doctype" |
||
87 | @access private |
||
88 | */ |
||
89 | var $tag_map = array( |
||
90 | '!doctype' => 'parse_doctype', |
||
91 | '?' => 'parse_php', |
||
92 | '?php' => 'parse_php', |
||
93 | '%' => 'parse_asp', |
||
94 | 'style' => 'parse_style', |
||
95 | 'script' => 'parse_script' |
||
96 | ); |
||
97 | |||
98 | /** |
||
99 | * Parse a HTML string (attributes) |
||
100 | * @internal Gets called with ' and " |
||
101 | * @return int |
||
102 | */ |
||
103 | 33 | protected function parse_string() { |
|
104 | 33 | if ($this->next_pos($this->doc[$this->pos], false) !== self::TOK_UNKNOWN) { |
|
105 | --$this->pos; |
||
106 | } |
||
107 | 33 | return self::TOK_STRING; |
|
108 | } |
||
109 | |||
110 | /** |
||
111 | * Parse text between tags |
||
112 | * @internal Gets called between tags, uses {@link $status}[last_pos] |
||
113 | * @internal Stores text in {@link $status}[text] |
||
114 | */ |
||
115 | 37 | function parse_text() { |
|
116 | 37 | $len = $this->pos - 1 - $this->status['last_pos']; |
|
117 | 37 | $this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : ''); |
|
118 | 37 | } |
|
119 | |||
120 | /** |
||
121 | * Parse comment tags |
||
122 | * @internal Gets called with HTML comments ("<!--") |
||
123 | * @internal Stores text in {@link $status}[comment] |
||
124 | * @return bool |
||
125 | */ |
||
126 | 9 | function parse_comment() { |
|
127 | 9 | $this->pos += 3; |
|
128 | 9 | if ($this->next_pos('-->', false) !== self::TOK_UNKNOWN) { |
|
129 | $this->status['comment'] = $this->getTokenString(1, -1); |
||
130 | --$this->pos; |
||
131 | } else { |
||
132 | 9 | $this->status['comment'] = $this->getTokenString(1, -1); |
|
133 | 9 | $this->pos += 2; |
|
134 | } |
||
135 | 9 | $this->status['last_pos'] = $this->pos; |
|
136 | |||
137 | 9 | return true; |
|
138 | } |
||
139 | |||
140 | /** |
||
141 | * Parse doctype tag |
||
142 | * @internal Gets called with doctype ("<!doctype") |
||
143 | * @internal Stores text in {@link $status}[dtd] |
||
144 | * @return bool |
||
145 | */ |
||
146 | 9 | function parse_doctype() { |
|
147 | 9 | $start = $this->pos; |
|
148 | 9 | if ($this->next_search('[>', false) === self::TOK_UNKNOWN) { |
|
149 | 9 | if ($this->doc[$this->pos] === '[') { |
|
150 | if (($this->next_pos(']', false) !== self::TOK_UNKNOWN) || ($this->next_pos('>', false) !== self::TOK_UNKNOWN)) { |
||
151 | $this->addError('Invalid doctype'); |
||
152 | return false; |
||
153 | } |
||
154 | } |
||
155 | |||
156 | 9 | $this->token_start = $start; |
|
157 | 9 | $this->status['dtd'] = $this->getTokenString(2, -1); |
|
158 | 9 | $this->status['last_pos'] = $this->pos; |
|
159 | 9 | return true; |
|
160 | } else { |
||
161 | $this->addError('Invalid doctype'); |
||
162 | return false; |
||
163 | } |
||
164 | } |
||
165 | |||
166 | /** |
||
167 | * Parse cdata tag |
||
168 | * @internal Gets called with cdata ("<![cdata") |
||
169 | * @internal Stores text in {@link $status}[cdata] |
||
170 | * @return bool |
||
171 | */ |
||
172 | function parse_cdata() { |
||
173 | if ($this->next_pos(']]>', false) === self::TOK_UNKNOWN) { |
||
174 | $this->status['cdata'] = $this->getTokenString(9, -1); |
||
175 | $this->status['last_pos'] = $this->pos + 2; |
||
176 | return true; |
||
177 | } else { |
||
178 | $this->addError('Invalid cdata tag'); |
||
179 | return false; |
||
180 | } |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * Parse php tags |
||
185 | * @internal Gets called with php tags ("<?php") |
||
186 | * @return bool |
||
187 | */ |
||
188 | function parse_php() { |
||
189 | $start = $this->pos; |
||
190 | if ($this->next_pos('?>', false) !== self::TOK_UNKNOWN) { |
||
191 | $this->pos -= 2; //End of file |
||
192 | } |
||
193 | |||
194 | $len = $this->pos - 1 - $start; |
||
195 | $this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : ''); |
||
196 | $this->status['last_pos'] = ++$this->pos; |
||
197 | return true; |
||
198 | } |
||
199 | |||
200 | /** |
||
201 | * Parse asp tags |
||
202 | * @internal Gets called with asp tags ("<%") |
||
203 | * @return bool |
||
204 | */ |
||
205 | function parse_asp() { |
||
206 | $start = $this->pos; |
||
207 | if ($this->next_pos('%>', false) !== self::TOK_UNKNOWN) { |
||
208 | $this->pos -= 2; //End of file |
||
209 | } |
||
210 | |||
211 | $len = $this->pos - 1 - $start; |
||
212 | $this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : ''); |
||
213 | $this->status['last_pos'] = ++$this->pos; |
||
214 | return true; |
||
215 | } |
||
216 | |||
217 | /** |
||
218 | * Parse style tags |
||
219 | * @internal Gets called with php tags ("<style>") |
||
220 | * @return bool |
||
221 | */ |
||
222 | 9 | function parse_style() { |
|
223 | 9 | if ($this->parse_attributes() && ($this->token === self::TOK_TAG_CLOSE) && ($start = $this->pos) && ($this->next_pos('</style>', false) === self::TOK_UNKNOWN)) { |
|
224 | 9 | $len = $this->pos - 1 - $start; |
|
225 | 9 | $this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : ''); |
|
226 | |||
227 | 9 | $this->pos += 7; |
|
228 | 9 | $this->status['last_pos'] = $this->pos; |
|
229 | 9 | return true; |
|
230 | } else { |
||
231 | $this->addError('No end for style tag found'); |
||
232 | return false; |
||
233 | } |
||
234 | } |
||
235 | |||
236 | /** |
||
237 | * Parse script tags |
||
238 | * @internal Gets called with php tags ("<script>") |
||
239 | * @return bool |
||
240 | */ |
||
241 | function parse_script() { |
||
242 | if ($this->parse_attributes() && ($this->token === self::TOK_TAG_CLOSE) && ($start = $this->pos) && ($this->next_pos('</script>', false) === self::TOK_UNKNOWN)) { |
||
243 | $len = $this->pos - 1 - $start; |
||
244 | $this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : ''); |
||
245 | |||
246 | $this->pos += 8; |
||
247 | $this->status['last_pos'] = $this->pos; |
||
248 | return true; |
||
249 | } else { |
||
250 | $this->addError('No end for script tag found'); |
||
251 | return false; |
||
252 | } |
||
253 | } |
||
254 | |||
255 | /** |
||
256 | * Parse conditional tags (+ all conditional tags inside) |
||
257 | * @internal Gets called with IE conditionals ("<![if]" and "<!--[if]") |
||
258 | * @internal Stores condition in {@link $status}[tag_condition] |
||
259 | * @return bool |
||
260 | */ |
||
261 | function parse_conditional() { |
||
262 | if ($this->status['closing_tag']) { |
||
263 | $this->pos += 8; |
||
264 | } else { |
||
265 | $this->pos += (($this->status['comment']) ? 5 : 3); |
||
266 | if ($this->next_pos(']', false) !== self::TOK_UNKNOWN) { |
||
267 | $this->addError('"]" not found in conditional tag'); |
||
268 | return false; |
||
269 | } |
||
270 | $this->status['tag_condition'] = $this->getTokenString(0, -1); |
||
271 | } |
||
272 | |||
273 | if ($this->next_no_whitespace() !== self::TOK_TAG_CLOSE) { |
||
274 | $this->addError('No ">" tag found 2 for conditional tag'); |
||
275 | return false; |
||
276 | } |
||
277 | |||
278 | if ($this->status['comment']) { |
||
279 | $this->status['last_pos'] = $this->pos; |
||
280 | if ($this->next_pos('-->', false) !== self::TOK_UNKNOWN) { |
||
281 | $this->addError('No ending tag found for conditional tag'); |
||
282 | $this->pos = $this->size - 1; |
||
283 | |||
284 | $len = $this->pos - 1 - $this->status['last_pos']; |
||
285 | $this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : ''); |
||
286 | } else { |
||
287 | $len = $this->pos - 10 - $this->status['last_pos']; |
||
288 | $this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : ''); |
||
289 | $this->pos += 2; |
||
290 | } |
||
291 | } |
||
292 | |||
293 | $this->status['last_pos'] = $this->pos; |
||
294 | return true; |
||
295 | } |
||
296 | |||
297 | /** |
||
298 | * Parse attributes (names + value) |
||
299 | * @internal Stores attributes in {@link $status}[attributes] (array(ATTR => VAL)) |
||
300 | * @return bool |
||
301 | */ |
||
302 | 37 | function parse_attributes() { |
|
303 | 37 | $this->status['attributes'] = array(); |
|
304 | |||
305 | 37 | while ($this->next_no_whitespace() === self::TOK_IDENTIFIER) { |
|
306 | 34 | $attr = $this->getTokenString(); |
|
307 | 34 | if (($attr === '?') || ($attr === '%')) { |
|
308 | //Probably closing tags |
||
309 | break; |
||
310 | } |
||
311 | |||
312 | 34 | if ($this->next_no_whitespace() === self::TOK_EQUALS) { |
|
313 | 34 | if ($this->next_no_whitespace() === self::TOK_STRING) { |
|
314 | 33 | $val = $this->getTokenString(1, -1); |
|
315 | 33 | } else { |
|
316 | 1 | $this->token_start = $this->pos; |
|
317 | 1 | if (!isset($stop)) { |
|
318 | 1 | $stop = $this->whitespace; |
|
319 | 1 | $stop['<'] = true; |
|
320 | 1 | $stop['>'] = true; |
|
321 | 1 | } |
|
322 | |||
323 | 1 | while ((++$this->pos < $this->size) && (!isset($stop[$this->doc[$this->pos]]))) { |
|
324 | // Do nothing. |
||
325 | 1 | } |
|
326 | 1 | --$this->pos; |
|
327 | |||
328 | 1 | $val = $this->getTokenString(); |
|
329 | |||
330 | 1 | if (trim($val) === '') { |
|
331 | $this->addError('Invalid attribute value'); |
||
332 | return false; |
||
333 | } |
||
334 | } |
||
335 | 34 | } else { |
|
336 | 9 | $val = $attr; |
|
337 | 9 | $this->pos = (($this->token_start) ? $this->token_start : $this->pos) - 1; |
|
338 | } |
||
339 | |||
340 | 34 | $this->status['attributes'][$attr] = $val; |
|
341 | 34 | } |
|
342 | |||
343 | 37 | return true; |
|
344 | } |
||
345 | |||
346 | /** |
||
347 | * Default callback for tags |
||
348 | * @internal Gets called after the tagname (<html*ENTERS_HERE* attribute="value">) |
||
349 | * @return bool |
||
350 | */ |
||
351 | 37 | function parse_tag_default() { |
|
352 | 37 | if ($this->status['closing_tag']) { |
|
353 | 37 | $this->status['attributes'] = array(); |
|
354 | 37 | $this->next_no_whitespace(); |
|
355 | 37 | } else { |
|
356 | 37 | if (!$this->parse_attributes()) { |
|
357 | return false; |
||
358 | } |
||
359 | } |
||
360 | |||
361 | 37 | if ($this->token !== self::TOK_TAG_CLOSE) { |
|
362 | 9 | if ($this->token === self::TOK_SLASH_FORWARD) { |
|
363 | 9 | $this->status['self_close'] = true; |
|
364 | 9 | $this->next(); |
|
365 | 9 | } elseif ((($this->status['tag_name'][0] === '?') && ($this->doc[$this->pos] === '?')) || (($this->status['tag_name'][0] === '%') && ($this->doc[$this->pos] === '%'))) { |
|
366 | $this->status['self_close'] = true; |
||
367 | $this->pos++; |
||
368 | |||
369 | if (isset($this->char_map[$this->doc[$this->pos]]) && (!is_string($this->char_map[$this->doc[$this->pos]]))) { |
||
370 | $this->token = $this->char_map[$this->doc[$this->pos]]; |
||
371 | } else { |
||
372 | $this->token = self::TOK_UNKNOWN; |
||
373 | } |
||
374 | }/* else { |
||
375 | $this->status['self_close'] = false; |
||
376 | }*/ |
||
377 | 9 | } |
|
378 | |||
379 | 37 | if ($this->token !== self::TOK_TAG_CLOSE) { |
|
380 | $this->addError('Expected ">", but found "'.$this->getTokenString().'"'); |
||
381 | if ($this->next_pos('>', false) !== self::TOK_UNKNOWN) { |
||
382 | $this->addError('No ">" tag found for "'.$this->status['tag_name'].'" tag'); |
||
383 | return false; |
||
384 | } |
||
385 | } |
||
386 | |||
387 | 37 | return true; |
|
388 | } |
||
389 | |||
390 | /** |
||
391 | * Parse tag |
||
392 | * @internal Gets called after opening tag (<*ENTERS_HERE*html attribute="value">) |
||
393 | * @internal Stores information about the tag in {@link $status} (comment, closing_tag, tag_name) |
||
394 | * @return bool |
||
395 | */ |
||
396 | 37 | function parse_tag() { |
|
397 | 37 | $start = $this->pos; |
|
398 | 37 | $this->status['self_close'] = false; |
|
399 | 37 | $this->parse_text(); |
|
400 | |||
401 | 37 | $next = (($this->pos + 1) < $this->size) ? $this->doc[$this->pos + 1] : ''; |
|
402 | 37 | if ($next === '!') { |
|
403 | 9 | $this->status['closing_tag'] = false; |
|
404 | |||
405 | 9 | if (substr($this->doc, $this->pos + 2, 2) === '--') { |
|
406 | 9 | $this->status['comment'] = true; |
|
407 | |||
408 | 9 | if (($this->doc[$this->pos + 4] === '[') && (strcasecmp(substr($this->doc, $this->pos + 5, 2), 'if') === 0)) { |
|
409 | return $this->parse_conditional(); |
||
410 | } else { |
||
411 | 9 | return $this->parse_comment(); |
|
412 | } |
||
413 | } else { |
||
414 | 9 | $this->status['comment'] = false; |
|
415 | |||
416 | 9 | if ($this->doc[$this->pos + 2] === '[') { |
|
417 | if (strcasecmp(substr($this->doc, $this->pos + 3, 2), 'if') === 0) { |
||
418 | return $this->parse_conditional(); |
||
419 | } elseif (strcasecmp(substr($this->doc, $this->pos + 3, 5), 'endif') === 0) { |
||
420 | $this->status['closing_tag'] = true; |
||
421 | return $this->parse_conditional(); |
||
422 | } elseif (strcasecmp(substr($this->doc, $this->pos + 3, 5), 'cdata') === 0) { |
||
423 | return $this->parse_cdata(); |
||
424 | } |
||
425 | } |
||
426 | } |
||
427 | 37 | } elseif ($next === '/') { |
|
428 | 37 | $this->status['closing_tag'] = true; |
|
429 | 37 | ++$this->pos; |
|
430 | 37 | } else { |
|
431 | 37 | $this->status['closing_tag'] = false; |
|
432 | } |
||
433 | |||
434 | 37 | if ($this->next() !== self::TOK_IDENTIFIER) { |
|
435 | $this->addError('Tagname expected'); |
||
436 | //if ($this->next_pos('>', false) === self::TOK_UNKNOWN) { |
||
437 | $this->status['last_pos'] = $start - 1; |
||
438 | return true; |
||
439 | //} else { |
||
440 | // return false; |
||
441 | //} |
||
442 | } |
||
443 | |||
444 | 37 | $tag = $this->getTokenString(); |
|
445 | 37 | $this->status['tag_name'] = $tag; |
|
446 | 37 | $tag = strtolower($tag); |
|
447 | |||
448 | 37 | if (isset($this->tag_map[$tag])) { |
|
449 | 9 | $res = $this->{$this->tag_map[$tag]}(); |
|
450 | 9 | } else { |
|
451 | 37 | $res = $this->parse_tag_default(); |
|
452 | } |
||
453 | |||
454 | 37 | $this->status['last_pos'] = $this->pos; |
|
455 | 37 | return $res; |
|
456 | } |
||
457 | |||
458 | /** |
||
459 | * Parse full document |
||
460 | * @return bool |
||
461 | */ |
||
462 | 37 | function parse_all() { |
|
463 | 37 | $this->errors = array(); |
|
464 | 37 | $this->status['last_pos'] = -1; |
|
465 | |||
466 | 37 | if (($this->token === self::TOK_TAG_OPEN) || ($this->next_pos('<', false) === self::TOK_UNKNOWN)) { |
|
467 | do { |
||
468 | 37 | if (!$this->parse_tag()) { |
|
469 | return false; |
||
470 | } |
||
471 | 37 | } while ($this->next_pos('<') !== self::TOK_NULL); |
|
472 | 37 | } |
|
473 | |||
474 | 37 | $this->pos = $this->size; |
|
475 | 37 | $this->parse_text(); |
|
476 | |||
477 | 37 | return true; |
|
478 | } |
||
479 | } |
||
480 | |||
481 | /** |
||
482 | * Parses a HTML document into a HTML DOM |
||
483 | */ |
||
484 | class HtmlParser extends HtmlParserBase { |
||
485 | |||
486 | /** |
||
487 | * Root object |
||
488 | * @internal If string, then it will create a new instance as root |
||
489 | * @var DomNode |
||
490 | */ |
||
491 | var $root = 'pQuery\\DomNode'; |
||
492 | |||
493 | /** |
||
494 | * Current parsing hierarchy |
||
495 | * @internal Root is always at index 0, current tag is at the end of the array |
||
496 | * @var array |
||
497 | * @access private |
||
498 | */ |
||
499 | var $hierarchy = array(); |
||
500 | |||
501 | /** |
||
502 | * Tags that don't need closing tags |
||
503 | * @var array |
||
504 | * @access private |
||
505 | */ |
||
506 | var $tags_selfclose = array( |
||
507 | 'area' => true, |
||
508 | 'base' => true, |
||
509 | 'basefont' => true, |
||
510 | 'br' => true, |
||
511 | 'col' => true, |
||
512 | 'command' => true, |
||
513 | 'embed' => true, |
||
514 | 'frame' => true, |
||
515 | 'hr' => true, |
||
516 | 'img' => true, |
||
517 | 'input' => true, |
||
518 | 'ins' => true, |
||
519 | 'keygen' => true, |
||
520 | 'link' => true, |
||
521 | 'meta' => true, |
||
522 | 'param' => true, |
||
523 | 'source' => true, |
||
524 | 'track' => true, |
||
525 | 'wbr' => true |
||
526 | ); |
||
527 | |||
528 | /** |
||
529 | * Class constructor |
||
530 | * @param string $doc Document to be tokenized |
||
531 | * @param int $pos Position to start parsing |
||
532 | * @param DomNode $root Root node, null to auto create |
||
533 | */ |
||
534 | 37 | function __construct($doc = '', $pos = 0, $root = null) { |
|
535 | 37 | if ($root === null) { |
|
536 | 37 | $root = new $this->root('~root~', null); |
|
537 | 37 | } |
|
538 | 37 | $this->root =& $root; |
|
539 | |||
540 | 37 | parent::__construct($doc, $pos); |
|
541 | 37 | } |
|
542 | |||
543 | #php4 PHP4 class constructor compatibility |
||
544 | #function HtmlParser($doc = '', $pos = 0, $root = null) {return $this->__construct($doc, $pos, $root);} |
||
545 | #php4e |
||
546 | |||
547 | /** |
||
548 | * Class magic invoke method, performs {@link select()} |
||
549 | * @return array |
||
550 | * @access private |
||
551 | */ |
||
552 | function __invoke($query = '*') { |
||
553 | return $this->select($query); |
||
554 | } |
||
555 | |||
556 | /** |
||
557 | * Class magic toString method, performs {@link DomNode::toString()} |
||
558 | * @return string |
||
559 | * @access private |
||
560 | */ |
||
561 | function __toString() { |
||
562 | return $this->root->getInnerText(); |
||
563 | } |
||
564 | |||
565 | /** |
||
566 | * Performs a css select query on the root node |
||
567 | * @see DomNode::select() |
||
568 | * @return array |
||
569 | */ |
||
570 | function select($query = '*', $index = false, $recursive = true, $check_self = false) { |
||
571 | return $this->root->select($query, $index, $recursive, $check_self); |
||
572 | } |
||
573 | |||
574 | /** |
||
575 | * Updates the current hierarchy status and checks for |
||
576 | * correct opening/closing of tags |
||
577 | * @param bool $self_close Is current tag self closing? Null to use {@link tags_selfclose} |
||
578 | * @internal This is were most of the nodes get added |
||
579 | * @access private |
||
580 | */ |
||
581 | 37 | protected function parse_hierarchy($self_close = null) { |
|
582 | 37 | if ($self_close === null) { |
|
583 | $this->status['self_close'] = ($self_close = isset($this->tags_selfclose[strtolower($this->status['tag_name'])])); |
||
584 | } |
||
585 | |||
586 | 37 | if ($self_close) { |
|
587 | 9 | if ($this->status['closing_tag']) { |
|
588 | |||
589 | //$c = end($this->hierarchy)->children |
||
590 | $c = $this->hierarchy[count($this->hierarchy) - 1]->children; |
||
591 | $found = false; |
||
592 | for ($count = count($c), $i = $count - 1; $i >= 0; $i--) { |
||
593 | if (strcasecmp($c[$i]->tag, $this->status['tag_name']) === 0) { |
||
594 | for($ii = $i + 1; $ii < $count; $ii++) { |
||
595 | $index = null; //Needs to be passed by ref |
||
596 | $c[$i + 1]->changeParent($c[$i], $index); |
||
597 | } |
||
598 | $c[$i]->self_close = false; |
||
599 | |||
600 | $found = true; |
||
601 | break; |
||
602 | } |
||
603 | } |
||
604 | |||
605 | if (!$found) { |
||
606 | $this->addError('Closing tag "'.$this->status['tag_name'].'" which is not open'); |
||
607 | } |
||
608 | |||
609 | 9 | } elseif ($this->status['tag_name'][0] === '?') { |
|
610 | //end($this->hierarchy)->addXML($this->status['tag_name'], '', $this->status['attributes']); |
||
611 | $index = null; //Needs to be passed by ref |
||
612 | $this->hierarchy[count($this->hierarchy) - 1]->addXML($this->status['tag_name'], '', $this->status['attributes'], $index); |
||
613 | 9 | } elseif ($this->status['tag_name'][0] === '%') { |
|
614 | //end($this->hierarchy)->addASP($this->status['tag_name'], '', $this->status['attributes']); |
||
615 | $index = null; //Needs to be passed by ref |
||
616 | $this->hierarchy[count($this->hierarchy) - 1]->addASP($this->status['tag_name'], '', $this->status['attributes'], $index); |
||
617 | } else { |
||
618 | //end($this->hierarchy)->addChild($this->status); |
||
619 | 9 | $index = null; //Needs to be passed by ref |
|
620 | 9 | $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index); |
|
621 | } |
||
622 | 37 | } elseif ($this->status['closing_tag']) { |
|
623 | 37 | $found = false; |
|
624 | 37 | for ($count = count($this->hierarchy), $i = $count - 1; $i >= 0; $i--) { |
|
625 | 37 | if (strcasecmp($this->hierarchy[$i]->tag, $this->status['tag_name']) === 0) { |
|
626 | |||
627 | 37 | for($ii = ($count - $i - 1); $ii >= 0; $ii--) { |
|
628 | 37 | $e = array_pop($this->hierarchy); |
|
629 | 37 | if ($ii > 0) { |
|
630 | $this->addError('Closing tag "'.$this->status['tag_name'].'" while "'.$e->tag.'" is not closed yet'); |
||
631 | } |
||
632 | 37 | } |
|
633 | |||
634 | 37 | $found = true; |
|
635 | 37 | break; |
|
636 | } |
||
637 | } |
||
638 | |||
639 | 37 | if (!$found) { |
|
640 | $this->addError('Closing tag "'.$this->status['tag_name'].'" which is not open'); |
||
641 | } |
||
642 | |||
643 | 37 | } else { |
|
644 | //$this->hierarchy[] = end($this->hierarchy)->addChild($this->status); |
||
645 | 37 | $index = null; //Needs to be passed by ref |
|
646 | 37 | $this->hierarchy[] = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index); |
|
647 | } |
||
648 | 37 | } |
|
649 | |||
650 | function parse_cdata() { |
||
651 | if (!parent::parse_cdata()) {return false;} |
||
652 | |||
653 | //end($this->hierarchy)->addCDATA($this->status['cdata']); |
||
654 | $index = null; //Needs to be passed by ref |
||
655 | $this->hierarchy[count($this->hierarchy) - 1]->addCDATA($this->status['cdata'], $index); |
||
656 | return true; |
||
657 | } |
||
658 | |||
659 | 9 | function parse_comment() { |
|
660 | 9 | if (!parent::parse_comment()) {return false;} |
|
661 | |||
662 | //end($this->hierarchy)->addComment($this->status['comment']); |
||
663 | 9 | $index = null; //Needs to be passed by ref |
|
664 | 9 | $this->hierarchy[count($this->hierarchy) - 1]->addComment($this->status['comment'], $index); |
|
665 | 9 | return true; |
|
666 | } |
||
667 | |||
668 | function parse_conditional() { |
||
669 | if (!parent::parse_conditional()) {return false;} |
||
670 | |||
671 | if ($this->status['comment']) { |
||
672 | //$e = end($this->hierarchy)->addConditional($this->status['tag_condition'], true); |
||
673 | $index = null; //Needs to be passed by ref |
||
674 | $e = $this->hierarchy[count($this->hierarchy) - 1]->addConditional($this->status['tag_condition'], true, $index); |
||
675 | if ($this->status['text'] !== '') { |
||
676 | $index = null; //Needs to be passed by ref |
||
677 | $e->addText($this->status['text'], $index); |
||
678 | } |
||
679 | } else { |
||
680 | if ($this->status['closing_tag']) { |
||
681 | $this->parse_hierarchy(false); |
||
682 | } else { |
||
683 | //$this->hierarchy[] = end($this->hierarchy)->addConditional($this->status['tag_condition'], false); |
||
684 | $index = null; //Needs to be passed by ref |
||
685 | $this->hierarchy[] = $this->hierarchy[count($this->hierarchy) - 1]->addConditional($this->status['tag_condition'], false, $index); |
||
686 | } |
||
687 | } |
||
688 | |||
689 | return true; |
||
690 | } |
||
691 | |||
692 | 9 | function parse_doctype() { |
|
693 | 9 | if (!parent::parse_doctype()) {return false;} |
|
694 | |||
695 | //end($this->hierarchy)->addDoctype($this->status['dtd']); |
||
696 | 9 | $index = null; //Needs to be passed by ref |
|
697 | 9 | $this->hierarchy[count($this->hierarchy) - 1]->addDoctype($this->status['dtd'], $index); |
|
698 | 9 | return true; |
|
699 | } |
||
700 | |||
701 | function parse_php() { |
||
702 | if (!parent::parse_php()) {return false;} |
||
703 | |||
704 | //end($this->hierarchy)->addXML('php', $this->status['text']); |
||
705 | $index = null; //Needs to be passed by ref |
||
706 | $this->hierarchy[count($this->hierarchy) - 1]->addXML('php', $this->status['text'], $index); |
||
707 | return true; |
||
708 | } |
||
709 | |||
710 | function parse_asp() { |
||
711 | if (!parent::parse_asp()) {return false;} |
||
712 | |||
713 | //end($this->hierarchy)->addASP('', $this->status['text']); |
||
714 | $index = null; //Needs to be passed by ref |
||
715 | $this->hierarchy[count($this->hierarchy) - 1]->addASP('', $this->status['text'], $index); |
||
716 | return true; |
||
717 | } |
||
718 | |||
719 | function parse_script() { |
||
720 | if (!parent::parse_script()) {return false;} |
||
721 | |||
722 | //$e = end($this->hierarchy)->addChild($this->status); |
||
723 | $index = null; //Needs to be passed by ref |
||
724 | $e = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index); |
||
725 | if ($this->status['text'] !== '') { |
||
726 | $index = null; //Needs to be passed by ref |
||
727 | $e->addText($this->status['text'], $index); |
||
728 | } |
||
729 | return true; |
||
730 | } |
||
731 | |||
732 | 9 | function parse_style() { |
|
733 | 9 | if (!parent::parse_style()) {return false;} |
|
734 | |||
735 | //$e = end($this->hierarchy)->addChild($this->status); |
||
736 | 9 | $index = null; //Needs to be passed by ref |
|
737 | 9 | $e = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index); |
|
738 | 9 | if ($this->status['text'] !== '') { |
|
739 | 9 | $index = null; //Needs to be passed by ref |
|
740 | 9 | $e->addText($this->status['text'], $index); |
|
741 | 9 | } |
|
742 | 9 | return true; |
|
743 | } |
||
744 | |||
745 | 37 | function parse_tag_default() { |
|
746 | 37 | if (!parent::parse_tag_default()) {return false;} |
|
747 | |||
748 | 37 | $this->parse_hierarchy(($this->status['self_close']) ? true : null); |
|
749 | 37 | return true; |
|
750 | } |
||
751 | |||
752 | 37 | function parse_text() { |
|
753 | 37 | parent::parse_text(); |
|
754 | 37 | if ($this->status['text'] !== '') { |
|
755 | //end($this->hierarchy)->addText($this->status['text']); |
||
756 | 37 | $index = null; //Needs to be passed by ref |
|
757 | 37 | $this->hierarchy[count($this->hierarchy) - 1]->addText($this->status['text'], $index); |
|
758 | 37 | } |
|
759 | 37 | } |
|
760 | |||
761 | 37 | function parse_all() { |
|
762 | 37 | $this->hierarchy = array(&$this->root); |
|
763 | 37 | return ((parent::parse_all()) ? $this->root : false); |
|
764 | } |
||
765 | } |
||
766 | |||
767 | /** |
||
768 | * HTML5 specific parser (adds support for omittable closing tags) |
||
769 | */ |
||
770 | class Html5Parser extends HtmlParser { |
||
771 | |||
772 | /** |
||
773 | * Tags with ommitable closing tags |
||
774 | * @var array array('tag2' => 'tag1') will close tag1 if following (not child) tag is tag2 |
||
775 | * @access private |
||
776 | */ |
||
777 | var $tags_optional_close = array( |
||
778 | //Current tag => Previous tag |
||
779 | 'li' => array('li' => true), |
||
780 | 'dt' => array('dt' => true, 'dd' => true), |
||
781 | 'dd' => array('dt' => true, 'dd' => true), |
||
782 | 'address' => array('p' => true), |
||
783 | 'article' => array('p' => true), |
||
784 | 'aside' => array('p' => true), |
||
785 | 'blockquote' => array('p' => true), |
||
786 | 'dir' => array('p' => true), |
||
787 | 'div' => array('p' => true), |
||
788 | 'dl' => array('p' => true), |
||
789 | 'fieldset' => array('p' => true), |
||
790 | 'footer' => array('p' => true), |
||
791 | 'form' => array('p' => true), |
||
792 | 'h1' => array('p' => true), |
||
793 | 'h2' => array('p' => true), |
||
794 | 'h3' => array('p' => true), |
||
795 | 'h4' => array('p' => true), |
||
796 | 'h5' => array('p' => true), |
||
797 | 'h6' => array('p' => true), |
||
798 | 'header' => array('p' => true), |
||
799 | 'hgroup' => array('p' => true), |
||
800 | 'hr' => array('p' => true), |
||
801 | 'menu' => array('p' => true), |
||
802 | 'nav' => array('p' => true), |
||
803 | 'ol' => array('p' => true), |
||
804 | 'p' => array('p' => true), |
||
805 | 'pre' => array('p' => true), |
||
806 | 'section' => array('p' => true), |
||
807 | 'table' => array('p' => true), |
||
808 | 'ul' => array('p' => true), |
||
809 | 'rt' => array('rt' => true, 'rp' => true), |
||
810 | 'rp' => array('rt' => true, 'rp' => true), |
||
811 | 'optgroup' => array('optgroup' => true, 'option' => true), |
||
812 | 'option' => array('option'), |
||
813 | 'tbody' => array('thread' => true, 'tbody' => true, 'tfoot' => true), |
||
814 | 'tfoot' => array('thread' => true, 'tbody' => true), |
||
815 | 'tr' => array('tr' => true), |
||
816 | 'td' => array('td' => true, 'th' => true), |
||
817 | 'th' => array('td' => true, 'th' => true), |
||
818 | 'body' => array('head' => true) |
||
819 | ); |
||
820 | |||
821 | 37 | protected function parse_hierarchy($self_close = null) { |
|
822 | 37 | $tag_curr = strtolower($this->status['tag_name']); |
|
823 | 37 | if ($self_close === null) { |
|
824 | 37 | $this->status['self_close'] = ($self_close = isset($this->tags_selfclose[$tag_curr])); |
|
825 | 37 | } |
|
826 | |||
827 | 37 | if (! ($self_close || $this->status['closing_tag'])) { |
|
828 | //$tag_prev = strtolower(end($this->hierarchy)->tag); |
||
829 | 37 | $tag_prev = strtolower($this->hierarchy[count($this->hierarchy) - 1]->tag); |
|
830 | 37 | if (isset($this->tags_optional_close[$tag_curr]) && isset($this->tags_optional_close[$tag_curr][$tag_prev])) { |
|
831 | array_pop($this->hierarchy); |
||
832 | } |
||
833 | 37 | } |
|
834 | |||
835 | 37 | return parent::parse_hierarchy($self_close); |
|
836 | } |
||
837 | } |
||
838 | |||
839 | ?> |