Code

< 40 %
40-60 %
> 60 %
1
<?php
2
/**
3
 * @author Niels A.D.
4
 * @author Todd Burry <[email protected]>
5
 * @copyright 2010 Niels A.D., 2014 Todd Burry
6
 * @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1
7
 * @package pQuery
8
 */
9
10
namespace pQuery;
11
12
/**
13
 * Parses a HTML document
14
 *
15
 * Functionality can be extended by overriding functions or adjusting the tag map.
16
 * Document may contain small errors, the parser will try to recover and resume parsing.
17
 */
18
class HtmlParserBase extends TokenizerBase {
19
20
	/**
21
	 * Tag open token, used for "<"
22
	 */
23
	const TOK_TAG_OPEN = 100;
24
	/**
25
	 * Tag close token, used for ">"
26
	 */
27
	const TOK_TAG_CLOSE = 101;
28
	/**
29
	 * Forward slash token, used for "/"
30
	 */
31
	const TOK_SLASH_FORWARD = 103;
32
	/**
33
	 * Backslash token, used for "\"
34
	 */
35
	const TOK_SLASH_BACKWARD = 104;
36
	/**
37
	 * String token, used for attribute values (" and ')
38
	 */
39
	const TOK_STRING = 104;
40
	/**
41
	 * Equals token, used for "="
42
	 */
43
	const TOK_EQUALS = 105;
44
45
	/**
46
	 * Sets HTML identifiers, tags/attributes are considered identifiers
47
	 * @see TokenizerBase::$identifiers
48
	 * @access private
49
	 */
50
	var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890:-_!?%';
51
52
	/**
53
	 * Status of the parser (tagname, closing tag, etc)
54
	 * @var array
55
	 */
56
	var $status = array();
57
58
	/**
59
	 * Map characters to match their tokens
60
	 * @see TokenizerBase::$custom_char_map
61
	 * @access private
62
	 */
63
	var $custom_char_map = array(
64
		'<' => self::TOK_TAG_OPEN,
65
		'>' => self::TOK_TAG_CLOSE,
66
		"'" => 'parse_string',
67
		'"' => 'parse_string',
68
		'/' => self::TOK_SLASH_FORWARD,
69
		'\\' => self::TOK_SLASH_BACKWARD,
70
		'=' => self::TOK_EQUALS
71
	);
72
73 37
	function __construct($doc = '', $pos = 0) {
74 37
		parent::__construct($doc, $pos);
75 37
		$this->parse_all();
76 37
	}
77
78
	#php4 PHP4 class constructor compatibility
79
	#function HtmlParserBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);}
80
	#php4e
81
82
	/**
83
	 Callback functions for certain tags
84
	 @var array (TAG_NAME => FUNCTION_NAME)
85
	 @internal Function should be a method in the class
86
	 @internal Tagname should be lowercase and is everything after <, e.g. "?php" or "!doctype"
87
	 @access private
88
	 */
89
	var $tag_map = array(
90
		'!doctype' => 'parse_doctype',
91
		'?' => 'parse_php',
92
		'?php' => 'parse_php',
93
		'%' => 'parse_asp',
94
		'style' => 'parse_style',
95
		'script' => 'parse_script'
96
	);
97
98
	/**
99
	 * Parse a HTML string (attributes)
100
	 * @internal Gets called with ' and "
101
	 * @return int
102
	 */
103 33
	protected function parse_string() {
104 33
		if ($this->next_pos($this->doc[$this->pos], false) !== self::TOK_UNKNOWN) {
105
			--$this->pos;
106
		}
107 33
		return self::TOK_STRING;
108
	}
109
110
	/**
111
	 * Parse text between tags
112
	 * @internal Gets called between tags, uses {@link $status}[last_pos]
113
	 * @internal Stores text in {@link $status}[text]
114
	 */
115 37
	function parse_text() {
116 37
		$len = $this->pos - 1 - $this->status['last_pos'];
117 37
		$this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : '');
118 37
	}
119
120
	/**
121
	 * Parse comment tags
122
	 * @internal Gets called with HTML comments ("<!--")
123
	 * @internal Stores text in {@link $status}[comment]
124
	 * @return bool
125
	 */
126 9
	function parse_comment() {
127 9
		$this->pos += 3;
128 9
		if ($this->next_pos('-->', false) !== self::TOK_UNKNOWN) {
129
			$this->status['comment'] = $this->getTokenString(1, -1);
130
			--$this->pos;
131
		} else {
132 9
			$this->status['comment'] = $this->getTokenString(1, -1);
133 9
			$this->pos += 2;
134
		}
135 9
		$this->status['last_pos'] = $this->pos;
136
137 9
		return true;
138
	}
139
140
	/**
141
	 * Parse doctype tag
142
	 * @internal Gets called with doctype ("<!doctype")
143
	 * @internal Stores text in {@link $status}[dtd]
144
	 * @return bool
145
	 */
146 9
	function parse_doctype() {
147 9
		$start = $this->pos;
148 9
		if ($this->next_search('[>', false) === self::TOK_UNKNOWN)  {
149 9
			if ($this->doc[$this->pos] === '[') {
150
				if (($this->next_pos(']', false) !== self::TOK_UNKNOWN) || ($this->next_pos('>', false) !== self::TOK_UNKNOWN)) {
151
					$this->addError('Invalid doctype');
152
					return false;
153
				}
154
			}
155
156 9
			$this->token_start = $start;
157 9
			$this->status['dtd'] = $this->getTokenString(2, -1);
158 9
			$this->status['last_pos'] = $this->pos;
159 9
			return true;
160
		} else {
161
			$this->addError('Invalid doctype');
162
			return false;
163
		}
164
	}
165
166
	/**
167
	 * Parse cdata tag
168
	 * @internal Gets called with cdata ("<![cdata")
169
	 * @internal Stores text in {@link $status}[cdata]
170
	 * @return bool
171
	 */
172
	function parse_cdata() {
173
		if ($this->next_pos(']]>', false) === self::TOK_UNKNOWN) {
174
			$this->status['cdata'] = $this->getTokenString(9, -1);
175
			$this->status['last_pos'] = $this->pos + 2;
176
			return true;
177
		} else {
178
			$this->addError('Invalid cdata tag');
179
			return false;
180
		}
181
	}
182
183
	/**
184
	 * Parse php tags
185
	 * @internal Gets called with php tags ("<?php")
186
	 * @return bool
187
	 */
188
	function parse_php() {
189
		$start = $this->pos;
190
		if ($this->next_pos('?>', false) !== self::TOK_UNKNOWN) {
191
			$this->pos -= 2; //End of file
192
		}
193
194
		$len = $this->pos - 1 - $start;
195
		$this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : '');
196
		$this->status['last_pos'] = ++$this->pos;
197
		return true;
198
	}
199
200
	/**
201
	 * Parse asp tags
202
	 * @internal Gets called with asp tags ("<%")
203
	 * @return bool
204
	 */
205
	function parse_asp() {
206
		$start = $this->pos;
207
		if ($this->next_pos('%>', false) !== self::TOK_UNKNOWN) {
208
			$this->pos -= 2; //End of file
209
		}
210
211
		$len = $this->pos - 1 - $start;
212
		$this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : '');
213
		$this->status['last_pos'] = ++$this->pos;
214
		return true;
215
	}
216
217
	/**
218
	 * Parse style tags
219
	 * @internal Gets called with php tags ("<style>")
220
	 * @return bool
221
	 */
222 9
	function parse_style() {
223 9
		if ($this->parse_attributes() && ($this->token === self::TOK_TAG_CLOSE) && ($start = $this->pos) && ($this->next_pos('</style>', false) === self::TOK_UNKNOWN)) {
224 9
			$len = $this->pos - 1 - $start;
225 9
			$this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : '');
226
227 9
			$this->pos += 7;
228 9
			$this->status['last_pos'] = $this->pos;
229 9
			return true;
230
		} else {
231
			$this->addError('No end for style tag found');
232
			return false;
233
		}
234
	}
235
236
	/**
237
	 * Parse script tags
238
	 * @internal Gets called with php tags ("<script>")
239
	 * @return bool
240
	 */
241
	function parse_script() {
242
		if ($this->parse_attributes() && ($this->token === self::TOK_TAG_CLOSE) && ($start = $this->pos) && ($this->next_pos('</script>', false) === self::TOK_UNKNOWN)) {
243
			$len = $this->pos - 1 - $start;
244
			$this->status['text'] = (($len > 0) ? substr($this->doc, $start + 1, $len) : '');
245
246
			$this->pos += 8;
247
			$this->status['last_pos'] = $this->pos;
248
			return true;
249
		} else {
250
			$this->addError('No end for script tag found');
251
			return false;
252
		}
253
	}
254
255
	/**
256
	 * Parse conditional tags (+ all conditional tags inside)
257
	 * @internal Gets called with IE conditionals ("<![if]" and "<!--[if]")
258
	 * @internal Stores condition in {@link $status}[tag_condition]
259
	 * @return bool
260
	 */
261
	function parse_conditional() {
262
		if ($this->status['closing_tag']) {
263
			$this->pos += 8;
264
		} else {
265
			$this->pos += (($this->status['comment']) ? 5 : 3);
266
			if ($this->next_pos(']', false) !== self::TOK_UNKNOWN) {
267
				$this->addError('"]" not found in conditional tag');
268
				return false;
269
			}
270
			$this->status['tag_condition'] = $this->getTokenString(0, -1);
271
		}
272
273
		if ($this->next_no_whitespace() !== self::TOK_TAG_CLOSE) {
274
			$this->addError('No ">" tag found 2 for conditional tag');
275
			return false;
276
		}
277
278
		if ($this->status['comment']) {
279
			$this->status['last_pos'] = $this->pos;
280
			if ($this->next_pos('-->', false) !== self::TOK_UNKNOWN) {
281
				$this->addError('No ending tag found for conditional tag');
282
				$this->pos = $this->size - 1;
283
284
				$len = $this->pos - 1 - $this->status['last_pos'];
285
				$this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : '');
286
			} else {
287
				$len = $this->pos - 10 - $this->status['last_pos'];
288
				$this->status['text'] = (($len > 0) ? substr($this->doc, $this->status['last_pos'] + 1, $len) : '');
289
				$this->pos += 2;
290
			}
291
		}
292
293
		$this->status['last_pos'] = $this->pos;
294
		return true;
295
	}
296
297
	/**
298
	 * Parse attributes (names + value)
299
	 * @internal Stores attributes in {@link $status}[attributes] (array(ATTR => VAL))
300
	 * @return bool
301
	 */
302 37
	function parse_attributes() {
303 37
		$this->status['attributes'] = array();
304
305 37
		while ($this->next_no_whitespace() === self::TOK_IDENTIFIER) {
306 34
			$attr = $this->getTokenString();
307 34
			if (($attr === '?') || ($attr === '%')) {
308
				//Probably closing tags
309
				break;
310
			}
311
312 34
			if ($this->next_no_whitespace() === self::TOK_EQUALS) {
313 34
				if ($this->next_no_whitespace() === self::TOK_STRING) {
314 33
					$val = $this->getTokenString(1, -1);
315 33
				} else {
316 1
					$this->token_start = $this->pos;
317 1
					if (!isset($stop)) {
318 1
						$stop = $this->whitespace;
319 1
						$stop['<'] = true;
320 1
						$stop['>'] = true;
321 1
					}
322
323 1
					while ((++$this->pos < $this->size) && (!isset($stop[$this->doc[$this->pos]]))) {
324
						// Do nothing.
325 1
					}
326 1
					--$this->pos;
327
328 1
					$val = $this->getTokenString();
329
330 1
					if (trim($val) === '') {
331
						$this->addError('Invalid attribute value');
332
						return false;
333
					}
334
				}
335 34
			} else {
336 9
				$val = $attr;
337 9
				$this->pos = (($this->token_start) ? $this->token_start : $this->pos) - 1;
338
			}
339
340 34
			$this->status['attributes'][$attr] = $val;
341 34
		}
342
343 37
		return true;
344
	}
345
346
	/**
347
	 * Default callback for tags
348
	 * @internal Gets called after the tagname (<html*ENTERS_HERE* attribute="value">)
349
	 * @return bool
350
	 */
351 37
	function parse_tag_default() {
352 37
		if ($this->status['closing_tag']) {
353 37
			$this->status['attributes'] = array();
354 37
			$this->next_no_whitespace();
355 37
		} else {
356 37
			if (!$this->parse_attributes()) {
357
				return false;
358
			}
359
		}
360
361 37
		if ($this->token !== self::TOK_TAG_CLOSE) {
362 9
			if ($this->token === self::TOK_SLASH_FORWARD) {
363 9
				$this->status['self_close'] = true;
364 9
				$this->next();
365 9
			} elseif ((($this->status['tag_name'][0] === '?') && ($this->doc[$this->pos] === '?')) || (($this->status['tag_name'][0] === '%') && ($this->doc[$this->pos] === '%'))) {
366
				$this->status['self_close'] = true;
367
				$this->pos++;
368
369
				if (isset($this->char_map[$this->doc[$this->pos]]) && (!is_string($this->char_map[$this->doc[$this->pos]]))) {
370
					$this->token = $this->char_map[$this->doc[$this->pos]];
371
				} else {
372
					$this->token = self::TOK_UNKNOWN;
373
				}
374
			}/* else {
375
				$this->status['self_close'] = false;
376
			}*/
377 9
		}
378
379 37
		if ($this->token !== self::TOK_TAG_CLOSE) {
380
			$this->addError('Expected ">", but found "'.$this->getTokenString().'"');
381
			if ($this->next_pos('>', false) !== self::TOK_UNKNOWN) {
382
				$this->addError('No ">" tag found for "'.$this->status['tag_name'].'" tag');
383
				return false;
384
			}
385
		}
386
387 37
		return true;
388
	}
389
390
	/**
391
	 * Parse tag
392
	 * @internal Gets called after opening tag (<*ENTERS_HERE*html attribute="value">)
393
	 * @internal Stores information about the tag in {@link $status} (comment, closing_tag, tag_name)
394
	 * @return bool
395
	 */
396 37
	function parse_tag() {
397 37
		$start = $this->pos;
398 37
		$this->status['self_close'] = false;
399 37
		$this->parse_text();
400
401 37
		$next = (($this->pos + 1) < $this->size) ? $this->doc[$this->pos + 1] : '';
402 37
		if ($next === '!') {
403 9
			$this->status['closing_tag'] = false;
404
405 9
			if (substr($this->doc, $this->pos + 2, 2) === '--') {
406 9
				$this->status['comment'] = true;
407
408 9
				if (($this->doc[$this->pos + 4] === '[') && (strcasecmp(substr($this->doc, $this->pos + 5, 2), 'if') === 0)) {
409
					return $this->parse_conditional();
410
				} else {
411 9
					return $this->parse_comment();
412
				}
413
			} else {
414 9
				$this->status['comment'] = false;
415
416 9
				if ($this->doc[$this->pos + 2] === '[') {
417
					if (strcasecmp(substr($this->doc, $this->pos + 3, 2), 'if') === 0) {
418
						return $this->parse_conditional();
419
					} elseif (strcasecmp(substr($this->doc, $this->pos + 3, 5), 'endif') === 0) {
420
						$this->status['closing_tag'] = true;
421
						return $this->parse_conditional();
422
					} elseif (strcasecmp(substr($this->doc, $this->pos + 3, 5), 'cdata') === 0) {
423
						return $this->parse_cdata();
424
					}
425
				}
426
			}
427 37
		} elseif ($next === '/') {
428 37
			$this->status['closing_tag'] = true;
429 37
			++$this->pos;
430 37
		} else {
431 37
			$this->status['closing_tag'] = false;
432
		}
433
434 37
		if ($this->next() !== self::TOK_IDENTIFIER) {
435
			$this->addError('Tagname expected');
436
			//if ($this->next_pos('>', false) === self::TOK_UNKNOWN) {
437
				$this->status['last_pos'] = $start - 1;
438
				return true;
439
			//} else {
440
			//	return false;
441
			//}
442
		}
443
444 37
		$tag = $this->getTokenString();
445 37
		$this->status['tag_name'] = $tag;
446 37
		$tag = strtolower($tag);
447
448 37
		if (isset($this->tag_map[$tag])) {
449 9
			$res = $this->{$this->tag_map[$tag]}();
450 9
		} else {
451 37
			$res = $this->parse_tag_default();
452
		}
453
454 37
		$this->status['last_pos'] = $this->pos;
455 37
		return $res;
456
	}
457
458
	/**
459
	 * Parse full document
460
	 * @return bool
461
	 */
462 37
	function parse_all() {
463 37
		$this->errors = array();
464 37
		$this->status['last_pos'] = -1;
465
466 37
		if (($this->token === self::TOK_TAG_OPEN) || ($this->next_pos('<', false) === self::TOK_UNKNOWN)) {
467
			do {
468 37
				if (!$this->parse_tag()) {
469
					return false;
470
				}
471 37
			} while ($this->next_pos('<') !== self::TOK_NULL);
472 37
		}
473
474 37
		$this->pos = $this->size;
475 37
		$this->parse_text();
476
477 37
		return true;
478
	}
479
}
480
481
/**
482
 * Parses a HTML document into a HTML DOM
483
 */
484
class HtmlParser extends HtmlParserBase {
485
486
	/**
487
	 * Root object
488
	 * @internal If string, then it will create a new instance as root
489
	 * @var DomNode
490
	 */
491
	var $root = 'pQuery\\DomNode';
492
493
	/**
494
	 * Current parsing hierarchy
495
	 * @internal Root is always at index 0, current tag is at the end of the array
496
	 * @var array
497
	 * @access private
498
	 */
499
	var $hierarchy = array();
500
501
	/**
502
	 * Tags that don't need closing tags
503
	 * @var array
504
	 * @access private
505
	 */
506
	var	$tags_selfclose = array(
507
		'area'		=> true,
508
		'base'		=> true,
509
		'basefont'	=> true,
510
		'br'		=> true,
511
		'col'		=> true,
512
		'command'	=> true,
513
		'embed'		=> true,
514
		'frame'		=> true,
515
		'hr'		=> true,
516
		'img'		=> true,
517
		'input'		=> true,
518
		'ins'		=> true,
519
		'keygen'	=> true,
520
		'link'		=> true,
521
		'meta'		=> true,
522
		'param'		=> true,
523
		'source'	=> true,
524
		'track'		=> true,
525
		'wbr'		=> true
526
	);
527
528
	/**
529
	 * Class constructor
530
	 * @param string $doc Document to be tokenized
531
	 * @param int $pos Position to start parsing
532
	 * @param DomNode $root Root node, null to auto create
533
	 */
534 37
	function __construct($doc = '', $pos = 0, $root = null) {
535 37
		if ($root === null) {
536 37
			$root = new $this->root('~root~', null);
537 37
		}
538 37
		$this->root =& $root;
539
540 37
		parent::__construct($doc, $pos);
541 37
	}
542
543
	#php4 PHP4 class constructor compatibility
544
	#function HtmlParser($doc = '', $pos = 0, $root = null) {return $this->__construct($doc, $pos, $root);}
545
	#php4e
546
547
	/**
548
	 * Class magic invoke method, performs {@link select()}
549
	 * @return array
550
	 * @access private
551
	 */
552
	function __invoke($query = '*') {
553
		return $this->select($query);
554
	}
555
556
	/**
557
	 * Class magic toString method, performs {@link DomNode::toString()}
558
	 * @return string
559
	 * @access private
560
	 */
561
	function __toString() {
562
		return $this->root->getInnerText();
563
	}
564
565
	/**
566
	 * Performs a css select query on the root node
567
	 * @see DomNode::select()
568
	 * @return array
569
	 */
570
	function select($query = '*', $index = false, $recursive = true, $check_self = false) {
571
		return $this->root->select($query, $index, $recursive, $check_self);
572
	}
573
574
	/**
575
	 * Updates the current hierarchy status and checks for
576
	 * correct opening/closing of tags
577
	 * @param bool $self_close Is current tag self closing? Null to use {@link tags_selfclose}
578
	 * @internal This is were most of the nodes get added
579
	 * @access private
580
	 */
581 37
	protected function parse_hierarchy($self_close = null) {
582 37
		if ($self_close === null) {
583
			$this->status['self_close'] = ($self_close = isset($this->tags_selfclose[strtolower($this->status['tag_name'])]));
584
		}
585
586 37
		if ($self_close) {
587 9
			if ($this->status['closing_tag']) {
588
589
				//$c = end($this->hierarchy)->children
590
				$c = $this->hierarchy[count($this->hierarchy) - 1]->children;
591
				$found = false;
592
				for ($count = count($c), $i = $count - 1; $i >= 0; $i--) {
593
					if (strcasecmp($c[$i]->tag, $this->status['tag_name']) === 0) {
594
						for($ii = $i + 1; $ii < $count; $ii++) {
595
							$index = null; //Needs to be passed by ref
596
							$c[$i + 1]->changeParent($c[$i], $index);
597
						}
598
						$c[$i]->self_close = false;
599
600
						$found = true;
601
						break;
602
					}
603
				}
604
605
				if (!$found) {
606
					$this->addError('Closing tag "'.$this->status['tag_name'].'" which is not open');
607
				}
608
609 9
			} elseif ($this->status['tag_name'][0] === '?') {
610
				//end($this->hierarchy)->addXML($this->status['tag_name'], '', $this->status['attributes']);
611
				$index = null; //Needs to be passed by ref
612
				$this->hierarchy[count($this->hierarchy) - 1]->addXML($this->status['tag_name'], '', $this->status['attributes'], $index);
613 9
			} elseif ($this->status['tag_name'][0] === '%') {
614
				//end($this->hierarchy)->addASP($this->status['tag_name'], '', $this->status['attributes']);
615
				$index = null; //Needs to be passed by ref
616
				$this->hierarchy[count($this->hierarchy) - 1]->addASP($this->status['tag_name'], '', $this->status['attributes'], $index);
617
			} else {
618
				//end($this->hierarchy)->addChild($this->status);
619 9
				$index = null; //Needs to be passed by ref
620 9
				$this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index);
621
			}
622 37
		} elseif ($this->status['closing_tag']) {
623 37
			$found = false;
624 37
			for ($count = count($this->hierarchy), $i = $count - 1; $i >= 0; $i--) {
625 37
				if (strcasecmp($this->hierarchy[$i]->tag, $this->status['tag_name']) === 0) {
626
627 37
					for($ii = ($count - $i - 1); $ii >= 0; $ii--) {
628 37
						$e = array_pop($this->hierarchy);
629 37
						if ($ii > 0) {
630
							$this->addError('Closing tag "'.$this->status['tag_name'].'" while "'.$e->tag.'" is not closed yet');
631
						}
632 37
					}
633
634 37
					$found = true;
635 37
					break;
636
				}
637
			}
638
639 37
			if (!$found) {
640
				$this->addError('Closing tag "'.$this->status['tag_name'].'" which is not open');
641
			}
642
643 37
		} else {
644
			//$this->hierarchy[] = end($this->hierarchy)->addChild($this->status);
645 37
			$index = null; //Needs to be passed by ref
646 37
			$this->hierarchy[] = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index);
647
		}
648 37
	}
649
650
	function parse_cdata() {
651
		if (!parent::parse_cdata()) {return false;}
652
653
		//end($this->hierarchy)->addCDATA($this->status['cdata']);
654
		$index = null; //Needs to be passed by ref
655
		$this->hierarchy[count($this->hierarchy) - 1]->addCDATA($this->status['cdata'], $index);
656
		return true;
657
	}
658
659 9
	function parse_comment() {
660 9
		if (!parent::parse_comment()) {return false;}
661
662
		//end($this->hierarchy)->addComment($this->status['comment']);
663 9
		$index = null; //Needs to be passed by ref
664 9
		$this->hierarchy[count($this->hierarchy) - 1]->addComment($this->status['comment'], $index);
665 9
		return true;
666
	}
667
668
	function parse_conditional() {
669
		if (!parent::parse_conditional()) {return false;}
670
671
		if ($this->status['comment']) {
672
			//$e = end($this->hierarchy)->addConditional($this->status['tag_condition'], true);
673
			$index = null; //Needs to be passed by ref
674
			$e = $this->hierarchy[count($this->hierarchy) - 1]->addConditional($this->status['tag_condition'], true, $index);
675
			if ($this->status['text'] !== '') {
676
				$index = null; //Needs to be passed by ref
677
				$e->addText($this->status['text'], $index);
678
			}
679
		} else {
680
			if ($this->status['closing_tag']) {
681
				$this->parse_hierarchy(false);
682
			} else {
683
				//$this->hierarchy[] = end($this->hierarchy)->addConditional($this->status['tag_condition'], false);
684
				$index = null; //Needs to be passed by ref
685
				$this->hierarchy[] = $this->hierarchy[count($this->hierarchy) - 1]->addConditional($this->status['tag_condition'], false, $index);
686
			}
687
		}
688
689
		return true;
690
	}
691
692 9
	function parse_doctype() {
693 9
		if (!parent::parse_doctype()) {return false;}
694
695
		//end($this->hierarchy)->addDoctype($this->status['dtd']);
696 9
		$index = null; //Needs to be passed by ref
697 9
		$this->hierarchy[count($this->hierarchy) - 1]->addDoctype($this->status['dtd'], $index);
698 9
		return true;
699
	}
700
701
	function parse_php() {
702
		if (!parent::parse_php()) {return false;}
703
704
		//end($this->hierarchy)->addXML('php', $this->status['text']);
705
		$index = null; //Needs to be passed by ref
706
		$this->hierarchy[count($this->hierarchy) - 1]->addXML('php', $this->status['text'], $index);
707
		return true;
708
	}
709
710
	function parse_asp() {
711
		if (!parent::parse_asp()) {return false;}
712
713
		//end($this->hierarchy)->addASP('', $this->status['text']);
714
		$index = null; //Needs to be passed by ref
715
		$this->hierarchy[count($this->hierarchy) - 1]->addASP('', $this->status['text'], $index);
716
		return true;
717
	}
718
719
	function parse_script() {
720
		if (!parent::parse_script()) {return false;}
721
722
		//$e = end($this->hierarchy)->addChild($this->status);
723
		$index = null; //Needs to be passed by ref
724
		$e = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index);
725
		if ($this->status['text'] !== '') {
726
			$index = null; //Needs to be passed by ref
727
			$e->addText($this->status['text'], $index);
728
		}
729
		return true;
730
	}
731
732 9
	function parse_style() {
733 9
		if (!parent::parse_style()) {return false;}
734
735
		//$e = end($this->hierarchy)->addChild($this->status);
736 9
		$index = null; //Needs to be passed by ref
737 9
		$e = $this->hierarchy[count($this->hierarchy) - 1]->addChild($this->status, $index);
738 9
		if ($this->status['text'] !== '') {
739 9
			$index = null; //Needs to be passed by ref
740 9
			$e->addText($this->status['text'], $index);
741 9
		}
742 9
		return true;
743
	}
744
745 37
	function parse_tag_default() {
746 37
		if (!parent::parse_tag_default()) {return false;}
747
748 37
		$this->parse_hierarchy(($this->status['self_close']) ? true : null);
749 37
		return true;
750
	}
751
752 37
	function parse_text() {
753 37
		parent::parse_text();
754 37
		if ($this->status['text'] !== '') {
755
			//end($this->hierarchy)->addText($this->status['text']);
756 37
			$index = null; //Needs to be passed by ref
757 37
			$this->hierarchy[count($this->hierarchy) - 1]->addText($this->status['text'], $index);
758 37
		}
759 37
	}
760
761 37
	function parse_all() {
762 37
		$this->hierarchy = array(&$this->root);
763 37
		return ((parent::parse_all()) ? $this->root : false);
764
	}
765
}
766
767
/**
768
 * HTML5 specific parser (adds support for omittable closing tags)
769
 */
770
class Html5Parser extends HtmlParser {
771
772
	/**
773
	 * Tags with ommitable closing tags
774
	 * @var array array('tag2' => 'tag1') will close tag1 if following (not child) tag is tag2
775
	 * @access private
776
	 */
777
	var $tags_optional_close = array(
778
		//Current tag	=> Previous tag
779
		'li' 			=> array('li' => true),
780
		'dt' 			=> array('dt' => true, 'dd' => true),
781
		'dd' 			=> array('dt' => true, 'dd' => true),
782
		'address' 		=> array('p' => true),
783
		'article' 		=> array('p' => true),
784
		'aside' 		=> array('p' => true),
785
		'blockquote' 	=> array('p' => true),
786
		'dir' 			=> array('p' => true),
787
		'div' 			=> array('p' => true),
788
		'dl' 			=> array('p' => true),
789
		'fieldset' 		=> array('p' => true),
790
		'footer' 		=> array('p' => true),
791
		'form' 			=> array('p' => true),
792
		'h1' 			=> array('p' => true),
793
		'h2' 			=> array('p' => true),
794
		'h3' 			=> array('p' => true),
795
		'h4' 			=> array('p' => true),
796
		'h5' 			=> array('p' => true),
797
		'h6' 			=> array('p' => true),
798
		'header' 		=> array('p' => true),
799
		'hgroup' 		=> array('p' => true),
800
		'hr' 			=> array('p' => true),
801
		'menu' 			=> array('p' => true),
802
		'nav' 			=> array('p' => true),
803
		'ol' 			=> array('p' => true),
804
		'p' 			=> array('p' => true),
805
		'pre' 			=> array('p' => true),
806
		'section' 		=> array('p' => true),
807
		'table' 		=> array('p' => true),
808
		'ul' 			=> array('p' => true),
809
		'rt'			=> array('rt' => true, 'rp' => true),
810
		'rp'			=> array('rt' => true, 'rp' => true),
811
		'optgroup'		=> array('optgroup' => true, 'option' => true),
812
		'option'		=> array('option'),
813
		'tbody'			=> array('thread' => true, 'tbody' => true, 'tfoot' => true),
814
		'tfoot'			=> array('thread' => true, 'tbody' => true),
815
		'tr'			=> array('tr' => true),
816
		'td'			=> array('td' => true, 'th' => true),
817
		'th'			=> array('td' => true, 'th' => true),
818
		'body'			=> array('head' => true)
819
	);
820
821 37
	protected function parse_hierarchy($self_close = null) {
822 37
		$tag_curr = strtolower($this->status['tag_name']);
823 37
		if ($self_close === null) {
824 37
			$this->status['self_close'] = ($self_close = isset($this->tags_selfclose[$tag_curr]));
825 37
		}
826
827 37
		if (! ($self_close || $this->status['closing_tag'])) {
828
			//$tag_prev = strtolower(end($this->hierarchy)->tag);
829 37
			$tag_prev = strtolower($this->hierarchy[count($this->hierarchy) - 1]->tag);
830 37
			if (isset($this->tags_optional_close[$tag_curr]) && isset($this->tags_optional_close[$tag_curr][$tag_prev])) {
831
				array_pop($this->hierarchy);
832
			}
833 37
		}
834
835 37
		return parent::parse_hierarchy($self_close);
836
	}
837
}
838
839
?>