Completed
Push — master ( 80f547...a9257d )
by Gilles
02:48
created
src/PHPHtmlParser/Dom.php 1 patch
Indentation   +632 added lines, -632 removed lines patch added patch discarded remove patch
@@ -16,636 +16,636 @@
 block discarded – undo
16 16
 class Dom
17 17
 {
18 18
 
19
-    /**
20
-     * The charset we would like the output to be in.
21
-     *
22
-     * @var string
23
-     */
24
-    protected $defaultCharset = 'UTF-8';
25
-
26
-    /**
27
-     * Contains the root node of this dom tree.
28
-     *
29
-     * @var HtmlNode
30
-     */
31
-    public $root;
32
-
33
-    /**
34
-     * The raw version of the document string.
35
-     *
36
-     * @var string
37
-     */
38
-    protected $raw;
39
-
40
-    /**
41
-     * The document string.
42
-     *
43
-     * @var Content
44
-     */
45
-    protected $content = null;
46
-
47
-    /**
48
-     * The original file size of the document.
49
-     *
50
-     * @var int
51
-     */
52
-    protected $rawSize;
53
-
54
-    /**
55
-     * The size of the document after it is cleaned.
56
-     *
57
-     * @var int
58
-     */
59
-    protected $size;
60
-
61
-    /**
62
-     * A global options array to be used by all load calls.
63
-     *
64
-     * @var array
65
-     */
66
-    protected $globalOptions = [];
67
-
68
-    /**
69
-     * A persistent option object to be used for all options in the
70
-     * parsing of the file.
71
-     *
72
-     * @var Options
73
-     */
74
-    protected $options;
75
-
76
-    /**
77
-     * A list of tags which will always be self closing
78
-     *
79
-     * @var array
80
-     */
81
-    protected $selfClosing = [
82
-        'img',
83
-        'br',
84
-        'input',
85
-        'meta',
86
-        'link',
87
-        'hr',
88
-        'base',
89
-        'embed',
90
-        'spacer',
91
-    ];
92
-
93
-    /**
94
-     * Returns the inner html of the root node.
95
-     *
96
-     * @return string
97
-     */
98
-    public function __toString()
99
-    {
100
-        return $this->root->innerHtml();
101
-    }
102
-
103
-    /**
104
-     * A simple wrapper around the root node.
105
-     *
106
-     * @param string $name
107
-     * @return mixed
108
-     */
109
-    public function __get($name)
110
-    {
111
-        return $this->root->$name;
112
-    }
113
-
114
-    /**
115
-     * Attempts to load the dom from any resource, string, file, or URL.
116
-     *
117
-     * @param string $str
118
-     * @param array $options
119
-     * @return $this
120
-     */
121
-    public function load($str, $options = [])
122
-    {
123
-        // check if it's a file
124
-        if (strpos($str, "\n") === false && is_file($str)) {
125
-            return $this->loadFromFile($str, $options);
126
-        }
127
-        // check if it's a url
128
-        if (preg_match("/^https?:\/\//i", $str)) {
129
-            return $this->loadFromUrl($str, $options);
130
-        }
131
-
132
-        return $this->loadStr($str, $options);
133
-    }
134
-
135
-    /**
136
-     * Loads the dom from a document file/url
137
-     *
138
-     * @param string $file
139
-     * @param array $options
140
-     * @return $this
141
-     */
142
-    public function loadFromFile($file, $options = [])
143
-    {
144
-        return $this->loadStr(file_get_contents($file), $options);
145
-    }
146
-
147
-    /**
148
-     * Use a curl interface implementation to attempt to load
149
-     * the content from a url.
150
-     *
151
-     * @param string $url
152
-     * @param array $options
153
-     * @param CurlInterface $curl
154
-     * @return $this
155
-     */
156
-    public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
157
-    {
158
-        if (is_null($curl)) {
159
-            // use the default curl interface
160
-            $curl = new Curl;
161
-        }
162
-        $content = $curl->get($url);
163
-
164
-        return $this->loadStr($content, $options);
165
-    }
166
-
167
-    /**
168
-     * Parsers the html of the given string. Used for load(), loadFromFile(),
169
-     * and loadFromUrl().
170
-     *
171
-     * @param string $str
172
-     * @param array $option
173
-     * @return $this
174
-     */
175
-    public function loadStr($str, $option)
176
-    {
177
-        $this->options = new Options;
178
-        $this->options->setOptions($this->globalOptions)
179
-                      ->setOptions($option);
180
-
181
-        $this->rawSize = strlen($str);
182
-        $this->raw     = $str;
183
-
184
-        $html = $this->clean($str);
185
-
186
-        $this->size    = strlen($str);
187
-        $this->content = new Content($html);
188
-
189
-        $this->parse();
190
-        $this->detectCharset();
191
-
192
-        return $this;
193
-    }
194
-
195
-    /**
196
-     * Sets a global options array to be used by all load calls.
197
-     *
198
-     * @param array $options
199
-     * @return $this
200
-     */
201
-    public function setOptions(array $options)
202
-    {
203
-        $this->globalOptions = $options;
204
-
205
-        return $this;
206
-    }
207
-
208
-    /**
209
-     * Find elements by css selector on the root node.
210
-     *
211
-     * @param string $selector
212
-     * @param int $nth
213
-     * @return array
214
-     */
215
-    public function find($selector, $nth = null)
216
-    {
217
-        $this->isLoaded();
218
-
219
-        return $this->root->find($selector, $nth);
220
-    }
221
-
222
-    /**
223
-     * Adds the tag (or tags in an array) to the list of tags that will always
224
-     * be self closing.
225
-     *
226
-     * @param string|array $tag
227
-     * @return $this
228
-     */
229
-    public function addSelfClosingTag($tag)
230
-    {
231
-        if ( ! is_array($tag)) {
232
-            $tag = [$tag];
233
-        }
234
-        foreach ($tag as $value) {
235
-            $this->selfClosing[] = $value;
236
-        }
237
-
238
-        return $this;
239
-    }
240
-
241
-    /**
242
-     * Removes the tag (or tags in an array) from the list of tags that will
243
-     * always be self closing.
244
-     *
245
-     * @param string|array $tag
246
-     * @return $this
247
-     */
248
-    public function removeSelfClosingTag($tag)
249
-    {
250
-        if ( ! is_array($tag)) {
251
-            $tag = [$tag];
252
-        }
253
-        $this->selfClosing = array_diff($this->selfClosing, $tag);
254
-
255
-        return $this;
256
-    }
257
-
258
-    /**
259
-     * Sets the list of self closing tags to empty.
260
-     *
261
-     * @return $this
262
-     */
263
-    public function clearSelfClosingTags()
264
-    {
265
-        $this->selfClosing = [];
266
-
267
-        return $this;
268
-    }
269
-
270
-    /**
271
-     * Simple wrapper function that returns the first child.
272
-     *
273
-     * @return \PHPHtmlParser\Dom\AbstractNode
274
-     */
275
-    public function firstChild()
276
-    {
277
-        $this->isLoaded();
278
-
279
-        return $this->root->firstChild();
280
-    }
281
-
282
-    /**
283
-     * Simple wrapper function that returns the last child.
284
-     *
285
-     * @return \PHPHtmlParser\Dom\AbstractNode
286
-     */
287
-    public function lastChild()
288
-    {
289
-        $this->isLoaded();
290
-
291
-        return $this->root->lastChild();
292
-    }
293
-
294
-    /**
295
-     * Simple wrapper function that returns an element by the
296
-     * id.
297
-     *
298
-     * @param string $id
299
-     * @return \PHPHtmlParser\Dom\AbstractNode
300
-     */
301
-    public function getElementById($id)
302
-    {
303
-        $this->isLoaded();
304
-
305
-        return $this->find('#'.$id, 0);
306
-    }
307
-
308
-    /**
309
-     * Simple wrapper function that returns all elements by
310
-     * tag name.
311
-     *
312
-     * @param string $name
313
-     * @return array
314
-     */
315
-    public function getElementsByTag($name)
316
-    {
317
-        $this->isLoaded();
318
-
319
-        return $this->find($name);
320
-    }
321
-
322
-    /**
323
-     * Simple wrapper function that returns all elements by
324
-     * class name.
325
-     *
326
-     * @param string $class
327
-     * @return array
328
-     */
329
-    public function getElementsByClass($class)
330
-    {
331
-        $this->isLoaded();
332
-
333
-        return $this->find('.'.$class);
334
-    }
335
-
336
-    /**
337
-     * Checks if the load methods have been called.
338
-     *
339
-     * @throws NotLoadedException
340
-     */
341
-    protected function isLoaded()
342
-    {
343
-        if (is_null($this->content)) {
344
-            throw new NotLoadedException('Content is not loaded!');
345
-        }
346
-    }
347
-
348
-    /**
349
-     * Cleans the html of any none-html information.
350
-     *
351
-     * @param string $str
352
-     * @return string
353
-     */
354
-    protected function clean($str)
355
-    {
356
-        if ($this->options->get('cleanupInput') != true) {
357
-            // skip entire cleanup step
358
-            return $str;
359
-        }
360
-
361
-        // remove white space before closing tags
362
-        $str = mb_eregi_replace("'\s+>", "'>", $str);
363
-        $str = mb_eregi_replace('"\s+>', '">', $str);
364
-
365
-        // clean out the \n\r
366
-        $replace = ' ';
367
-        if ($this->options->get('preserveLineBreaks')) {
368
-            $replace = '&#10';
369
-        }
370
-        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
371
-
372
-        // strip the doctype
373
-        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
374
-
375
-        // strip out comments
376
-        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
377
-
378
-        // strip out cdata
379
-        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
380
-
381
-        // strip out <script> tags
382
-        if ($this->options->get('removeScripts') == true) {
383
-            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
384
-            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
385
-        }
386
-
387
-        // strip out <style> tags
388
-        if ($this->options->get('removeStyles') == true) {
389
-            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
390
-            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
391
-        }
392
-
393
-        // strip out preformatted tags
394
-        $str = mb_eregi_replace("<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>", '', $str);
395
-
396
-        // strip out server side scripts
397
-        $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
398
-
399
-        // strip smarty scripts
400
-        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
401
-
402
-        return $str;
403
-    }
404
-
405
-    /**
406
-     * Attempts to parse the html in content.
407
-     */
408
-    protected function parse()
409
-    {
410
-        // add the root node
411
-        $this->root = new HtmlNode('root');
412
-        $activeNode = $this->root;
413
-        while ( ! is_null($activeNode)) {
414
-            $str = $this->content->copyUntil('<');
415
-            if ($str == '') {
416
-                $info = $this->parseTag();
417
-                if ( ! $info['status']) {
418
-                    // we are done here
419
-                    $activeNode = null;
420
-                    continue;
421
-                }
422
-
423
-                // check if it was a closing tag
424
-                if ($info['closing']) {
425
-                    $originalNode = $activeNode;
426
-                    while ($activeNode->getTag()->name() != $info['tag']) {
427
-                        $activeNode = $activeNode->getParent();
428
-                        if (is_null($activeNode)) {
429
-                            // we could not find opening tag
430
-                            $activeNode = $originalNode;
431
-                            break;
432
-                        }
433
-                    }
434
-                    if ( ! is_null($activeNode)) {
435
-                        $activeNode = $activeNode->getParent();
436
-                    }
437
-                    continue;
438
-                }
439
-
440
-                if ( ! isset($info['node'])) {
441
-                    continue;
442
-                }
443
-
444
-                /** @var AbstractNode $node */
445
-                $node = $info['node'];
446
-                $activeNode->addChild($node);
447
-
448
-                // check if node is self closing
449
-                if ( ! $node->getTag()->isSelfClosing()) {
450
-                    $activeNode = $node;
451
-                }
452
-            } else if ($this->options->whitespaceTextNode ||
453
-                trim($str) != ''
454
-            ) {
455
-                // we found text we care about
456
-                $textNode = new TextNode($str);
457
-                $activeNode->addChild($textNode);
458
-            }
459
-        }
460
-    }
461
-
462
-    /**
463
-     * Attempt to parse a tag out of the content.
464
-     *
465
-     * @return array
466
-     * @throws StrictException
467
-     */
468
-    protected function parseTag()
469
-    {
470
-        $return = [
471
-            'status'  => false,
472
-            'closing' => false,
473
-            'node'    => null,
474
-        ];
475
-        if ($this->content->char() != '<') {
476
-            // we are not at the beginning of a tag
477
-            return $return;
478
-        }
479
-
480
-        // check if this is a closing tag
481
-        if ($this->content->fastForward(1)->char() == '/') {
482
-            // end tag
483
-            $tag = $this->content->fastForward(1)
484
-                                 ->copyByToken('slash', true);
485
-            // move to end of tag
486
-            $this->content->copyUntil('>');
487
-            $this->content->fastForward(1);
488
-
489
-            // check if this closing tag counts
490
-            $tag = strtolower($tag);
491
-            if (in_array($tag, $this->selfClosing)) {
492
-                $return['status'] = true;
493
-
494
-                return $return;
495
-            } else {
496
-                $return['status']  = true;
497
-                $return['closing'] = true;
498
-                $return['tag']     = strtolower($tag);
499
-            }
500
-
501
-            return $return;
502
-        }
503
-
504
-        $tag  = strtolower($this->content->copyByToken('slash', true));
505
-        $node = new HtmlNode($tag);
506
-
507
-        // attributes
508
-        while ($this->content->char() != '>' &&
509
-            $this->content->char() != '/') {
510
-            $space = $this->content->skipByToken('blank', true);
511
-            if (empty($space)) {
512
-                $this->content->fastForward(1);
513
-                continue;
514
-            }
515
-
516
-            $name = $this->content->copyByToken('equal', true);
517
-            if ($name == '/') {
518
-                break;
519
-            }
520
-
521
-            if (empty($name)) {
522
-                $this->content->fastForward(1);
523
-                continue;
524
-            }
525
-
526
-            $this->content->skipByToken('blank');
527
-            if ($this->content->char() == '=') {
528
-                $attr = [];
529
-                $this->content->fastForward(1)
530
-                              ->skipByToken('blank');
531
-                switch ($this->content->char()) {
532
-                    case '"':
533
-                        $attr['doubleQuote'] = true;
534
-                        $this->content->fastForward(1);
535
-                        $string = $this->content->copyUntil('"', true, true);
536
-                        do {
537
-                            $moreString = $this->content->copyUntilUnless('"', '=>');
538
-                            $string .= $moreString;
539
-                        } while ( ! empty($moreString));
540
-                        $attr['value'] = $string;
541
-                        $this->content->fastForward(1);
542
-                        $node->getTag()->$name = $attr;
543
-                        break;
544
-                    case "'":
545
-                        $attr['doubleQuote'] = false;
546
-                        $this->content->fastForward(1);
547
-                        $string = $this->content->copyUntil("'", true, true);
548
-                        do {
549
-                            $moreString = $this->content->copyUntilUnless("'", '=>');
550
-                            $string .= $moreString;
551
-                        } while ( ! empty($moreString));
552
-                        $attr['value'] = $string;
553
-                        $this->content->fastForward(1);
554
-                        $node->getTag()->$name = $attr;
555
-                        break;
556
-                    default:
557
-                        $attr['doubleQuote']   = true;
558
-                        $attr['value']         = $this->content->copyByToken('attr', true);
559
-                        $node->getTag()->$name = $attr;
560
-                        break;
561
-                }
562
-            } else {
563
-                // no value attribute
564
-                if ($this->options->strict) {
565
-                    // can't have this in strict html
566
-                    $character = $this->content->getPosition();
567
-                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
568
-                }
569
-                $node->getTag()->$name = [
570
-                    'value'       => null,
571
-                    'doubleQuote' => true,
572
-                ];
573
-                if ($this->content->char() != '>') {
574
-                    $this->content->rewind(1);
575
-                }
576
-            }
577
-        }
578
-
579
-        $this->content->skipByToken('blank');
580
-        if ($this->content->char() == '/') {
581
-            // self closing tag
582
-            $node->getTag()->selfClosing();
583
-            $this->content->fastForward(1);
584
-        } elseif (in_array($tag, $this->selfClosing)) {
585
-
586
-            // Should be a self closing tag, check if we are strict
587
-            if ($this->options->strict) {
588
-                $character = $this->content->getPosition();
589
-                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
590
-            }
591
-
592
-            // We force self closing on this tag.
593
-            $node->getTag()->selfClosing();
594
-        }
595
-
596
-        $this->content->fastForward(1);
597
-
598
-        $return['status'] = true;
599
-        $return['node']   = $node;
600
-
601
-        return $return;
602
-    }
603
-
604
-    /**
605
-     * Attempts to detect the charset that the html was sent in.
606
-     *
607
-     * @return bool
608
-     */
609
-    protected function detectCharset()
610
-    {
611
-        // set the default
612
-        $encode = new Encode;
613
-        $encode->from($this->defaultCharset);
614
-        $encode->to($this->defaultCharset);
615
-
616
-        if ( ! is_null($this->options->enforceEncoding)) {
617
-            //  they want to enforce the given encoding
618
-            $encode->from($this->options->enforceEncoding);
619
-            $encode->to($this->options->enforceEncoding);
620
-
621
-            return false;
622
-        }
623
-
624
-        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
625
-        if (is_null($meta)) {
626
-            // could not find meta tag
627
-            $this->root->propagateEncoding($encode);
628
-
629
-            return false;
630
-        }
631
-        $content = $meta->content;
632
-        if (empty($content)) {
633
-            // could not find content
634
-            $this->root->propagateEncoding($encode);
635
-
636
-            return false;
637
-        }
638
-        $matches = [];
639
-        if (preg_match('/charset=(.+)/', $content, $matches)) {
640
-            $encode->from(trim($matches[1]));
641
-            $this->root->propagateEncoding($encode);
642
-
643
-            return true;
644
-        }
645
-
646
-        // no charset found
647
-        $this->root->propagateEncoding($encode);
648
-
649
-        return false;
650
-    }
19
+	/**
20
+	 * The charset we would like the output to be in.
21
+	 *
22
+	 * @var string
23
+	 */
24
+	protected $defaultCharset = 'UTF-8';
25
+
26
+	/**
27
+	 * Contains the root node of this dom tree.
28
+	 *
29
+	 * @var HtmlNode
30
+	 */
31
+	public $root;
32
+
33
+	/**
34
+	 * The raw version of the document string.
35
+	 *
36
+	 * @var string
37
+	 */
38
+	protected $raw;
39
+
40
+	/**
41
+	 * The document string.
42
+	 *
43
+	 * @var Content
44
+	 */
45
+	protected $content = null;
46
+
47
+	/**
48
+	 * The original file size of the document.
49
+	 *
50
+	 * @var int
51
+	 */
52
+	protected $rawSize;
53
+
54
+	/**
55
+	 * The size of the document after it is cleaned.
56
+	 *
57
+	 * @var int
58
+	 */
59
+	protected $size;
60
+
61
+	/**
62
+	 * A global options array to be used by all load calls.
63
+	 *
64
+	 * @var array
65
+	 */
66
+	protected $globalOptions = [];
67
+
68
+	/**
69
+	 * A persistent option object to be used for all options in the
70
+	 * parsing of the file.
71
+	 *
72
+	 * @var Options
73
+	 */
74
+	protected $options;
75
+
76
+	/**
77
+	 * A list of tags which will always be self closing
78
+	 *
79
+	 * @var array
80
+	 */
81
+	protected $selfClosing = [
82
+		'img',
83
+		'br',
84
+		'input',
85
+		'meta',
86
+		'link',
87
+		'hr',
88
+		'base',
89
+		'embed',
90
+		'spacer',
91
+	];
92
+
93
+	/**
94
+	 * Returns the inner html of the root node.
95
+	 *
96
+	 * @return string
97
+	 */
98
+	public function __toString()
99
+	{
100
+		return $this->root->innerHtml();
101
+	}
102
+
103
+	/**
104
+	 * A simple wrapper around the root node.
105
+	 *
106
+	 * @param string $name
107
+	 * @return mixed
108
+	 */
109
+	public function __get($name)
110
+	{
111
+		return $this->root->$name;
112
+	}
113
+
114
+	/**
115
+	 * Attempts to load the dom from any resource, string, file, or URL.
116
+	 *
117
+	 * @param string $str
118
+	 * @param array $options
119
+	 * @return $this
120
+	 */
121
+	public function load($str, $options = [])
122
+	{
123
+		// check if it's a file
124
+		if (strpos($str, "\n") === false && is_file($str)) {
125
+			return $this->loadFromFile($str, $options);
126
+		}
127
+		// check if it's a url
128
+		if (preg_match("/^https?:\/\//i", $str)) {
129
+			return $this->loadFromUrl($str, $options);
130
+		}
131
+
132
+		return $this->loadStr($str, $options);
133
+	}
134
+
135
+	/**
136
+	 * Loads the dom from a document file/url
137
+	 *
138
+	 * @param string $file
139
+	 * @param array $options
140
+	 * @return $this
141
+	 */
142
+	public function loadFromFile($file, $options = [])
143
+	{
144
+		return $this->loadStr(file_get_contents($file), $options);
145
+	}
146
+
147
+	/**
148
+	 * Use a curl interface implementation to attempt to load
149
+	 * the content from a url.
150
+	 *
151
+	 * @param string $url
152
+	 * @param array $options
153
+	 * @param CurlInterface $curl
154
+	 * @return $this
155
+	 */
156
+	public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
157
+	{
158
+		if (is_null($curl)) {
159
+			// use the default curl interface
160
+			$curl = new Curl;
161
+		}
162
+		$content = $curl->get($url);
163
+
164
+		return $this->loadStr($content, $options);
165
+	}
166
+
167
+	/**
168
+	 * Parsers the html of the given string. Used for load(), loadFromFile(),
169
+	 * and loadFromUrl().
170
+	 *
171
+	 * @param string $str
172
+	 * @param array $option
173
+	 * @return $this
174
+	 */
175
+	public function loadStr($str, $option)
176
+	{
177
+		$this->options = new Options;
178
+		$this->options->setOptions($this->globalOptions)
179
+					  ->setOptions($option);
180
+
181
+		$this->rawSize = strlen($str);
182
+		$this->raw     = $str;
183
+
184
+		$html = $this->clean($str);
185
+
186
+		$this->size    = strlen($str);
187
+		$this->content = new Content($html);
188
+
189
+		$this->parse();
190
+		$this->detectCharset();
191
+
192
+		return $this;
193
+	}
194
+
195
+	/**
196
+	 * Sets a global options array to be used by all load calls.
197
+	 *
198
+	 * @param array $options
199
+	 * @return $this
200
+	 */
201
+	public function setOptions(array $options)
202
+	{
203
+		$this->globalOptions = $options;
204
+
205
+		return $this;
206
+	}
207
+
208
+	/**
209
+	 * Find elements by css selector on the root node.
210
+	 *
211
+	 * @param string $selector
212
+	 * @param int $nth
213
+	 * @return array
214
+	 */
215
+	public function find($selector, $nth = null)
216
+	{
217
+		$this->isLoaded();
218
+
219
+		return $this->root->find($selector, $nth);
220
+	}
221
+
222
+	/**
223
+	 * Adds the tag (or tags in an array) to the list of tags that will always
224
+	 * be self closing.
225
+	 *
226
+	 * @param string|array $tag
227
+	 * @return $this
228
+	 */
229
+	public function addSelfClosingTag($tag)
230
+	{
231
+		if ( ! is_array($tag)) {
232
+			$tag = [$tag];
233
+		}
234
+		foreach ($tag as $value) {
235
+			$this->selfClosing[] = $value;
236
+		}
237
+
238
+		return $this;
239
+	}
240
+
241
+	/**
242
+	 * Removes the tag (or tags in an array) from the list of tags that will
243
+	 * always be self closing.
244
+	 *
245
+	 * @param string|array $tag
246
+	 * @return $this
247
+	 */
248
+	public function removeSelfClosingTag($tag)
249
+	{
250
+		if ( ! is_array($tag)) {
251
+			$tag = [$tag];
252
+		}
253
+		$this->selfClosing = array_diff($this->selfClosing, $tag);
254
+
255
+		return $this;
256
+	}
257
+
258
+	/**
259
+	 * Sets the list of self closing tags to empty.
260
+	 *
261
+	 * @return $this
262
+	 */
263
+	public function clearSelfClosingTags()
264
+	{
265
+		$this->selfClosing = [];
266
+
267
+		return $this;
268
+	}
269
+
270
+	/**
271
+	 * Simple wrapper function that returns the first child.
272
+	 *
273
+	 * @return \PHPHtmlParser\Dom\AbstractNode
274
+	 */
275
+	public function firstChild()
276
+	{
277
+		$this->isLoaded();
278
+
279
+		return $this->root->firstChild();
280
+	}
281
+
282
+	/**
283
+	 * Simple wrapper function that returns the last child.
284
+	 *
285
+	 * @return \PHPHtmlParser\Dom\AbstractNode
286
+	 */
287
+	public function lastChild()
288
+	{
289
+		$this->isLoaded();
290
+
291
+		return $this->root->lastChild();
292
+	}
293
+
294
+	/**
295
+	 * Simple wrapper function that returns an element by the
296
+	 * id.
297
+	 *
298
+	 * @param string $id
299
+	 * @return \PHPHtmlParser\Dom\AbstractNode
300
+	 */
301
+	public function getElementById($id)
302
+	{
303
+		$this->isLoaded();
304
+
305
+		return $this->find('#'.$id, 0);
306
+	}
307
+
308
+	/**
309
+	 * Simple wrapper function that returns all elements by
310
+	 * tag name.
311
+	 *
312
+	 * @param string $name
313
+	 * @return array
314
+	 */
315
+	public function getElementsByTag($name)
316
+	{
317
+		$this->isLoaded();
318
+
319
+		return $this->find($name);
320
+	}
321
+
322
+	/**
323
+	 * Simple wrapper function that returns all elements by
324
+	 * class name.
325
+	 *
326
+	 * @param string $class
327
+	 * @return array
328
+	 */
329
+	public function getElementsByClass($class)
330
+	{
331
+		$this->isLoaded();
332
+
333
+		return $this->find('.'.$class);
334
+	}
335
+
336
+	/**
337
+	 * Checks if the load methods have been called.
338
+	 *
339
+	 * @throws NotLoadedException
340
+	 */
341
+	protected function isLoaded()
342
+	{
343
+		if (is_null($this->content)) {
344
+			throw new NotLoadedException('Content is not loaded!');
345
+		}
346
+	}
347
+
348
+	/**
349
+	 * Cleans the html of any none-html information.
350
+	 *
351
+	 * @param string $str
352
+	 * @return string
353
+	 */
354
+	protected function clean($str)
355
+	{
356
+		if ($this->options->get('cleanupInput') != true) {
357
+			// skip entire cleanup step
358
+			return $str;
359
+		}
360
+
361
+		// remove white space before closing tags
362
+		$str = mb_eregi_replace("'\s+>", "'>", $str);
363
+		$str = mb_eregi_replace('"\s+>', '">', $str);
364
+
365
+		// clean out the \n\r
366
+		$replace = ' ';
367
+		if ($this->options->get('preserveLineBreaks')) {
368
+			$replace = '&#10';
369
+		}
370
+		$str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
371
+
372
+		// strip the doctype
373
+		$str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
374
+
375
+		// strip out comments
376
+		$str = mb_eregi_replace("<!--(.*?)-->", '', $str);
377
+
378
+		// strip out cdata
379
+		$str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
380
+
381
+		// strip out <script> tags
382
+		if ($this->options->get('removeScripts') == true) {
383
+			$str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
384
+			$str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
385
+		}
386
+
387
+		// strip out <style> tags
388
+		if ($this->options->get('removeStyles') == true) {
389
+			$str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
390
+			$str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
391
+		}
392
+
393
+		// strip out preformatted tags
394
+		$str = mb_eregi_replace("<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>", '', $str);
395
+
396
+		// strip out server side scripts
397
+		$str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
398
+
399
+		// strip smarty scripts
400
+		$str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
401
+
402
+		return $str;
403
+	}
404
+
405
+	/**
406
+	 * Attempts to parse the html in content.
407
+	 */
408
+	protected function parse()
409
+	{
410
+		// add the root node
411
+		$this->root = new HtmlNode('root');
412
+		$activeNode = $this->root;
413
+		while ( ! is_null($activeNode)) {
414
+			$str = $this->content->copyUntil('<');
415
+			if ($str == '') {
416
+				$info = $this->parseTag();
417
+				if ( ! $info['status']) {
418
+					// we are done here
419
+					$activeNode = null;
420
+					continue;
421
+				}
422
+
423
+				// check if it was a closing tag
424
+				if ($info['closing']) {
425
+					$originalNode = $activeNode;
426
+					while ($activeNode->getTag()->name() != $info['tag']) {
427
+						$activeNode = $activeNode->getParent();
428
+						if (is_null($activeNode)) {
429
+							// we could not find opening tag
430
+							$activeNode = $originalNode;
431
+							break;
432
+						}
433
+					}
434
+					if ( ! is_null($activeNode)) {
435
+						$activeNode = $activeNode->getParent();
436
+					}
437
+					continue;
438
+				}
439
+
440
+				if ( ! isset($info['node'])) {
441
+					continue;
442
+				}
443
+
444
+				/** @var AbstractNode $node */
445
+				$node = $info['node'];
446
+				$activeNode->addChild($node);
447
+
448
+				// check if node is self closing
449
+				if ( ! $node->getTag()->isSelfClosing()) {
450
+					$activeNode = $node;
451
+				}
452
+			} else if ($this->options->whitespaceTextNode ||
453
+				trim($str) != ''
454
+			) {
455
+				// we found text we care about
456
+				$textNode = new TextNode($str);
457
+				$activeNode->addChild($textNode);
458
+			}
459
+		}
460
+	}
461
+
462
+	/**
463
+	 * Attempt to parse a tag out of the content.
464
+	 *
465
+	 * @return array
466
+	 * @throws StrictException
467
+	 */
468
+	protected function parseTag()
469
+	{
470
+		$return = [
471
+			'status'  => false,
472
+			'closing' => false,
473
+			'node'    => null,
474
+		];
475
+		if ($this->content->char() != '<') {
476
+			// we are not at the beginning of a tag
477
+			return $return;
478
+		}
479
+
480
+		// check if this is a closing tag
481
+		if ($this->content->fastForward(1)->char() == '/') {
482
+			// end tag
483
+			$tag = $this->content->fastForward(1)
484
+								 ->copyByToken('slash', true);
485
+			// move to end of tag
486
+			$this->content->copyUntil('>');
487
+			$this->content->fastForward(1);
488
+
489
+			// check if this closing tag counts
490
+			$tag = strtolower($tag);
491
+			if (in_array($tag, $this->selfClosing)) {
492
+				$return['status'] = true;
493
+
494
+				return $return;
495
+			} else {
496
+				$return['status']  = true;
497
+				$return['closing'] = true;
498
+				$return['tag']     = strtolower($tag);
499
+			}
500
+
501
+			return $return;
502
+		}
503
+
504
+		$tag  = strtolower($this->content->copyByToken('slash', true));
505
+		$node = new HtmlNode($tag);
506
+
507
+		// attributes
508
+		while ($this->content->char() != '>' &&
509
+			$this->content->char() != '/') {
510
+			$space = $this->content->skipByToken('blank', true);
511
+			if (empty($space)) {
512
+				$this->content->fastForward(1);
513
+				continue;
514
+			}
515
+
516
+			$name = $this->content->copyByToken('equal', true);
517
+			if ($name == '/') {
518
+				break;
519
+			}
520
+
521
+			if (empty($name)) {
522
+				$this->content->fastForward(1);
523
+				continue;
524
+			}
525
+
526
+			$this->content->skipByToken('blank');
527
+			if ($this->content->char() == '=') {
528
+				$attr = [];
529
+				$this->content->fastForward(1)
530
+							  ->skipByToken('blank');
531
+				switch ($this->content->char()) {
532
+					case '"':
533
+						$attr['doubleQuote'] = true;
534
+						$this->content->fastForward(1);
535
+						$string = $this->content->copyUntil('"', true, true);
536
+						do {
537
+							$moreString = $this->content->copyUntilUnless('"', '=>');
538
+							$string .= $moreString;
539
+						} while ( ! empty($moreString));
540
+						$attr['value'] = $string;
541
+						$this->content->fastForward(1);
542
+						$node->getTag()->$name = $attr;
543
+						break;
544
+					case "'":
545
+						$attr['doubleQuote'] = false;
546
+						$this->content->fastForward(1);
547
+						$string = $this->content->copyUntil("'", true, true);
548
+						do {
549
+							$moreString = $this->content->copyUntilUnless("'", '=>');
550
+							$string .= $moreString;
551
+						} while ( ! empty($moreString));
552
+						$attr['value'] = $string;
553
+						$this->content->fastForward(1);
554
+						$node->getTag()->$name = $attr;
555
+						break;
556
+					default:
557
+						$attr['doubleQuote']   = true;
558
+						$attr['value']         = $this->content->copyByToken('attr', true);
559
+						$node->getTag()->$name = $attr;
560
+						break;
561
+				}
562
+			} else {
563
+				// no value attribute
564
+				if ($this->options->strict) {
565
+					// can't have this in strict html
566
+					$character = $this->content->getPosition();
567
+					throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
568
+				}
569
+				$node->getTag()->$name = [
570
+					'value'       => null,
571
+					'doubleQuote' => true,
572
+				];
573
+				if ($this->content->char() != '>') {
574
+					$this->content->rewind(1);
575
+				}
576
+			}
577
+		}
578
+
579
+		$this->content->skipByToken('blank');
580
+		if ($this->content->char() == '/') {
581
+			// self closing tag
582
+			$node->getTag()->selfClosing();
583
+			$this->content->fastForward(1);
584
+		} elseif (in_array($tag, $this->selfClosing)) {
585
+
586
+			// Should be a self closing tag, check if we are strict
587
+			if ($this->options->strict) {
588
+				$character = $this->content->getPosition();
589
+				throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
590
+			}
591
+
592
+			// We force self closing on this tag.
593
+			$node->getTag()->selfClosing();
594
+		}
595
+
596
+		$this->content->fastForward(1);
597
+
598
+		$return['status'] = true;
599
+		$return['node']   = $node;
600
+
601
+		return $return;
602
+	}
603
+
604
+	/**
605
+	 * Attempts to detect the charset that the html was sent in.
606
+	 *
607
+	 * @return bool
608
+	 */
609
+	protected function detectCharset()
610
+	{
611
+		// set the default
612
+		$encode = new Encode;
613
+		$encode->from($this->defaultCharset);
614
+		$encode->to($this->defaultCharset);
615
+
616
+		if ( ! is_null($this->options->enforceEncoding)) {
617
+			//  they want to enforce the given encoding
618
+			$encode->from($this->options->enforceEncoding);
619
+			$encode->to($this->options->enforceEncoding);
620
+
621
+			return false;
622
+		}
623
+
624
+		$meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
625
+		if (is_null($meta)) {
626
+			// could not find meta tag
627
+			$this->root->propagateEncoding($encode);
628
+
629
+			return false;
630
+		}
631
+		$content = $meta->content;
632
+		if (empty($content)) {
633
+			// could not find content
634
+			$this->root->propagateEncoding($encode);
635
+
636
+			return false;
637
+		}
638
+		$matches = [];
639
+		if (preg_match('/charset=(.+)/', $content, $matches)) {
640
+			$encode->from(trim($matches[1]));
641
+			$this->root->propagateEncoding($encode);
642
+
643
+			return true;
644
+		}
645
+
646
+		// no charset found
647
+		$this->root->propagateEncoding($encode);
648
+
649
+		return false;
650
+	}
651 651
 }
Please login to merge, or discard this patch.