Completed
Push — master ( 8b759a...268bdc )
by Gilles
02:53
created

Dom::loadStr()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 18
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 11
nc 1
nop 2
dl 0
loc 18
ccs 12
cts 12
cp 1
crap 1
rs 9.9
c 0
b 0
f 0
1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\HtmlNode;
6
use PHPHtmlParser\Dom\TextNode;
7
use PHPHtmlParser\Exceptions\NotLoadedException;
8
use PHPHtmlParser\Exceptions\StrictException;
9
use stringEncode\Encode;
10
11
/**
12
 * Class Dom
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Dom
17
{
18
19
    /**
20
     * The charset we would like the output to be in.
21
     *
22
     * @var string
23
     */
24
    protected $defaultCharset = 'UTF-8';
25
26
    /**
27
     * Contains the root node of this dom tree.
28
     *
29
     * @var HtmlNode
30
     */
31
    public $root;
32
33
    /**
34
     * The raw version of the document string.
35
     *
36
     * @var string
37
     */
38
    protected $raw;
39
40
    /**
41
     * The document string.
42
     *
43
     * @var Content
44
     */
45
    protected $content = null;
46
47
    /**
48
     * The original file size of the document.
49
     *
50
     * @var int
51
     */
52
    protected $rawSize;
53
54
    /**
55
     * The size of the document after it is cleaned.
56
     *
57
     * @var int
58
     */
59
    protected $size;
60
61
    /**
62
     * A global options array to be used by all load calls.
63
     *
64
     * @var array
65
     */
66
    protected $globalOptions = [];
67
68
    /**
69
     * A persistent option object to be used for all options in the
70
     * parsing of the file.
71
     *
72
     * @var Options
73
     */
74
    protected $options;
75
76
    /**
77
     * A list of tags which will always be self closing
78
     *
79
     * @var array
80
     */
81
    protected $selfClosing = [
82
        'area',
83
        'base',
84
        'basefont',
85
        'br',
86
        'col',
87
        'embed',
88
        'hr',
89
        'img',
90
        'input',
91
        'keygen',
92
        'link',
93
        'meta',
94
        'param',
95
        'source',
96
        'spacer',
97
        'track',
98
        'wbr'
99
    ];
100
101
    /**
102
     * A list of tags where there should be no /> at the end (html5 style)
103
     *
104
     * @var array
105
     */
106
    protected $noSlash = [];
107
108
    /**
109
     * Returns the inner html of the root node.
110
     *
111
     * @return string
112
     */
113 24
    public function __toString(): string
114
    {
115 24
        return $this->root->innerHtml();
116
    }
117
118
    /**
119
     * A simple wrapper around the root node.
120
     *
121
     * @param string $name
122
     * @return mixed
123
     */
124 9
    public function __get($name)
125
    {
126 9
        return $this->root->$name;
127
    }
128
129
    /**
130
     * Attempts to load the dom from any resource, string, file, or URL.
131
     *
132
     * @param string $str
133
     * @param array $options
134
     * @return Dom
135
     * @chainable
136
     */
137 147
    public function load(string $str, array $options = []): Dom
138
    {
139 147
        AbstractNode::resetCount();
140
        // check if it's a file
141 147
        if (strpos($str, "\n") === false && is_file($str)) {
142 6
            return $this->loadFromFile($str, $options);
143
        }
144
        // check if it's a url
145 141
        if (preg_match("/^https?:\/\//i", $str)) {
146
            return $this->loadFromUrl($str, $options);
147
        }
148
149 141
        return $this->loadStr($str, $options);
150
    }
151
152
    /**
153
     * Loads the dom from a document file/url
154
     *
155
     * @param string $file
156
     * @param array $options
157
     * @return Dom
158
     * @chainable
159
     */
160 48
    public function loadFromFile(string $file, array $options = []): Dom
161
    {
162 48
        return $this->loadStr(file_get_contents($file), $options);
163
    }
164
165
    /**
166
     * Use a curl interface implementation to attempt to load
167
     * the content from a url.
168
     *
169
     * @param string $url
170
     * @param array $options
171
     * @param CurlInterface $curl
172
     * @return Dom
173
     * @chainable
174
     */
175 6
    public function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom
176
    {
177 6
        if (is_null($curl)) {
178
            // use the default curl interface
179
            $curl = new Curl;
180
        }
181 6
        $content = $curl->get($url);
182
183 6
        return $this->loadStr($content, $options);
184
    }
185
186
    /**
187
     * Parsers the html of the given string. Used for load(), loadFromFile(),
188
     * and loadFromUrl().
189
     *
190
     * @param string $str
191
     * @param array $option
192
     * @return Dom
193
     * @chainable
194
     */
195 195
    public function loadStr(string $str, array $option): Dom
196
    {
197 195
        $this->options = new Options;
198 195
        $this->options->setOptions($this->globalOptions)
199 195
                      ->setOptions($option);
200
201 195
        $this->rawSize = strlen($str);
202 195
        $this->raw     = $str;
203
204 195
        $html = $this->clean($str);
205
206 195
        $this->size    = strlen($str);
207 195
        $this->content = new Content($html);
208
209 195
        $this->parse();
210 189
        $this->detectCharset();
211
212 189
        return $this;
213
    }
214
215
    /**
216
     * Sets a global options array to be used by all load calls.
217
     *
218
     * @param array $options
219
     * @return Dom
220
     * @chainable
221
     */
222 39
    public function setOptions(array $options): Dom
223
    {
224 39
        $this->globalOptions = $options;
225
226 39
        return $this;
227
    }
228
229
    /**
230
     * Find elements by css selector on the root node.
231
     *
232
     * @param string $selector
233
     * @param int $nth
234
     * @return mixed
235
     */
236 147
    public function find(string $selector, int $nth = null)
237
    {
238 147
        $this->isLoaded();
239
240 144
        return $this->root->find($selector, $nth);
241
    }
242
243
    /**
244
     * Find element by Id on the root node
245
     *
246
     * @param int $id
247
     * @return mixed
248
     */
249 9
    public function findById(int $id)
250
    {
251 9
        $this->isLoaded();
252
253 9
        return $this->root->findById($id);
254
    }
255
256
    /**
257
     * Adds the tag (or tags in an array) to the list of tags that will always
258
     * be self closing.
259
     *
260
     * @param string|array $tag
261
     * @return Dom
262
     * @chainable
263
     */
264 6
    public function addSelfClosingTag($tag): Dom
265
    {
266 6
        if ( ! is_array($tag)) {
267 3
            $tag = [$tag];
268
        }
269 6
        foreach ($tag as $value) {
270 6
            $this->selfClosing[] = $value;
271
        }
272
273 6
        return $this;
274
    }
275
276
    /**
277
     * Removes the tag (or tags in an array) from the list of tags that will
278
     * always be self closing.
279
     *
280
     * @param string|array $tag
281
     * @return Dom
282
     * @chainable
283
     */
284 3
    public function removeSelfClosingTag($tag): Dom
285
    {
286 3
        if ( ! is_array($tag)) {
287 3
            $tag = [$tag];
288
        }
289 3
        $this->selfClosing = array_diff($this->selfClosing, $tag);
290
291 3
        return $this;
292
    }
293
294
    /**
295
     * Sets the list of self closing tags to empty.
296
     *
297
     * @return Dom
298
     * @chainable
299
     */
300 3
    public function clearSelfClosingTags(): Dom
301
    {
302 3
        $this->selfClosing = [];
303
304 3
        return $this;
305
    }
306
307
308
    /**
309
     * Adds a tag to the list of self closing tags that should not have a trailing slash
310
     *
311
     * @param $tag
312
     * @return Dom
313
     * @chainable
314
     */
315 3
    public function addNoSlashTag($tag): Dom
316
    {
317 3
        if ( ! is_array($tag)) {
318 3
            $tag = [$tag];
319
        }
320 3
        foreach ($tag as $value) {
321 3
            $this->noSlash[] = $value;
322
        }
323
324 3
        return $this;
325
    }
326
327
    /**
328
     * Removes a tag from the list of no-slash tags.
329
     *
330
     * @param $tag
331
     * @return Dom
332
     * @chainable
333
     */
334
    public function removeNoSlashTag($tag): Dom
335
    {
336
        if ( ! is_array($tag)) {
337
            $tag = [$tag];
338
        }
339
        $this->noSlash = array_diff($this->noSlash, $tag);
340
341
        return $this;
342
    }
343
344
    /**
345
     * Empties the list of no-slash tags.
346
     *
347
     * @return Dom
348
     * @chainable
349
     */
350
    public function clearNoSlashTags(): Dom
351
    {
352
        $this->noSlash = [];
353
354
        return $this;
355
    }
356
357
    /**
358
     * Simple wrapper function that returns the first child.
359
     *
360
     * @return \PHPHtmlParser\Dom\AbstractNode
361
     */
362 3
    public function firstChild(): \PHPHtmlParser\Dom\AbstractNode
363
    {
364 3
        $this->isLoaded();
365
366 3
        return $this->root->firstChild();
367
    }
368
369
    /**
370
     * Simple wrapper function that returns the last child.
371
     *
372
     * @return \PHPHtmlParser\Dom\AbstractNode
373
     */
374 3
    public function lastChild(): \PHPHtmlParser\Dom\AbstractNode
375
    {
376 3
        $this->isLoaded();
377
378 3
        return $this->root->lastChild();
379
    }
380
381
    /**
382
     * Simple wrapper function that returns count of child elements
383
     *
384
     * @return int
385
     */
386 3
    public function countChildren(): int
387
    {
388 3
        $this->isLoaded();
389
390 3
        return $this->root->countChildren();
391
    }
392
393
    /**
394
     * Get array of children
395
     *
396
     * @return array
397
     */
398 3
    public function getChildren(): array
399
    {
400 3
        $this->isLoaded();
401
402 3
        return $this->root->getChildren();
403
    }
404
405
    /**
406
     * Check if node have children nodes
407
     *
408
     * @return bool
409
     */
410 3
    public function hasChildren(): bool
411
    {
412 3
        $this->isLoaded();
413
414 3
        return $this->root->hasChildren();
415
    }
416
417
    /**
418
     * Simple wrapper function that returns an element by the
419
     * id.
420
     *
421
     * @param string $id
422
     * @return \PHPHtmlParser\Dom\AbstractNode
423
     */
424 12
    public function getElementById($id): \PHPHtmlParser\Dom\AbstractNode
425
    {
426 12
        $this->isLoaded();
427
428 12
        return $this->find('#'.$id, 0);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->find('#' . $id, 0) could return the type null which is incompatible with the type-hinted return PHPHtmlParser\Dom\AbstractNode. Consider adding an additional type-check to rule them out.
Loading history...
429
    }
430
431
    /**
432
     * Simple wrapper function that returns all elements by
433
     * tag name.
434
     *
435
     * @param string $name
436
     * @return mixed
437
     */
438 12
    public function getElementsByTag(string $name)
439
    {
440 12
        $this->isLoaded();
441
442 12
        return $this->find($name);
443
    }
444
445
    /**
446
     * Simple wrapper function that returns all elements by
447
     * class name.
448
     *
449
     * @param string $class
450
     * @return mixed
451
     */
452 3
    public function getElementsByClass(string $class)
453
    {
454 3
        $this->isLoaded();
455
456 3
        return $this->find('.'.$class);
457
    }
458
459
    /**
460
     * Checks if the load methods have been called.
461
     *
462
     * @throws NotLoadedException
463
     */
464 171
    protected function isLoaded(): void
465
    {
466 171
        if (is_null($this->content)) {
467 3
            throw new NotLoadedException('Content is not loaded!');
468
        }
469 168
    }
470
471
    /**
472
     * Cleans the html of any none-html information.
473
     *
474
     * @param string $str
475
     * @return string
476
     */
477 195
    protected function clean(string $str): string
478
    {
479 195
        if ($this->options->get('cleanupInput') != true) {
480
            // skip entire cleanup step
481 6
            return $str;
482
        }
483
484
        // remove white space before closing tags
485 189
        $str = mb_eregi_replace("'\s+>", "'>", $str);
486 189
        $str = mb_eregi_replace('"\s+>', '">', $str);
487
488
        // clean out the \n\r
489 189
        $replace = ' ';
490 189
        if ($this->options->get('preserveLineBreaks')) {
491 3
            $replace = '&#10;';
492
        }
493 189
        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
494
495
        // strip the doctype
496 189
        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
497
498
        // strip out comments
499 189
        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
500
501
        // strip out cdata
502 189
        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
503
504
        // strip out <script> tags
505 189
        if ($this->options->get('removeScripts') == true) {
506 186
            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
507 186
            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
508
        }
509
510
        // strip out <style> tags
511 189
        if ($this->options->get('removeStyles') == true) {
512 186
            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
513 186
            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
514
        }
515
516
        // strip out server side scripts
517 189
        if ($this->options->get('serverSideScriptis') == true){
518
            $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
519
        }
520
521
        // strip smarty scripts
522 189
        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
523
524 189
        return $str;
525
    }
526
527
    /**
528
     * Attempts to parse the html in content.
529
     */
530 195
    protected function parse(): void
531
    {
532
        // add the root node
533 195
        $this->root = new HtmlNode('root');
534 195
        $activeNode = $this->root;
535 195
        while ( ! is_null($activeNode)) {
536 195
            $str = $this->content->copyUntil('<');
537 195
            if ($str == '') {
538 195
                $info = $this->parseTag();
539 195
                if ( ! $info['status']) {
540
                    // we are done here
541 159
                    $activeNode = null;
542 159
                    continue;
543
                }
544
545
                // check if it was a closing tag
546 195
                if ($info['closing']) {
547 192
                    $originalNode = $activeNode;
548 192
                    while ($activeNode->getTag()->name() != $info['tag']) {
549 75
                        $activeNode = $activeNode->getParent();
550 75
                        if (is_null($activeNode)) {
551
                            // we could not find opening tag
552 33
                            $activeNode = $originalNode;
553 33
                            break;
554
                        }
555
                    }
556 192
                    if ( ! is_null($activeNode)) {
557 192
                        $activeNode = $activeNode->getParent();
558
                    }
559 192
                    continue;
560
                }
561
562 195
                if ( ! isset($info['node'])) {
563 12
                    continue;
564
                }
565
566
                /** @var AbstractNode $node */
567 195
                $node = $info['node'];
568 195
                $activeNode->addChild($node);
569
570
                // check if node is self closing
571 195
                if ( ! $node->getTag()->isSelfClosing()) {
572 195
                    $activeNode = $node;
573
                }
574 189
            } else if ($this->options->whitespaceTextNode ||
575 189
                trim($str) != ''
576
            ) {
577
                // we found text we care about
578 189
                $textNode = new TextNode($str, $this->options->removeDoubleSpace);
579 189
                $activeNode->addChild($textNode);
580
            }
581
        }
582 189
    }
583
584
    /**
585
     * Attempt to parse a tag out of the content.
586
     *
587
     * @return array
588
     * @throws StrictException
589
     */
590 195
    protected function parseTag(): array
591
    {
592
        $return = [
593 195
            'status'  => false,
594
            'closing' => false,
595
            'node'    => null,
596
        ];
597 195
        if ($this->content->char() != '<') {
598
            // we are not at the beginning of a tag
599 159
            return $return;
600
        }
601
602
        // check if this is a closing tag
603 195
        if ($this->content->fastForward(1)->char() == '/') {
604
            // end tag
605 192
            $tag = $this->content->fastForward(1)
606 192
                                 ->copyByToken('slash', true);
607
            // move to end of tag
608 192
            $this->content->copyUntil('>');
609 192
            $this->content->fastForward(1);
610
611
            // check if this closing tag counts
612 192
            $tag = strtolower($tag);
613 192
            if (in_array($tag, $this->selfClosing)) {
614 12
                $return['status'] = true;
615
616 12
                return $return;
617
            } else {
618 192
                $return['status']  = true;
619 192
                $return['closing'] = true;
620 192
                $return['tag']     = strtolower($tag);
621
            }
622
623 192
            return $return;
624
        }
625
626 195
        $tag  = strtolower($this->content->copyByToken('slash', true));
627 195
        $node = new HtmlNode($tag);
628
629
        // attributes
630 195
        while ($this->content->char() != '>' &&
631 195
            $this->content->char() != '/') {
632 186
            $space = $this->content->skipByToken('blank', true);
633 186
            if (empty($space)) {
634 6
                $this->content->fastForward(1);
635 6
                continue;
636
            }
637
638 186
            $name = $this->content->copyByToken('equal', true);
639 186
            if ($name == '/') {
640
                break;
641
            }
642
643 186
            if (empty($name)) {
644 114
				$this->content->skipByToken('blank');
645 114
				continue;
646
            }
647
648 186
            $this->content->skipByToken('blank');
649 186
            if ($this->content->char() == '=') {
650 186
                $attr = [];
651 186
                $this->content->fastForward(1)
652 186
                              ->skipByToken('blank');
653 186
                switch ($this->content->char()) {
654 186
                    case '"':
655 180
                        $attr['doubleQuote'] = true;
656 180
                        $this->content->fastForward(1);
657 180
                        $string = $this->content->copyUntil('"', true, true);
658
                        do {
659 180
                            $moreString = $this->content->copyUntilUnless('"', '=>');
660 180
                            $string .= $moreString;
661 180
                        } while ( ! empty($moreString));
662 180
                        $attr['value'] = $string;
663 180
                        $this->content->fastForward(1);
664 180
                        $node->getTag()->$name = $attr;
665 180
                        break;
666 12
                    case "'":
667 9
                        $attr['doubleQuote'] = false;
668 9
                        $this->content->fastForward(1);
669 9
                        $string = $this->content->copyUntil("'", true, true);
670
                        do {
671 9
                            $moreString = $this->content->copyUntilUnless("'", '=>');
672 9
                            $string .= $moreString;
673 9
                        } while ( ! empty($moreString));
674 9
                        $attr['value'] = $string;
675 9
                        $this->content->fastForward(1);
676 9
                        $node->getTag()->$name = $attr;
677 9
                        break;
678
                    default:
679 3
                        $attr['doubleQuote']   = true;
680 3
                        $attr['value']         = $this->content->copyByToken('attr', true);
681 3
                        $node->getTag()->$name = $attr;
682 186
                        break;
683
                }
684
            } else {
685
                // no value attribute
686 66
                if ($this->options->strict) {
687
                    // can't have this in strict html
688 3
                    $character = $this->content->getPosition();
689 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
690
                }
691 63
                $node->getTag()->$name = [
692
                    'value'       => null,
693
                    'doubleQuote' => true,
694
                ];
695 63
                if ($this->content->char() != '>') {
696 12
                    $this->content->rewind(1);
697
                }
698
            }
699
        }
700
701 195
        $this->content->skipByToken('blank');
702 195
        if ($this->content->char() == '/') {
703
            // self closing tag
704 111
            $node->getTag()->selfClosing();
705 111
            $this->content->fastForward(1);
706 195
        } elseif (in_array($tag, $this->selfClosing)) {
707
708
            // Should be a self closing tag, check if we are strict
709 69
            if ($this->options->strict) {
710 3
                $character = $this->content->getPosition();
711 3
                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
712
            }
713
714
            // We force self closing on this tag.
715 66
            $node->getTag()->selfClosing();
716
717
            // Should this tag use a trailing slash?
718 66
            if(in_array($tag, $this->noSlash))
719
            {
720 3
                $node->getTag()->noTrailingSlash();
721
            }
722
723
        }
724
725 195
        $this->content->fastForward(1);
726
727 195
        $return['status'] = true;
728 195
        $return['node']   = $node;
729
730 195
        return $return;
731
    }
732
733
    /**
734
     * Attempts to detect the charset that the html was sent in.
735
     *
736
     * @return bool
737
     */
738 189
    protected function detectCharset(): bool
739
    {
740
        // set the default
741 189
        $encode = new Encode;
742 189
        $encode->from($this->defaultCharset);
743 189
        $encode->to($this->defaultCharset);
744
745 189
        if ( ! is_null($this->options->enforceEncoding)) {
746
            //  they want to enforce the given encoding
747
            $encode->from($this->options->enforceEncoding);
748
            $encode->to($this->options->enforceEncoding);
749
750
            return false;
751
        }
752
753 189
        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
754 189
        if (is_null($meta)) {
755
            // could not find meta tag
756 159
            $this->root->propagateEncoding($encode);
757
758 159
            return false;
759
        }
760 30
        $content = $meta->content;
761 30
        if (empty($content)) {
762
            // could not find content
763
            $this->root->propagateEncoding($encode);
764
765
            return false;
766
        }
767 30
        $matches = [];
768 30
        if (preg_match('/charset=(.+)/', $content, $matches)) {
769 30
            $encode->from(trim($matches[1]));
770 30
            $this->root->propagateEncoding($encode);
771
772 30
            return true;
773
        }
774
775
        // no charset found
776
        $this->root->propagateEncoding($encode);
777
778
        return false;
779
    }
780
}
781