Passed
Push — master ( 268bdc...77e4a4 )
by Gilles
02:56
created

Dom   F

Complexity

Total Complexity 78

Size/Duplication

Total Lines 765
Duplicated Lines 0 %

Test Coverage

Coverage 92.61%

Importance

Changes 0
Metric Value
eloc 265
dl 0
loc 765
ccs 238
cts 257
cp 0.9261
rs 2.16
c 0
b 0
f 0
wmc 78

28 Methods

Rating   Name   Duplication   Size   Complexity  
A addSelfClosingTag() 0 10 3
A find() 0 5 1
A getChildren() 0 5 1
A removeNoSlashTag() 0 8 2
B clean() 0 48 6
A loadFromFile() 0 3 1
A removeSelfClosingTag() 0 8 2
A __toString() 0 3 1
A getElementsByClass() 0 5 1
A getElementById() 0 5 1
A isLoaded() 0 4 2
A addNoSlashTag() 0 10 3
A clearNoSlashTags() 0 5 1
A lastChild() 0 5 1
A __get() 0 3 1
A clearSelfClosingTags() 0 5 1
A countChildren() 0 5 1
A loadStr() 0 18 1
A load() 0 13 4
A setOptions() 0 5 1
A loadFromUrl() 0 9 2
A firstChild() 0 5 1
A getElementsByTag() 0 5 1
A findById() 0 5 1
A hasChildren() 0 5 1
D parseTag() 0 141 20
A detectCharset() 0 41 5
C parse() 0 52 12

How to fix   Complexity   

Complex Class

Complex classes like Dom often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Dom, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\HtmlNode;
6
use PHPHtmlParser\Dom\TextNode;
7
use PHPHtmlParser\Exceptions\NotLoadedException;
8
use PHPHtmlParser\Exceptions\StrictException;
9
use stringEncode\Encode;
10
11
/**
12
 * Class Dom
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Dom
17
{
18
19
    /**
20
     * The charset we would like the output to be in.
21
     *
22
     * @var string
23
     */
24
    protected $defaultCharset = 'UTF-8';
25
26
    /**
27
     * Contains the root node of this dom tree.
28
     *
29
     * @var HtmlNode
30
     */
31
    public $root;
32
33
    /**
34
     * The raw version of the document string.
35
     *
36
     * @var string
37
     */
38
    protected $raw;
39
40
    /**
41
     * The document string.
42
     *
43
     * @var Content
44
     */
45
    protected $content = null;
46
47
    /**
48
     * The original file size of the document.
49
     *
50
     * @var int
51
     */
52
    protected $rawSize;
53
54
    /**
55
     * The size of the document after it is cleaned.
56
     *
57
     * @var int
58
     */
59
    protected $size;
60
61
    /**
62
     * A global options array to be used by all load calls.
63
     *
64
     * @var array
65
     */
66
    protected $globalOptions = [];
67
68
    /**
69
     * A persistent option object to be used for all options in the
70
     * parsing of the file.
71
     *
72
     * @var Options
73
     */
74
    protected $options;
75
76
    /**
77
     * A list of tags which will always be self closing
78
     *
79
     * @var array
80
     */
81
    protected $selfClosing = [
82
        'area',
83
        'base',
84
        'basefont',
85
        'br',
86
        'col',
87
        'embed',
88
        'hr',
89
        'img',
90
        'input',
91
        'keygen',
92
        'link',
93
        'meta',
94
        'param',
95
        'source',
96
        'spacer',
97
        'track',
98
        'wbr'
99
    ];
100
101
    /**
102
     * A list of tags where there should be no /> at the end (html5 style)
103
     *
104
     * @var array
105
     */
106
    protected $noSlash = [];
107
108
    /**
109
     * Returns the inner html of the root node.
110
     *
111
     * @return string
112
     */
113 24
    public function __toString(): string
114
    {
115 24
        return $this->root->innerHtml();
116
    }
117
118
    /**
119
     * A simple wrapper around the root node.
120
     *
121
     * @param string $name
122
     * @return mixed
123
     */
124 9
    public function __get($name)
125
    {
126 9
        return $this->root->$name;
127
    }
128
129
    /**
130
     * Attempts to load the dom from any resource, string, file, or URL.
131
     *
132
     * @param string $str
133
     * @param array $options
134
     * @return Dom
135
     * @chainable
136
     */
137 150
    public function load(string $str, array $options = []): Dom
138
    {
139 150
        AbstractNode::resetCount();
140
        // check if it's a file
141 150
        if (strpos($str, "\n") === false && is_file($str)) {
142 6
            return $this->loadFromFile($str, $options);
143
        }
144
        // check if it's a url
145 144
        if (preg_match("/^https?:\/\//i", $str)) {
146
            return $this->loadFromUrl($str, $options);
147
        }
148
149 144
        return $this->loadStr($str, $options);
150
    }
151
152
    /**
153
     * Loads the dom from a document file/url
154
     *
155
     * @param string $file
156
     * @param array $options
157
     * @return Dom
158
     * @chainable
159
     */
160 48
    public function loadFromFile(string $file, array $options = []): Dom
161
    {
162 48
        return $this->loadStr(file_get_contents($file), $options);
163
    }
164
165
    /**
166
     * Use a curl interface implementation to attempt to load
167
     * the content from a url.
168
     *
169
     * @param string $url
170
     * @param array $options
171
     * @param CurlInterface $curl
172
     * @return Dom
173
     * @chainable
174
     */
175 6
    public function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom
176
    {
177 6
        if (is_null($curl)) {
178
            // use the default curl interface
179
            $curl = new Curl;
180
        }
181 6
        $content = $curl->get($url);
182
183 6
        return $this->loadStr($content, $options);
184
    }
185
186
    /**
187
     * Parsers the html of the given string. Used for load(), loadFromFile(),
188
     * and loadFromUrl().
189
     *
190
     * @param string $str
191
     * @param array $option
192
     * @return Dom
193
     * @chainable
194
     */
195 201
    public function loadStr(string $str, array $option = []): Dom
196
    {
197 201
        $this->options = new Options;
198 201
        $this->options->setOptions($this->globalOptions)
199 201
                      ->setOptions($option);
200
201 201
        $this->rawSize = strlen($str);
202 201
        $this->raw     = $str;
203
204 201
        $html = $this->clean($str);
205
206 201
        $this->size    = strlen($str);
207 201
        $this->content = new Content($html);
208
209 201
        $this->parse();
210 195
        $this->detectCharset();
211
212 195
        return $this;
213
    }
214
215
    /**
216
     * Sets a global options array to be used by all load calls.
217
     *
218
     * @param array $options
219
     * @return Dom
220
     * @chainable
221
     */
222 39
    public function setOptions(array $options): Dom
223
    {
224 39
        $this->globalOptions = $options;
225
226 39
        return $this;
227
    }
228
229
    /**
230
     * Find elements by css selector on the root node.
231
     *
232
     * @param string $selector
233
     * @param int $nth
234
     * @return mixed
235
     */
236 150
    public function find(string $selector, int $nth = null)
237
    {
238 150
        $this->isLoaded();
239
240 147
        return $this->root->find($selector, $nth);
241
    }
242
243
    /**
244
     * Find element by Id on the root node
245
     *
246
     * @param int $id
247
     * @return mixed
248
     */
249 9
    public function findById(int $id)
250
    {
251 9
        $this->isLoaded();
252
253 9
        return $this->root->findById($id);
254
    }
255
256
    /**
257
     * Adds the tag (or tags in an array) to the list of tags that will always
258
     * be self closing.
259
     *
260
     * @param string|array $tag
261
     * @return Dom
262
     * @chainable
263
     */
264 6
    public function addSelfClosingTag($tag): Dom
265
    {
266 6
        if ( ! is_array($tag)) {
267 3
            $tag = [$tag];
268
        }
269 6
        foreach ($tag as $value) {
270 6
            $this->selfClosing[] = $value;
271
        }
272
273 6
        return $this;
274
    }
275
276
    /**
277
     * Removes the tag (or tags in an array) from the list of tags that will
278
     * always be self closing.
279
     *
280
     * @param string|array $tag
281
     * @return Dom
282
     * @chainable
283
     */
284 3
    public function removeSelfClosingTag($tag): Dom
285
    {
286 3
        if ( ! is_array($tag)) {
287 3
            $tag = [$tag];
288
        }
289 3
        $this->selfClosing = array_diff($this->selfClosing, $tag);
290
291 3
        return $this;
292
    }
293
294
    /**
295
     * Sets the list of self closing tags to empty.
296
     *
297
     * @return Dom
298
     * @chainable
299
     */
300 3
    public function clearSelfClosingTags(): Dom
301
    {
302 3
        $this->selfClosing = [];
303
304 3
        return $this;
305
    }
306
307
308
    /**
309
     * Adds a tag to the list of self closing tags that should not have a trailing slash
310
     *
311
     * @param $tag
312
     * @return Dom
313
     * @chainable
314
     */
315 3
    public function addNoSlashTag($tag): Dom
316
    {
317 3
        if ( ! is_array($tag)) {
318 3
            $tag = [$tag];
319
        }
320 3
        foreach ($tag as $value) {
321 3
            $this->noSlash[] = $value;
322
        }
323
324 3
        return $this;
325
    }
326
327
    /**
328
     * Removes a tag from the list of no-slash tags.
329
     *
330
     * @param $tag
331
     * @return Dom
332
     * @chainable
333
     */
334
    public function removeNoSlashTag($tag): Dom
335
    {
336
        if ( ! is_array($tag)) {
337
            $tag = [$tag];
338
        }
339
        $this->noSlash = array_diff($this->noSlash, $tag);
340
341
        return $this;
342
    }
343
344
    /**
345
     * Empties the list of no-slash tags.
346
     *
347
     * @return Dom
348
     * @chainable
349
     */
350
    public function clearNoSlashTags(): Dom
351
    {
352
        $this->noSlash = [];
353
354
        return $this;
355
    }
356
357
    /**
358
     * Simple wrapper function that returns the first child.
359
     *
360
     * @return \PHPHtmlParser\Dom\AbstractNode
361
     */
362 3
    public function firstChild(): \PHPHtmlParser\Dom\AbstractNode
363
    {
364 3
        $this->isLoaded();
365
366 3
        return $this->root->firstChild();
367
    }
368
369
    /**
370
     * Simple wrapper function that returns the last child.
371
     *
372
     * @return \PHPHtmlParser\Dom\AbstractNode
373
     */
374 3
    public function lastChild(): \PHPHtmlParser\Dom\AbstractNode
375
    {
376 3
        $this->isLoaded();
377
378 3
        return $this->root->lastChild();
379
    }
380
381
    /**
382
     * Simple wrapper function that returns count of child elements
383
     *
384
     * @return int
385
     */
386 3
    public function countChildren(): int
387
    {
388 3
        $this->isLoaded();
389
390 3
        return $this->root->countChildren();
391
    }
392
393
    /**
394
     * Get array of children
395
     *
396
     * @return array
397
     */
398 3
    public function getChildren(): array
399
    {
400 3
        $this->isLoaded();
401
402 3
        return $this->root->getChildren();
403
    }
404
405
    /**
406
     * Check if node have children nodes
407
     *
408
     * @return bool
409
     */
410 3
    public function hasChildren(): bool
411
    {
412 3
        $this->isLoaded();
413
414 3
        return $this->root->hasChildren();
415
    }
416
417
    /**
418
     * Simple wrapper function that returns an element by the
419
     * id.
420
     *
421
     * @param string $id
422
     * @return \PHPHtmlParser\Dom\AbstractNode|null
423
     */
424 12
    public function getElementById($id)
425
    {
426 12
        $this->isLoaded();
427
428 12
        return $this->find('#'.$id, 0);
429
    }
430
431
    /**
432
     * Simple wrapper function that returns all elements by
433
     * tag name.
434
     *
435
     * @param string $name
436
     * @return mixed
437
     */
438 15
    public function getElementsByTag(string $name)
439
    {
440 15
        $this->isLoaded();
441
442 15
        return $this->find($name);
443
    }
444
445
    /**
446
     * Simple wrapper function that returns all elements by
447
     * class name.
448
     *
449
     * @param string $class
450
     * @return mixed
451
     */
452 3
    public function getElementsByClass(string $class)
453
    {
454 3
        $this->isLoaded();
455
456 3
        return $this->find('.'.$class);
457
    }
458
459
    /**
460
     * Checks if the load methods have been called.
461
     *
462
     * @throws NotLoadedException
463
     */
464 174
    protected function isLoaded(): void
465
    {
466 174
        if (is_null($this->content)) {
467 3
            throw new NotLoadedException('Content is not loaded!');
468
        }
469 171
    }
470
471
    /**
472
     * Cleans the html of any none-html information.
473
     *
474
     * @param string $str
475
     * @return string
476
     */
477 201
    protected function clean(string $str): string
478
    {
479 201
        if ($this->options->get('cleanupInput') != true) {
480
            // skip entire cleanup step
481 6
            return $str;
482
        }
483
484
        // remove white space before closing tags
485 195
        $str = mb_eregi_replace("'\s+>", "'>", $str);
486 195
        $str = mb_eregi_replace('"\s+>', '">', $str);
487
488
        // clean out the \n\r
489 195
        $replace = ' ';
490 195
        if ($this->options->get('preserveLineBreaks')) {
491 3
            $replace = '&#10;';
492
        }
493 195
        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
494
495
        // strip the doctype
496 195
        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
497
498
        // strip out comments
499 195
        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
500
501
        // strip out cdata
502 195
        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
503
504
        // strip out <script> tags
505 195
        if ($this->options->get('removeScripts') == true) {
506 192
            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
507 192
            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
508
        }
509
510
        // strip out <style> tags
511 195
        if ($this->options->get('removeStyles') == true) {
512 192
            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
513 192
            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
514
        }
515
516
        // strip out server side scripts
517 195
        if ($this->options->get('serverSideScriptis') == true){
518
            $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
519
        }
520
521
        // strip smarty scripts
522 195
        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
523
524 195
        return $str;
525
    }
526
527
    /**
528
     * Attempts to parse the html in content.
529
     */
530 201
    protected function parse(): void
531
    {
532
        // add the root node
533 201
        $this->root = new HtmlNode('root');
534 201
        $activeNode = $this->root;
535 201
        while ( ! is_null($activeNode)) {
536 201
            $str = $this->content->copyUntil('<');
537 201
            if ($str == '') {
538 201
                $info = $this->parseTag();
539 201
                if ( ! $info['status']) {
540
                    // we are done here
541 195
                    $activeNode = null;
542 195
                    continue;
543
                }
544
545
                // check if it was a closing tag
546 201
                if ($info['closing']) {
547 198
                    $foundOpeningTag  = true;
548 198
                    $originalNode     = $activeNode;
549 198
                    while ($activeNode->getTag()->name() != $info['tag']) {
550 78
                        $activeNode = $activeNode->getParent();
551 78
                        if (is_null($activeNode)) {
552
                            // we could not find opening tag
553 36
                            $activeNode = $originalNode;
554 36
                            $foundOpeningTag = false;
555 36
                            break;
556
                        }
557
                    }
558 198
                    if ($foundOpeningTag) {
559 198
                        $activeNode = $activeNode->getParent();
560
                    }
561 198
                    continue;
562
                }
563
564 201
                if ( ! isset($info['node'])) {
565 12
                    continue;
566
                }
567
568
                /** @var AbstractNode $node */
569 201
                $node = $info['node'];
570 201
                $activeNode->addChild($node);
571
572
                // check if node is self closing
573 201
                if ( ! $node->getTag()->isSelfClosing()) {
574 201
                    $activeNode = $node;
575
                }
576 195
            } else if ($this->options->whitespaceTextNode ||
577 195
                trim($str) != ''
578
            ) {
579
                // we found text we care about
580 195
                $textNode = new TextNode($str, $this->options->removeDoubleSpace);
581 195
                $activeNode->addChild($textNode);
582
            }
583
        }
584 195
    }
585
586
    /**
587
     * Attempt to parse a tag out of the content.
588
     *
589
     * @return array
590
     * @throws StrictException
591
     */
592 201
    protected function parseTag(): array
593
    {
594
        $return = [
595 201
            'status'  => false,
596
            'closing' => false,
597
            'node'    => null,
598
        ];
599 201
        if ($this->content->char() != '<') {
600
            // we are not at the beginning of a tag
601 195
            return $return;
602
        }
603
604
        // check if this is a closing tag
605 201
        if ($this->content->fastForward(1)->char() == '/') {
606
            // end tag
607 198
            $tag = $this->content->fastForward(1)
608 198
                                 ->copyByToken('slash', true);
609
            // move to end of tag
610 198
            $this->content->copyUntil('>');
611 198
            $this->content->fastForward(1);
612
613
            // check if this closing tag counts
614 198
            $tag = strtolower($tag);
615 198
            if (in_array($tag, $this->selfClosing)) {
616 12
                $return['status'] = true;
617
618 12
                return $return;
619
            } else {
620 198
                $return['status']  = true;
621 198
                $return['closing'] = true;
622 198
                $return['tag']     = strtolower($tag);
623
            }
624
625 198
            return $return;
626
        }
627
628 201
        $tag  = strtolower($this->content->copyByToken('slash', true));
629 201
        $node = new HtmlNode($tag);
630
631
        // attributes
632 201
        while ($this->content->char() != '>' &&
633 201
            $this->content->char() != '/') {
634 192
            $space = $this->content->skipByToken('blank', true);
635 192
            if (empty($space)) {
636 6
                $this->content->fastForward(1);
637 6
                continue;
638
            }
639
640 192
            $name = $this->content->copyByToken('equal', true);
641 192
            if ($name == '/') {
642
                break;
643
            }
644
645 192
            if (empty($name)) {
646 114
				$this->content->skipByToken('blank');
647 114
				continue;
648
            }
649
650 192
            $this->content->skipByToken('blank');
651 192
            if ($this->content->char() == '=') {
652 192
                $attr = [];
653 192
                $this->content->fastForward(1)
654 192
                              ->skipByToken('blank');
655 192
                switch ($this->content->char()) {
656 192
                    case '"':
657 183
                        $attr['doubleQuote'] = true;
658 183
                        $this->content->fastForward(1);
659 183
                        $string = $this->content->copyUntil('"', true, true);
660
                        do {
661 183
                            $moreString = $this->content->copyUntilUnless('"', '=>');
662 183
                            $string .= $moreString;
663 183
                        } while ( ! empty($moreString));
664 183
                        $attr['value'] = $string;
665 183
                        $this->content->fastForward(1);
666 183
                        $node->getTag()->$name = $attr;
667 183
                        break;
668 15
                    case "'":
669 12
                        $attr['doubleQuote'] = false;
670 12
                        $this->content->fastForward(1);
671 12
                        $string = $this->content->copyUntil("'", true, true);
672
                        do {
673 12
                            $moreString = $this->content->copyUntilUnless("'", '=>');
674 12
                            $string .= $moreString;
675 12
                        } while ( ! empty($moreString));
676 12
                        $attr['value'] = $string;
677 12
                        $this->content->fastForward(1);
678 12
                        $node->getTag()->$name = $attr;
679 12
                        break;
680
                    default:
681 3
                        $attr['doubleQuote']   = true;
682 3
                        $attr['value']         = $this->content->copyByToken('attr', true);
683 3
                        $node->getTag()->$name = $attr;
684 192
                        break;
685
                }
686
            } else {
687
                // no value attribute
688 66
                if ($this->options->strict) {
689
                    // can't have this in strict html
690 3
                    $character = $this->content->getPosition();
691 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
692
                }
693 63
                $node->getTag()->$name = [
694
                    'value'       => null,
695
                    'doubleQuote' => true,
696
                ];
697 63
                if ($this->content->char() != '>') {
698 12
                    $this->content->rewind(1);
699
                }
700
            }
701
        }
702
703 201
        $this->content->skipByToken('blank');
704 201
        if ($this->content->char() == '/') {
705
            // self closing tag
706 111
            $node->getTag()->selfClosing();
707 111
            $this->content->fastForward(1);
708 201
        } elseif (in_array($tag, $this->selfClosing)) {
709
710
            // Should be a self closing tag, check if we are strict
711 72
            if ($this->options->strict) {
712 3
                $character = $this->content->getPosition();
713 3
                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
714
            }
715
716
            // We force self closing on this tag.
717 69
            $node->getTag()->selfClosing();
718
719
            // Should this tag use a trailing slash?
720 69
            if(in_array($tag, $this->noSlash))
721
            {
722 3
                $node->getTag()->noTrailingSlash();
723
            }
724
725
        }
726
727 201
        $this->content->fastForward(1);
728
729 201
        $return['status'] = true;
730 201
        $return['node']   = $node;
731
732 201
        return $return;
733
    }
734
735
    /**
736
     * Attempts to detect the charset that the html was sent in.
737
     *
738
     * @return bool
739
     */
740 195
    protected function detectCharset(): bool
741
    {
742
        // set the default
743 195
        $encode = new Encode;
744 195
        $encode->from($this->defaultCharset);
745 195
        $encode->to($this->defaultCharset);
746
747 195
        if ( ! is_null($this->options->enforceEncoding)) {
748
            //  they want to enforce the given encoding
749
            $encode->from($this->options->enforceEncoding);
750
            $encode->to($this->options->enforceEncoding);
751
752
            return false;
753
        }
754
755 195
        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
756 195
        if (is_null($meta)) {
757
            // could not find meta tag
758 165
            $this->root->propagateEncoding($encode);
759
760 165
            return false;
761
        }
762 30
        $content = $meta->content;
763 30
        if (empty($content)) {
764
            // could not find content
765
            $this->root->propagateEncoding($encode);
766
767
            return false;
768
        }
769 30
        $matches = [];
770 30
        if (preg_match('/charset=(.+)/', $content, $matches)) {
771 30
            $encode->from(trim($matches[1]));
772 30
            $this->root->propagateEncoding($encode);
773
774 30
            return true;
775
        }
776
777
        // no charset found
778
        $this->root->propagateEncoding($encode);
779
780
        return false;
781
    }
782
}
783