Test Failed
Pull Request — master (#160)
by
unknown
11:10
created

Dom::__toString()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\HtmlNode;
6
use PHPHtmlParser\Dom\TextNode;
7
use PHPHtmlParser\Exceptions\NotLoadedException;
8
use PHPHtmlParser\Exceptions\StrictException;
9
use stringEncode\Encode;
10
11
/**
12
 * Class Dom
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Dom
17
{
18
19
    /**
20
     * The charset we would like the output to be in.
21
     *
22
     * @var string
23
     */
24
    protected $defaultCharset = 'UTF-8';
25
26
    /**
27
     * Contains the root node of this dom tree.
28
     *
29
     * @var HtmlNode
30
     */
31
    public $root;
32
33
    /**
34
     * The raw version of the document string.
35
     *
36
     * @var string
37
     */
38
    protected $raw;
39
40
    /**
41
     * The document string.
42
     *
43
     * @var Content
44
     */
45
    protected $content = null;
46
47
    /**
48
     * The original file size of the document.
49
     *
50
     * @var int
51
     */
52
    protected $rawSize;
53
54
    /**
55
     * The size of the document after it is cleaned.
56
     *
57
     * @var int
58
     */
59
    protected $size;
60
61
    /**
62
     * A global options array to be used by all load calls.
63
     *
64
     * @var array
65
     */
66
    protected $globalOptions = [];
67
68
    /**
69
     * A persistent option object to be used for all options in the
70
     * parsing of the file.
71
     *
72
     * @var Options
73
     */
74
    protected $options;
75
76
    /**
77
     * A list of tags which will always be self closing
78
     *
79
     * @var array
80
     */
81
    protected $selfClosing = [
82
        'area',
83
        'base',
84
        'basefont',
85
        'br',
86
        'col',
87
        'embed',
88
        'hr',
89
        'img',
90
        'input',
91
        'keygen',
92
        'link',
93
        'meta',
94
        'param',
95
        'source',
96
        'spacer',
97
        'track',
98
        'wbr'
99
    ];
100
101
    /**
102
     * A list of tags where there should be no /> at the end (html5 style)
103
     *
104
     * @var array
105
     */
106
    protected $noSlash = [];
107
108
    /**
109
     * Returns the inner html of the root node.
110
     *
111
     * @return string
112
     */
113
    public function __toString()
114
    {
115
        return $this->root->innerHtml();
116
    }
117
118
    /**
119
     * A simple wrapper around the root node.
120
     *
121
     * @param string $name
122
     * @return mixed
123
     */
124
    public function __get($name)
125
    {
126
        return $this->root->$name;
127
    }
128
129
    /**
130
     * Attempts to load the dom from any resource, string, file, or URL.
131
     *
132
     * @param string $str
133
     * @param array $options
134
     * @return $this
135
     */
136
    public function load($str, $options = [])
137
    {
138
        AbstractNode::resetCount();
139
        // check if it's a file
140
        if (strpos($str, "\n") === false && is_file($str)) {
141
            return $this->loadFromFile($str, $options);
142
        }
143
        // check if it's a url
144
        if (preg_match("/^https?:\/\//i", $str)) {
145
            return $this->loadFromUrl($str, $options);
146
        }
147
148
        return $this->loadStr($str, $options);
149
    }
150
151
    /**
152
     * Loads the dom from a document file/url
153
     *
154
     * @param string $file
155
     * @param array $options
156
     * @return $this
157
     */
158
    public function loadFromFile($file, $options = [])
159
    {
160
        return $this->loadStr(file_get_contents($file), $options);
161
    }
162
163
    /**
164
     * Use a curl interface implementation to attempt to load
165
     * the content from a url.
166
     *
167
     * @param string $url
168
     * @param array $options
169
     * @param CurlInterface $curl
170
     * @return $this
171
     */
172
    public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
173
    {
174
        if (is_null($curl)) {
175
            // use the default curl interface
176
            $curl = new Curl;
177
        }
178
        $content = $curl->get($url);
179
180
        return $this->loadStr($content, $options);
181
    }
182
183
    /**
184
     * Parsers the html of the given string. Used for load(), loadFromFile(),
185
     * and loadFromUrl().
186
     *
187
     * @param string $str
188
     * @param array $option
189
     * @return $this
190
     */
191
    public function loadStr($str, $option)
192
    {
193
        $this->options = new Options;
194
        $this->options->setOptions($this->globalOptions)
195
                      ->setOptions($option);
196
197
        $this->rawSize = strlen($str);
198
        $this->raw     = $str;
199
200
        $html = $this->clean($str);
201
202
        $this->size    = strlen($str);
203
        $this->content = new Content($html);
204
205
        $this->parse();
206
        $this->detectCharset();
207
208
        return $this;
209
    }
210
211
    /**
212
     * Sets a global options array to be used by all load calls.
213
     *
214
     * @param array $options
215
     * @return $this
216
     */
217
    public function setOptions(array $options)
218
    {
219
        $this->globalOptions = $options;
220
221
        return $this;
222
    }
223
224
    /**
225
     * Find elements by css selector on the root node.
226
     *
227
     * @param string $selector
228
     * @param int $nth
229
     * @return array
230
     */
231
    public function find($selector, $nth = null)
232
    {
233
        $this->isLoaded();
234
235
        return $this->root->find($selector, $nth);
236
    }
237
238
    /**
239
     * Find element by Id on the root node
240
     *
241
     * @param int $id Element Id
242
     * @return mixed
243
     *
244
     */
245
    public function findById($id)
246
    {
247
        $this->isLoaded();
248
249
        return $this->root->findById($id);
250
    }
251
252
    /**
253
     * Adds the tag (or tags in an array) to the list of tags that will always
254
     * be self closing.
255
     *
256
     * @param string|array $tag
257
     * @return $this
258
     */
259
    public function addSelfClosingTag($tag)
260
    {
261
        if ( ! is_array($tag)) {
262
            $tag = [$tag];
263
        }
264
        foreach ($tag as $value) {
265
            $this->selfClosing[] = $value;
266
        }
267
268
        return $this;
269
    }
270
271
    /**
272
     * Removes the tag (or tags in an array) from the list of tags that will
273
     * always be self closing.
274
     *
275
     * @param string|array $tag
276
     * @return $this
277
     */
278
    public function removeSelfClosingTag($tag)
279
    {
280
        if ( ! is_array($tag)) {
281
            $tag = [$tag];
282
        }
283
        $this->selfClosing = array_diff($this->selfClosing, $tag);
284
285
        return $this;
286
    }
287
288
    /**
289
     * Sets the list of self closing tags to empty.
290
     *
291
     * @return $this
292
     */
293
    public function clearSelfClosingTags()
294
    {
295
        $this->selfClosing = [];
296
297
        return $this;
298
    }
299
300
301
    /**
302
     * Adds a tag to the list of self closing tags that should not have a trailing slash
303
     *
304
     * @param $tag
305
     * @return $this
306
     */
307
    public function addNoSlashTag($tag)
308
    {
309
        if ( ! is_array($tag)) {
310
            $tag = [$tag];
311
        }
312
        foreach ($tag as $value) {
313
            $this->noSlash[] = $value;
314
        }
315
316
        return $this;
317
    }
318
319
    /**
320
     * Removes a tag from the list of no-slash tags.
321
     *
322
     * @param $tag
323
     * @return $this
324
     */
325
    public function removeNoSlashTag($tag)
326
    {
327
        if ( ! is_array($tag)) {
328
            $tag = [$tag];
329
        }
330
        $this->noSlash = array_diff($this->noSlash, $tag);
331
332
        return $this;
333
    }
334
335
    /**
336
     * Empties the list of no-slash tags.
337
     *
338
     * @return $this
339
     */
340
    public function clearNoSlashTags()
341
    {
342
        $this->noSlash = [];
343
344
        return $this;
345
    }
346
347
    /**
348
     * Simple wrapper function that returns the first child.
349
     *
350
     * @return \PHPHtmlParser\Dom\AbstractNode
351
     */
352
    public function firstChild()
353
    {
354
        $this->isLoaded();
355
356
        return $this->root->firstChild();
357
    }
358
359
    /**
360
     * Simple wrapper function that returns the last child.
361
     *
362
     * @return \PHPHtmlParser\Dom\AbstractNode
363
     */
364
    public function lastChild()
365
    {
366
        $this->isLoaded();
367
368
        return $this->root->lastChild();
369
    }
370
371
    /**
372
     * Simple wrapper function that returns count of child elements
373
     *
374
     * @return int
375
     */
376
    public function countChildren()
377
    {
378
        $this->isLoaded();
379
380
        return $this->root->countChildren();
381
    }
382
383
    /**
384
     * Get array of children
385
     *
386
     * @return array
387
     */
388
    public function getChildren()
389
    {
390
        $this->isLoaded();
391
392
        return $this->root->getChildren();
393
    }
394
395
    /**
396
     * Check if node have children nodes
397
     *
398
     * @return bool
399
     */
400
    public function hasChildren()
401
    {
402
        $this->isLoaded();
403
404
        return $this->root->hasChildren();
405
    }
406
407
    /**
408
     * Simple wrapper function that returns an element by the
409
     * id.
410
     *
411
     * @param string $id
412
     * @return \PHPHtmlParser\Dom\AbstractNode
413
     */
414
    public function getElementById($id)
415
    {
416
        $this->isLoaded();
417
418
        return $this->find('#'.$id, 0);
0 ignored issues
show
Bug Compatibility introduced by
The expression $this->find('#' . $id, 0); of type array|PHPHtmlParser\Dom\AbstractNode adds the type array to the return on line 418 which is incompatible with the return type documented by PHPHtmlParser\Dom::getElementById of type PHPHtmlParser\Dom\AbstractNode.
Loading history...
419
    }
420
421
    /**
422
     * Simple wrapper function that returns all elements by
423
     * tag name.
424
     *
425
     * @param string $name
426
     * @return array
427
     */
428
    public function getElementsByTag($name)
429
    {
430
        $this->isLoaded();
431
432
        return $this->find($name);
433
    }
434
435
    /**
436
     * Simple wrapper function that returns all elements by
437
     * class name.
438
     *
439
     * @param string $class
440
     * @return array
441
     */
442
    public function getElementsByClass($class)
443
    {
444
        $this->isLoaded();
445
446
        return $this->find('.'.$class);
447
    }
448
449
    /**
450
     * Checks if the load methods have been called.
451
     *
452
     * @throws NotLoadedException
453
     */
454
    protected function isLoaded()
455
    {
456
        if (is_null($this->content)) {
457
            throw new NotLoadedException('Content is not loaded!');
458
        }
459
    }
460
461
    /**
462
     * Cleans the html of any none-html information.
463
     *
464
     * @param string $str
465
     * @return string
466
     */
467
    protected function clean($str)
468
    {
469
        if ($this->options->get('cleanupInput') != true) {
470
            // skip entire cleanup step
471
            return $str;
472
        }
473
474
        // remove white space before closing tags
475
        $str = mb_eregi_replace("'\s+>", "'>", $str);
476
        $str = mb_eregi_replace('"\s+>', '">', $str);
477
478
        // clean out the \n\r
479
        $replace = ' ';
480
        if ($this->options->get('preserveLineBreaks')) {
481
            $replace = '&#10;';
482
        }
483
        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
484
485
        // strip the doctype
486
        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
487
488
        // strip out comments
489
        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
490
491
        // strip out cdata
492
        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
493
494
        // strip out <script> tags
495
        if ($this->options->get('removeScripts') == true) {
496
            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
497
            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
498
        }
499
500
        // strip out <style> tags
501
        if ($this->options->get('removeStyles') == true) {
502
            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
503
            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
504
        }
505
506
        // strip out server side scripts
507
        if ($this->options->get('serverSideScriptis') == true){
508
            $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
509
        }
510
511
        // strip smarty scripts
512
        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
513
514
        return $str;
515
    }
516
517
    /**
518
     * Attempts to parse the html in content.
519
     */
520
    protected function parse()
521
    {
522
        // add the root node
523
        $this->root = new HtmlNode('root');
524
        $activeNode = $this->root;
525
        while ( ! is_null($activeNode)) {
526
            $str = $this->content->copyUntil('<');
527
            if ($str == '') {
528
                $info = $this->parseTag();
529
                if ( ! $info['status']) {
530
                    // we are done here
531
                    $activeNode = null;
532
                    continue;
533
                }
534
535
                // check if it was a closing tag
536
                if ($info['closing']) {
537
                    $originalNode = $activeNode;
538
                    while ($activeNode->getTag()->name() != $info['tag']) {
539
                        $activeNode = $activeNode->getParent();
540
                        if (is_null($activeNode)) {
541
                            // we could not find opening tag
542
                            $activeNode = $originalNode;
543
                            break;
544
                        }
545
                    }
546
                    if ( ! is_null($activeNode)) {
547
                        $activeNode = $activeNode->getParent();
548
                    }
549
                    continue;
550
                }
551
552
                if ( ! isset($info['node'])) {
553
                    continue;
554
                }
555
556
                /** @var AbstractNode $node */
557
                $node = $info['node'];
558
                $activeNode->addChild($node);
559
560
                // check if node is self closing
561
                if ( ! $node->getTag()->isSelfClosing()) {
562
                    $activeNode = $node;
563
                }
564
            } else if ($this->options->whitespaceTextNode ||
565
                trim($str) != ''
566
            ) {
567
                // we found text we care about
568
                $textNode = new TextNode($str, $this->options->removeDoubleSpace);
0 ignored issues
show
Documentation introduced by
The property removeDoubleSpace does not exist on object<PHPHtmlParser\Options>. Since you implemented __get, maybe consider adding a @property annotation.

Since your code implements the magic getter _get, this function will be called for any read access on an undefined variable. You can add the @property annotation to your class or interface to document the existence of this variable.

<?php

/**
 * @property int $x
 * @property int $y
 * @property string $text
 */
class MyLabel
{
    private $properties;

    private $allowedProperties = array('x', 'y', 'text');

    public function __get($name)
    {
        if (isset($properties[$name]) && in_array($name, $this->allowedProperties)) {
            return $properties[$name];
        } else {
            return null;
        }
    }

    public function __set($name, $value)
    {
        if (in_array($name, $this->allowedProperties)) {
            $properties[$name] = $value;
        } else {
            throw new \LogicException("Property $name is not defined.");
        }
    }

}

If the property has read access only, you can use the @property-read annotation instead.

Of course, you may also just have mistyped another name, in which case you should fix the error.

See also the PhpDoc documentation for @property.

Loading history...
569
                $activeNode->addChild($textNode);
570
            }
571
        }
572
    }
573
574
    /**
575
     * Attempt to parse a tag out of the content.
576
     *
577
     * @return array
578
     * @throws StrictException
579
     */
580
    protected function parseTag()
581
    {
582
        $return = [
583
            'status'  => false,
584
            'closing' => false,
585
            'node'    => null,
586
        ];
587
        if ($this->content->char() != '<') {
588
            // we are not at the beginning of a tag
589
            return $return;
590
        }
591
592
        // check if this is a closing tag
593
        if ($this->content->fastForward(1)->char() == '/') {
594
            // end tag
595
            $tag = $this->content->fastForward(1)
596
                                 ->copyByToken('slash', true);
597
            // move to end of tag
598
            $this->content->copyUntil('>');
599
            $this->content->fastForward(1);
600
601
            // check if this closing tag counts
602
            $tag = strtolower($tag);
603
            if (in_array($tag, $this->selfClosing)) {
604
                $return['status'] = true;
605
606
                return $return;
607
            } else {
608
                $return['status']  = true;
609
                $return['closing'] = true;
610
                $return['tag']     = strtolower($tag);
611
            }
612
613
            return $return;
614
        }
615
616
        $tag  = strtolower($this->content->copyByToken('slash', true));
617
        $node = new HtmlNode($tag);
618
619
        // attributes
620
        while ($this->content->char() != '>' &&
621
            $this->content->char() != '/') {
622
            $space = $this->content->skipByToken('blank', true);
623
            if (empty($space)) {
624
                $this->content->fastForward(1);
625
                continue;
626
            }
627
628
            $name = $this->content->copyByToken('equal', true);
629
            if ($name == '/') {
630
                break;
631
            }
632
633
            if (empty($name)) {
634
				$this->content->skipByToken('blank');
635
				continue;
636
            }
637
638
            $this->content->skipByToken('blank');
639
            if ($this->content->char() == '=') {
640
                $attr = [];
641
                $this->content->fastForward(1)
642
                              ->skipByToken('blank');
643
                switch ($this->content->char()) {
644
                    case '"':
645
                        $attr['doubleQuote'] = true;
646
                        $this->content->fastForward(1);
647
                        $string = $this->content->copyUntil('"', true, true);
648
                        do {
649
                            $moreString = $this->content->copyUntilUnless('"', '=>');
650
                            $string .= $moreString;
651
                        } while ( ! empty($moreString));
652
                        $attr['value'] = $string;
653
                        $this->content->fastForward(1);
654
                        $node->getTag()->$name = $attr;
655
                        break;
656
                    case "'":
657
                        $attr['doubleQuote'] = false;
658
                        $this->content->fastForward(1);
659
                        $string = $this->content->copyUntil("'", true, true);
660
                        do {
661
                            $moreString = $this->content->copyUntilUnless("'", '=>');
662
                            $string .= $moreString;
663
                        } while ( ! empty($moreString));
664
                        $attr['value'] = $string;
665
                        $this->content->fastForward(1);
666
                        $node->getTag()->$name = $attr;
667
                        break;
668
                    default:
669
                        $attr['doubleQuote']   = true;
670
                        $attr['value']         = $this->content->copyByToken('attr', true);
671
                        $node->getTag()->$name = $attr;
672
                        break;
673
                }
674
            } else {
675
                // no value attribute
676
                if ($this->options->strict) {
677
                    // can't have this in strict html
678
                    $character = $this->content->getPosition();
679
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
680
                }
681
                $node->getTag()->$name = [
682
                    'value'       => null,
683
                    'doubleQuote' => true,
684
                ];
685
                if ($this->content->char() != '>') {
686
                    $this->content->rewind(1);
687
                }
688
            }
689
        }
690
691
        $this->content->skipByToken('blank');
692
        if ($this->content->char() == '/') {
693
            // self closing tag
694
            $node->getTag()->selfClosing();
695
            $this->content->fastForward(1);
696
        } elseif (in_array($tag, $this->selfClosing)) {
697
698
            // Should be a self closing tag, check if we are strict
699
            if ($this->options->strict) {
700
                $character = $this->content->getPosition();
701
                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
702
            }
703
704
            // We force self closing on this tag.
705
            $node->getTag()->selfClosing();
706
707
            // Should this tag use a trailing slash?
708
            if(in_array($tag, $this->noSlash))
709
            {
710
                $node->getTag()->noTrailingSlash();
711
            }
712
713
        }
714
715
        $this->content->fastForward(1);
716
717
        $return['status'] = true;
718
        $return['node']   = $node;
719
720
        return $return;
721
    }
722
723
    /**
724
     * Attempts to detect the charset that the html was sent in.
725
     *
726
     * @return bool
727
     */
728
    protected function detectCharset()
729
    {
730
        // set the default
731
        $encode = new Encode;
732
        $encode->from($this->defaultCharset);
733
        $encode->to($this->defaultCharset);
734
735
        if ( ! is_null($this->options->enforceEncoding)) {
736
            //  they want to enforce the given encoding
737
            $encode->from($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
738
            $encode->to($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
739
740
            return false;
741
        }
742
743
        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
744
        if (is_null($meta)) {
745
            // could not find meta tag
746
            $this->root->propagateEncoding($encode);
747
748
            return false;
749
        }
750
        $content = $meta->content;
751
        if (empty($content)) {
752
            // could not find content
753
            $this->root->propagateEncoding($encode);
754
755
            return false;
756
        }
757
        $matches = [];
758
        if (preg_match('/charset=(.+)/', $content, $matches)) {
759
            $encode->from(trim($matches[1]));
760
            $this->root->propagateEncoding($encode);
761
762
            return true;
763
        }
764
765
        // no charset found
766
        $this->root->propagateEncoding($encode);
767
768
        return false;
769
    }
770
}
771