Completed
Pull Request — master (#90)
by
unknown
03:17
created

Dom::findById()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\HtmlNode;
6
use PHPHtmlParser\Dom\TextNode;
7
use PHPHtmlParser\Exceptions\NotLoadedException;
8
use PHPHtmlParser\Exceptions\StrictException;
9
use stringEncode\Encode;
10
11
/**
12
 * Class Dom
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Dom
17
{
18
19
    /**
20
     * The charset we would like the output to be in.
21
     *
22
     * @var string
23
     */
24
    protected $defaultCharset = 'UTF-8';
25
26
    /**
27
     * Contains the root node of this dom tree.
28
     *
29
     * @var HtmlNode
30
     */
31
    public $root;
32
33
    /**
34
     * The raw version of the document string.
35
     *
36
     * @var string
37
     */
38
    protected $raw;
39
40
    /**
41
     * The document string.
42
     *
43
     * @var Content
44
     */
45
    protected $content = null;
46
47
    /**
48
     * The original file size of the document.
49
     *
50
     * @var int
51
     */
52
    protected $rawSize;
53
54
    /**
55
     * The size of the document after it is cleaned.
56
     *
57
     * @var int
58
     */
59
    protected $size;
60
61
    /**
62
     * A global options array to be used by all load calls.
63
     *
64
     * @var array
65
     */
66
    protected $globalOptions = [];
67
68
    /**
69
     * A persistent option object to be used for all options in the
70
     * parsing of the file.
71
     *
72
     * @var Options
73
     */
74
    protected $options;
75
76
    /**
77
     * A list of tags which will always be self closing
78
     *
79
     * @var array
80
     */
81
    protected $selfClosing = [
82
        'img',
83
        'br',
84
        'input',
85
        'meta',
86
        'link',
87
        'hr',
88
        'base',
89
        'embed',
90
        'spacer',
91
    ];
92
93
    /**
94
     * Returns the inner html of the root node.
95
     *
96
     * @return string
97
     */
98
    public function __toString()
99
    {
100
        return $this->root->innerHtml();
101
    }
102
103
    /**
104
     * A simple wrapper around the root node.
105
     *
106
     * @param string $name
107
     * @return mixed
108
     */
109
    public function __get($name)
110
    {
111
        return $this->root->$name;
112
    }
113
114
    /**
115
     * Attempts to load the dom from any resource, string, file, or URL.
116
     *
117
     * @param string $str
118
     * @param array $options
119
     * @return $this
120
     */
121
    public function load($str, $options = [])
122
    {
123
        // check if it's a file
124
        if (strpos($str, "\n") === false && is_file($str)) {
125
            return $this->loadFromFile($str, $options);
126
        }
127
        // check if it's a url
128
        if (preg_match("/^https?:\/\//i", $str)) {
129
            return $this->loadFromUrl($str, $options);
130
        }
131
132
        return $this->loadStr($str, $options);
133
    }
134
135
    /**
136
     * Loads the dom from a document file/url
137
     *
138
     * @param string $file
139
     * @param array $options
140
     * @return $this
141
     */
142
    public function loadFromFile($file, $options = [])
143
    {
144
        return $this->loadStr(file_get_contents($file), $options);
145
    }
146
147
    /**
148
     * Use a curl interface implementation to attempt to load
149
     * the content from a url.
150
     *
151
     * @param string $url
152
     * @param array $options
153
     * @param CurlInterface $curl
154
     * @return $this
155
     */
156
    public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
157
    {
158
        if (is_null($curl)) {
159
            // use the default curl interface
160
            $curl = new Curl;
161
        }
162
        $content = $curl->get($url);
163
164
        return $this->loadStr($content, $options);
165
    }
166
167
    /**
168
     * Parsers the html of the given string. Used for load(), loadFromFile(),
169
     * and loadFromUrl().
170
     *
171
     * @param string $str
172
     * @param array $option
173
     * @return $this
174
     */
175
    public function loadStr($str, $option)
176
    {
177
        $this->options = new Options;
178
        $this->options->setOptions($this->globalOptions)
179
                      ->setOptions($option);
180
181
        $this->rawSize = strlen($str);
182
        $this->raw     = $str;
183
184
        $html = $this->clean($str);
185
186
        $this->size    = strlen($str);
187
        $this->content = new Content($html);
188
189
        $this->parse();
190
        $this->detectCharset();
191
192
        return $this;
193
    }
194
195
    /**
196
     * Sets a global options array to be used by all load calls.
197
     *
198
     * @param array $options
199
     * @return $this
200
     */
201
    public function setOptions(array $options)
202
    {
203
        $this->globalOptions = $options;
204
205
        return $this;
206
    }
207
208
    /**
209
     * Find elements by css selector on the root node.
210
     *
211
     * @param string $selector
212
     * @param int $nth
213
     * @return array
214
     */
215
    public function find($selector, $nth = null)
216
    {
217
        $this->isLoaded();
218
219
        return $this->root->find($selector, $nth);
220
    }
221
222
    /**
223
     * Find element by Id on the root node
224
     *
225
     * @param int $id Element Id
226
     * @return mixed
227
     *
228
     */
229
    public function findById($id)
230
    {
231
        $this->isLoaded();
232
233
        return $this->root->findById($id);
234
    }
235
236
    /**
237
     * Adds the tag (or tags in an array) to the list of tags that will always
238
     * be self closing.
239
     *
240
     * @param string|array $tag
241
     * @return $this
242
     */
243
    public function addSelfClosingTag($tag)
244
    {
245
        if ( ! is_array($tag)) {
246
            $tag = [$tag];
247
        }
248
        foreach ($tag as $value) {
249
            $this->selfClosing[] = $value;
250
        }
251
252
        return $this;
253
    }
254
255
    /**
256
     * Removes the tag (or tags in an array) from the list of tags that will
257
     * always be self closing.
258
     *
259
     * @param string|array $tag
260
     * @return $this
261
     */
262
    public function removeSelfClosingTag($tag)
263
    {
264
        if ( ! is_array($tag)) {
265
            $tag = [$tag];
266
        }
267
        $this->selfClosing = array_diff($this->selfClosing, $tag);
268
269
        return $this;
270
    }
271
272
    /**
273
     * Sets the list of self closing tags to empty.
274
     *
275
     * @return $this
276
     */
277
    public function clearSelfClosingTags()
278
    {
279
        $this->selfClosing = [];
280
281
        return $this;
282
    }
283
284
    /**
285
     * Simple wrapper function that returns the first child.
286
     *
287
     * @return \PHPHtmlParser\Dom\AbstractNode
288
     */
289
    public function firstChild()
290
    {
291
        $this->isLoaded();
292
293
        return $this->root->firstChild();
294
    }
295
296
    /**
297
     * Simple wrapper function that returns the last child.
298
     *
299
     * @return \PHPHtmlParser\Dom\AbstractNode
300
     */
301
    public function lastChild()
302
    {
303
        $this->isLoaded();
304
305
        return $this->root->lastChild();
306
    }
307
308
    /**
309
     * Simple wrapper function that returns count of child elements
310
     *
311
     * @return int
312
     */
313
    public function countChildren()
314
    {
315
        $this->isLoaded();
316
317
        return $this->root->countChildren();
318
    }
319
320
    /**
321
     * Get array of children
322
     *
323
     * @return array
324
     */
325
    public function getChildren()
326
    {
327
        $this->isLoaded();
328
329
        return $this->root->getChildren();
330
    }
331
332
    /**
333
     * Check if node have children nodes
334
     *
335
     * @return bool
336
     */
337
    public function hasChildren()
338
    {
339
        $this->isLoaded();
340
341
        return $this->root->hasChildren();
342
    }
343
344
    /**
345
     * Simple wrapper function that returns an element by the
346
     * id.
347
     *
348
     * @param string $id
349
     * @return \PHPHtmlParser\Dom\AbstractNode
350
     */
351
    public function getElementById($id)
352
    {
353
        $this->isLoaded();
354
355
        return $this->find('#'.$id, 0);
0 ignored issues
show
Bug Compatibility introduced by
The expression $this->find('#' . $id, 0); of type array|PHPHtmlParser\Dom\AbstractNode adds the type array to the return on line 355 which is incompatible with the return type documented by PHPHtmlParser\Dom::getElementById of type PHPHtmlParser\Dom\AbstractNode.
Loading history...
356
    }
357
358
    /**
359
     * Simple wrapper function that returns all elements by
360
     * tag name.
361
     *
362
     * @param string $name
363
     * @return array
364
     */
365
    public function getElementsByTag($name)
366
    {
367
        $this->isLoaded();
368
369
        return $this->find($name);
370
    }
371
372
    /**
373
     * Simple wrapper function that returns all elements by
374
     * class name.
375
     *
376
     * @param string $class
377
     * @return array
378
     */
379
    public function getElementsByClass($class)
380
    {
381
        $this->isLoaded();
382
383
        return $this->find('.'.$class);
384
    }
385
386
    /**
387
     * Checks if the load methods have been called.
388
     *
389
     * @throws NotLoadedException
390
     */
391
    protected function isLoaded()
392
    {
393
        if (is_null($this->content)) {
394
            throw new NotLoadedException('Content is not loaded!');
395
        }
396
    }
397
398
    /**
399
     * Cleans the html of any none-html information.
400
     *
401
     * @param string $str
402
     * @return string
403
     */
404
    protected function clean($str)
405
    {
406
        if ($this->options->get('cleanupInput') != true) {
407
            // skip entire cleanup step
408
            return $str;
409
        }
410
411
        // remove white space before closing tags
412
        $str = mb_eregi_replace("'\s+>", "'>", $str);
413
        $str = mb_eregi_replace('"\s+>', '">', $str);
414
415
        // clean out the \n\r
416
        $replace = ' ';
417
        if ($this->options->get('preserveLineBreaks')) {
418
            $replace = '&#10;';
419
        }
420
        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
421
422
        // strip the doctype
423
        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
424
425
        // strip out comments
426
        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
427
428
        // strip out cdata
429
        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
430
431
        // strip out <script> tags
432
        if ($this->options->get('removeScripts') == true) {
433
            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
434
            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
435
        }
436
437
        // strip out <style> tags
438
        if ($this->options->get('removeStyles') == true) {
439
            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
440
            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
441
        }
442
443
        // strip out server side scripts
444
        if ($this->options->get('serverSideScriptis') == true){
445
            $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
446
        }
447
448
        // strip smarty scripts
449
        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
450
451
        return $str;
452
    }
453
454
    /**
455
     * Attempts to parse the html in content.
456
     */
457
    protected function parse()
458
    {
459
        // add the root node
460
        $this->root = new HtmlNode('root');
461
        $activeNode = $this->root;
462
        while ( ! is_null($activeNode)) {
463
            $str = $this->content->copyUntil('<');
464
            if ($str == '') {
465
                $info = $this->parseTag();
466
                if ( ! $info['status']) {
467
                    // we are done here
468
                    $activeNode = null;
469
                    continue;
470
                }
471
472
                // check if it was a closing tag
473
                if ($info['closing']) {
474
                    $originalNode = $activeNode;
475
                    while ($activeNode->getTag()->name() != $info['tag']) {
476
                        $activeNode = $activeNode->getParent();
477
                        if (is_null($activeNode)) {
478
                            // we could not find opening tag
479
                            $activeNode = $originalNode;
480
                            break;
481
                        }
482
                    }
483
                    if ( ! is_null($activeNode)) {
484
                        $activeNode = $activeNode->getParent();
485
                    }
486
                    continue;
487
                }
488
489
                if ( ! isset($info['node'])) {
490
                    continue;
491
                }
492
493
                /** @var AbstractNode $node */
494
                $node = $info['node'];
495
                $activeNode->addChild($node);
496
497
                // check if node is self closing
498
                if ( ! $node->getTag()->isSelfClosing()) {
499
                    $activeNode = $node;
500
                }
501
            } else if ($this->options->whitespaceTextNode ||
502
                trim($str) != ''
503
            ) {
504
                // we found text we care about
505
                $textNode = new TextNode($str);
506
                $activeNode->addChild($textNode);
507
            }
508
        }
509
    }
510
511
    /**
512
     * Attempt to parse a tag out of the content.
513
     *
514
     * @return array
515
     * @throws StrictException
516
     */
517
    protected function parseTag()
518
    {
519
        $return = [
520
            'status'  => false,
521
            'closing' => false,
522
            'node'    => null,
523
        ];
524
        if ($this->content->char() != '<') {
525
            // we are not at the beginning of a tag
526
            return $return;
527
        }
528
529
        // check if this is a closing tag
530
        if ($this->content->fastForward(1)->char() == '/') {
531
            // end tag
532
            $tag = $this->content->fastForward(1)
533
                                 ->copyByToken('slash', true);
534
            // move to end of tag
535
            $this->content->copyUntil('>');
536
            $this->content->fastForward(1);
537
538
            // check if this closing tag counts
539
            $tag = strtolower($tag);
540
            if (in_array($tag, $this->selfClosing)) {
541
                $return['status'] = true;
542
543
                return $return;
544
            } else {
545
                $return['status']  = true;
546
                $return['closing'] = true;
547
                $return['tag']     = strtolower($tag);
548
            }
549
550
            return $return;
551
        }
552
553
        $tag  = strtolower($this->content->copyByToken('slash', true));
554
        $node = new HtmlNode($tag);
555
556
        // attributes
557
        while ($this->content->char() != '>' &&
558
            $this->content->char() != '/') {
559
            $space = $this->content->skipByToken('blank', true);
560
            if (empty($space)) {
561
                $this->content->fastForward(1);
562
                continue;
563
            }
564
565
            $name = $this->content->copyByToken('equal', true);
566
            if ($name == '/') {
567
                break;
568
            }
569
570
            if (empty($name)) {
571
                $this->content->fastForward(1);
572
                continue;
573
            }
574
575
            $this->content->skipByToken('blank');
576
            if ($this->content->char() == '=') {
577
                $attr = [];
578
                $this->content->fastForward(1)
579
                              ->skipByToken('blank');
580
                switch ($this->content->char()) {
581
                    case '"':
582
                        $attr['doubleQuote'] = true;
583
                        $this->content->fastForward(1);
584
                        $string = $this->content->copyUntil('"', true, true);
585
                        do {
586
                            $moreString = $this->content->copyUntilUnless('"', '=>');
587
                            $string .= $moreString;
588
                        } while ( ! empty($moreString));
589
                        $attr['value'] = $string;
590
                        $this->content->fastForward(1);
591
                        $node->getTag()->$name = $attr;
592
                        break;
593
                    case "'":
594
                        $attr['doubleQuote'] = false;
595
                        $this->content->fastForward(1);
596
                        $string = $this->content->copyUntil("'", true, true);
597
                        do {
598
                            $moreString = $this->content->copyUntilUnless("'", '=>');
599
                            $string .= $moreString;
600
                        } while ( ! empty($moreString));
601
                        $attr['value'] = $string;
602
                        $this->content->fastForward(1);
603
                        $node->getTag()->$name = $attr;
604
                        break;
605
                    default:
606
                        $attr['doubleQuote']   = true;
607
                        $attr['value']         = $this->content->copyByToken('attr', true);
608
                        $node->getTag()->$name = $attr;
609
                        break;
610
                }
611
            } else {
612
                // no value attribute
613
                if ($this->options->strict) {
614
                    // can't have this in strict html
615
                    $character = $this->content->getPosition();
616
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
617
                }
618
                $node->getTag()->$name = [
619
                    'value'       => null,
620
                    'doubleQuote' => true,
621
                ];
622
                if ($this->content->char() != '>') {
623
                    $this->content->rewind(1);
624
                }
625
            }
626
        }
627
628
        $this->content->skipByToken('blank');
629
        if ($this->content->char() == '/') {
630
            // self closing tag
631
            $node->getTag()->selfClosing();
632
            $this->content->fastForward(1);
633
        } elseif (in_array($tag, $this->selfClosing)) {
634
635
            // Should be a self closing tag, check if we are strict
636
            if ($this->options->strict) {
637
                $character = $this->content->getPosition();
638
                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
639
            }
640
641
            // We force self closing on this tag.
642
            $node->getTag()->selfClosing();
643
        }
644
645
        $this->content->fastForward(1);
646
647
        $return['status'] = true;
648
        $return['node']   = $node;
649
650
        return $return;
651
    }
652
653
    /**
654
     * Attempts to detect the charset that the html was sent in.
655
     *
656
     * @return bool
657
     */
658
    protected function detectCharset()
659
    {
660
        // set the default
661
        $encode = new Encode;
662
        $encode->from($this->defaultCharset);
663
        $encode->to($this->defaultCharset);
664
665
        if ( ! is_null($this->options->enforceEncoding)) {
666
            //  they want to enforce the given encoding
667
            $encode->from($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
668
            $encode->to($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
669
670
            return false;
671
        }
672
673
        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
674
        if (is_null($meta)) {
675
            // could not find meta tag
676
            $this->root->propagateEncoding($encode);
677
678
            return false;
679
        }
680
        $content = $meta->content;
681
        if (empty($content)) {
682
            // could not find content
683
            $this->root->propagateEncoding($encode);
684
685
            return false;
686
        }
687
        $matches = [];
688
        if (preg_match('/charset=(.+)/', $content, $matches)) {
689
            $encode->from(trim($matches[1]));
690
            $this->root->propagateEncoding($encode);
691
692
            return true;
693
        }
694
695
        // no charset found
696
        $this->root->propagateEncoding($encode);
697
698
        return false;
699
    }
700
}
701