Completed
Pull Request — master (#90)
by
unknown
03:21
created

Dom::countChildren()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\HtmlNode;
6
use PHPHtmlParser\Dom\TextNode;
7
use PHPHtmlParser\Exceptions\NotLoadedException;
8
use PHPHtmlParser\Exceptions\StrictException;
9
use stringEncode\Encode;
10
11
/**
12
 * Class Dom
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Dom
17
{
18
19
    /**
20
     * The charset we would like the output to be in.
21
     *
22
     * @var string
23
     */
24
    protected $defaultCharset = 'UTF-8';
25
26
    /**
27
     * Contains the root node of this dom tree.
28
     *
29
     * @var HtmlNode
30
     */
31
    public $root;
32
33
    /**
34
     * The raw version of the document string.
35
     *
36
     * @var string
37
     */
38
    protected $raw;
39
40
    /**
41
     * The document string.
42
     *
43
     * @var Content
44
     */
45
    protected $content = null;
46
47
    /**
48
     * The original file size of the document.
49
     *
50
     * @var int
51
     */
52
    protected $rawSize;
53
54
    /**
55
     * The size of the document after it is cleaned.
56
     *
57
     * @var int
58
     */
59
    protected $size;
60
61
    /**
62
     * A global options array to be used by all load calls.
63
     *
64
     * @var array
65
     */
66
    protected $globalOptions = [];
67
68
    /**
69
     * A persistent option object to be used for all options in the
70
     * parsing of the file.
71
     *
72
     * @var Options
73
     */
74
    protected $options;
75
76
    /**
77
     * A list of tags which will always be self closing
78
     *
79
     * @var array
80
     */
81
    protected $selfClosing = [
82
        'img',
83
        'br',
84
        'input',
85
        'meta',
86
        'link',
87
        'hr',
88
        'base',
89
        'embed',
90
        'spacer',
91
    ];
92
93
    /**
94
     * Returns the inner html of the root node.
95
     *
96
     * @return string
97
     */
98
    public function __toString()
99
    {
100
        return $this->root->innerHtml();
101
    }
102
103
    /**
104
     * A simple wrapper around the root node.
105
     *
106
     * @param string $name
107
     * @return mixed
108
     */
109
    public function __get($name)
110
    {
111
        return $this->root->$name;
112
    }
113
114
    /**
115
     * Attempts to load the dom from any resource, string, file, or URL.
116
     *
117
     * @param string $str
118
     * @param array $options
119
     * @return $this
120
     */
121
    public function load($str, $options = [])
122
    {
123
        AbstractNode::resetCount();
124
        // check if it's a file
125
        if (strpos($str, "\n") === false && is_file($str)) {
126
            return $this->loadFromFile($str, $options);
127
        }
128
        // check if it's a url
129
        if (preg_match("/^https?:\/\//i", $str)) {
130
            return $this->loadFromUrl($str, $options);
131
        }
132
133
        return $this->loadStr($str, $options);
134
    }
135
136
    /**
137
     * Loads the dom from a document file/url
138
     *
139
     * @param string $file
140
     * @param array $options
141
     * @return $this
142
     */
143
    public function loadFromFile($file, $options = [])
144
    {
145
        return $this->loadStr(file_get_contents($file), $options);
146
    }
147
148
    /**
149
     * Use a curl interface implementation to attempt to load
150
     * the content from a url.
151
     *
152
     * @param string $url
153
     * @param array $options
154
     * @param CurlInterface $curl
155
     * @return $this
156
     */
157
    public function loadFromUrl($url, $options = [], CurlInterface $curl = null)
158
    {
159
        if (is_null($curl)) {
160
            // use the default curl interface
161
            $curl = new Curl;
162
        }
163
        $content = $curl->get($url);
164
165
        return $this->loadStr($content, $options);
166
    }
167
168
    /**
169
     * Parsers the html of the given string. Used for load(), loadFromFile(),
170
     * and loadFromUrl().
171
     *
172
     * @param string $str
173
     * @param array $option
174
     * @return $this
175
     */
176
    public function loadStr($str, $option)
177
    {
178
        $this->options = new Options;
179
        $this->options->setOptions($this->globalOptions)
180
                      ->setOptions($option);
181
182
        $this->rawSize = strlen($str);
183
        $this->raw     = $str;
184
185
        $html = $this->clean($str);
186
187
        $this->size    = strlen($str);
188
        $this->content = new Content($html);
189
190
        $this->parse();
191
        $this->detectCharset();
192
193
        return $this;
194
    }
195
196
    /**
197
     * Sets a global options array to be used by all load calls.
198
     *
199
     * @param array $options
200
     * @return $this
201
     */
202
    public function setOptions(array $options)
203
    {
204
        $this->globalOptions = $options;
205
206
        return $this;
207
    }
208
209
    /**
210
     * Find elements by css selector on the root node.
211
     *
212
     * @param string $selector
213
     * @param int $nth
214
     * @return array
215
     */
216
    public function find($selector, $nth = null)
217
    {
218
        $this->isLoaded();
219
220
        return $this->root->find($selector, $nth);
221
    }
222
223
    /**
224
     * Find element by Id on the root node
225
     *
226
     * @param int $id Element Id
227
     * @return mixed
228
     *
229
     */
230
    public function findById($id)
231
    {
232
        $this->isLoaded();
233
234
        return $this->root->findById($id);
235
    }
236
237
    /**
238
     * Adds the tag (or tags in an array) to the list of tags that will always
239
     * be self closing.
240
     *
241
     * @param string|array $tag
242
     * @return $this
243
     */
244
    public function addSelfClosingTag($tag)
245
    {
246
        if ( ! is_array($tag)) {
247
            $tag = [$tag];
248
        }
249
        foreach ($tag as $value) {
250
            $this->selfClosing[] = $value;
251
        }
252
253
        return $this;
254
    }
255
256
    /**
257
     * Removes the tag (or tags in an array) from the list of tags that will
258
     * always be self closing.
259
     *
260
     * @param string|array $tag
261
     * @return $this
262
     */
263
    public function removeSelfClosingTag($tag)
264
    {
265
        if ( ! is_array($tag)) {
266
            $tag = [$tag];
267
        }
268
        $this->selfClosing = array_diff($this->selfClosing, $tag);
269
270
        return $this;
271
    }
272
273
    /**
274
     * Sets the list of self closing tags to empty.
275
     *
276
     * @return $this
277
     */
278
    public function clearSelfClosingTags()
279
    {
280
        $this->selfClosing = [];
281
282
        return $this;
283
    }
284
285
    /**
286
     * Simple wrapper function that returns the first child.
287
     *
288
     * @return \PHPHtmlParser\Dom\AbstractNode
289
     */
290
    public function firstChild()
291
    {
292
        $this->isLoaded();
293
294
        return $this->root->firstChild();
295
    }
296
297
    /**
298
     * Simple wrapper function that returns the last child.
299
     *
300
     * @return \PHPHtmlParser\Dom\AbstractNode
301
     */
302
    public function lastChild()
303
    {
304
        $this->isLoaded();
305
306
        return $this->root->lastChild();
307
    }
308
309
    /**
310
     * Simple wrapper function that returns count of child elements
311
     *
312
     * @return int
313
     */
314
    public function countChildren()
315
    {
316
        $this->isLoaded();
317
318
        return $this->root->countChildren();
319
    }
320
321
    /**
322
     * Get array of children
323
     *
324
     * @return array
325
     */
326
    public function getChildren()
327
    {
328
        $this->isLoaded();
329
330
        return $this->root->getChildren();
331
    }
332
333
    /**
334
     * Check if node have children nodes
335
     *
336
     * @return bool
337
     */
338
    public function hasChildren()
339
    {
340
        $this->isLoaded();
341
342
        return $this->root->hasChildren();
343
    }
344
345
    /**
346
     * Simple wrapper function that returns an element by the
347
     * id.
348
     *
349
     * @param string $id
350
     * @return \PHPHtmlParser\Dom\AbstractNode
351
     */
352
    public function getElementById($id)
353
    {
354
        $this->isLoaded();
355
356
        return $this->find('#'.$id, 0);
0 ignored issues
show
Bug Compatibility introduced by
The expression $this->find('#' . $id, 0); of type array|PHPHtmlParser\Dom\AbstractNode adds the type array to the return on line 356 which is incompatible with the return type documented by PHPHtmlParser\Dom::getElementById of type PHPHtmlParser\Dom\AbstractNode.
Loading history...
357
    }
358
359
    /**
360
     * Simple wrapper function that returns all elements by
361
     * tag name.
362
     *
363
     * @param string $name
364
     * @return array
365
     */
366
    public function getElementsByTag($name)
367
    {
368
        $this->isLoaded();
369
370
        return $this->find($name);
371
    }
372
373
    /**
374
     * Simple wrapper function that returns all elements by
375
     * class name.
376
     *
377
     * @param string $class
378
     * @return array
379
     */
380
    public function getElementsByClass($class)
381
    {
382
        $this->isLoaded();
383
384
        return $this->find('.'.$class);
385
    }
386
387
    /**
388
     * Checks if the load methods have been called.
389
     *
390
     * @throws NotLoadedException
391
     */
392
    protected function isLoaded()
393
    {
394
        if (is_null($this->content)) {
395
            throw new NotLoadedException('Content is not loaded!');
396
        }
397
    }
398
399
    /**
400
     * Cleans the html of any none-html information.
401
     *
402
     * @param string $str
403
     * @return string
404
     */
405
    protected function clean($str)
406
    {
407
        if ($this->options->get('cleanupInput') != true) {
408
            // skip entire cleanup step
409
            return $str;
410
        }
411
412
        // remove white space before closing tags
413
        $str = mb_eregi_replace("'\s+>", "'>", $str);
414
        $str = mb_eregi_replace('"\s+>', '">', $str);
415
416
        // clean out the \n\r
417
        $replace = ' ';
418
        if ($this->options->get('preserveLineBreaks')) {
419
            $replace = '&#10;';
420
        }
421
        $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
422
423
        // strip the doctype
424
        $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
425
426
        // strip out comments
427
        $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
428
429
        // strip out cdata
430
        $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
431
432
        // strip out <script> tags
433
        if ($this->options->get('removeScripts') == true) {
434
            $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
435
            $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
436
        }
437
438
        // strip out <style> tags
439
        if ($this->options->get('removeStyles') == true) {
440
            $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
441
            $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
442
        }
443
444
        // strip out server side scripts
445
        if ($this->options->get('serverSideScriptis') == true){
446
            $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
447
        }
448
449
        // strip smarty scripts
450
        $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
451
452
        return $str;
453
    }
454
455
    /**
456
     * Attempts to parse the html in content.
457
     */
458
    protected function parse()
459
    {
460
        // add the root node
461
        $this->root = new HtmlNode('root');
462
        $activeNode = $this->root;
463
        while ( ! is_null($activeNode)) {
464
            $str = $this->content->copyUntil('<');
465
            if ($str == '') {
466
                $info = $this->parseTag();
467
                if ( ! $info['status']) {
468
                    // we are done here
469
                    $activeNode = null;
470
                    continue;
471
                }
472
473
                // check if it was a closing tag
474
                if ($info['closing']) {
475
                    $originalNode = $activeNode;
476
                    while ($activeNode->getTag()->name() != $info['tag']) {
477
                        $activeNode = $activeNode->getParent();
478
                        if (is_null($activeNode)) {
479
                            // we could not find opening tag
480
                            $activeNode = $originalNode;
481
                            break;
482
                        }
483
                    }
484
                    if ( ! is_null($activeNode)) {
485
                        $activeNode = $activeNode->getParent();
486
                    }
487
                    continue;
488
                }
489
490
                if ( ! isset($info['node'])) {
491
                    continue;
492
                }
493
494
                /** @var AbstractNode $node */
495
                $node = $info['node'];
496
                $activeNode->addChild($node);
497
498
                // check if node is self closing
499
                if ( ! $node->getTag()->isSelfClosing()) {
500
                    $activeNode = $node;
501
                }
502
            } else if ($this->options->whitespaceTextNode ||
503
                trim($str) != ''
504
            ) {
505
                // we found text we care about
506
                $textNode = new TextNode($str);
507
                $activeNode->addChild($textNode);
508
            }
509
        }
510
    }
511
512
    /**
513
     * Attempt to parse a tag out of the content.
514
     *
515
     * @return array
516
     * @throws StrictException
517
     */
518
    protected function parseTag()
519
    {
520
        $return = [
521
            'status'  => false,
522
            'closing' => false,
523
            'node'    => null,
524
        ];
525
        if ($this->content->char() != '<') {
526
            // we are not at the beginning of a tag
527
            return $return;
528
        }
529
530
        // check if this is a closing tag
531
        if ($this->content->fastForward(1)->char() == '/') {
532
            // end tag
533
            $tag = $this->content->fastForward(1)
534
                                 ->copyByToken('slash', true);
535
            // move to end of tag
536
            $this->content->copyUntil('>');
537
            $this->content->fastForward(1);
538
539
            // check if this closing tag counts
540
            $tag = strtolower($tag);
541
            if (in_array($tag, $this->selfClosing)) {
542
                $return['status'] = true;
543
544
                return $return;
545
            } else {
546
                $return['status']  = true;
547
                $return['closing'] = true;
548
                $return['tag']     = strtolower($tag);
549
            }
550
551
            return $return;
552
        }
553
554
        $tag  = strtolower($this->content->copyByToken('slash', true));
555
        $node = new HtmlNode($tag);
556
557
        // attributes
558
        while ($this->content->char() != '>' &&
559
            $this->content->char() != '/') {
560
            $space = $this->content->skipByToken('blank', true);
561
            if (empty($space)) {
562
                $this->content->fastForward(1);
563
                continue;
564
            }
565
566
            $name = $this->content->copyByToken('equal', true);
567
            if ($name == '/') {
568
                break;
569
            }
570
571
            if (empty($name)) {
572
                $this->content->fastForward(1);
573
                continue;
574
            }
575
576
            $this->content->skipByToken('blank');
577
            if ($this->content->char() == '=') {
578
                $attr = [];
579
                $this->content->fastForward(1)
580
                              ->skipByToken('blank');
581
                switch ($this->content->char()) {
582
                    case '"':
583
                        $attr['doubleQuote'] = true;
584
                        $this->content->fastForward(1);
585
                        $string = $this->content->copyUntil('"', true, true);
586
                        do {
587
                            $moreString = $this->content->copyUntilUnless('"', '=>');
588
                            $string .= $moreString;
589
                        } while ( ! empty($moreString));
590
                        $attr['value'] = $string;
591
                        $this->content->fastForward(1);
592
                        $node->getTag()->$name = $attr;
593
                        break;
594
                    case "'":
595
                        $attr['doubleQuote'] = false;
596
                        $this->content->fastForward(1);
597
                        $string = $this->content->copyUntil("'", true, true);
598
                        do {
599
                            $moreString = $this->content->copyUntilUnless("'", '=>');
600
                            $string .= $moreString;
601
                        } while ( ! empty($moreString));
602
                        $attr['value'] = $string;
603
                        $this->content->fastForward(1);
604
                        $node->getTag()->$name = $attr;
605
                        break;
606
                    default:
607
                        $attr['doubleQuote']   = true;
608
                        $attr['value']         = $this->content->copyByToken('attr', true);
609
                        $node->getTag()->$name = $attr;
610
                        break;
611
                }
612
            } else {
613
                // no value attribute
614
                if ($this->options->strict) {
615
                    // can't have this in strict html
616
                    $character = $this->content->getPosition();
617
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
618
                }
619
                $node->getTag()->$name = [
620
                    'value'       => null,
621
                    'doubleQuote' => true,
622
                ];
623
                if ($this->content->char() != '>') {
624
                    $this->content->rewind(1);
625
                }
626
            }
627
        }
628
629
        $this->content->skipByToken('blank');
630
        if ($this->content->char() == '/') {
631
            // self closing tag
632
            $node->getTag()->selfClosing();
633
            $this->content->fastForward(1);
634
        } elseif (in_array($tag, $this->selfClosing)) {
635
636
            // Should be a self closing tag, check if we are strict
637
            if ($this->options->strict) {
638
                $character = $this->content->getPosition();
639
                throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
640
            }
641
642
            // We force self closing on this tag.
643
            $node->getTag()->selfClosing();
644
        }
645
646
        $this->content->fastForward(1);
647
648
        $return['status'] = true;
649
        $return['node']   = $node;
650
651
        return $return;
652
    }
653
654
    /**
655
     * Attempts to detect the charset that the html was sent in.
656
     *
657
     * @return bool
658
     */
659
    protected function detectCharset()
660
    {
661
        // set the default
662
        $encode = new Encode;
663
        $encode->from($this->defaultCharset);
664
        $encode->to($this->defaultCharset);
665
666
        if ( ! is_null($this->options->enforceEncoding)) {
667
            //  they want to enforce the given encoding
668
            $encode->from($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
669
            $encode->to($this->options->enforceEncoding);
0 ignored issues
show
Documentation introduced by
$this->options->enforceEncoding is of type boolean, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
670
671
            return false;
672
        }
673
674
        $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
675
        if (is_null($meta)) {
676
            // could not find meta tag
677
            $this->root->propagateEncoding($encode);
678
679
            return false;
680
        }
681
        $content = $meta->content;
682
        if (empty($content)) {
683
            // could not find content
684
            $this->root->propagateEncoding($encode);
685
686
            return false;
687
        }
688
        $matches = [];
689
        if (preg_match('/charset=(.+)/', $content, $matches)) {
690
            $encode->from(trim($matches[1]));
691
            $this->root->propagateEncoding($encode);
692
693
            return true;
694
        }
695
696
        // no charset found
697
        $this->root->propagateEncoding($encode);
698
699
        return false;
700
    }
701
}
702