Completed
Push — master ( 162761...d36330 )
by Lars
01:44
created

HtmlDomParser::getElementsById()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 1
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[][]
50
     */
51
    protected static $domLinkReplaceHelper = [
52
        'orig' => ['[', ']', '{', '}'],
53
        'tmp'  => [
54
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
55
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
56
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
57
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
58
        ],
59
    ];
60
61
    /**
62
     * @var string[][]
63
     */
64
    protected static $domReplaceHelper = [
65
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
66
        'tmp'  => [
67
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
68
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
69
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
70
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
71
            '____SIMPLE_HTML_DOM__VOKU__AT____',
72
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
73
        ],
74
    ];
75
76
    /**
77
     * @var string
78
     */
79
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
80
81
    /**
82
     * @var string
83
     */
84
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
85
86
    /**
87
     * @var array
88
     */
89
    protected static $domBrokenReplaceHelper = [];
90
91
    /**
92
     * @var callable
93
     */
94
    protected static $callback;
95
96
    /**
97
     * @var string[]
98
     */
99
    protected $namespaces = [];
100
101
    /**
102
     * @var \DOMDocument
103
     */
104
    protected $document;
105
106
    /**
107
     * @var string
108
     */
109
    protected $encoding = 'UTF-8';
110
111
    /**
112
     * @var bool
113
     */
114
    protected $isDOMDocumentCreatedWithoutHtml = false;
115
116
    /**
117
     * @var bool
118
     */
119
    protected $isDOMDocumentCreatedWithoutWrapper = false;
120
121
    /**
122
     * @var bool
123
     */
124
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
125
126
    /**
127
     * @var bool
128
     */
129
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
130
131
    /**
132
     * @var bool
133
     */
134
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
135
136
    /**
137
     * @var bool
138
     */
139
    protected $keepBrokenHtml;
140
141
    /**
142
     * Constructor
143
     *
144
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
145
     */
146 146
    public function __construct($element = null)
147
    {
148 146
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
149
150
        // reset
151 146
        self::$domBrokenReplaceHelper = [];
152
153
        // DOMDocument settings
154 146
        $this->document->preserveWhiteSpace = true;
155 146
        $this->document->formatOutput = true;
156
157 146
        if ($element instanceof SimpleHtmlDomInterface) {
158 72
            $element = $element->getNode();
159
        }
160
161 146
        if ($element instanceof \DOMNode) {
162 72
            $domNode = $this->document->importNode($element, true);
163
164 72
            if ($domNode instanceof \DOMNode) {
165
                /** @noinspection UnusedFunctionResultInspection */
166 72
                $this->document->appendChild($domNode);
167
            }
168
169 72
            return;
170
        }
171
172 146
        if ($element !== null) {
173
            /** @noinspection UnusedFunctionResultInspection */
174 79
            $this->loadHtml($element);
175
        }
176 145
    }
177
178
    /**
179
     * @param string $name
180
     * @param array  $arguments
181
     *
182
     * @return bool|mixed
183
     */
184 53 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
185
    {
186 53
        $name = \strtolower($name);
187
188 53
        if (isset(self::$functionAliases[$name])) {
189 52
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
190
        }
191
192 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
193
    }
194
195
    /**
196
     * @param string $name
197
     * @param array  $arguments
198
     *
199
     * @throws \RuntimeException
200
     * @throws \BadMethodCallException
201
     *
202
     * @return HtmlDomParser
203
     */
204 21 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
205
    {
206 21
        $arguments0 = $arguments[0] ?? '';
207
208 21
        $arguments1 = $arguments[1] ?? null;
209
210 21
        if ($name === 'str_get_html') {
211 16
            $parser = new static();
212
213 16
            return $parser->loadHtml($arguments0, $arguments1);
214
        }
215
216 5
        if ($name === 'file_get_html') {
217 4
            $parser = new static();
218
219 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
220
        }
221
222 1
        throw new \BadMethodCallException('Method does not exist');
223
    }
224
225
    public function __clone()
226
    {
227
        $this->document = clone $this->document;
228
    }
229
230
    /** @noinspection MagicMethodsValidityInspection */
231
232
    /**
233
     * @param string $name
234
     *
235
     * @return string|null
236
     */
237 14
    public function __get($name)
238
    {
239 14
        $name = \strtolower($name);
240
241
        switch ($name) {
242 14
            case 'outerhtml':
243 14
            case 'outertext':
244 5
                return $this->html();
245 10
            case 'innerhtml':
246 4
            case 'innertext':
247 7
                return $this->innerHtml();
248 3
            case 'text':
249 3
            case 'plaintext':
250 2
                return $this->text();
251
        }
252
253 1
        return null;
254
    }
255
256
    /**
257
     * @param string $selector
258
     * @param int    $idx
259
     *
260
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
261
     */
262 3
    public function __invoke($selector, $idx = null)
263
    {
264 3
        return $this->find($selector, $idx);
265
    }
266
267
    /**
268
     * @return string
269
     */
270 17
    public function __toString()
271
    {
272 17
        return $this->html();
273
    }
274
275
    /**
276
     * does nothing (only for api-compatibility-reasons)
277
     *
278
     * @return bool
279
     *
280
     * @deprecated
281
     */
282 1
    public function clear(): bool
283
    {
284 1
        return true;
285
    }
286
287
    /**
288
     * Create DOMDocument from HTML.
289
     *
290
     * @param string   $html
291
     * @param int|null $libXMLExtraOptions
292
     *
293
     * @return \DOMDocument
294
     */
295 131
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
296
    {
297 131
        if ($this->keepBrokenHtml) {
298 2
            $html = $this->keepBrokenHtml(\trim($html));
299
        }
300
301 131
        if (\strpos($html, '<') === false) {
302 7
            $this->isDOMDocumentCreatedWithoutHtml = true;
303 129
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
304 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
305
        }
306
307 131
        if (\strpos($html, '<html') === false) {
308 79
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
309
        }
310
311
        /** @noinspection HtmlRequiredTitleElement */
312 131
        if (\strpos($html, '<head>') === false) {
313 82
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
314
        }
315
316
        if (
317 131
            \strpos($html, '</script>') === false
318
            &&
319 131
            \strpos($html, '<\/script>') !== false
320
        ) {
321 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
322
        }
323
324 131
        if (\strpos($html, '<script') !== false) {
325 15
            $this->html5FallbackForScriptTags($html);
326
327
            if (
328 15
                \strpos($html, 'type="text/html"') !== false
329
                ||
330 14
                \strpos($html, 'type=\'text/html\'') !== false
331
                ||
332 15
                \strpos($html, 'type=text/html') !== false
333
            ) {
334 1
                $this->keepSpecialScriptTags($html);
335
            }
336
        }
337
338
        // set error level
339 131
        $internalErrors = \libxml_use_internal_errors(true);
340 131
        $disableEntityLoader = \libxml_disable_entity_loader(true);
341 131
        \libxml_clear_errors();
342
343 131
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
344
345 131
        if (\defined('LIBXML_BIGLINES')) {
346 131
            $optionsXml |= \LIBXML_BIGLINES;
347
        }
348
349 131
        if (\defined('LIBXML_COMPACT')) {
350 131
            $optionsXml |= \LIBXML_COMPACT;
351
        }
352
353 131
        if (\defined('LIBXML_HTML_NODEFDTD')) {
354 131
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
355
        }
356
357 131
        if ($libXMLExtraOptions !== null) {
358 1
            $optionsXml |= $libXMLExtraOptions;
359
        }
360
361
        if (
362 131
            $this->isDOMDocumentCreatedWithoutWrapper
363
            ||
364 131
            $this->keepBrokenHtml
365
        ) {
366 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
367
        }
368
369 131
        $html = self::replaceToPreserveHtmlEntities($html);
370
371 131
        $documentFound = false;
372 131
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
373 131 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
374 47
            $domElementTmp = \dom_import_simplexml($sxe);
375 47
            if ($domElementTmp) {
376 47
                $documentFound = true;
377 47
                $this->document = $domElementTmp->ownerDocument;
378
            }
379
        }
380
381 131 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
382
383
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
384 89
            $xmlHackUsed = false;
385 89
            if (\stripos('<?xml', $html) !== 0) {
386 89
                $xmlHackUsed = true;
387 89
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
388
            }
389
390 89
            $this->document->loadHTML($html, $optionsXml);
391
392
            // remove the "xml-encoding" hack
393 89
            if ($xmlHackUsed) {
394 89
                foreach ($this->document->childNodes as $child) {
395 89
                    if ($child->nodeType === \XML_PI_NODE) {
396
                        /** @noinspection UnusedFunctionResultInspection */
397 89
                        $this->document->removeChild($child);
398
399 89
                        break;
400
                    }
401
                }
402
            }
403
        }
404
405
        // set encoding
406 131
        $this->document->encoding = $this->getEncoding();
407
408
        // restore lib-xml settings
409 131
        \libxml_clear_errors();
410 131
        \libxml_use_internal_errors($internalErrors);
411 131
        \libxml_disable_entity_loader($disableEntityLoader);
412
413 131
        return $this->document;
414
    }
415
416
    /**
417
     * @param string $content
418
     * @param bool   $multiDecodeNewHtmlEntity
419
     *
420
     * @return string
421
     */
422 78
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
423
    {
424 78
        if ($multiDecodeNewHtmlEntity) {
425 3
            if (\class_exists('\voku\helper\UTF8')) {
426
                /** @noinspection PhpUndefinedClassInspection */
427
                $content = UTF8::rawurldecode($content, true);
428
            } else {
429
                do {
430 3
                    $content_compare = $content;
431
432 3
                    $content = \rawurldecode(
433 3
                        \html_entity_decode(
434 3
                            $content,
435 3
                            \ENT_QUOTES | \ENT_HTML5
436
                        )
437
                    );
438 3
                } while ($content_compare !== $content);
439
            }
440
        } else {
441
            /** @noinspection NestedPositiveIfStatementsInspection */
442 77
            if (\class_exists('\voku\helper\UTF8')) {
443
                /** @noinspection PhpUndefinedClassInspection */
444
                $content = UTF8::rawurldecode($content, false);
445
            } else {
446 77
                $content = \rawurldecode(
447 77
                    \html_entity_decode(
448 77
                        $content,
449 77
                        \ENT_QUOTES | \ENT_HTML5
450
                    )
451
                );
452
            }
453
        }
454
455 78
        return $content;
456
    }
457
458
    /**
459
     * Find list of nodes with a CSS selector.
460
     *
461
     * @param string   $selector
462
     * @param int|null $idx
463
     *
464
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
465
     */
466 95
    public function find(string $selector, $idx = null)
467
    {
468 95
        $xPathQuery = SelectorConverter::toXPath($selector);
469
470 95
        $xPath = new \DOMXPath($this->document);
471 95
        $nodesList = $xPath->query($xPathQuery);
472 95
        $elements = new SimpleHtmlDomNode();
473
474
        // register the namespaces
475 95
        foreach ($this->namespaces as $namespace => $url) {
476
            $xPath->registerNamespace($namespace, $url);
477
        }
478
479 95
        foreach ($nodesList as $node) {
480 87
            $elements[] = new SimpleHtmlDom($node);
481
        }
482
483
        // return all elements
484 95
        if ($idx === null) {
485 61
            if (\count($elements) === 0) {
486 13
                return new SimpleHtmlDomNodeBlank();
487
            }
488
489 58
            return $elements;
490
        }
491
492
        // handle negative values
493 47
        if ($idx < 0) {
494 11
            $idx = \count($elements) + $idx;
495
        }
496
497
        // return one element
498 47
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
499
    }
500
501
    /**
502
     * Find nodes with a CSS selector.
503
     *
504
     * @param string $selector
505
     *
506
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
507
     */
508 4
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
509
    {
510 4
        return $this->find($selector, null);
511
    }
512
513
    /**
514
     * Find one node with a CSS selector.
515
     *
516
     * @param string $selector
517
     *
518
     * @return SimpleHtmlDomInterface
519
     */
520 6
    public function findOne(string $selector): SimpleHtmlDomInterface
521
    {
522 6
        return $this->find($selector, 0);
523
    }
524
525
    /**
526
     * @param string $content
527
     * @param bool   $multiDecodeNewHtmlEntity
528
     *
529
     * @return string
530
     */
531 76
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
532
    {
533
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
534
        //          so we try to remove it here again ...
535
536 76
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
537
            /** @noinspection HtmlRequiredLangAttribute */
538 30
            $content = \str_replace(
539
                [
540 30
                    '<body>',
541
                    '</body>',
542
                    '<html>',
543
                    '</html>',
544
                ],
545 30
                '',
546 30
                $content
547
            );
548
        }
549
550 76
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
551
            /** @noinspection HtmlRequiredTitleElement */
552 31
            $content = \str_replace(
553
                [
554 31
                    '<head>',
555
                    '</head>',
556
                ],
557 31
                '',
558 31
                $content
559
            );
560
        }
561
562 76
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
563 1
            $content = \str_replace(
564 1
                '</script>',
565 1
                '',
566 1
                $content
567
            );
568
        }
569
570 76
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
571 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
572 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
573
        }
574
575 76
        if ($this->isDOMDocumentCreatedWithoutHtml) {
576 5
            $content = \str_replace(
577
                [
578 5
                    '<p>',
579
                    '</p>',
580
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
581
                ],
582 5
                '',
583 5
                $content
584
            );
585
        }
586
587
        /** @noinspection CheckTagEmptyBody */
588
        /** @noinspection HtmlExtraClosingTag */
589
        /** @noinspection HtmlRequiredTitleElement */
590 76
        $content = \trim(
591 76
            \str_replace(
592
                [
593 76
                    '<simpleHtmlDomP>',
594
                    '</simpleHtmlDomP>',
595
                    '<head><head>',
596
                    '</head></head>',
597
                    '<br></br>',
598
                ],
599
                [
600 76
                    '',
601
                    '',
602
                    '<head>',
603
                    '</head>',
604
                    '<br>',
605
                ],
606 76
                $content
607
            )
608
        );
609
610 76
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
611
612 76
        return self::putReplacedBackToPreserveHtmlEntities($content);
613
    }
614
615
    /**
616
     * @return \DOMDocument
617
     */
618 39
    public function getDocument(): \DOMDocument
619
    {
620 39
        return $this->document;
621
    }
622
623
    /**
624
     * Return elements by .class.
625
     *
626
     * @param string $class
627
     *
628
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
629
     */
630
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
631
    {
632
        return $this->findMulti(".${class}");
633
    }
634
635
    /**
636
     * Return element by #id.
637
     *
638
     * @param string $id
639
     *
640
     * @return SimpleHtmlDomInterface
641
     */
642
    public function getElementById(string $id): SimpleHtmlDomInterface
643
    {
644 2
        return $this->findOne("#${id}");
645
    }
646
647
    /**
648
     * Return element by tag name.
649
     *
650
     * @param string $name
651
     *
652
     * @return SimpleHtmlDomInterface
653
     */
654
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
655
    {
656 1
        $node = $this->document->getElementsByTagName($name)->item(0);
657
658 1
        if ($node === null) {
659
            return new SimpleHtmlDomBlank();
660
        }
661
662 1
        return new SimpleHtmlDom($node);
663
    }
664
665
    /**
666
     * Returns elements by #id.
667
     *
668
     * @param string   $id
669
     * @param int|null $idx
670
     *
671
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
672
     */
673
    public function getElementsById(string $id, $idx = null)
674
    {
675
        return $this->find("#${id}", $idx);
676
    }
677
678
    /**
679
     * Returns elements by tag name.
680
     *
681
     * @param string   $name
682
     * @param int|null $idx
683
     *
684
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
685
     */
686 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
687
    {
688 3
        $nodesList = $this->document->getElementsByTagName($name);
689
690 3
        $elements = new SimpleHtmlDomNode();
691
692 3
        foreach ($nodesList as $node) {
693 3
            $elements[] = new SimpleHtmlDom($node);
694
        }
695
696
        // return all elements
697 3
        if ($idx === null) {
698 2
            if (\count($elements) === 0) {
699
                return new SimpleHtmlDomNodeBlank();
700
            }
701
702 2
            return $elements;
703
        }
704
705
        // handle negative values
706 1
        if ($idx < 0) {
707
            $idx = \count($elements) + $idx;
708
        }
709
710
        // return one element
711 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
712
    }
713
714
    /**
715
     * Get the encoding to use.
716
     *
717
     * @return string
718
     */
719
    protected function getEncoding(): string
720
    {
721 146
        return $this->encoding;
722
    }
723
724
    /**
725
     * @return bool
726
     */
727
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
728
    {
729 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
730
    }
731
732
    /**
733
     * @return bool
734
     */
735
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
736
    {
737 9
        return $this->isDOMDocumentCreatedWithoutHtml;
738
    }
739
740
    /**
741
     * @return bool
742
     */
743
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
744
    {
745 50
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
746
    }
747
748
    /**
749
     * @return bool
750
     */
751
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
752
    {
753
        return $this->isDOMDocumentCreatedWithoutWrapper;
754
    }
755
756
    /**
757
     * Get the list of registered namespaces as an array.
758
     *
759
     * @return array
760
     *   An array in form ['prefix' => 'namespace-uri']
761
     */
762
    public function getNamespaces(): array
763
    {
764
        return $this->namespaces;
765
    }
766
767
    /**
768
     * Get dom node's outer html.
769
     *
770
     * @param bool $multiDecodeNewHtmlEntity
771
     *
772
     * @return string
773
     */
774
    public function html(bool $multiDecodeNewHtmlEntity = false): string
775
    {
776 50
        if ($this::$callback !== null) {
777
            \call_user_func($this::$callback, [$this]);
778
        }
779
780 50
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
781 23
            $content = $this->document->saveHTML($this->document->documentElement);
782
        } else {
783 35
            $content = $this->document->saveHTML();
784
        }
785
786 50
        if ($content === false) {
787
            return '';
788
        }
789
790 50
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
791
    }
792
793
    /**
794
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
795
     *
796
     * @param string $html
797
     */
798
    protected function html5FallbackForScriptTags(string &$html)
799
    {
800
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
801
        /** @noinspection HtmlDeprecatedTag */
802 15
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
803 15
        $html = \preg_replace_callback(
804 15
            $regExSpecialScript,
805
            static function ($scripts) {
806 14
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
807 15
            },
808 15
            $html
809
        );
810 15
    }
811
812
    /**
813
     * Get dom node's inner html.
814
     *
815
     * @param bool $multiDecodeNewHtmlEntity
816
     *
817
     * @return string
818
     */
819
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
820
    {
821
        // init
822 20
        $text = '';
823
824 20
        if ($this->document->documentElement) {
825 20
            foreach ($this->document->documentElement->childNodes as $node) {
826 20
                $text .= $this->document->saveHTML($node);
827
            }
828
        }
829
830 20
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
831
    }
832
833
    /**
834
     * @param string $html
835
     *
836
     * @return string
837
     */
838
    protected function keepBrokenHtml(string $html): string
839
    {
840
        do {
841 2
            $original = $html;
842
843 2
            $html = (string) \preg_replace_callback(
844 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
845
                static function ($matches) {
846 2
                    return $matches['start'] .
847 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
848 2
                           $matches['value'] .
849 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
850 2
                           $matches['end'];
851 2
                },
852 2
                $html
853
            );
854 2
        } while ($original !== $html);
855
856
        do {
857 2
            $original = $html;
858
859 2
            $html = (string) \preg_replace_callback(
860 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
861
                static function ($matches) {
862 2
                    $matches['broken'] = \str_replace(
863 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
864 2
                        ['</', '<', '>'],
865 2
                        $matches['broken']
866
                    );
867
868 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
869 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
870
871 2
                    return $matches['start'] . $matchesHash . $matches['end'];
872 2
                },
873 2
                $html
874
            );
875 2
        } while ($original !== $html);
876
877 2
        return \str_replace(
878 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
879 2
            ['</', '<', '>'],
880 2
            $html
881
        );
882
    }
883
884
    /**
885
     * @param string $html
886
     */
887
    protected function keepSpecialScriptTags(string &$html)
888
    {
889 1
        $specialScripts = [];
890
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
891 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU';
892 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
893
894 1
        if (isset($specialScripts[0])) {
895 1
            foreach ($specialScripts[0] as $specialScript) {
896 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
897 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
898
                // remove the html5 fallback
899 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
900
901 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
902
            }
903
        }
904 1
    }
905
906
    /**
907
     * Load HTML from string.
908
     *
909
     * @param string   $html
910
     * @param int|null $libXMLExtraOptions
911
     *
912
     * @return HtmlDomParser
913
     */
914
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
915
    {
916 131
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
917
918 131
        return $this;
919
    }
920
921
    /**
922
     * Load HTML from file.
923
     *
924
     * @param string   $filePath
925
     * @param int|null $libXMLExtraOptions
926
     *
927
     * @throws \RuntimeException
928
     *
929
     * @return HtmlDomParser
930
     */
931 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
932
    {
933
        if (
934 11
            !\preg_match("/^https?:\/\//i", $filePath)
935
            &&
936 11
            !\file_exists($filePath)
937
        ) {
938 1
            throw new \RuntimeException("File ${filePath} not found");
939
        }
940
941
        try {
942 10
            if (\class_exists('\voku\helper\UTF8')) {
943
                /** @noinspection PhpUndefinedClassInspection */
944
                $html = UTF8::file_get_contents($filePath);
945
            } else {
946 10
                $html = \file_get_contents($filePath);
947
            }
948 1
        } catch (\Exception $e) {
949 1
            throw new \RuntimeException("Could not load file ${filePath}");
950
        }
951
952 9
        if ($html === false) {
953
            throw new \RuntimeException("Could not load file ${filePath}");
954
        }
955
956 9
        return $this->loadHtml($html, $libXMLExtraOptions);
957
    }
958
959
    /**
960
     * @param string $html
961
     *
962
     * @return string
963
     */
964
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
965
    {
966 86
        static $DOM_REPLACE__HELPER_CACHE = null;
967
968 86
        if ($DOM_REPLACE__HELPER_CACHE === null) {
969 2
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
970 2
                self::$domLinkReplaceHelper['tmp'],
971 2
                self::$domReplaceHelper['tmp']
972
            );
973 2
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
974 2
                self::$domLinkReplaceHelper['orig'],
975 2
                self::$domReplaceHelper['orig']
976
            );
977
978 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
979 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
980
981 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
982 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
983
984 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
985 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
986
987 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
988 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
989
        }
990
991
        if (
992 86
            isset(self::$domBrokenReplaceHelper['tmp'])
993
            &&
994 86
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
995
        ) {
996 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
997
        }
998
999 86
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
1000
    }
1001
1002
    /**
1003
     * Register a namespace to be used in xpath queries.
1004
     *
1005
     * @param string $prefix
1006
     *   Namespace prefix to register
1007
     * @param string $url
1008
     *   Connonical URL for this namespace prefix
1009
     */
1010
    protected function registerNamespace($prefix, $url)
1011
    {
1012
        $this->namespaces[$prefix] = $url;
1013
    }
1014
1015
    /**
1016
     * @param string $html
1017
     *
1018
     * @return string
1019
     */
1020
    public static function replaceToPreserveHtmlEntities(string $html): string
1021
    {
1022
        // init
1023 135
        $linksNew = [];
1024 135
        $linksOld = [];
1025
1026 135
        if (\strpos($html, 'http') !== false) {
1027
1028
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
1029 61
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
1030 61
            \preg_match_all($regExUrl, $html, $linksOld);
1031
1032 61
            if (!empty($linksOld[1])) {
1033 58
                $linksOld = $linksOld[1];
1034 58
                foreach ((array) $linksOld as $linkKey => $linkOld) {
1035 58
                    $linksNew[$linkKey] = \str_replace(
1036 58
                        self::$domLinkReplaceHelper['orig'],
1037 58
                        self::$domLinkReplaceHelper['tmp'],
1038 58
                        $linkOld
1039
                    );
1040
                }
1041
            }
1042
        }
1043
1044 135
        $linksNewCount = \count($linksNew);
1045 135
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
1046 58
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
1047 58
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
1048
        } else {
1049 82
            $search = self::$domReplaceHelper['orig'];
1050 82
            $replace = self::$domReplaceHelper['tmp'];
1051
        }
1052
1053 135
        return \str_replace($search, $replace, $html);
1054
    }
1055
1056
    /**
1057
     * Save the html-dom as string.
1058
     *
1059
     * @param string $filepath
1060
     *
1061
     * @return string
1062
     */
1063
    public function save(string $filepath = ''): string
1064
    {
1065 1
        $string = $this->innerHtml();
1066 1
        if ($filepath !== '') {
1067
            \file_put_contents($filepath, $string, \LOCK_EX);
1068
        }
1069
1070 1
        return $string;
1071
    }
1072
1073
    /**
1074
     * @param callable $functionName
1075
     */
1076
    public function set_callback($functionName)
1077
    {
1078
        static::$callback = $functionName;
1079
    }
1080
1081
    /**
1082
     * Get dom node's plain text.
1083
     *
1084
     * @param bool $multiDecodeNewHtmlEntity
1085
     *
1086
     * @return string
1087
     */
1088
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1089
    {
1090 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
1091
    }
1092
1093
    /**
1094
     * @param bool $keepBrokenHtml
1095
     *
1096
     * @return HtmlDomParser
1097
     */
1098
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
1099
    {
1100 2
        $this->keepBrokenHtml = $keepBrokenHtml;
1101
1102 2
        return $this;
1103
    }
1104
1105
    /**
1106
     * Get the HTML as XML or plain XML if needed.
1107
     *
1108
     * @param bool $multiDecodeNewHtmlEntity
1109
     * @param bool $htmlToXml
1110
     * @param bool $removeXmlHeader
1111
     * @param int  $options
1112
     *
1113
     * @return string
1114
     */
1115
    public function xml(
1116
        bool $multiDecodeNewHtmlEntity = false,
1117
        bool $htmlToXml = true,
1118
        bool $removeXmlHeader = true,
1119
        int $options = \LIBXML_NOEMPTYTAG
1120
    ): string {
1121 4
        $xml = $this->document->saveXML(null, $options);
1122
1123 4
        if ($removeXmlHeader) {
1124 4
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
1125
        }
1126
1127 4
        if ($htmlToXml) {
1128 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
1129
        } else {
1130 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
1131
1132 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
1133
        }
1134
1135 4
        return $return;
1136
    }
1137
}
1138