Completed
Push — master ( 593209...b6a8dc )
by Lars
02:22
created

HtmlDomParser::find()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 31

Duplication

Lines 31
Ratio 100 %

Code Coverage

Tests 15
CRAP Score 6

Importance

Changes 0
Metric Value
nc 8
dl 31
loc 31
ccs 15
cts 15
cp 1
c 0
b 0
f 0
cc 6
nop 2
crap 6
rs 8.8017
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template',
75
    ];
76
77
    /**
78
     * @var string[]
79
     */
80
    protected $selfClosingTags = [
81
        'area',
82
        'base',
83
        'br',
84
        'col',
85
        'command',
86
        'embed',
87
        'hr',
88
        'img',
89
        'input',
90
        'keygen',
91
        'link',
92
        'meta',
93
        'param',
94
        'source',
95
        'track',
96
        'wbr',
97
    ];
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
128
129
    /**
130
     * @var bool
131
     */
132
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
133
134
    /**
135
     * @var bool
136
     */
137
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
138
139
    /**
140
     * @var bool
141
     */
142
    protected $keepBrokenHtml = false;
143
144
    /**
145
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
146
     */
147 220 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
148
    {
149 220
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
150
151
        // DOMDocument settings
152 220
        $this->document->preserveWhiteSpace = true;
153 220
        $this->document->formatOutput = true;
154
155 220
        if ($element instanceof SimpleHtmlDomInterface) {
156 105
            $element = $element->getNode();
157
        }
158
159 220
        if ($element instanceof \DOMNode) {
160 105
            $domNode = $this->document->importNode($element, true);
161
162 105
            if ($domNode instanceof \DOMNode) {
163
                /** @noinspection UnusedFunctionResultInspection */
164 105
                $this->document->appendChild($domNode);
165
            }
166
167 105
            return;
168
        }
169
170 220
        if ($element !== null) {
171
            /** @noinspection UnusedFunctionResultInspection */
172 88
            $this->loadHtml($element);
173
        }
174 219
    }
175
176
    /**
177
     * @param string $name
178
     * @param array  $arguments
179
     *
180
     * @return bool|mixed
181
     */
182 79
    public function __call($name, $arguments)
183
    {
184 79
        $name = \strtolower($name);
185
186 79
        if (isset(self::$functionAliases[$name])) {
187 78
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
188
        }
189
190 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
191
    }
192
193
    /**
194
     * @param string $name
195
     * @param array  $arguments
196
     *
197
     * @throws \BadMethodCallException
198
     * @throws \RuntimeException
199
     *
200
     * @return HtmlDomParser
201
     */
202 28 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
203
    {
204 28
        $arguments0 = $arguments[0] ?? '';
205
206 28
        $arguments1 = $arguments[1] ?? null;
207
208 28
        if ($name === 'str_get_html') {
209 22
            $parser = new static();
210
211 22
            return $parser->loadHtml($arguments0, $arguments1);
212
        }
213
214 7
        if ($name === 'file_get_html') {
215 6
            $parser = new static();
216
217 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
218
        }
219
220 1
        throw new \BadMethodCallException('Method does not exist');
221
    }
222
223
    /** @noinspection MagicMethodsValidityInspection */
224
225
    /**
226
     * @param string $name
227
     *
228
     * @return string|null
229
     */
230 17
    public function __get($name)
231
    {
232 17
        $name = \strtolower($name);
233
234 17
        switch ($name) {
235 17
            case 'outerhtml':
236 17
            case 'outertext':
237 7
                return $this->html();
238 11
            case 'innerhtml':
239 5
            case 'innertext':
240 7
                return $this->innerHtml();
241 4
            case 'text':
242 4
            case 'plaintext':
243 3
                return $this->text();
244
        }
245
246 1
        return null;
247
    }
248
249
    /**
250
     * @return string
251
     */
252 20
    public function __toString()
253
    {
254 20
        return $this->html();
255
    }
256
257
    /**
258
     * does nothing (only for api-compatibility-reasons)
259
     *
260
     * @return bool
261
     *
262
     * @deprecated
263
     */
264 1
    public function clear(): bool
265
    {
266 1
        return true;
267
    }
268
269
    /**
270
     * Create DOMDocument from HTML.
271
     *
272
     * @param string   $html
273
     * @param int|null $libXMLExtraOptions
274
     *
275
     * @return \DOMDocument
276
     */
277 204
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
278
    {
279
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
280 204
        $isDOMDocumentCreatedWithDoctype = false;
281 204 View Code Duplication
        if (\stripos($html, '<!DOCTYPE') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
282 60
            $isDOMDocumentCreatedWithDoctype = true;
283
            if (
284 60
                \preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype)
285
                &&
286 60
                \trim($matches_before_doctype[1])
287
            ) {
288 2
                $html = \str_replace($matches_before_doctype[1], '', $html);
289
            }
290
        }
291
292 204
        if ($this->keepBrokenHtml) {
293 5
            $html = $this->keepBrokenHtml(\trim($html));
294
        }
295
296 204
        if (\strpos($html, '<') === false) {
297 12
            $this->isDOMDocumentCreatedWithoutHtml = true;
298 202
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
299 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
300
        }
301
302 204
        if (\strpos(\ltrim($html), '<!--') === 0) {
303 12
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
304
        }
305
306
        /** @noinspection HtmlRequiredLangAttribute */
307
        if (
308 204
            \strpos($html, '<html ') === false
309
            &&
310 204
            \strpos($html, '<html>') === false
311
        ) {
312 123
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
313
        }
314
315
        if (
316 204
            \strpos($html, '<body ') === false
317
            &&
318 204
            \strpos($html, '<body>') === false
319
        ) {
320 128
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
321
        }
322
323
        /** @noinspection HtmlRequiredTitleElement */
324
        if (
325 204
            \strpos($html, '<head ') === false
326
            &&
327 204
            \strpos($html, '<head>') === false
328
        ) {
329 147
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
330
        }
331
332
        if (
333 204
            \strpos($html, '<p ') === false
334
            &&
335 204
            \strpos($html, '<p>') === false
336
        ) {
337 113
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
338
        }
339
340
        if (
341 204
            \strpos($html, '</script>') === false
342
            &&
343 204
            \strpos($html, '<\/script>') !== false
344
        ) {
345 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
346
        }
347
348 204 View Code Duplication
        if (\stripos($html, '</html>') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
349
            /** @noinspection NestedPositiveIfStatementsInspection */
350
            if (
351 90
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
352
                &&
353 90
                \trim($matches_after_html[1])
354
            ) {
355 4
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
356
            }
357
        }
358
359 204
        if (\strpos($html, '<script') !== false) {
360 23
            $this->html5FallbackForScriptTags($html);
361
362 23
            foreach ($this->specialScriptTags as $tag) {
363 23
                if (\strpos($html, $tag) !== false) {
364 6
                    $this->keepSpecialScriptTags($html);
365
                }
366
            }
367
        }
368
369 204
        $html = \str_replace(
370
            \array_map(static function ($e) {
371 204
                return '<' . $e . '>';
372 204
            }, $this->selfClosingTags),
373
            \array_map(static function ($e) {
374 204
                return '<' . $e . '/>';
375 204
            }, $this->selfClosingTags),
376 204
            $html
377
        );
378
379
        // set error level
380 204
        $internalErrors = \libxml_use_internal_errors(true);
381 204
        if (\PHP_VERSION_ID < 80000) {
382 204
            $disableEntityLoader = \libxml_disable_entity_loader(true);
383
        }
384 204
        \libxml_clear_errors();
385
386 204
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
387
388 204
        if (\defined('LIBXML_BIGLINES')) {
389 204
            $optionsXml |= \LIBXML_BIGLINES;
390
        }
391
392 204
        if (\defined('LIBXML_COMPACT')) {
393 204
            $optionsXml |= \LIBXML_COMPACT;
394
        }
395
396 204
        if (\defined('LIBXML_HTML_NODEFDTD')) {
397 204
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
398
        }
399
400 204
        if ($libXMLExtraOptions !== null) {
401 5
            $optionsXml |= $libXMLExtraOptions;
402
        }
403
404
        if (
405 204
            $this->isDOMDocumentCreatedWithoutWrapper
406
            ||
407 200
            $this->isDOMDocumentCreatedWithCommentWrapper
408
            ||
409
            (
410 188
                !$isDOMDocumentCreatedWithDoctype
411
                &&
412 204
                $this->keepBrokenHtml
413
            )
414
        ) {
415 20
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
416
        }
417
418 204
        $html = self::replaceToPreserveHtmlEntities($html);
419
420 204
        $documentFound = false;
421 204
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
422 204 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
423 99
            $domElementTmp = \dom_import_simplexml($sxe);
424
            if (
425 99
                $domElementTmp
426
                &&
427 99
                $domElementTmp->ownerDocument
428
            ) {
429 99
                $documentFound = true;
430 99
                $this->document = $domElementTmp->ownerDocument;
431
            }
432
        }
433
434 204 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
435
436
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
437 115
            $xmlHackUsed = false;
438
            /** @noinspection StringFragmentMisplacedInspection */
439 115
            if (\stripos('<?xml', $html) !== 0) {
440 115
                $xmlHackUsed = true;
441 115
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
442
            }
443
444 115
            if ($html !== '') {
445 115
                $this->document->loadHTML($html, $optionsXml);
446
            }
447
448
            // remove the "xml-encoding" hack
449 115
            if ($xmlHackUsed) {
450 115
                foreach ($this->document->childNodes as $child) {
451 115
                    if ($child->nodeType === \XML_PI_NODE) {
452
                        /** @noinspection UnusedFunctionResultInspection */
453 115
                        $this->document->removeChild($child);
454
455 115
                        break;
456
                    }
457
                }
458
            }
459
        }
460
461
        // set encoding
462 204
        $this->document->encoding = $this->getEncoding();
463
464
        // restore lib-xml settings
465 204
        \libxml_clear_errors();
466 204
        \libxml_use_internal_errors($internalErrors);
467 204
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
468 204
            \libxml_disable_entity_loader($disableEntityLoader);
469
        }
470
471 204
        return $this->document;
472
    }
473
474
    /**
475
     * Find list of nodes with a CSS selector.
476
     *
477
     * @param string   $selector
478
     * @param int|null $idx
479
     *
480
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
481
     */
482 149 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
483
    {
484 149
        $xPathQuery = SelectorConverter::toXPath($selector);
485
486 149
        $xPath = new \DOMXPath($this->document);
487 149
        $nodesList = $xPath->query($xPathQuery);
488 149
        $elements = new SimpleHtmlDomNode();
489
490 149
        if ($nodesList) {
491 149
            foreach ($nodesList as $node) {
492 139
                $elements[] = new SimpleHtmlDom($node);
493
            }
494
        }
495
496
        // return all elements
497 149
        if ($idx === null) {
498 75
            if (\count($elements) === 0) {
499 16
                return new SimpleHtmlDomNodeBlank();
500
            }
501
502 72
            return $elements;
503
        }
504
505
        // handle negative values
506 92
        if ($idx < 0) {
507 11
            $idx = \count($elements) + $idx;
508
        }
509
510
        // return one element
511 92
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
512
    }
513
514
    /**
515
     * Find nodes with a CSS selector.
516
     *
517
     * @param string $selector
518
     *
519
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
520
     */
521 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
522
    {
523 12
        return $this->find($selector, null);
524
    }
525
526
    /**
527
     * Find nodes with a CSS selector or false, if no element is found.
528
     *
529
     * @param string $selector
530
     *
531
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
532
     */
533 4
    public function findMultiOrFalse(string $selector)
534
    {
535 4
        $return = $this->find($selector, null);
536
537 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
538 3
            return false;
539
        }
540
541 2
        return $return;
542
    }
543
544
    /**
545
     * Find one node with a CSS selector.
546
     *
547
     * @param string $selector
548
     *
549
     * @return SimpleHtmlDomInterface
550
     */
551 33
    public function findOne(string $selector): SimpleHtmlDomInterface
552
    {
553 33
        return $this->find($selector, 0);
554
    }
555
556
    /**
557
     * Find one node with a CSS selector or false, if no element is found.
558
     *
559
     * @param string $selector
560
     *
561
     * @return false|SimpleHtmlDomInterface
562
     */
563 6
    public function findOneOrFalse(string $selector)
564
    {
565 6
        $return = $this->find($selector, 0);
566
567 6
        if ($return instanceof SimpleHtmlDomBlank) {
568 3
            return false;
569
        }
570
571 4
        return $return;
572
    }
573
574
    /**
575
     * @param string $content
576
     * @param bool   $multiDecodeNewHtmlEntity
577
     *
578
     * @return string
579
     */
580 130
    public function fixHtmlOutput(
581
        string $content,
582
        bool $multiDecodeNewHtmlEntity = false
583
    ): string {
584
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
585
        //          so we try to remove it here again ...
586
587 130
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
588
            /** @noinspection HtmlRequiredLangAttribute */
589 61
            $content = \str_replace(
590
                [
591 61
                    '<html>',
592
                    '</html>',
593
                ],
594 61
                '',
595 61
                $content
596
            );
597
        }
598
599 130
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
600
            /** @noinspection HtmlRequiredTitleElement */
601 65
            $content = \str_replace(
602
                [
603 65
                    '<head>',
604
                    '</head>',
605
                ],
606 65
                '',
607 65
                $content
608
            );
609
        }
610
611 130
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
612 64
            $content = \str_replace(
613
                [
614 64
                    '<body>',
615
                    '</body>',
616
                ],
617 64
                '',
618 64
                $content
619
            );
620
        }
621
622 130
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
623 1
            $content = \str_replace(
624 1
                '</script>',
625 1
                '',
626 1
                $content
627
            );
628
        }
629
630 130
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
631 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
632 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
633
        }
634
635 130
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
636 62
            $content = \str_replace(
637
                [
638 62
                    '<p>',
639
                    '</p>',
640
                ],
641 62
                '',
642 62
                $content
643
            );
644
        }
645
646 130
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
647 10
            $content = \str_replace(
648 10
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
649 10
                '',
650 10
                $content
651
            );
652
        }
653
654
        // https://bugs.php.net/bug.php?id=73175
655 130
        $content = \str_replace(
656
            \array_map(static function ($e) {
657 130
                return '</' . $e . '>';
658 130
            }, $this->selfClosingTags),
659 130
            '',
660 130
            $content
661
        );
662
663
        /** @noinspection HtmlRequiredTitleElement */
664 130
        $content = \trim(
665 130
            \str_replace(
666
                [
667 130
                    '<simpleHtmlDomHtml>',
668
                    '</simpleHtmlDomHtml>',
669
                    '<simpleHtmlDomP>',
670
                    '</simpleHtmlDomP>',
671
                    '<head><head>',
672
                    '</head></head>',
673
                ],
674
                [
675 130
                    '',
676
                    '',
677
                    '',
678
                    '',
679
                    '<head>',
680
                    '</head>',
681
                ],
682 130
                $content
683
            )
684
        );
685
686 130
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
687
688 130
        return self::putReplacedBackToPreserveHtmlEntities($content);
689
    }
690
691
    /**
692
     * Return elements by ".class".
693
     *
694
     * @param string $class
695
     *
696
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
697
     */
698
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
699
    {
700
        return $this->findMulti(".${class}");
701
    }
702
703
    /**
704
     * Return element by #id.
705
     *
706
     * @param string $id
707
     *
708
     * @return SimpleHtmlDomInterface
709
     */
710 3
    public function getElementById(string $id): SimpleHtmlDomInterface
711
    {
712 3
        return $this->findOne("#${id}");
713
    }
714
715
    /**
716
     * Return element by tag name.
717
     *
718
     * @param string $name
719
     *
720
     * @return SimpleHtmlDomInterface
721
     */
722 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
723
    {
724 1
        $node = $this->document->getElementsByTagName($name)->item(0);
725
726 1
        if ($node === null) {
727
            return new SimpleHtmlDomBlank();
728
        }
729
730 1
        return new SimpleHtmlDom($node);
731
    }
732
733
    /**
734
     * Returns elements by "#id".
735
     *
736
     * @param string   $id
737
     * @param int|null $idx
738
     *
739
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
740
     */
741
    public function getElementsById(string $id, $idx = null)
742
    {
743
        return $this->find("#${id}", $idx);
744
    }
745
746
    /**
747
     * Returns elements by tag name.
748
     *
749
     * @param string   $name
750
     * @param int|null $idx
751
     *
752
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
753
     */
754 6
    public function getElementsByTagName(string $name, $idx = null)
755
    {
756 6
        $nodesList = $this->document->getElementsByTagName($name);
757
758 6
        $elements = new SimpleHtmlDomNode();
759
760 6
        foreach ($nodesList as $node) {
761 4
            $elements[] = new SimpleHtmlDom($node);
762
        }
763
764
        // return all elements
765 6
        if ($idx === null) {
766 5
            if (\count($elements) === 0) {
767 2
                return new SimpleHtmlDomNodeBlank();
768
            }
769
770 3
            return $elements;
771
        }
772
773
        // handle negative values
774 1
        if ($idx < 0) {
775
            $idx = \count($elements) + $idx;
776
        }
777
778
        // return one element
779 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
780
    }
781
782
    /**
783
     * Get dom node's outer html.
784
     *
785
     * @param bool $multiDecodeNewHtmlEntity
786
     *
787
     * @return string
788
     */
789 96
    public function html(bool $multiDecodeNewHtmlEntity = false): string
790
    {
791 96
        if (static::$callback !== null) {
792
            \call_user_func(static::$callback, [$this]);
793
        }
794
795 96
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
796 54
            $content = $this->document->saveHTML($this->document->documentElement);
797
        } else {
798 57
            $content = $this->document->saveHTML();
799
        }
800
801 96
        if ($content === false) {
802
            return '';
803
        }
804
805 96
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
806
    }
807
808
    /**
809
     * Load HTML from string.
810
     *
811
     * @param string   $html
812
     * @param int|null $libXMLExtraOptions
813
     *
814
     * @return HtmlDomParser
815
     */
816 204
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
817
    {
818
        // reset
819 204
        self::$domBrokenReplaceHelper = [];
820
821 204
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
822
823 204
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
824
    }
825
826
    /**
827
     * Load HTML from file.
828
     *
829
     * @param string   $filePath
830
     * @param int|null $libXMLExtraOptions
831
     *
832
     * @throws \RuntimeException
833
     *
834
     * @return HtmlDomParser
835
     */
836 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
837
    {
838
        // reset
839 13
        self::$domBrokenReplaceHelper = [];
840
841
        if (
842 13
            !\preg_match("/^https?:\/\//i", $filePath)
843
            &&
844 13
            !\file_exists($filePath)
845
        ) {
846 1
            throw new \RuntimeException("File ${filePath} not found");
847
        }
848
849
        try {
850 12
            if (\class_exists('\voku\helper\UTF8')) {
851
                /** @noinspection PhpUndefinedClassInspection */
852
                $html = UTF8::file_get_contents($filePath);
853
            } else {
854 12
                $html = \file_get_contents($filePath);
855
            }
856 1
        } catch (\Exception $e) {
857 1
            throw new \RuntimeException("Could not load file ${filePath}");
858
        }
859
860 11
        if ($html === false) {
861
            throw new \RuntimeException("Could not load file ${filePath}");
862
        }
863
864 11
        return $this->loadHtml($html, $libXMLExtraOptions);
865
    }
866
867
    /**
868
     * Get the HTML as XML or plain XML if needed.
869
     *
870
     * @param bool $multiDecodeNewHtmlEntity
871
     * @param bool $htmlToXml
872
     * @param bool $removeXmlHeader
873
     * @param int  $options
874
     *
875
     * @return string
876
     */
877 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
878
        bool $multiDecodeNewHtmlEntity = false,
879
        bool $htmlToXml = true,
880
        bool $removeXmlHeader = true,
881
        int $options = \LIBXML_NOEMPTYTAG
882
    ): string {
883 2
        $xml = $this->document->saveXML(null, $options);
884 2
        if ($xml === false) {
885
            return '';
886
        }
887
888 2
        if ($removeXmlHeader) {
889 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
890
        }
891
892 2
        if ($htmlToXml) {
893 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
894
        } else {
895
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
896
897
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
898
        }
899
900 2
        return $return;
901
    }
902
903
    /**
904
     * @param string $selector
905
     * @param int    $idx
906
     *
907
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
908
     */
909 3
    public function __invoke($selector, $idx = null)
910
    {
911 3
        return $this->find($selector, $idx);
912
    }
913
914
    /**
915
     * @return bool
916
     */
917 130
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
918
    {
919 130
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
920
    }
921
922
    /**
923
     * @return bool
924
     */
925 130
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
926
    {
927 130
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
928
    }
929
930
    /**
931
     * @return bool
932
     */
933 130
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
934
    {
935 130
        return $this->isDOMDocumentCreatedWithoutHtml;
936
    }
937
938
    /**
939
     * @return bool
940
     */
941 130
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
942
    {
943 130
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
944
    }
945
946
    /**
947
     * @return bool
948
     */
949 130
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
950
    {
951 130
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
952
    }
953
954
    /**
955
     * @return bool
956
     */
957 130
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
958
    {
959 130
        return $this->isDOMDocumentCreatedWithoutWrapper;
960
    }
961
962
    /**
963
     * @return bool
964
     */
965 130
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
966
    {
967 130
        return $this->isDOMDocumentCreatedWithFakeEndScript;
968
    }
969
970
    /**
971
     * @param string $html
972
     *
973
     * @return string
974
     */
975 5
    protected function keepBrokenHtml(string $html): string
976
    {
977
        do {
978 5
            $original = $html;
979
980 5
            $html = (string) \preg_replace_callback(
981 5
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
982
                static function ($matches) {
983 5
                    return $matches['start'] .
984 5
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
985 5
                        $matches['value'] .
986 5
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
987 5
                        $matches['end'];
988 5
                },
989 5
                $html
990
            );
991 5
        } while ($original !== $html);
992
993
        do {
994 5
            $original = $html;
995
996 5
            $html = (string) \preg_replace_callback(
997 5
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
998
                static function ($matches) {
999 3
                    $matches['broken'] = \str_replace(
1000 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1001 3
                        ['</', '<', '>'],
1002 3
                        $matches['broken']
1003
                    );
1004
1005 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
1006 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
1007
1008 3
                    return $matches['start'] . $matchesHash . $matches['end'];
1009 5
                },
1010 5
                $html
1011
            );
1012 5
        } while ($original !== $html);
1013
1014 5
        return \str_replace(
1015 5
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1016 5
            ['</', '<', '>'],
1017 5
            $html
1018
        );
1019
    }
1020
1021
    /**
1022
     * @param string $html
1023
     *
1024
     * @return void
1025
     */
1026 6
    protected function keepSpecialScriptTags(string &$html)
1027
    {
1028
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1029 6
        $tags = \implode('|', \array_map(
1030
            static function ($value) {
1031 6
                return \preg_quote($value, '/');
1032 6
            },
1033 6
            $this->specialScriptTags
1034
        ));
1035 6
        $html = (string) \preg_replace_callback(
1036 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1037
            function ($matches) {
1038
1039
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1040
                // because often this looks like non valid html in the template itself.
1041 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1042 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1043
                        // remove the html5 fallback
1044 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
1045
1046 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1047 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1048
1049 3
                        return $matches['start'] . $matchesHash . $matches['end'];
1050
                    }
1051
                }
1052
1053
                // remove the html5 fallback
1054 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
1055
1056 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1057
1058 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1059 6
            },
1060 6
            $html
1061
        );
1062 6
    }
1063
1064
    /**
1065
     * @param bool $keepBrokenHtml
1066
     *
1067
     * @return HtmlDomParser
1068
     */
1069 5
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1070
    {
1071 5
        $this->keepBrokenHtml = $keepBrokenHtml;
1072
1073 5
        return $this;
1074
    }
1075
1076
    /**
1077
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1078
     *
1079
     * @return HtmlDomParser
1080
     */
1081 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1082
    {
1083 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1084 2
            if (!\is_string($tmp)) {
1085 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1086
            }
1087
        }
1088
1089 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1090
1091 1
        return $this;
1092
    }
1093
1094
    /**
1095
     * @param string[] $specialScriptTags
1096
     *
1097
     * @return HtmlDomParser
1098
     */
1099
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1100
    {
1101
        foreach ($specialScriptTags as $tag) {
1102
            if (!\is_string($tag)) {
1103
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1104
            }
1105
        }
1106
1107
        $this->specialScriptTags = $specialScriptTags;
1108
1109
        return $this;
1110
    }
1111
}
1112