Completed
Push — master ( 3dc188...ddb6ef )
by Lars
01:45
created

HtmlDomParser::setCallbackXPathBeforeQuery()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
ccs 3
cts 3
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42
43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49
50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61
62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71
72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/x-custom-template',
79
     *     'text/x-handlebars-template'
80
     * ]
81
     * ```
82
     *
83
     * @var string[]
84
     */
85
    protected $specialScriptTags = [
86
        'text/html',
87
        'text/x-custom-template',
88
        'text/x-handlebars-template',
89
    ];
90
91
    /**
92
     * @var string[]
93
     */
94
    protected $selfClosingTags = [
95
        'area',
96
        'base',
97
        'br',
98
        'col',
99
        'command',
100
        'embed',
101
        'hr',
102
        'img',
103
        'input',
104
        'keygen',
105
        'link',
106
        'meta',
107
        'param',
108
        'source',
109
        'track',
110
        'wbr',
111
    ];
112
113
    /**
114
     * @var bool
115
     */
116
    protected $isDOMDocumentCreatedWithoutHtml = false;
117
118
    /**
119
     * @var bool
120
     */
121
    protected $isDOMDocumentCreatedWithoutWrapper = false;
122
123
    /**
124
     * @var bool
125
     */
126
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
127
128
    /**
129
     * @var bool
130
     */
131
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
132
133
    /**
134
     * @var bool
135
     */
136
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
137
138
    /**
139
     * @var bool
140
     */
141
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
142
143
    /**
144
     * @var bool
145
     */
146
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
147
148
    /**
149
     * @var bool
150
     */
151
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
152
153
    /**
154
     * @var bool
155
     */
156
    protected $keepBrokenHtml = false;
157
158
    /**
159
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
160
     */
161 221 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
    {
163 221
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
164
165
        // DOMDocument settings
166 221
        $this->document->preserveWhiteSpace = true;
167 221
        $this->document->formatOutput = true;
168
169 221
        if ($element instanceof SimpleHtmlDomInterface) {
170 105
            $element = $element->getNode();
171
        }
172
173 221
        if ($element instanceof \DOMNode) {
174 105
            $domNode = $this->document->importNode($element, true);
175
176 105
            if ($domNode instanceof \DOMNode) {
177
                /** @noinspection UnusedFunctionResultInspection */
178 105
                $this->document->appendChild($domNode);
179
            }
180
181 105
            return;
182
        }
183
184 221
        if ($element !== null) {
185
            /** @noinspection UnusedFunctionResultInspection */
186 88
            $this->loadHtml($element);
187
        }
188 220
    }
189
190
    /**
191
     * @param string $name
192
     * @param array  $arguments
193
     *
194
     * @return bool|mixed
195
     */
196 79
    public function __call($name, $arguments)
197
    {
198 79
        $name = \strtolower($name);
199
200 79
        if (isset(self::$functionAliases[$name])) {
201 78
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
202
        }
203
204 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
205
    }
206
207
    /**
208
     * @param string $name
209
     * @param array  $arguments
210
     *
211
     * @throws \BadMethodCallException
212
     * @throws \RuntimeException
213
     *
214
     * @return HtmlDomParser
215
     */
216 29 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
217
    {
218 29
        $arguments0 = $arguments[0] ?? '';
219
220 29
        $arguments1 = $arguments[1] ?? null;
221
222 29
        if ($name === 'str_get_html') {
223 23
            $parser = new static();
224
225 23
            return $parser->loadHtml($arguments0, $arguments1);
226
        }
227
228 7
        if ($name === 'file_get_html') {
229 6
            $parser = new static();
230
231 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
232
        }
233
234 1
        throw new \BadMethodCallException('Method does not exist');
235
    }
236
237
    /** @noinspection MagicMethodsValidityInspection */
238
239
    /**
240
     * @param string $name
241
     *
242
     * @return string|null
243
     */
244 17
    public function __get($name)
245
    {
246 17
        $name = \strtolower($name);
247
248 17
        switch ($name) {
249 17
            case 'outerhtml':
250 17
            case 'outertext':
251 7
                return $this->html();
252 11
            case 'innerhtml':
253 5
            case 'innertext':
254 7
                return $this->innerHtml();
255 4
            case 'text':
256 4
            case 'plaintext':
257 3
                return $this->text();
258
        }
259
260 1
        return null;
261
    }
262
263
    /**
264
     * @return string
265
     */
266 20
    public function __toString()
267
    {
268 20
        return $this->html();
269
    }
270
271
    /**
272
     * does nothing (only for api-compatibility-reasons)
273
     *
274
     * @return bool
275
     *
276
     * @deprecated
277
     */
278 1
    public function clear(): bool
279
    {
280 1
        return true;
281
    }
282
283
    /**
284
     * Create DOMDocument from HTML.
285
     *
286
     * @param string   $html
287
     * @param int|null $libXMLExtraOptions
288
     *
289
     * @return \DOMDocument
290
     */
291 205
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
292
    {
293 205
        if ($this->callbackBeforeCreateDom) {
294 1
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
295
        }
296
297
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
298 205
        $isDOMDocumentCreatedWithDoctype = false;
299 205 View Code Duplication
        if (\stripos($html, '<!DOCTYPE') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
300 60
            $isDOMDocumentCreatedWithDoctype = true;
301
            if (
302 60
                \preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype)
303
                &&
304 60
                \trim($matches_before_doctype[1])
305
            ) {
306 2
                $html = \str_replace($matches_before_doctype[1], '', $html);
307
            }
308
        }
309
310 205
        if ($this->keepBrokenHtml) {
311 5
            $html = $this->keepBrokenHtml(\trim($html));
312
        }
313
314 205
        if (\strpos($html, '<') === false) {
315 12
            $this->isDOMDocumentCreatedWithoutHtml = true;
316 203
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
317 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
318
        }
319
320 205
        if (\strpos(\ltrim($html), '<!--') === 0) {
321 12
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
322
        }
323
324
        /** @noinspection HtmlRequiredLangAttribute */
325
        if (
326 205
            \strpos($html, '<html ') === false
327
            &&
328 205
            \strpos($html, '<html>') === false
329
        ) {
330 124
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
331
        }
332
333
        if (
334 205
            \strpos($html, '<body ') === false
335
            &&
336 205
            \strpos($html, '<body>') === false
337
        ) {
338 129
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
339
        }
340
341
        /** @noinspection HtmlRequiredTitleElement */
342
        if (
343 205
            \strpos($html, '<head ') === false
344
            &&
345 205
            \strpos($html, '<head>') === false
346
        ) {
347 148
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
348
        }
349
350
        if (
351 205
            \strpos($html, '<p ') === false
352
            &&
353 205
            \strpos($html, '<p>') === false
354
        ) {
355 114
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
356
        }
357
358
        if (
359 205
            \strpos($html, '</script>') === false
360
            &&
361 205
            \strpos($html, '<\/script>') !== false
362
        ) {
363 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
364
        }
365
366 205 View Code Duplication
        if (\stripos($html, '</html>') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
367
            /** @noinspection NestedPositiveIfStatementsInspection */
368
            if (
369 90
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
370
                &&
371 90
                \trim($matches_after_html[1])
372
            ) {
373 4
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
374
            }
375
        }
376
377 205
        if (\strpos($html, '<script') !== false) {
378 23
            $this->html5FallbackForScriptTags($html);
379
380 23
            foreach ($this->specialScriptTags as $tag) {
381 23
                if (\strpos($html, $tag) !== false) {
382 6
                    $this->keepSpecialScriptTags($html);
383
                }
384
            }
385
        }
386
387 205
        $html = \str_replace(
388
            \array_map(static function ($e) {
389 205
                return '<' . $e . '>';
390 205
            }, $this->selfClosingTags),
391
            \array_map(static function ($e) {
392 205
                return '<' . $e . '/>';
393 205
            }, $this->selfClosingTags),
394 205
            $html
395
        );
396
397
        // set error level
398 205
        $internalErrors = \libxml_use_internal_errors(true);
399 205
        if (\PHP_VERSION_ID < 80000) {
400 205
            $disableEntityLoader = \libxml_disable_entity_loader(true);
401
        }
402 205
        \libxml_clear_errors();
403
404 205
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
405
406 205
        if (\defined('LIBXML_BIGLINES')) {
407 205
            $optionsXml |= \LIBXML_BIGLINES;
408
        }
409
410 205
        if (\defined('LIBXML_COMPACT')) {
411 205
            $optionsXml |= \LIBXML_COMPACT;
412
        }
413
414 205
        if (\defined('LIBXML_HTML_NODEFDTD')) {
415 205
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
416
        }
417
418 205
        if ($libXMLExtraOptions !== null) {
419 5
            $optionsXml |= $libXMLExtraOptions;
420
        }
421
422
        if (
423 205
            $this->isDOMDocumentCreatedWithoutWrapper
424
            ||
425 201
            $this->isDOMDocumentCreatedWithCommentWrapper
426
            ||
427
            (
428 189
                !$isDOMDocumentCreatedWithDoctype
429
                &&
430 205
                $this->keepBrokenHtml
431
            )
432
        ) {
433 20
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
434
        }
435
436 205
        $html = self::replaceToPreserveHtmlEntities($html);
437
438 205
        $documentFound = false;
439 205
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
440 205 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
441 100
            $domElementTmp = \dom_import_simplexml($sxe);
442
            if (
443 100
                $domElementTmp
444
                &&
445 100
                $domElementTmp->ownerDocument
446
            ) {
447 100
                $documentFound = true;
448 100
                $this->document = $domElementTmp->ownerDocument;
449
            }
450
        }
451
452 205 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
453
454
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
455 115
            $xmlHackUsed = false;
456
            /** @noinspection StringFragmentMisplacedInspection */
457 115
            if (\stripos('<?xml', $html) !== 0) {
458 115
                $xmlHackUsed = true;
459 115
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
460
            }
461
462 115
            if ($html !== '') {
463 115
                $this->document->loadHTML($html, $optionsXml);
464
            }
465
466
            // remove the "xml-encoding" hack
467 115
            if ($xmlHackUsed) {
468 115
                foreach ($this->document->childNodes as $child) {
469 115
                    if ($child->nodeType === \XML_PI_NODE) {
470
                        /** @noinspection UnusedFunctionResultInspection */
471 115
                        $this->document->removeChild($child);
472
473 115
                        break;
474
                    }
475
                }
476
            }
477
        }
478
479
        // set encoding
480 205
        $this->document->encoding = $this->getEncoding();
481
482
        // restore lib-xml settings
483 205
        \libxml_clear_errors();
484 205
        \libxml_use_internal_errors($internalErrors);
485 205
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
486 205
            \libxml_disable_entity_loader($disableEntityLoader);
487
        }
488
489 205
        return $this->document;
490
    }
491
492
    /**
493
     * Find list of nodes with a CSS selector.
494
     *
495
     * @param string   $selector
496
     * @param int|null $idx
497
     *
498
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
499
     */
500 150 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
501
    {
502 150
        $xPathQuery = SelectorConverter::toXPath($selector);
503
504 150
        $xPath = new \DOMXPath($this->document);
505
506 150
        if ($this->callbackXPathBeforeQuery) {
507 1
            $xPathQuery = \call_user_func($this->callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $this);
508
        }
509
510 150
        $nodesList = $xPath->query($xPathQuery);
511
512 150
        $elements = new SimpleHtmlDomNode();
513
514 150
        if ($nodesList) {
515 150
            foreach ($nodesList as $node) {
516 140
                $elements[] = new SimpleHtmlDom($node);
517
            }
518
        }
519
520
        // return all elements
521 150
        if ($idx === null) {
522 75
            if (\count($elements) === 0) {
523 16
                return new SimpleHtmlDomNodeBlank();
524
            }
525
526 72
            return $elements;
527
        }
528
529
        // handle negative values
530 93
        if ($idx < 0) {
531 11
            $idx = \count($elements) + $idx;
532
        }
533
534
        // return one element
535 93
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
536
    }
537
538
    /**
539
     * Find nodes with a CSS selector.
540
     *
541
     * @param string $selector
542
     *
543
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
544
     */
545 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
546
    {
547 12
        return $this->find($selector, null);
548
    }
549
550
    /**
551
     * Find nodes with a CSS selector or false, if no element is found.
552
     *
553
     * @param string $selector
554
     *
555
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
556
     */
557 4
    public function findMultiOrFalse(string $selector)
558
    {
559 4
        $return = $this->find($selector, null);
560
561 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
562 3
            return false;
563
        }
564
565 2
        return $return;
566
    }
567
568
    /**
569
     * Find one node with a CSS selector.
570
     *
571
     * @param string $selector
572
     *
573
     * @return SimpleHtmlDomInterface
574
     */
575 34
    public function findOne(string $selector): SimpleHtmlDomInterface
576
    {
577 34
        return $this->find($selector, 0);
578
    }
579
580
    /**
581
     * Find one node with a CSS selector or false, if no element is found.
582
     *
583
     * @param string $selector
584
     *
585
     * @return false|SimpleHtmlDomInterface
586
     */
587 6
    public function findOneOrFalse(string $selector)
588
    {
589 6
        $return = $this->find($selector, 0);
590
591 6
        if ($return instanceof SimpleHtmlDomBlank) {
592 3
            return false;
593
        }
594
595 4
        return $return;
596
    }
597
598
    /**
599
     * @param string $content
600
     * @param bool   $multiDecodeNewHtmlEntity
601
     *
602
     * @return string
603
     */
604 131
    public function fixHtmlOutput(
605
        string $content,
606
        bool $multiDecodeNewHtmlEntity = false
607
    ): string {
608
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
609
        //          so we try to remove it here again ...
610
611 131
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
612
            /** @noinspection HtmlRequiredLangAttribute */
613 62
            $content = \str_replace(
614
                [
615 62
                    '<html>',
616
                    '</html>',
617
                ],
618 62
                '',
619 62
                $content
620
            );
621
        }
622
623 131
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
624
            /** @noinspection HtmlRequiredTitleElement */
625 66
            $content = \str_replace(
626
                [
627 66
                    '<head>',
628
                    '</head>',
629
                ],
630 66
                '',
631 66
                $content
632
            );
633
        }
634
635 131
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
636 65
            $content = \str_replace(
637
                [
638 65
                    '<body>',
639
                    '</body>',
640
                ],
641 65
                '',
642 65
                $content
643
            );
644
        }
645
646 131
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
647 1
            $content = \str_replace(
648 1
                '</script>',
649 1
                '',
650 1
                $content
651
            );
652
        }
653
654 131
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
655 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
656 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
657
        }
658
659 131
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
660 63
            $content = \str_replace(
661
                [
662 63
                    '<p>',
663
                    '</p>',
664
                ],
665 63
                '',
666 63
                $content
667
            );
668
        }
669
670 131
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
671 10
            $content = \str_replace(
672 10
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
673 10
                '',
674 10
                $content
675
            );
676
        }
677
678
        // https://bugs.php.net/bug.php?id=73175
679 131
        $content = \str_replace(
680
            \array_map(static function ($e) {
681 131
                return '</' . $e . '>';
682 131
            }, $this->selfClosingTags),
683 131
            '',
684 131
            $content
685
        );
686
687
        /** @noinspection HtmlRequiredTitleElement */
688 131
        $content = \trim(
689 131
            \str_replace(
690
                [
691 131
                    '<simpleHtmlDomHtml>',
692
                    '</simpleHtmlDomHtml>',
693
                    '<simpleHtmlDomP>',
694
                    '</simpleHtmlDomP>',
695
                    '<head><head>',
696
                    '</head></head>',
697
                ],
698
                [
699 131
                    '',
700
                    '',
701
                    '',
702
                    '',
703
                    '<head>',
704
                    '</head>',
705
                ],
706 131
                $content
707
            )
708
        );
709
710 131
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
711
712 131
        return self::putReplacedBackToPreserveHtmlEntities($content);
713
    }
714
715
    /**
716
     * Return elements by ".class".
717
     *
718
     * @param string $class
719
     *
720
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
721
     */
722
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
723
    {
724
        return $this->findMulti(".${class}");
725
    }
726
727
    /**
728
     * Return element by #id.
729
     *
730
     * @param string $id
731
     *
732
     * @return SimpleHtmlDomInterface
733
     */
734 3
    public function getElementById(string $id): SimpleHtmlDomInterface
735
    {
736 3
        return $this->findOne("#${id}");
737
    }
738
739
    /**
740
     * Return element by tag name.
741
     *
742
     * @param string $name
743
     *
744
     * @return SimpleHtmlDomInterface
745
     */
746 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
747
    {
748 1
        $node = $this->document->getElementsByTagName($name)->item(0);
749
750 1
        if ($node === null) {
751
            return new SimpleHtmlDomBlank();
752
        }
753
754 1
        return new SimpleHtmlDom($node);
755
    }
756
757
    /**
758
     * Returns elements by "#id".
759
     *
760
     * @param string   $id
761
     * @param int|null $idx
762
     *
763
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
764
     */
765
    public function getElementsById(string $id, $idx = null)
766
    {
767
        return $this->find("#${id}", $idx);
768
    }
769
770
    /**
771
     * Returns elements by tag name.
772
     *
773
     * @param string   $name
774
     * @param int|null $idx
775
     *
776
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
777
     */
778 6
    public function getElementsByTagName(string $name, $idx = null)
779
    {
780 6
        $nodesList = $this->document->getElementsByTagName($name);
781
782 6
        $elements = new SimpleHtmlDomNode();
783
784 6
        foreach ($nodesList as $node) {
785 4
            $elements[] = new SimpleHtmlDom($node);
786
        }
787
788
        // return all elements
789 6
        if ($idx === null) {
790 5
            if (\count($elements) === 0) {
791 2
                return new SimpleHtmlDomNodeBlank();
792
            }
793
794 3
            return $elements;
795
        }
796
797
        // handle negative values
798 1
        if ($idx < 0) {
799
            $idx = \count($elements) + $idx;
800
        }
801
802
        // return one element
803 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
804
    }
805
806
    /**
807
     * Get dom node's outer html.
808
     *
809
     * @param bool $multiDecodeNewHtmlEntity
810
     *
811
     * @return string
812
     */
813 97
    public function html(bool $multiDecodeNewHtmlEntity = false): string
814
    {
815 97
        if (static::$callback !== null) {
816
            \call_user_func(static::$callback, [$this]);
817
        }
818
819 97
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
820 55
            $content = $this->document->saveHTML($this->document->documentElement);
821
        } else {
822 57
            $content = $this->document->saveHTML();
823
        }
824
825 97
        if ($content === false) {
826
            return '';
827
        }
828
829 97
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
830
    }
831
832
    /**
833
     * Load HTML from string.
834
     *
835
     * @param string   $html
836
     * @param int|null $libXMLExtraOptions
837
     *
838
     * @return HtmlDomParser
839
     */
840 205
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
841
    {
842
        // reset
843 205
        self::$domBrokenReplaceHelper = [];
844
845 205
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
846
847 205
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
848
    }
849
850
    /**
851
     * Load HTML from file.
852
     *
853
     * @param string   $filePath
854
     * @param int|null $libXMLExtraOptions
855
     *
856
     * @throws \RuntimeException
857
     *
858
     * @return HtmlDomParser
859
     */
860 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
861
    {
862
        // reset
863 13
        self::$domBrokenReplaceHelper = [];
864
865
        if (
866 13
            !\preg_match("/^https?:\/\//i", $filePath)
867
            &&
868 13
            !\file_exists($filePath)
869
        ) {
870 1
            throw new \RuntimeException("File ${filePath} not found");
871
        }
872
873
        try {
874 12
            if (\class_exists('\voku\helper\UTF8')) {
875
                $html = \voku\helper\UTF8::file_get_contents($filePath);
876
            } else {
877 12
                $html = \file_get_contents($filePath);
878
            }
879 1
        } catch (\Exception $e) {
880 1
            throw new \RuntimeException("Could not load file ${filePath}");
881
        }
882
883 11
        if ($html === false) {
884
            throw new \RuntimeException("Could not load file ${filePath}");
885
        }
886
887 11
        return $this->loadHtml($html, $libXMLExtraOptions);
888
    }
889
890
    /**
891
     * Get the HTML as XML or plain XML if needed.
892
     *
893
     * @param bool $multiDecodeNewHtmlEntity
894
     * @param bool $htmlToXml
895
     * @param bool $removeXmlHeader
896
     * @param int  $options
897
     *
898
     * @return string
899
     */
900 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
901
        bool $multiDecodeNewHtmlEntity = false,
902
        bool $htmlToXml = true,
903
        bool $removeXmlHeader = true,
904
        int $options = \LIBXML_NOEMPTYTAG
905
    ): string {
906 2
        $xml = $this->document->saveXML(null, $options);
907 2
        if ($xml === false) {
908
            return '';
909
        }
910
911 2
        if ($removeXmlHeader) {
912 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
913
        }
914
915 2
        if ($htmlToXml) {
916 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
917
        } else {
918
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
919
920
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
921
        }
922
923 2
        return $return;
924
    }
925
926
    /**
927
     * @param string $selector
928
     * @param int    $idx
929
     *
930
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
931
     */
932 3
    public function __invoke($selector, $idx = null)
933
    {
934 3
        return $this->find($selector, $idx);
935
    }
936
937
    /**
938
     * @return bool
939
     */
940 131
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
941
    {
942 131
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
943
    }
944
945
    /**
946
     * @return bool
947
     */
948 131
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
949
    {
950 131
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
951
    }
952
953
    /**
954
     * @return bool
955
     */
956 131
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
957
    {
958 131
        return $this->isDOMDocumentCreatedWithoutHtml;
959
    }
960
961
    /**
962
     * @return bool
963
     */
964 131
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
965
    {
966 131
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
967
    }
968
969
    /**
970
     * @return bool
971
     */
972 131
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
973
    {
974 131
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
975
    }
976
977
    /**
978
     * @return bool
979
     */
980 131
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
981
    {
982 131
        return $this->isDOMDocumentCreatedWithoutWrapper;
983
    }
984
985
    /**
986
     * @return bool
987
     */
988 131
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
989
    {
990 131
        return $this->isDOMDocumentCreatedWithFakeEndScript;
991
    }
992
993
    /**
994
     * @param string $html
995
     *
996
     * @return string
997
     */
998 5
    protected function keepBrokenHtml(string $html): string
999
    {
1000
        do {
1001 5
            $original = $html;
1002
1003 5
            $html = (string) \preg_replace_callback(
1004 5
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
1005
                static function ($matches) {
1006 5
                    return $matches['start'] .
1007 5
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
1008 5
                        $matches['value'] .
1009 5
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
1010 5
                        $matches['end'];
1011 5
                },
1012 5
                $html
1013
            );
1014 5
        } while ($original !== $html);
1015
1016
        do {
1017 5
            $original = $html;
1018
1019 5
            $html = (string) \preg_replace_callback(
1020 5
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
1021
                static function ($matches) {
1022 3
                    $matches['broken'] = \str_replace(
1023 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1024 3
                        ['</', '<', '>'],
1025 3
                        $matches['broken']
1026
                    );
1027
1028 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
1029 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
1030
1031 3
                    return $matches['start'] . $matchesHash . $matches['end'];
1032 5
                },
1033 5
                $html
1034
            );
1035 5
        } while ($original !== $html);
1036
1037 5
        return \str_replace(
1038 5
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1039 5
            ['</', '<', '>'],
1040 5
            $html
1041
        );
1042
    }
1043
1044
    /**
1045
     * @param string $html
1046
     *
1047
     * @return void
1048
     */
1049 6
    protected function keepSpecialScriptTags(string &$html)
1050
    {
1051
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1052 6
        $tags = \implode('|', \array_map(
1053
            static function ($value) {
1054 6
                return \preg_quote($value, '/');
1055 6
            },
1056 6
            $this->specialScriptTags
1057
        ));
1058 6
        $html = (string) \preg_replace_callback(
1059 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1060
            function ($matches) {
1061
1062
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1063
                // because often this looks like non valid html in the template itself.
1064 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1065 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1066
                        // remove the html5 fallback
1067 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
1068
1069 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1070 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1071
1072 3
                        return $matches['start'] . $matchesHash . $matches['end'];
1073
                    }
1074
                }
1075
1076
                // remove the html5 fallback
1077 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
1078
1079 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1080
1081 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1082 6
            },
1083 6
            $html
1084
        );
1085 6
    }
1086
1087
    /**
1088
     * @param bool $keepBrokenHtml
1089
     *
1090
     * @return HtmlDomParser
1091
     */
1092 5
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1093
    {
1094 5
        $this->keepBrokenHtml = $keepBrokenHtml;
1095
1096 5
        return $this;
1097
    }
1098
1099
    /**
1100
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1101
     *
1102
     * @return HtmlDomParser
1103
     */
1104 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1105
    {
1106 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1107 2
            if (!\is_string($tmp)) {
1108 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1109
            }
1110
        }
1111
1112 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1113
1114 1
        return $this;
1115
    }
1116
1117
    /**
1118
     * @param string[] $specialScriptTags
1119
     *
1120
     * @return HtmlDomParser
1121
     */
1122
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1123
    {
1124
        foreach ($specialScriptTags as $tag) {
1125
            if (!\is_string($tag)) {
1126
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1127
            }
1128
        }
1129
1130
        $this->specialScriptTags = $specialScriptTags;
1131
1132
        return $this;
1133
    }
1134
1135
    /**
1136
     * @param callable $callbackXPathBeforeQuery
1137
     *
1138
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1139
     */
1140 1
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery)
1141
    {
1142 1
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
1143 1
    }
1144
1145
    /**
1146
     * @param callable $callbackBeforeCreateDom
1147
     *
1148
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1149
     */
1150 1
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom)
1151
    {
1152 1
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
1153 1
    }
1154
}
1155