Completed
Push — master ( ae518a...b71ea1 )
by Lars
01:46
created

HtmlDomParser::find()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 31

Duplication

Lines 31
Ratio 100 %

Code Coverage

Tests 15
CRAP Score 6

Importance

Changes 0
Metric Value
nc 8
dl 31
loc 31
ccs 15
cts 15
cp 1
c 0
b 0
f 0
cc 6
nop 2
crap 6
rs 8.8017
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template',
75
    ];
76
77
    /**
78
     * @var string[]
79
     */
80
    protected $selfClosingTags = [
81
        'area',
82
        'base',
83
        'br',
84
        'col',
85
        'command',
86
        'embed',
87
        'hr',
88
        'img',
89
        'input',
90
        'keygen',
91
        'link',
92
        'meta',
93
        'param',
94
        'source',
95
        'track',
96
        'wbr',
97
    ];
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
128
129
    /**
130
     * @var bool
131
     */
132
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
133
134
    /**
135
     * @var bool
136
     */
137
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
138
139
    /**
140
     * @var bool
141
     */
142
    protected $keepBrokenHtml = false;
143
144
    /**
145
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
146
     */
147 217 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
148
    {
149 217
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
150
151
        // DOMDocument settings
152 217
        $this->document->preserveWhiteSpace = true;
153 217
        $this->document->formatOutput = true;
154
155 217
        if ($element instanceof SimpleHtmlDomInterface) {
156 102
            $element = $element->getNode();
157
        }
158
159 217
        if ($element instanceof \DOMNode) {
160 102
            $domNode = $this->document->importNode($element, true);
161
162 102
            if ($domNode instanceof \DOMNode) {
163
                /** @noinspection UnusedFunctionResultInspection */
164 102
                $this->document->appendChild($domNode);
165
            }
166
167 102
            return;
168
        }
169
170 217
        if ($element !== null) {
171
            /** @noinspection UnusedFunctionResultInspection */
172 85
            $this->loadHtml($element);
173
        }
174 216
    }
175
176
    /**
177
     * @param string $name
178
     * @param array  $arguments
179
     *
180
     * @return bool|mixed
181
     */
182 78
    public function __call($name, $arguments)
183
    {
184 78
        $name = \strtolower($name);
185
186 78
        if (isset(self::$functionAliases[$name])) {
187 77
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
188
        }
189
190 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
191
    }
192
193
    /**
194
     * @param string $name
195
     * @param array  $arguments
196
     *
197
     * @throws \BadMethodCallException
198
     * @throws \RuntimeException
199
     *
200
     * @return HtmlDomParser
201
     */
202 28 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
203
    {
204 28
        $arguments0 = $arguments[0] ?? '';
205
206 28
        $arguments1 = $arguments[1] ?? null;
207
208 28
        if ($name === 'str_get_html') {
209 22
            $parser = new static();
210
211 22
            return $parser->loadHtml($arguments0, $arguments1);
212
        }
213
214 7
        if ($name === 'file_get_html') {
215 6
            $parser = new static();
216
217 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
218
        }
219
220 1
        throw new \BadMethodCallException('Method does not exist');
221
    }
222
223
    /** @noinspection MagicMethodsValidityInspection */
224
225
    /**
226
     * @param string $name
227
     *
228
     * @return string|null
229
     */
230 15
    public function __get($name)
231
    {
232 15
        $name = \strtolower($name);
233
234 15
        switch ($name) {
235 15
            case 'outerhtml':
236 15
            case 'outertext':
237 5
                return $this->html();
238 11
            case 'innerhtml':
239 5
            case 'innertext':
240 7
                return $this->innerHtml();
241 4
            case 'text':
242 4
            case 'plaintext':
243 3
                return $this->text();
244
        }
245
246 1
        return null;
247
    }
248
249
    /**
250
     * @return string
251
     */
252 20
    public function __toString()
253
    {
254 20
        return $this->html();
255
    }
256
257
    /**
258
     * does nothing (only for api-compatibility-reasons)
259
     *
260
     * @return bool
261
     *
262
     * @deprecated
263
     */
264 6
    public function clear(): bool
265
    {
266 6
        return true;
267
    }
268
269
    /**
270
     * Create DOMDocument from HTML.
271
     *
272
     * @param string   $html
273
     * @param int|null $libXMLExtraOptions
274
     *
275
     * @return \DOMDocument
276
     */
277 201
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
278
    {
279
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
280 201
        $isDOMDocumentCreatedWithDoctype = false;
281 201
        if (\stripos($html, '<!DOCTYPE') !== false) {
282 60
            $isDOMDocumentCreatedWithDoctype = true;
283
            if (
284 60
                \preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype)
285
                &&
286 60
                \trim($matches_before_doctype[1])
287
            ) {
288 2
                $html = \str_replace($matches_before_doctype[1], '', $html);
289
            }
290
        }
291
292 201
        if ($this->keepBrokenHtml) {
293 5
            $html = $this->keepBrokenHtml(\trim($html));
294
        }
295
296 201
        if (\strpos($html, '<') === false) {
297 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
298 199
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
299 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
300
        }
301
302 201
        if (\strpos(\ltrim($html), '<!--') === 0) {
303 12
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
304
        }
305
306
        /** @noinspection HtmlRequiredLangAttribute */
307
        if (
308 201
            \strpos($html, '<html ') === false
309
            &&
310 201
            \strpos($html, '<html>') === false
311
        ) {
312 120
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
313
        }
314
315
        if (
316 201
            \strpos($html, '<body ') === false
317
            &&
318 201
            \strpos($html, '<body>') === false
319
        ) {
320 125
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
321
        }
322
323
        /** @noinspection HtmlRequiredTitleElement */
324
        if (
325 201
            \strpos($html, '<head ') === false
326
            &&
327 201
            \strpos($html, '<head>') === false
328
        ) {
329 144
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
330
        }
331
332
        if (
333 201
            \strpos($html, '<p ') === false
334
            &&
335 201
            \strpos($html, '<p>') === false
336
        ) {
337 110
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
338
        }
339
340
        if (
341 201
            \strpos($html, '</script>') === false
342
            &&
343 201
            \strpos($html, '<\/script>') !== false
344
        ) {
345 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
346
        }
347
348 201
        if (\stripos($html, '</html>') !== false) {
349
            /** @noinspection NestedPositiveIfStatementsInspection */
350
            if (
351 90
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
352
                &&
353 90
                \trim($matches_after_html[1])
354
            ) {
355 4
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
356
            }
357
        }
358
359 201
        if (\strpos($html, '<script') !== false) {
360 23
            $this->html5FallbackForScriptTags($html);
361
362 23
            foreach ($this->specialScriptTags as $tag) {
363 23
                if (\strpos($html, $tag) !== false) {
364 6
                    $this->keepSpecialScriptTags($html);
365
                }
366
            }
367
        }
368
369 201
        $html = \str_replace(
370
            \array_map(static function ($e) {
371 201
                return '<' . $e . '>';
372 201
            }, $this->selfClosingTags),
373
            \array_map(static function ($e) {
374 201
                return '<' . $e . '/>';
375 201
            }, $this->selfClosingTags),
376 201
            $html
377
        );
378
379
        // set error level
380 201
        $internalErrors = \libxml_use_internal_errors(true);
381 201
        $disableEntityLoader = \libxml_disable_entity_loader(true);
382 201
        \libxml_clear_errors();
383
384 201
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
385
386 201
        if (\defined('LIBXML_BIGLINES')) {
387 201
            $optionsXml |= \LIBXML_BIGLINES;
388
        }
389
390 201
        if (\defined('LIBXML_COMPACT')) {
391 201
            $optionsXml |= \LIBXML_COMPACT;
392
        }
393
394 201
        if (\defined('LIBXML_HTML_NODEFDTD')) {
395 201
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
396
        }
397
398 201
        if ($libXMLExtraOptions !== null) {
399 5
            $optionsXml |= $libXMLExtraOptions;
400
        }
401
402
        if (
403 201
            $this->isDOMDocumentCreatedWithoutWrapper
404
            ||
405 197
            $this->isDOMDocumentCreatedWithCommentWrapper
406
            ||
407
            (
408 185
                !$isDOMDocumentCreatedWithDoctype
409
                &&
410 201
                $this->keepBrokenHtml
411
            )
412
        ) {
413 20
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
414
        }
415
416 201
        $html = self::replaceToPreserveHtmlEntities($html);
417
418 201
        $documentFound = false;
419 201
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
420 201 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
421 96
            $domElementTmp = \dom_import_simplexml($sxe);
422
            if (
423 96
                $domElementTmp
424
                &&
425 96
                $domElementTmp->ownerDocument
426
            ) {
427 96
                $documentFound = true;
428 96
                $this->document = $domElementTmp->ownerDocument;
429
            }
430
        }
431
432 201 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
433
434
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
435 114
            $xmlHackUsed = false;
436
            /** @noinspection StringFragmentMisplacedInspection */
437 114
            if (\stripos('<?xml', $html) !== 0) {
438 114
                $xmlHackUsed = true;
439 114
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
440
            }
441
442 114
            $this->document->loadHTML($html, $optionsXml);
443
444
            // remove the "xml-encoding" hack
445 114
            if ($xmlHackUsed) {
446 114
                foreach ($this->document->childNodes as $child) {
447 114
                    if ($child->nodeType === \XML_PI_NODE) {
448
                        /** @noinspection UnusedFunctionResultInspection */
449 114
                        $this->document->removeChild($child);
450
451 114
                        break;
452
                    }
453
                }
454
            }
455
        }
456
457
        // set encoding
458 201
        $this->document->encoding = $this->getEncoding();
459
460
        // restore lib-xml settings
461 201
        \libxml_clear_errors();
462 201
        \libxml_use_internal_errors($internalErrors);
463 201
        \libxml_disable_entity_loader($disableEntityLoader);
464
465 201
        return $this->document;
466
    }
467
468
    /**
469
     * Find list of nodes with a CSS selector.
470
     *
471
     * @param string   $selector
472
     * @param int|null $idx
473
     *
474
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
475
     */
476 148 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
477
    {
478 148
        $xPathQuery = SelectorConverter::toXPath($selector);
479
480 148
        $xPath = new \DOMXPath($this->document);
481 148
        $nodesList = $xPath->query($xPathQuery);
482 148
        $elements = new SimpleHtmlDomNode();
483
484 148
        if ($nodesList) {
485 148
            foreach ($nodesList as $node) {
486 138
                $elements[] = new SimpleHtmlDom($node);
487
            }
488
        }
489
490
        // return all elements
491 148
        if ($idx === null) {
492 75
            if (\count($elements) === 0) {
493 16
                return new SimpleHtmlDomNodeBlank();
494
            }
495
496 72
            return $elements;
497
        }
498
499
        // handle negative values
500 91
        if ($idx < 0) {
501 11
            $idx = \count($elements) + $idx;
502
        }
503
504
        // return one element
505 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
506
    }
507
508
    /**
509
     * Find nodes with a CSS selector.
510
     *
511
     * @param string $selector
512
     *
513
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
514
     */
515 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
516
    {
517 12
        return $this->find($selector, null);
518
    }
519
520
    /**
521
     * Find nodes with a CSS selector or false, if no element is found.
522
     *
523
     * @param string $selector
524
     *
525
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
526
     */
527 4
    public function findMultiOrFalse(string $selector)
528
    {
529 4
        $return = $this->find($selector, null);
530
531 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
532 3
            return false;
533
        }
534
535 2
        return $return;
536
    }
537
538
    /**
539
     * Find one node with a CSS selector.
540
     *
541
     * @param string $selector
542
     *
543
     * @return SimpleHtmlDomInterface
544
     */
545 32
    public function findOne(string $selector): SimpleHtmlDomInterface
546
    {
547 32
        return $this->find($selector, 0);
548
    }
549
550
    /**
551
     * Find one node with a CSS selector or false, if no element is found.
552
     *
553
     * @param string $selector
554
     *
555
     * @return false|SimpleHtmlDomInterface
556
     */
557 6
    public function findOneOrFalse(string $selector)
558
    {
559 6
        $return = $this->find($selector, 0);
560
561 6
        if ($return instanceof SimpleHtmlDomBlank) {
562 3
            return false;
563
        }
564
565 4
        return $return;
566
    }
567
568
    /**
569
     * @param string $content
570
     * @param bool   $multiDecodeNewHtmlEntity
571
     *
572
     * @return string
573
     */
574 127
    public function fixHtmlOutput(
575
        string $content,
576
        bool $multiDecodeNewHtmlEntity = false
577
    ): string {
578
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
579
        //          so we try to remove it here again ...
580
581 127
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
582
            /** @noinspection HtmlRequiredLangAttribute */
583 59
            $content = \str_replace(
584
                [
585 59
                    '<html>',
586
                    '</html>',
587
                ],
588 59
                '',
589 59
                $content
590
            );
591
        }
592
593 127
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
594
            /** @noinspection HtmlRequiredTitleElement */
595 63
            $content = \str_replace(
596
                [
597 63
                    '<head>',
598
                    '</head>',
599
                ],
600 63
                '',
601 63
                $content
602
            );
603
        }
604
605 127
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
606 62
            $content = \str_replace(
607
                [
608 62
                    '<body>',
609
                    '</body>',
610
                ],
611 62
                '',
612 62
                $content
613
            );
614
        }
615
616 127
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
617 1
            $content = \str_replace(
618 1
                '</script>',
619 1
                '',
620 1
                $content
621
            );
622
        }
623
624 127
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
625 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
626 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
627
        }
628
629 127
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
630 60
            $content = \str_replace(
631
                [
632 60
                    '<p>',
633
                    '</p>',
634
                ],
635 60
                '',
636 60
                $content
637
            );
638
        }
639
640 127
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
641 9
            $content = \str_replace(
642 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
643 9
                '',
644 9
                $content
645
            );
646
        }
647
648
        // https://bugs.php.net/bug.php?id=73175
649 127
        $content = \str_replace(
650
            \array_map(static function ($e) {
651 127
                return '</' . $e . '>';
652 127
            }, $this->selfClosingTags),
653 127
            '',
654 127
            $content
655
        );
656
657
        /** @noinspection HtmlRequiredTitleElement */
658 127
        $content = \trim(
659 127
            \str_replace(
660
                [
661 127
                    '<simpleHtmlDomHtml>',
662
                    '</simpleHtmlDomHtml>',
663
                    '<simpleHtmlDomP>',
664
                    '</simpleHtmlDomP>',
665
                    '<head><head>',
666
                    '</head></head>',
667
                ],
668
                [
669 127
                    '',
670
                    '',
671
                    '',
672
                    '',
673
                    '<head>',
674
                    '</head>',
675
                ],
676 127
                $content
677
            )
678
        );
679
680 127
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
681
682 127
        return self::putReplacedBackToPreserveHtmlEntities($content);
683
    }
684
685
    /**
686
     * Return elements by ".class".
687
     *
688
     * @param string $class
689
     *
690
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
691
     */
692
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
693
    {
694
        return $this->findMulti(".${class}");
695
    }
696
697
    /**
698
     * Return element by #id.
699
     *
700
     * @param string $id
701
     *
702
     * @return SimpleHtmlDomInterface
703
     */
704 3
    public function getElementById(string $id): SimpleHtmlDomInterface
705
    {
706 3
        return $this->findOne("#${id}");
707
    }
708
709
    /**
710
     * Return element by tag name.
711
     *
712
     * @param string $name
713
     *
714
     * @return SimpleHtmlDomInterface
715
     */
716 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
717
    {
718 1
        $node = $this->document->getElementsByTagName($name)->item(0);
719
720 1
        if ($node === null) {
721
            return new SimpleHtmlDomBlank();
722
        }
723
724 1
        return new SimpleHtmlDom($node);
725
    }
726
727
    /**
728
     * Returns elements by "#id".
729
     *
730
     * @param string   $id
731
     * @param int|null $idx
732
     *
733
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
734
     */
735
    public function getElementsById(string $id, $idx = null)
736
    {
737
        return $this->find("#${id}", $idx);
738
    }
739
740
    /**
741
     * Returns elements by tag name.
742
     *
743
     * @param string   $name
744
     * @param int|null $idx
745
     *
746
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
747
     */
748 6
    public function getElementsByTagName(string $name, $idx = null)
749
    {
750 6
        $nodesList = $this->document->getElementsByTagName($name);
751
752 6
        $elements = new SimpleHtmlDomNode();
753
754 6
        foreach ($nodesList as $node) {
755 4
            $elements[] = new SimpleHtmlDom($node);
756
        }
757
758
        // return all elements
759 6
        if ($idx === null) {
760 5
            if (\count($elements) === 0) {
761 2
                return new SimpleHtmlDomNodeBlank();
762
            }
763
764 3
            return $elements;
765
        }
766
767
        // handle negative values
768 1
        if ($idx < 0) {
769
            $idx = \count($elements) + $idx;
770
        }
771
772
        // return one element
773 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
774
    }
775
776
    /**
777
     * Get dom node's outer html.
778
     *
779
     * @param bool $multiDecodeNewHtmlEntity
780
     *
781
     * @return string
782
     */
783 94
    public function html(bool $multiDecodeNewHtmlEntity = false): string
784
    {
785 94
        if (static::$callback !== null) {
786
            \call_user_func(static::$callback, [$this]);
787
        }
788
789 94
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
790 52
            $content = $this->document->saveHTML($this->document->documentElement);
791
        } else {
792 55
            $content = $this->document->saveHTML();
793
        }
794
795 94
        if ($content === false) {
796
            return '';
797
        }
798
799 94
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
800
    }
801
802
    /**
803
     * Load HTML from string.
804
     *
805
     * @param string   $html
806
     * @param int|null $libXMLExtraOptions
807
     *
808
     * @return HtmlDomParser
809
     */
810 201
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
811
    {
812
        // reset
813 201
        self::$domBrokenReplaceHelper = [];
814
815 201
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
816
817 201
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
818
    }
819
820
    /**
821
     * Load HTML from file.
822
     *
823
     * @param string   $filePath
824
     * @param int|null $libXMLExtraOptions
825
     *
826
     * @throws \RuntimeException
827
     *
828
     * @return HtmlDomParser
829
     */
830 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
831
    {
832
        // reset
833 13
        self::$domBrokenReplaceHelper = [];
834
835
        if (
836 13
            !\preg_match("/^https?:\/\//i", $filePath)
837
            &&
838 13
            !\file_exists($filePath)
839
        ) {
840 1
            throw new \RuntimeException("File ${filePath} not found");
841
        }
842
843
        try {
844 12
            if (\class_exists('\voku\helper\UTF8')) {
845
                /** @noinspection PhpUndefinedClassInspection */
846
                $html = UTF8::file_get_contents($filePath);
847
            } else {
848 12
                $html = \file_get_contents($filePath);
849
            }
850 1
        } catch (\Exception $e) {
851 1
            throw new \RuntimeException("Could not load file ${filePath}");
852
        }
853
854 11
        if ($html === false) {
855
            throw new \RuntimeException("Could not load file ${filePath}");
856
        }
857
858 11
        return $this->loadHtml($html, $libXMLExtraOptions);
859
    }
860
861
    /**
862
     * Get the HTML as XML or plain XML if needed.
863
     *
864
     * @param bool $multiDecodeNewHtmlEntity
865
     * @param bool $htmlToXml
866
     * @param bool $removeXmlHeader
867
     * @param int  $options
868
     *
869
     * @return string
870
     */
871 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
872
        bool $multiDecodeNewHtmlEntity = false,
873
        bool $htmlToXml = true,
874
        bool $removeXmlHeader = true,
875
        int $options = \LIBXML_NOEMPTYTAG
876
    ): string {
877 2
        $xml = $this->document->saveXML(null, $options);
878 2
        if ($xml === false) {
879
            return '';
880
        }
881
882 2
        if ($removeXmlHeader) {
883 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
884
        }
885
886 2
        if ($htmlToXml) {
887 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
888
        } else {
889
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
890
891
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
892
        }
893
894 2
        return $return;
895
    }
896
897
    /**
898
     * @param string $selector
899
     * @param int    $idx
900
     *
901
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
902
     */
903 3
    public function __invoke($selector, $idx = null)
904
    {
905 3
        return $this->find($selector, $idx);
906
    }
907
908
    /**
909
     * @return bool
910
     */
911 127
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
912
    {
913 127
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
914
    }
915
916
    /**
917
     * @return bool
918
     */
919 127
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
920
    {
921 127
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
922
    }
923
924
    /**
925
     * @return bool
926
     */
927 127
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
928
    {
929 127
        return $this->isDOMDocumentCreatedWithoutHtml;
930
    }
931
932
    /**
933
     * @return bool
934
     */
935 127
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
936
    {
937 127
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
938
    }
939
940
    /**
941
     * @return bool
942
     */
943 127
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
944
    {
945 127
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
946
    }
947
948
    /**
949
     * @return bool
950
     */
951 127
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
952
    {
953 127
        return $this->isDOMDocumentCreatedWithoutWrapper;
954
    }
955
956
    /**
957
     * @return bool
958
     */
959 127
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
960
    {
961 127
        return $this->isDOMDocumentCreatedWithFakeEndScript;
962
    }
963
964
    /**
965
     * @param string $html
966
     *
967
     * @return string
968
     */
969 5
    protected function keepBrokenHtml(string $html): string
970
    {
971
        do {
972 5
            $original = $html;
973
974 5
            $html = (string) \preg_replace_callback(
975 5
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
976
                static function ($matches) {
977 5
                    return $matches['start'] .
978 5
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
979 5
                        $matches['value'] .
980 5
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
981 5
                        $matches['end'];
982 5
                },
983 5
                $html
984
            );
985 5
        } while ($original !== $html);
986
987
        do {
988 5
            $original = $html;
989
990 5
            $html = (string) \preg_replace_callback(
991 5
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
992
                static function ($matches) {
993 3
                    $matches['broken'] = \str_replace(
994 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
995 3
                        ['</', '<', '>'],
996 3
                        $matches['broken']
997
                    );
998
999 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
1000 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
1001
1002 3
                    return $matches['start'] . $matchesHash . $matches['end'];
1003 5
                },
1004 5
                $html
1005
            );
1006 5
        } while ($original !== $html);
1007
1008 5
        return \str_replace(
1009 5
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1010 5
            ['</', '<', '>'],
1011 5
            $html
1012
        );
1013
    }
1014
1015
    /**
1016
     * @param string $html
1017
     *
1018
     * @return void
1019
     */
1020 6
    protected function keepSpecialScriptTags(string &$html)
1021
    {
1022
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1023 6
        $tags = \implode('|', \array_map(
1024
            static function ($value) {
1025 6
                return \preg_quote($value, '/');
1026 6
            },
1027 6
            $this->specialScriptTags
1028
        ));
1029 6
        $html = (string) \preg_replace_callback(
1030 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1031
            function ($matches) {
1032
1033
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1034
                // because often this looks like non valid html in the template itself.
1035 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1036 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1037
                        // remove the html5 fallback
1038 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
1039
1040 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1041 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1042
1043 3
                        return $matches['start'] . $matchesHash . $matches['end'];
1044
                    }
1045
                }
1046
1047
                // remove the html5 fallback
1048 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
1049
1050 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1051
1052 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1053 6
            },
1054 6
            $html
1055
        );
1056 6
    }
1057
1058
    /**
1059
     * @param bool $keepBrokenHtml
1060
     *
1061
     * @return HtmlDomParser
1062
     */
1063 5
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1064
    {
1065 5
        $this->keepBrokenHtml = $keepBrokenHtml;
1066
1067 5
        return $this;
1068
    }
1069
1070
    /**
1071
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1072
     *
1073
     * @return HtmlDomParser
1074
     */
1075 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1076
    {
1077 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1078 2
            if (!\is_string($tmp)) {
1079 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1080
            }
1081
        }
1082
1083 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1084
1085 1
        return $this;
1086
    }
1087
1088
    /**
1089
     * @param string[] $specialScriptTags
1090
     *
1091
     * @return HtmlDomParser
1092
     */
1093
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1094
    {
1095
        foreach ($specialScriptTags as $tag) {
1096
            if (!\is_string($tag)) {
1097
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1098
            }
1099
        }
1100
1101
        $this->specialScriptTags = $specialScriptTags;
1102
1103
        return $this;
1104
    }
1105
}
1106