Completed
Push — master ( 612387...00fe6a )
by Lars
01:51
created

HtmlDomParser::find()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 31

Duplication

Lines 31
Ratio 100 %

Code Coverage

Tests 15
CRAP Score 6

Importance

Changes 0
Metric Value
nc 8
dl 31
loc 31
ccs 15
cts 15
cp 1
c 0
b 0
f 0
cc 6
nop 2
crap 6
rs 8.8017
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template',
75
    ];
76
77
    /**
78
     * @var string[]
79
     */
80
    protected $selfClosingTags = [
81
        'area',
82
        'base',
83
        'br',
84
        'col',
85
        'command',
86
        'embed',
87
        'hr',
88
        'img',
89
        'input',
90
        'keygen',
91
        'link',
92
        'meta',
93
        'param',
94
        'source',
95
        'track',
96
        'wbr',
97
    ];
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
128
129
    /**
130
     * @var bool
131
     */
132
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
133
134
    /**
135
     * @var bool
136
     */
137
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
138
139
    /**
140
     * @var bool
141
     */
142
    protected $keepBrokenHtml = false;
143
144
    /**
145
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
146
     */
147 215 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
148
    {
149 215
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
150
151
        // DOMDocument settings
152 215
        $this->document->preserveWhiteSpace = true;
153 215
        $this->document->formatOutput = true;
154
155 215
        if ($element instanceof SimpleHtmlDomInterface) {
156 100
            $element = $element->getNode();
157
        }
158
159 215
        if ($element instanceof \DOMNode) {
160 100
            $domNode = $this->document->importNode($element, true);
161
162 100
            if ($domNode instanceof \DOMNode) {
163
                /** @noinspection UnusedFunctionResultInspection */
164 100
                $this->document->appendChild($domNode);
165
            }
166
167 100
            return;
168
        }
169
170 215
        if ($element !== null) {
171
            /** @noinspection UnusedFunctionResultInspection */
172 85
            $this->loadHtml($element);
173
        }
174 214
    }
175
176
    /**
177
     * @param string $name
178
     * @param array  $arguments
179
     *
180
     * @return bool|mixed
181
     */
182 76
    public function __call($name, $arguments)
183
    {
184 76
        $name = \strtolower($name);
185
186 76
        if (isset(self::$functionAliases[$name])) {
187 75
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
188
        }
189
190 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
191
    }
192
193
    /**
194
     * @param string $name
195
     * @param array  $arguments
196
     *
197
     * @throws \BadMethodCallException
198
     * @throws \RuntimeException
199
     *
200
     * @return HtmlDomParser
201
     */
202 28 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
203
    {
204 28
        $arguments0 = $arguments[0] ?? '';
205
206 28
        $arguments1 = $arguments[1] ?? null;
207
208 28
        if ($name === 'str_get_html') {
209 22
            $parser = new static();
210
211 22
            return $parser->loadHtml($arguments0, $arguments1);
212
        }
213
214 7
        if ($name === 'file_get_html') {
215 6
            $parser = new static();
216
217 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
218
        }
219
220 1
        throw new \BadMethodCallException('Method does not exist');
221
    }
222
223
    /** @noinspection MagicMethodsValidityInspection */
224
225
    /**
226
     * @param string $name
227
     *
228
     * @return string|null
229
     */
230 15
    public function __get($name)
231
    {
232 15
        $name = \strtolower($name);
233
234 15
        switch ($name) {
235 15
            case 'outerhtml':
236 15
            case 'outertext':
237 5
                return $this->html();
238 11
            case 'innerhtml':
239 5
            case 'innertext':
240 7
                return $this->innerHtml();
241 4
            case 'text':
242 4
            case 'plaintext':
243 3
                return $this->text();
244
        }
245
246 1
        return null;
247
    }
248
249
    /**
250
     * @return string
251
     */
252 20
    public function __toString()
253
    {
254 20
        return $this->html();
255
    }
256
257
    /**
258
     * does nothing (only for api-compatibility-reasons)
259
     *
260
     * @return bool
261
     *
262
     * @deprecated
263
     */
264 6
    public function clear(): bool
265
    {
266 6
        return true;
267
    }
268
269
    /**
270
     * Create DOMDocument from HTML.
271
     *
272
     * @param string   $html
273
     * @param int|null $libXMLExtraOptions
274
     *
275
     * @return \DOMDocument
276
     */
277 199
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
278
    {
279
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
280 199 View Code Duplication
        if (\stripos($html, '<!DOCTYPE') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
281
            /** @noinspection NestedPositiveIfStatementsInspection */
282
            if (
283 59
                \preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype)
284
                &&
285 59
                \trim($matches_before_doctype[1])
286
            ) {
287 1
                $html = \str_replace($matches_before_doctype[1], '', $html);
288
            }
289
        }
290
291 199
        if ($this->keepBrokenHtml) {
292 3
            $html = $this->keepBrokenHtml(\trim($html));
293
        }
294
295 199
        if (\strpos($html, '<') === false) {
296 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
297 197
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
298 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
299
        }
300
301 199
        if (\strpos(\ltrim($html), '<!--') === 0) {
302 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
303
        }
304
305
        /** @noinspection HtmlRequiredLangAttribute */
306
        if (
307 199
            \strpos($html, '<html ') === false
308
            &&
309 199
            \strpos($html, '<html>') === false
310
        ) {
311 120
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
312
        }
313
314
        if (
315 199
            \strpos($html, '<body ') === false
316
            &&
317 199
            \strpos($html, '<body>') === false
318
        ) {
319 125
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
320
        }
321
322
        /** @noinspection HtmlRequiredTitleElement */
323
        if (
324 199
            \strpos($html, '<head ') === false
325
            &&
326 199
            \strpos($html, '<head>') === false
327
        ) {
328 144
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
329
        }
330
331
        if (
332 199
            \strpos($html, '<p ') === false
333
            &&
334 199
            \strpos($html, '<p>') === false
335
        ) {
336 108
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
337
        }
338
339
        if (
340 199
            \strpos($html, '</script>') === false
341
            &&
342 199
            \strpos($html, '<\/script>') !== false
343
        ) {
344 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
345
        }
346
347 199 View Code Duplication
        if (\stripos($html, '</html>') !== false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
348
            /** @noinspection NestedPositiveIfStatementsInspection */
349
            if (
350 88
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
351
                &&
352 88
                \trim($matches_after_html[1])
353
            ) {
354 2
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
355
            }
356
        }
357
358 199
        if (\strpos($html, '<script') !== false) {
359 23
            $this->html5FallbackForScriptTags($html);
360
361 23
            foreach ($this->specialScriptTags as $tag) {
362 23
                if (\strpos($html, $tag) !== false) {
363 6
                    $this->keepSpecialScriptTags($html);
364
                }
365
            }
366
        }
367
368 199
        $html = \str_replace(
369
            \array_map(static function ($e) {
370 199
                return '<' . $e . '>';
371 199
            }, $this->selfClosingTags),
372
            \array_map(static function ($e) {
373 199
                return '<' . $e . '/>';
374 199
            }, $this->selfClosingTags),
375 199
            $html
376
        );
377
378
        // set error level
379 199
        $internalErrors = \libxml_use_internal_errors(true);
380 199
        $disableEntityLoader = \libxml_disable_entity_loader(true);
381 199
        \libxml_clear_errors();
382
383 199
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
384
385 199
        if (\defined('LIBXML_BIGLINES')) {
386 199
            $optionsXml |= \LIBXML_BIGLINES;
387
        }
388
389 199
        if (\defined('LIBXML_COMPACT')) {
390 199
            $optionsXml |= \LIBXML_COMPACT;
391
        }
392
393 199
        if (\defined('LIBXML_HTML_NODEFDTD')) {
394 199
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
395
        }
396
397 199
        if ($libXMLExtraOptions !== null) {
398 5
            $optionsXml |= $libXMLExtraOptions;
399
        }
400
401
        if (
402 199
            $this->isDOMDocumentCreatedWithoutWrapper
403
            ||
404 195
            $this->isDOMDocumentCreatedWithCommentWrapper
405
            ||
406 199
            $this->keepBrokenHtml
407
        ) {
408 19
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
409
        }
410
411 199
        $html = self::replaceToPreserveHtmlEntities($html);
412
413 199
        $documentFound = false;
414 199
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
415 199 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
416 94
            $domElementTmp = \dom_import_simplexml($sxe);
417
            if (
418 94
                $domElementTmp
419
                &&
420 94
                $domElementTmp->ownerDocument
421
            ) {
422 94
                $documentFound = true;
423 94
                $this->document = $domElementTmp->ownerDocument;
424
            }
425
        }
426
427 199 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
428
429
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
430 114
            $xmlHackUsed = false;
431
            /** @noinspection StringFragmentMisplacedInspection */
432 114
            if (\stripos('<?xml', $html) !== 0) {
433 114
                $xmlHackUsed = true;
434 114
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
435
            }
436
437 114
            $this->document->loadHTML($html, $optionsXml);
438
439
            // remove the "xml-encoding" hack
440 114
            if ($xmlHackUsed) {
441 114
                foreach ($this->document->childNodes as $child) {
442 114
                    if ($child->nodeType === \XML_PI_NODE) {
443
                        /** @noinspection UnusedFunctionResultInspection */
444 114
                        $this->document->removeChild($child);
445
446 114
                        break;
447
                    }
448
                }
449
            }
450
        }
451
452
        // set encoding
453 199
        $this->document->encoding = $this->getEncoding();
454
455
        // restore lib-xml settings
456 199
        \libxml_clear_errors();
457 199
        \libxml_use_internal_errors($internalErrors);
458 199
        \libxml_disable_entity_loader($disableEntityLoader);
459
460 199
        return $this->document;
461
    }
462
463
    /**
464
     * Find list of nodes with a CSS selector.
465
     *
466
     * @param string   $selector
467
     * @param int|null $idx
468
     *
469
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
470
     */
471 146 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
472
    {
473 146
        $xPathQuery = SelectorConverter::toXPath($selector);
474
475 146
        $xPath = new \DOMXPath($this->document);
476 146
        $nodesList = $xPath->query($xPathQuery);
477 146
        $elements = new SimpleHtmlDomNode();
478
479 146
        if ($nodesList) {
480 146
            foreach ($nodesList as $node) {
481 136
                $elements[] = new SimpleHtmlDom($node);
482
            }
483
        }
484
485
        // return all elements
486 146
        if ($idx === null) {
487 73
            if (\count($elements) === 0) {
488 16
                return new SimpleHtmlDomNodeBlank();
489
            }
490
491 70
            return $elements;
492
        }
493
494
        // handle negative values
495 91
        if ($idx < 0) {
496 11
            $idx = \count($elements) + $idx;
497
        }
498
499
        // return one element
500 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
501
    }
502
503
    /**
504
     * Find nodes with a CSS selector.
505
     *
506
     * @param string $selector
507
     *
508
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
509
     */
510 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
511
    {
512 12
        return $this->find($selector, null);
513
    }
514
515
    /**
516
     * Find nodes with a CSS selector or false, if no element is found.
517
     *
518
     * @param string $selector
519
     *
520
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
521
     */
522 4
    public function findMultiOrFalse(string $selector)
523
    {
524 4
        $return = $this->find($selector, null);
525
526 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
527 3
            return false;
528
        }
529
530 2
        return $return;
531
    }
532
533
    /**
534
     * Find one node with a CSS selector.
535
     *
536
     * @param string $selector
537
     *
538
     * @return SimpleHtmlDomInterface
539
     */
540 32
    public function findOne(string $selector): SimpleHtmlDomInterface
541
    {
542 32
        return $this->find($selector, 0);
543
    }
544
545
    /**
546
     * Find one node with a CSS selector or false, if no element is found.
547
     *
548
     * @param string $selector
549
     *
550
     * @return false|SimpleHtmlDomInterface
551
     */
552 6
    public function findOneOrFalse(string $selector)
553
    {
554 6
        $return = $this->find($selector, 0);
555
556 6
        if ($return instanceof SimpleHtmlDomBlank) {
557 3
            return false;
558
        }
559
560 4
        return $return;
561
    }
562
563
    /**
564
     * @param string $content
565
     * @param bool   $multiDecodeNewHtmlEntity
566
     *
567
     * @return string
568
     */
569 125
    public function fixHtmlOutput(
570
        string $content,
571
        bool $multiDecodeNewHtmlEntity = false
572
    ): string {
573
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
574
        //          so we try to remove it here again ...
575
576 125
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
577
            /** @noinspection HtmlRequiredLangAttribute */
578 59
            $content = \str_replace(
579
                [
580 59
                    '<html>',
581
                    '</html>',
582
                ],
583 59
                '',
584 59
                $content
585
            );
586
        }
587
588 125
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
589
            /** @noinspection HtmlRequiredTitleElement */
590 63
            $content = \str_replace(
591
                [
592 63
                    '<head>',
593
                    '</head>',
594
                ],
595 63
                '',
596 63
                $content
597
            );
598
        }
599
600 125
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
601 62
            $content = \str_replace(
602
                [
603 62
                    '<body>',
604
                    '</body>',
605
                ],
606 62
                '',
607 62
                $content
608
            );
609
        }
610
611 125
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
612 1
            $content = \str_replace(
613 1
                '</script>',
614 1
                '',
615 1
                $content
616
            );
617
        }
618
619 125
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
620 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
621 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
622
        }
623
624 125
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
625 58
            $content = \str_replace(
626
                [
627 58
                    '<p>',
628
                    '</p>',
629
                ],
630 58
                '',
631 58
                $content
632
            );
633
        }
634
635 125
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
636 9
            $content = \str_replace(
637 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
638 9
                '',
639 9
                $content
640
            );
641
        }
642
643
        // https://bugs.php.net/bug.php?id=73175
644 125
        $content = \str_replace(
645
            \array_map(static function ($e) {
646 125
                return '</' . $e . '>';
647 125
            }, $this->selfClosingTags),
648 125
            '',
649 125
            $content
650
        );
651
652
        /** @noinspection HtmlRequiredTitleElement */
653 125
        $content = \trim(
654 125
            \str_replace(
655
                [
656 125
                    '<simpleHtmlDomHtml>',
657
                    '</simpleHtmlDomHtml>',
658
                    '<simpleHtmlDomP>',
659
                    '</simpleHtmlDomP>',
660
                    '<head><head>',
661
                    '</head></head>',
662
                ],
663
                [
664 125
                    '',
665
                    '',
666
                    '',
667
                    '',
668
                    '<head>',
669
                    '</head>',
670
                ],
671 125
                $content
672
            )
673
        );
674
675 125
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
676
677 125
        return self::putReplacedBackToPreserveHtmlEntities($content);
678
    }
679
680
    /**
681
     * Return elements by ".class".
682
     *
683
     * @param string $class
684
     *
685
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
686
     */
687
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
688
    {
689
        return $this->findMulti(".${class}");
690
    }
691
692
    /**
693
     * Return element by #id.
694
     *
695
     * @param string $id
696
     *
697
     * @return SimpleHtmlDomInterface
698
     */
699 3
    public function getElementById(string $id): SimpleHtmlDomInterface
700
    {
701 3
        return $this->findOne("#${id}");
702
    }
703
704
    /**
705
     * Return element by tag name.
706
     *
707
     * @param string $name
708
     *
709
     * @return SimpleHtmlDomInterface
710
     */
711 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
712
    {
713 1
        $node = $this->document->getElementsByTagName($name)->item(0);
714
715 1
        if ($node === null) {
716
            return new SimpleHtmlDomBlank();
717
        }
718
719 1
        return new SimpleHtmlDom($node);
720
    }
721
722
    /**
723
     * Returns elements by "#id".
724
     *
725
     * @param string   $id
726
     * @param int|null $idx
727
     *
728
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
729
     */
730
    public function getElementsById(string $id, $idx = null)
731
    {
732
        return $this->find("#${id}", $idx);
733
    }
734
735
    /**
736
     * Returns elements by tag name.
737
     *
738
     * @param string   $name
739
     * @param int|null $idx
740
     *
741
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
742
     */
743 6
    public function getElementsByTagName(string $name, $idx = null)
744
    {
745 6
        $nodesList = $this->document->getElementsByTagName($name);
746
747 6
        $elements = new SimpleHtmlDomNode();
748
749 6
        foreach ($nodesList as $node) {
750 4
            $elements[] = new SimpleHtmlDom($node);
751
        }
752
753
        // return all elements
754 6
        if ($idx === null) {
755 5
            if (\count($elements) === 0) {
756 2
                return new SimpleHtmlDomNodeBlank();
757
            }
758
759 3
            return $elements;
760
        }
761
762
        // handle negative values
763 1
        if ($idx < 0) {
764
            $idx = \count($elements) + $idx;
765
        }
766
767
        // return one element
768 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
769
    }
770
771
    /**
772
     * Get dom node's outer html.
773
     *
774
     * @param bool $multiDecodeNewHtmlEntity
775
     *
776
     * @return string
777
     */
778 92
    public function html(bool $multiDecodeNewHtmlEntity = false): string
779
    {
780 92
        if (static::$callback !== null) {
781
            \call_user_func(static::$callback, [$this]);
782
        }
783
784 92
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
785 52
            $content = $this->document->saveHTML($this->document->documentElement);
786
        } else {
787 53
            $content = $this->document->saveHTML();
788
        }
789
790 92
        if ($content === false) {
791
            return '';
792
        }
793
794 92
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
795
    }
796
797
    /**
798
     * Load HTML from string.
799
     *
800
     * @param string   $html
801
     * @param int|null $libXMLExtraOptions
802
     *
803
     * @return HtmlDomParser
804
     */
805 199
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
806
    {
807
        // reset
808 199
        self::$domBrokenReplaceHelper = [];
809
810 199
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
811
812 199
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
813
    }
814
815
    /**
816
     * Load HTML from file.
817
     *
818
     * @param string   $filePath
819
     * @param int|null $libXMLExtraOptions
820
     *
821
     * @throws \RuntimeException
822
     *
823
     * @return HtmlDomParser
824
     */
825 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
826
    {
827
        // reset
828 13
        self::$domBrokenReplaceHelper = [];
829
830
        if (
831 13
            !\preg_match("/^https?:\/\//i", $filePath)
832
            &&
833 13
            !\file_exists($filePath)
834
        ) {
835 1
            throw new \RuntimeException("File ${filePath} not found");
836
        }
837
838
        try {
839 12
            if (\class_exists('\voku\helper\UTF8')) {
840
                /** @noinspection PhpUndefinedClassInspection */
841
                $html = UTF8::file_get_contents($filePath);
842
            } else {
843 12
                $html = \file_get_contents($filePath);
844
            }
845 1
        } catch (\Exception $e) {
846 1
            throw new \RuntimeException("Could not load file ${filePath}");
847
        }
848
849 11
        if ($html === false) {
850
            throw new \RuntimeException("Could not load file ${filePath}");
851
        }
852
853 11
        return $this->loadHtml($html, $libXMLExtraOptions);
854
    }
855
856
    /**
857
     * Get the HTML as XML or plain XML if needed.
858
     *
859
     * @param bool $multiDecodeNewHtmlEntity
860
     * @param bool $htmlToXml
861
     * @param bool $removeXmlHeader
862
     * @param int  $options
863
     *
864
     * @return string
865
     */
866 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
867
        bool $multiDecodeNewHtmlEntity = false,
868
        bool $htmlToXml = true,
869
        bool $removeXmlHeader = true,
870
        int $options = \LIBXML_NOEMPTYTAG
871
    ): string {
872 2
        $xml = $this->document->saveXML(null, $options);
873 2
        if ($xml === false) {
874
            return '';
875
        }
876
877 2
        if ($removeXmlHeader) {
878 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
879
        }
880
881 2
        if ($htmlToXml) {
882 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
883
        } else {
884
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
885
886
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
887
        }
888
889 2
        return $return;
890
    }
891
892
    /**
893
     * @param string $selector
894
     * @param int    $idx
895
     *
896
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
897
     */
898 3
    public function __invoke($selector, $idx = null)
899
    {
900 3
        return $this->find($selector, $idx);
901
    }
902
903
    /**
904
     * @return bool
905
     */
906 125
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
907
    {
908 125
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
909
    }
910
911
    /**
912
     * @return bool
913
     */
914 125
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
915
    {
916 125
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
917
    }
918
919
    /**
920
     * @return bool
921
     */
922 125
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
923
    {
924 125
        return $this->isDOMDocumentCreatedWithoutHtml;
925
    }
926
927
    /**
928
     * @return bool
929
     */
930 125
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
931
    {
932 125
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
933
    }
934
935
    /**
936
     * @return bool
937
     */
938 125
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
939
    {
940 125
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
941
    }
942
943
    /**
944
     * @return bool
945
     */
946 125
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
947
    {
948 125
        return $this->isDOMDocumentCreatedWithoutWrapper;
949
    }
950
951
    /**
952
     * @return bool
953
     */
954 125
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
955
    {
956 125
        return $this->isDOMDocumentCreatedWithFakeEndScript;
957
    }
958
959
    /**
960
     * @param string $html
961
     *
962
     * @return string
963
     */
964 3
    protected function keepBrokenHtml(string $html): string
965
    {
966
        do {
967 3
            $original = $html;
968
969 3
            $html = (string) \preg_replace_callback(
970 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
971
                static function ($matches) {
972 3
                    return $matches['start'] .
973 3
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
974 3
                        $matches['value'] .
975 3
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
976 3
                        $matches['end'];
977 3
                },
978 3
                $html
979
            );
980 3
        } while ($original !== $html);
981
982
        do {
983 3
            $original = $html;
984
985 3
            $html = (string) \preg_replace_callback(
986 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
987
                static function ($matches) {
988 3
                    $matches['broken'] = \str_replace(
989 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
990 3
                        ['</', '<', '>'],
991 3
                        $matches['broken']
992
                    );
993
994 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
995 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
996
997 3
                    return $matches['start'] . $matchesHash . $matches['end'];
998 3
                },
999 3
                $html
1000
            );
1001 3
        } while ($original !== $html);
1002
1003 3
        return \str_replace(
1004 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1005 3
            ['</', '<', '>'],
1006 3
            $html
1007
        );
1008
    }
1009
1010
    /**
1011
     * @param string $html
1012
     *
1013
     * @return void
1014
     */
1015 6
    protected function keepSpecialScriptTags(string &$html)
1016
    {
1017
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1018 6
        $tags = \implode('|', \array_map(
1019
            static function ($value) {
1020 6
                return \preg_quote($value, '/');
1021 6
            },
1022 6
            $this->specialScriptTags
1023
        ));
1024 6
        $html = (string) \preg_replace_callback(
1025 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1026
            function ($matches) {
1027
1028
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1029
                // because often this looks like non valid html in the template itself.
1030 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1031 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1032
                        // remove the html5 fallback
1033 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
1034
1035 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1036 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1037
1038 3
                        return $matches['start'] . $matchesHash . $matches['end'];
1039
                    }
1040
                }
1041
1042
                // remove the html5 fallback
1043 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
1044
1045 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1046
1047 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1048 6
            },
1049 6
            $html
1050
        );
1051 6
    }
1052
1053
    /**
1054
     * @param bool $keepBrokenHtml
1055
     *
1056
     * @return HtmlDomParser
1057
     */
1058 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1059
    {
1060 3
        $this->keepBrokenHtml = $keepBrokenHtml;
1061
1062 3
        return $this;
1063
    }
1064
1065
    /**
1066
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1067
     *
1068
     * @return HtmlDomParser
1069
     */
1070 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1071
    {
1072 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1073 2
            if (!\is_string($tmp)) {
1074 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1075
            }
1076
        }
1077
1078 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1079
1080 1
        return $this;
1081
    }
1082
1083
    /**
1084
     * @param string[] $specialScriptTags
1085
     *
1086
     * @return HtmlDomParser
1087
     */
1088
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1089
    {
1090
        foreach ($specialScriptTags as $tag) {
1091
            if (!\is_string($tag)) {
1092
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1093
            }
1094
        }
1095
1096
        $this->specialScriptTags = $specialScriptTags;
1097
1098
        return $this;
1099
    }
1100
}
1101