Completed
Pull Request — master (#55)
by Volodymyr
02:41
created

HtmlDomParser::__invoke()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template'
75
    ];
76
77
    /**
78
     * @var bool
79
     */
80
    protected $isDOMDocumentCreatedWithoutHtml = false;
81
82
    /**
83
     * @var bool
84
     */
85
    protected $isDOMDocumentCreatedWithoutWrapper = false;
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
116
117
    /**
118
     * @var bool
119
     */
120
    protected $keepBrokenHtml;
121
122
    /**
123
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
124
     */
125 212 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
126
    {
127 212
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
128
129
        // DOMDocument settings
130 212
        $this->document->preserveWhiteSpace = true;
131 212
        $this->document->formatOutput = true;
132
133 212
        if ($element instanceof SimpleHtmlDomInterface) {
134 98
            $element = $element->getNode();
135
        }
136
137 212
        if ($element instanceof \DOMNode) {
138 98
            $domNode = $this->document->importNode($element, true);
139
140 98
            if ($domNode instanceof \DOMNode) {
141
                /** @noinspection UnusedFunctionResultInspection */
142 98
                $this->document->appendChild($domNode);
143
            }
144
145 98
            return;
146
        }
147
148 212
        if ($element !== null) {
149
            /** @noinspection UnusedFunctionResultInspection */
150 85
            $this->loadHtml($element);
151
        }
152 211
    }
153
154
    /**
155
     * @param string $name
156
     * @param array  $arguments
157
     *
158
     * @return bool|mixed
159
     */
160 76
    public function __call($name, $arguments)
161
    {
162 76
        $name = \strtolower($name);
163
164 76
        if (isset(self::$functionAliases[$name])) {
165 75
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
166
        }
167
168 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
169
    }
170
171
    /**
172
     * @param string $name
173
     * @param array  $arguments
174
     *
175
     * @throws \BadMethodCallException
176
     * @throws \RuntimeException
177
     *
178
     * @return HtmlDomParser
179
     */
180 26 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
181
    {
182 26
        $arguments0 = $arguments[0] ?? '';
183
184 26
        $arguments1 = $arguments[1] ?? null;
185
186 26
        if ($name === 'str_get_html') {
187 20
            $parser = new static();
188
189 20
            return $parser->loadHtml($arguments0, $arguments1);
190
        }
191
192 7
        if ($name === 'file_get_html') {
193 6
            $parser = new static();
194
195 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
196
        }
197
198 1
        throw new \BadMethodCallException('Method does not exist');
199
    }
200
201
    /** @noinspection MagicMethodsValidityInspection */
202
203
    /**
204
     * @param string $name
205
     *
206
     * @return string|null
207
     */
208 15
    public function __get($name)
209
    {
210 15
        $name = \strtolower($name);
211
212
        switch ($name) {
213 15
            case 'outerhtml':
214 15
            case 'outertext':
215 5
                return $this->html();
216 11
            case 'innerhtml':
217 5
            case 'innertext':
218 7
                return $this->innerHtml();
219 4
            case 'text':
220 4
            case 'plaintext':
221 3
                return $this->text();
222
        }
223
224 1
        return null;
225
    }
226
227
    /**
228
     * @return string
229
     */
230 19
    public function __toString()
231
    {
232 19
        return $this->html();
233
    }
234
235
    /**
236
     * does nothing (only for api-compatibility-reasons)
237
     *
238
     * @return bool
239
     *
240
     * @deprecated
241
     */
242 6
    public function clear(): bool
243
    {
244 6
        return true;
245
    }
246
247
    /**
248
     * Create DOMDocument from HTML.
249
     *
250
     * @param string   $html
251
     * @param int|null $libXMLExtraOptions
252
     *
253
     * @return \DOMDocument
254
     */
255 196
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
256
    {
257 196
        if ($this->keepBrokenHtml) {
258 3
            $html = $this->keepBrokenHtml(\trim($html));
259
        }
260
261 196
        if (\strpos($html, '<') === false) {
262 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
263 194
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
264 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
265
        }
266
267 196
        if (\strpos(\ltrim($html), '<!--') === 0) {
268 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
269
        }
270
271
        /** @noinspection HtmlRequiredLangAttribute */
272
        if (
273 196
            \strpos($html, '<html ') === false
274
            &&
275 196
            \strpos($html, '<html>') === false
276
        ) {
277 119
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
278
        }
279
280
        if (
281 196
            \strpos($html, '<body ') === false
282
            &&
283 196
            \strpos($html, '<body>') === false
284
        ) {
285 124
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
286
        }
287
288
        /** @noinspection HtmlRequiredTitleElement */
289
        if (
290 196
            \strpos($html, '<head ') === false
291
            &&
292 196
            \strpos($html, '<head>') === false
293
        ) {
294 143
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
295
        }
296
297
        /** @noinspection HtmlRequiredTitleElement */
298
        if (
299 196
            \strpos($html, '<p ') === false
300
            &&
301 196
            \strpos($html, '<p>') === false
302
        ) {
303 106
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
304
        }
305
306
        if (
307 196
            \strpos($html, '</script>') === false
308
            &&
309 196
            \strpos($html, '<\/script>') !== false
310
        ) {
311 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
312
        }
313
314 196
        if (\strpos($html, '<script') !== false) {
315 23
            $this->html5FallbackForScriptTags($html);
316
317 23
            foreach ($this->specialScriptTags as $tag) {
318 23
                if (\strpos($html, $tag) !== false) {
319 23
                    $this->keepSpecialScriptTags($html);
320
                }
321
            }
322
        }
323
324
        // set error level
325 196
        $internalErrors = \libxml_use_internal_errors(true);
326 196
        $disableEntityLoader = \libxml_disable_entity_loader(true);
327 196
        \libxml_clear_errors();
328
329 196
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
330
331 196
        if (\defined('LIBXML_BIGLINES')) {
332 196
            $optionsXml |= \LIBXML_BIGLINES;
333
        }
334
335 196
        if (\defined('LIBXML_COMPACT')) {
336 196
            $optionsXml |= \LIBXML_COMPACT;
337
        }
338
339 196
        if (\defined('LIBXML_HTML_NODEFDTD')) {
340 196
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
341
        }
342
343 196
        if ($libXMLExtraOptions !== null) {
344 5
            $optionsXml |= $libXMLExtraOptions;
345
        }
346
347
        if (
348 196
            $this->isDOMDocumentCreatedWithoutWrapper
349
            ||
350 192
            $this->isDOMDocumentCreatedWithCommentWrapper
351
            ||
352 196
            $this->keepBrokenHtml
353
        ) {
354 19
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
355
        }
356
357 196
        $html = self::replaceToPreserveHtmlEntities($html);
358
359 196
        $documentFound = false;
360 196
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
361 196 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
362 90
            $domElementTmp = \dom_import_simplexml($sxe);
363
            if (
364 90
                $domElementTmp
365
                &&
366 90
                $domElementTmp->ownerDocument !== null
367
            ) {
368 90
                $documentFound = true;
369 90
                $this->document = $domElementTmp->ownerDocument;
370
            }
371
        }
372
373 196 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
374
375
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
376 115
            $xmlHackUsed = false;
377
            /** @noinspection StringFragmentMisplacedInspection */
378 115
            if (\stripos('<?xml', $html) !== 0) {
379 115
                $xmlHackUsed = true;
380 115
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
381
            }
382
383 115
            $this->document->loadHTML($html, $optionsXml);
384
385
            // remove the "xml-encoding" hack
386 115
            if ($xmlHackUsed) {
387 115
                foreach ($this->document->childNodes as $child) {
388 115
                    if ($child->nodeType === \XML_PI_NODE) {
389
                        /** @noinspection UnusedFunctionResultInspection */
390 115
                        $this->document->removeChild($child);
391
392 115
                        break;
393
                    }
394
                }
395
            }
396
        }
397
398
        // set encoding
399 196
        $this->document->encoding = $this->getEncoding();
400
401
        // restore lib-xml settings
402 196
        \libxml_clear_errors();
403 196
        \libxml_use_internal_errors($internalErrors);
404 196
        \libxml_disable_entity_loader($disableEntityLoader);
405
406 196
        return $this->document;
407
    }
408
409
    /**
410
     * Find list of nodes with a CSS selector.
411
     *
412
     * @param string   $selector
413
     * @param int|null $idx
414
     *
415
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
416
     */
417 144 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
418
    {
419 144
        $xPathQuery = SelectorConverter::toXPath($selector);
420
421 144
        $xPath = new \DOMXPath($this->document);
422 144
        $nodesList = $xPath->query($xPathQuery);
423 144
        $elements = new SimpleHtmlDomNode();
424
425 144
        if ($nodesList) {
426 144
            foreach ($nodesList as $node) {
427 134
                $elements[] = new SimpleHtmlDom($node);
428
            }
429
        }
430
431
        // return all elements
432 144
        if ($idx === null) {
433 71
            if (\count($elements) === 0) {
434 16
                return new SimpleHtmlDomNodeBlank();
435
            }
436
437 68
            return $elements;
438
        }
439
440
        // handle negative values
441 91
        if ($idx < 0) {
442 11
            $idx = \count($elements) + $idx;
443
        }
444
445
        // return one element
446 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
447
    }
448
449
    /**
450
     * Find nodes with a CSS selector.
451
     *
452
     * @param string $selector
453
     *
454
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
455
     */
456 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
457
    {
458 12
        return $this->find($selector, null);
459
    }
460
461
    /**
462
     * Find nodes with a CSS selector or false, if no element is found.
463
     *
464
     * @param string $selector
465
     *
466
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
467
     */
468 4
    public function findMultiOrFalse(string $selector)
469
    {
470 4
        $return = $this->find($selector, null);
471
472 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
473 3
            return false;
474
        }
475
476 2
        return $return;
477
    }
478
479
    /**
480
     * Find one node with a CSS selector.
481
     *
482
     * @param string $selector
483
     *
484
     * @return SimpleHtmlDomInterface
485
     */
486 32
    public function findOne(string $selector): SimpleHtmlDomInterface
487
    {
488 32
        return $this->find($selector, 0);
489
    }
490
491
    /**
492
     * Find one node with a CSS selector or false, if no element is found.
493
     *
494
     * @param string $selector
495
     *
496
     * @return false|SimpleHtmlDomInterface
497
     */
498 6
    public function findOneOrFalse(string $selector)
499
    {
500 6
        $return = $this->find($selector, 0);
501
502 6
        if ($return instanceof SimpleHtmlDomBlank) {
503 3
            return false;
504
        }
505
506 4
        return $return;
507
    }
508
509
    /**
510
     * @param string $content
511
     * @param bool   $multiDecodeNewHtmlEntity
512
     *
513
     * @return string
514
     */
515 122
    public function fixHtmlOutput(
516
        string $content,
517
        bool $multiDecodeNewHtmlEntity = false
518
    ): string {
519
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
520
        //          so we try to remove it here again ...
521
522 122
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
523
            /** @noinspection HtmlRequiredLangAttribute */
524 58
            $content = \str_replace(
525
                [
526 58
                    '<html>',
527
                    '</html>',
528
                ],
529 58
                '',
530 58
                $content
531
            );
532
        }
533
534 122
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
535
            /** @noinspection HtmlRequiredTitleElement */
536 62
            $content = \str_replace(
537
                [
538 62
                    '<head>',
539
                    '</head>',
540
                ],
541 62
                '',
542 62
                $content
543
            );
544
        }
545
546 122
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
547
            /** @noinspection HtmlRequiredLangAttribute */
548 61
            $content = \str_replace(
549
                [
550 61
                    '<body>',
551
                    '</body>',
552
                ],
553 61
                '',
554 61
                $content
555
            );
556
        }
557
558 122
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
559 1
            $content = \str_replace(
560 1
                '</script>',
561 1
                '',
562 1
                $content
563
            );
564
        }
565
566 122
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
567 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
568 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
569
        }
570
571 122
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
572 56
            $content = \str_replace(
573
                [
574 56
                    '<p>',
575
                    '</p>',
576
                ],
577 56
                '',
578 56
                $content
579
            );
580
        }
581
582 122
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
583 9
            $content = \str_replace(
584 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
585 9
                '',
586 9
                $content
587
            );
588
        }
589
590
        /** @noinspection CheckTagEmptyBody */
591
        /** @noinspection HtmlExtraClosingTag */
592
        /** @noinspection HtmlRequiredTitleElement */
593 122
        $content = \trim(
594 122
            \str_replace(
595
                [
596 122
                    '<simpleHtmlDomHtml>',
597
                    '</simpleHtmlDomHtml>',
598
                    '<simpleHtmlDomP>',
599
                    '</simpleHtmlDomP>',
600
                    '<head><head>',
601
                    '</head></head>',
602
                    '<br></br>',
603
                ],
604
                [
605 122
                    '',
606
                    '',
607
                    '',
608
                    '',
609
                    '<head>',
610
                    '</head>',
611
                    '<br>',
612
                ],
613 122
                $content
614
            )
615
        );
616
617 122
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
618
619 122
        return self::putReplacedBackToPreserveHtmlEntities($content);
620
    }
621
622
    /**
623
     * Return elements by ".class".
624
     *
625
     * @param string $class
626
     *
627
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
628
     */
629
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
630
    {
631
        return $this->findMulti(".${class}");
632
    }
633
634
    /**
635
     * Return element by #id.
636
     *
637
     * @param string $id
638
     *
639
     * @return SimpleHtmlDomInterface
640
     */
641 3
    public function getElementById(string $id): SimpleHtmlDomInterface
642
    {
643 3
        return $this->findOne("#${id}");
644
    }
645
646
    /**
647
     * Return element by tag name.
648
     *
649
     * @param string $name
650
     *
651
     * @return SimpleHtmlDomInterface
652
     */
653 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
654
    {
655 1
        $node = $this->document->getElementsByTagName($name)->item(0);
656
657 1
        if ($node === null) {
658
            return new SimpleHtmlDomBlank();
659
        }
660
661 1
        return new SimpleHtmlDom($node);
662
    }
663
664
    /**
665
     * Returns elements by "#id".
666
     *
667
     * @param string   $id
668
     * @param int|null $idx
669
     *
670
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
671
     */
672
    public function getElementsById(string $id, $idx = null)
673
    {
674
        return $this->find("#${id}", $idx);
675
    }
676
677
    /**
678
     * Returns elements by tag name.
679
     *
680
     * @param string   $name
681
     * @param int|null $idx
682
     *
683
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
684
     */
685 6
    public function getElementsByTagName(string $name, $idx = null)
686
    {
687 6
        $nodesList = $this->document->getElementsByTagName($name);
688
689 6
        $elements = new SimpleHtmlDomNode();
690
691 6
        foreach ($nodesList as $node) {
692 4
            $elements[] = new SimpleHtmlDom($node);
693
        }
694
695
        // return all elements
696 6
        if ($idx === null) {
697 5
            if (\count($elements) === 0) {
698 2
                return new SimpleHtmlDomNodeBlank();
699
            }
700
701 3
            return $elements;
702
        }
703
704
        // handle negative values
705 1
        if ($idx < 0) {
706
            $idx = \count($elements) + $idx;
707
        }
708
709
        // return one element
710 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
711
    }
712
713
    /**
714
     * Get dom node's outer html.
715
     *
716
     * @param bool $multiDecodeNewHtmlEntity
717
     *
718
     * @return string
719
     */
720 89
    public function html(bool $multiDecodeNewHtmlEntity = false): string
721
    {
722 89
        if (static::$callback !== null) {
723
            \call_user_func(static::$callback, [$this]);
724
        }
725
726 89
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
727 51
            $content = $this->document->saveHTML($this->document->documentElement);
728
        } else {
729 51
            $content = $this->document->saveHTML();
730
        }
731
732 89
        if ($content === false) {
733
            return '';
734
        }
735
736 89
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
737
    }
738
739
    /**
740
     * Load HTML from string.
741
     *
742
     * @param string   $html
743
     * @param int|null $libXMLExtraOptions
744
     *
745
     * @return HtmlDomParser
746
     */
747 196
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
748
    {
749
        // reset
750 196
        self::$domBrokenReplaceHelper = [];
751
752 196
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
753
754 196
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
755
    }
756
757
    /**
758
     * Load HTML from file.
759
     *
760
     * @param string   $filePath
761
     * @param int|null $libXMLExtraOptions
762
     *
763
     * @throws \RuntimeException
764
     *
765
     * @return HtmlDomParser
766
     */
767 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
768
    {
769
        // reset
770 13
        self::$domBrokenReplaceHelper = [];
771
772
        if (
773 13
            !\preg_match("/^https?:\/\//i", $filePath)
774
            &&
775 13
            !\file_exists($filePath)
776
        ) {
777 1
            throw new \RuntimeException("File ${filePath} not found");
778
        }
779
780
        try {
781 12
            if (\class_exists('\voku\helper\UTF8')) {
782
                /** @noinspection PhpUndefinedClassInspection */
783
                $html = UTF8::file_get_contents($filePath);
784
            } else {
785 12
                $html = \file_get_contents($filePath);
786
            }
787 1
        } catch (\Exception $e) {
788 1
            throw new \RuntimeException("Could not load file ${filePath}");
789
        }
790
791 11
        if ($html === false) {
792
            throw new \RuntimeException("Could not load file ${filePath}");
793
        }
794
795 11
        return $this->loadHtml($html, $libXMLExtraOptions);
796
    }
797
798
    /**
799
     * Get the HTML as XML or plain XML if needed.
800
     *
801
     * @param bool $multiDecodeNewHtmlEntity
802
     * @param bool $htmlToXml
803
     * @param bool $removeXmlHeader
804
     * @param int  $options
805
     *
806
     * @return string
807
     */
808 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
809
        bool $multiDecodeNewHtmlEntity = false,
810
        bool $htmlToXml = true,
811
        bool $removeXmlHeader = true,
812
        int $options = \LIBXML_NOEMPTYTAG
813
    ): string {
814 2
        $xml = $this->document->saveXML(null, $options);
815 2
        if ($xml === false) {
816
            return '';
817
        }
818
819 2
        if ($removeXmlHeader) {
820 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
821
        }
822
823 2
        if ($htmlToXml) {
824 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
825
        } else {
826
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
827
828
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
829
        }
830
831 2
        return $return;
832
    }
833
834
    /**
835
     * @param string $selector
836
     * @param int    $idx
837
     *
838
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
839
     */
840 3
    public function __invoke($selector, $idx = null)
841
    {
842 3
        return $this->find($selector, $idx);
843
    }
844
845
    /**
846
     * @return bool
847
     */
848 122
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
849
    {
850 122
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
851
    }
852
853
    /**
854
     * @return bool
855
     */
856 122
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
857
    {
858 122
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
859
    }
860
861
    /**
862
     * @return bool
863
     */
864 122
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
865
    {
866 122
        return $this->isDOMDocumentCreatedWithoutHtml;
867
    }
868
869
    /**
870
     * @return bool
871
     */
872 122
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
873
    {
874 122
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
875
    }
876
877
    /**
878
     * @return bool
879
     */
880 122
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
881
    {
882 122
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
883
    }
884
885
    /**
886
     * @return bool
887
     */
888 122
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
889
    {
890 122
        return $this->isDOMDocumentCreatedWithoutWrapper;
891
    }
892
893
    /**
894
     * @return bool
895
     */
896 122
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
897
    {
898 122
        return $this->isDOMDocumentCreatedWithFakeEndScript;
899
    }
900
901
    /**
902
     * @param string $html
903
     *
904
     * @return string
905
     */
906 3
    protected function keepBrokenHtml(string $html): string
907
    {
908
        do {
909 3
            $original = $html;
910
911 3
            $html = (string) \preg_replace_callback(
912 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
913
                static function ($matches) {
914 3
                    return $matches['start'] .
915 3
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
916 3
                        $matches['value'] .
917 3
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
918 3
                        $matches['end'];
919 3
                },
920 3
                $html
921
            );
922 3
        } while ($original !== $html);
923
924
        do {
925 3
            $original = $html;
926
927 3
            $html = (string) \preg_replace_callback(
928 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
929
                static function ($matches) {
930 3
                    $matches['broken'] = \str_replace(
931 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
932 3
                        ['</', '<', '>'],
933 3
                        $matches['broken']
934
                    );
935
936 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
937 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
938
939 3
                    return $matches['start'] . $matchesHash . $matches['end'];
940 3
                },
941 3
                $html
942
            );
943 3
        } while ($original !== $html);
944
945 3
        return \str_replace(
946 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
947 3
            ['</', '<', '>'],
948 3
            $html
949
        );
950
    }
951
952
    /**
953
     * @param string $html
954
     *
955
     * @return void
956
     */
957 6
    protected function keepSpecialScriptTags(string &$html)
958
    {
959
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
960 6
        $tags = implode('|', array_map(
961
            function ($value) {
962 6
                return preg_quote($value, '/');
963 6
            }, $this->specialScriptTags
964
        ));
965 6
        $html = (string) \preg_replace_callback(
966 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
967
            function ($matches) {
968
969
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
970
                // because often this looks like non valid html in the template itself.
971 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
972 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
973
                        // remove the html5 fallback
974 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
975
976 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
977 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
978
979 4
                        return $matches['start'] . $matchesHash . $matches['end'];
980
                    }
981
                }
982
983
                // remove the html5 fallback
984 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
985
986 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
987
988 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
989 6
            },
990 6
            $html
991
        );
992 6
    }
993
994
    /**
995
     * @param bool $keepBrokenHtml
996
     *
997
     * @return HtmlDomParser
998
     */
999 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1000
    {
1001 3
        $this->keepBrokenHtml = $keepBrokenHtml;
1002
1003 3
        return $this;
1004
    }
1005
1006
    /**
1007
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1008
     *
1009
     * @return HtmlDomParser
1010
     */
1011 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1012
    {
1013 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1014 2
            if (!\is_string($tmp)) {
1015 2
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1016
            }
1017
        }
1018
1019 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1020
1021 1
        return $this;
1022
    }
1023
1024
1025
    /**
1026
     * @param string[] $specialScriptTags
1027
     *
1028
     * @return HtmlDomParser
1029
     */
1030
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1031
    {
1032
        foreach ($specialScriptTags as $tag) {
1033
            if (!\is_string($tag)) {
1034
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1035
            }
1036
        }
1037
1038
        $this->specialScriptTags = $specialScriptTags;
1039
1040
        return $this;
1041
    }
1042
}
1043