Completed
Push — master ( 444898...1d5830 )
by Lars
01:59
created

HtmlDomParser::__invoke()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template',
75
    ];
76
77
    /**
78
     * @var bool
79
     */
80
    protected $isDOMDocumentCreatedWithoutHtml = false;
81
82
    /**
83
     * @var bool
84
     */
85
    protected $isDOMDocumentCreatedWithoutWrapper = false;
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
116
117
    /**
118
     * @var bool
119
     */
120
    protected $keepBrokenHtml;
121
122
    /**
123
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
124
     */
125 212 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
126
    {
127 212
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
128
129
        // DOMDocument settings
130 212
        $this->document->preserveWhiteSpace = true;
131 212
        $this->document->formatOutput = true;
132
133 212
        if ($element instanceof SimpleHtmlDomInterface) {
134 98
            $element = $element->getNode();
135
        }
136
137 212
        if ($element instanceof \DOMNode) {
138 98
            $domNode = $this->document->importNode($element, true);
139
140 98
            if ($domNode instanceof \DOMNode) {
141
                /** @noinspection UnusedFunctionResultInspection */
142 98
                $this->document->appendChild($domNode);
143
            }
144
145 98
            return;
146
        }
147
148 212
        if ($element !== null) {
149
            /** @noinspection UnusedFunctionResultInspection */
150 85
            $this->loadHtml($element);
151
        }
152 211
    }
153
154
    /**
155
     * @param string $name
156
     * @param array  $arguments
157
     *
158
     * @return bool|mixed
159
     */
160 76
    public function __call($name, $arguments)
161
    {
162 76
        $name = \strtolower($name);
163
164 76
        if (isset(self::$functionAliases[$name])) {
165 75
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
166
        }
167
168 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
169
    }
170
171
    /**
172
     * @param string $name
173
     * @param array  $arguments
174
     *
175
     * @throws \BadMethodCallException
176
     * @throws \RuntimeException
177
     *
178
     * @return HtmlDomParser
179
     */
180 26 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
181
    {
182 26
        $arguments0 = $arguments[0] ?? '';
183
184 26
        $arguments1 = $arguments[1] ?? null;
185
186 26
        if ($name === 'str_get_html') {
187 20
            $parser = new static();
188
189 20
            return $parser->loadHtml($arguments0, $arguments1);
190
        }
191
192 7
        if ($name === 'file_get_html') {
193 6
            $parser = new static();
194
195 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
196
        }
197
198 1
        throw new \BadMethodCallException('Method does not exist');
199
    }
200
201
    /** @noinspection MagicMethodsValidityInspection */
202
203
    /**
204
     * @param string $name
205
     *
206
     * @return string|null
207
     */
208 15
    public function __get($name)
209
    {
210 15
        $name = \strtolower($name);
211
212 15
        switch ($name) {
213 15
            case 'outerhtml':
214 15
            case 'outertext':
215 5
                return $this->html();
216 11
            case 'innerhtml':
217 5
            case 'innertext':
218 7
                return $this->innerHtml();
219 4
            case 'text':
220 4
            case 'plaintext':
221 3
                return $this->text();
222
        }
223
224 1
        return null;
225
    }
226
227
    /**
228
     * @return string
229
     */
230 19
    public function __toString()
231
    {
232 19
        return $this->html();
233
    }
234
235
    /**
236
     * does nothing (only for api-compatibility-reasons)
237
     *
238
     * @return bool
239
     *
240
     * @deprecated
241
     */
242 6
    public function clear(): bool
243
    {
244 6
        return true;
245
    }
246
247
    /**
248
     * Create DOMDocument from HTML.
249
     *
250
     * @param string   $html
251
     * @param int|null $libXMLExtraOptions
252
     *
253
     * @return \DOMDocument
254
     */
255 196
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
256
    {
257 196
        if ($this->keepBrokenHtml) {
258 3
            $html = $this->keepBrokenHtml(\trim($html));
259
        }
260
261 196
        if (\strpos($html, '<') === false) {
262 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
263 194
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
264 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
265
        }
266
267 196
        if (\strpos(\ltrim($html), '<!--') === 0) {
268 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
269
        }
270
271
        /** @noinspection HtmlRequiredLangAttribute */
272
        if (
273 196
            \strpos($html, '<html ') === false
274
            &&
275 196
            \strpos($html, '<html>') === false
276
        ) {
277 119
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
278
        }
279
280
        if (
281 196
            \strpos($html, '<body ') === false
282
            &&
283 196
            \strpos($html, '<body>') === false
284
        ) {
285 124
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
286
        }
287
288
        /** @noinspection HtmlRequiredTitleElement */
289
        if (
290 196
            \strpos($html, '<head ') === false
291
            &&
292 196
            \strpos($html, '<head>') === false
293
        ) {
294 143
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
295
        }
296
297
        if (
298 196
            \strpos($html, '<p ') === false
299
            &&
300 196
            \strpos($html, '<p>') === false
301
        ) {
302 106
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
303
        }
304
305
        if (
306 196
            \strpos($html, '</script>') === false
307
            &&
308 196
            \strpos($html, '<\/script>') !== false
309
        ) {
310 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
311
        }
312
313 196
        if (\strpos($html, '<script') !== false) {
314 23
            $this->html5FallbackForScriptTags($html);
315
316 23
            foreach ($this->specialScriptTags as $tag) {
317 23
                if (\strpos($html, $tag) !== false) {
318 6
                    $this->keepSpecialScriptTags($html);
319
                }
320
            }
321
        }
322
323
        // set error level
324 196
        $internalErrors = \libxml_use_internal_errors(true);
325 196
        $disableEntityLoader = \libxml_disable_entity_loader(true);
326 196
        \libxml_clear_errors();
327
328 196
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
329
330 196
        if (\defined('LIBXML_BIGLINES')) {
331 196
            $optionsXml |= \LIBXML_BIGLINES;
332
        }
333
334 196
        if (\defined('LIBXML_COMPACT')) {
335 196
            $optionsXml |= \LIBXML_COMPACT;
336
        }
337
338 196
        if (\defined('LIBXML_HTML_NODEFDTD')) {
339 196
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
340
        }
341
342 196
        if ($libXMLExtraOptions !== null) {
343 5
            $optionsXml |= $libXMLExtraOptions;
344
        }
345
346
        if (
347 196
            $this->isDOMDocumentCreatedWithoutWrapper
348
            ||
349 192
            $this->isDOMDocumentCreatedWithCommentWrapper
350
            ||
351 196
            $this->keepBrokenHtml
352
        ) {
353 19
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
354
        }
355
356 196
        $html = self::replaceToPreserveHtmlEntities($html);
357
358 196
        $documentFound = false;
359 196
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
360 196 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
361 90
            $domElementTmp = \dom_import_simplexml($sxe);
362
            if (
363 90
                $domElementTmp
364
                &&
365 90
                $domElementTmp->ownerDocument
366
            ) {
367 90
                $documentFound = true;
368 90
                $this->document = $domElementTmp->ownerDocument;
369
            }
370
        }
371
372 196 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
373
374
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
375 115
            $xmlHackUsed = false;
376
            /** @noinspection StringFragmentMisplacedInspection */
377 115
            if (\stripos('<?xml', $html) !== 0) {
378 115
                $xmlHackUsed = true;
379 115
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
380
            }
381
382 115
            $this->document->loadHTML($html, $optionsXml);
383
384
            // remove the "xml-encoding" hack
385 115
            if ($xmlHackUsed) {
386 115
                foreach ($this->document->childNodes as $child) {
387 115
                    if ($child->nodeType === \XML_PI_NODE) {
388
                        /** @noinspection UnusedFunctionResultInspection */
389 115
                        $this->document->removeChild($child);
390
391 115
                        break;
392
                    }
393
                }
394
            }
395
        }
396
397
        // set encoding
398 196
        $this->document->encoding = $this->getEncoding();
399
400
        // restore lib-xml settings
401 196
        \libxml_clear_errors();
402 196
        \libxml_use_internal_errors($internalErrors);
403 196
        \libxml_disable_entity_loader($disableEntityLoader);
404
405 196
        return $this->document;
406
    }
407
408
    /**
409
     * Find list of nodes with a CSS selector.
410
     *
411
     * @param string   $selector
412
     * @param int|null $idx
413
     *
414
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
415
     */
416 144 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
417
    {
418 144
        $xPathQuery = SelectorConverter::toXPath($selector);
419
420 144
        $xPath = new \DOMXPath($this->document);
421 144
        $nodesList = $xPath->query($xPathQuery);
422 144
        $elements = new SimpleHtmlDomNode();
423
424 144
        if ($nodesList) {
425 144
            foreach ($nodesList as $node) {
426 134
                $elements[] = new SimpleHtmlDom($node);
427
            }
428
        }
429
430
        // return all elements
431 144
        if ($idx === null) {
432 71
            if (\count($elements) === 0) {
433 16
                return new SimpleHtmlDomNodeBlank();
434
            }
435
436 68
            return $elements;
437
        }
438
439
        // handle negative values
440 91
        if ($idx < 0) {
441 11
            $idx = \count($elements) + $idx;
442
        }
443
444
        // return one element
445 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
446
    }
447
448
    /**
449
     * Find nodes with a CSS selector.
450
     *
451
     * @param string $selector
452
     *
453
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
454
     */
455 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
456
    {
457 12
        return $this->find($selector, null);
458
    }
459
460
    /**
461
     * Find nodes with a CSS selector or false, if no element is found.
462
     *
463
     * @param string $selector
464
     *
465
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
466
     */
467 4
    public function findMultiOrFalse(string $selector)
468
    {
469 4
        $return = $this->find($selector, null);
470
471 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
472 3
            return false;
473
        }
474
475 2
        return $return;
476
    }
477
478
    /**
479
     * Find one node with a CSS selector.
480
     *
481
     * @param string $selector
482
     *
483
     * @return SimpleHtmlDomInterface
484
     */
485 32
    public function findOne(string $selector): SimpleHtmlDomInterface
486
    {
487 32
        return $this->find($selector, 0);
488
    }
489
490
    /**
491
     * Find one node with a CSS selector or false, if no element is found.
492
     *
493
     * @param string $selector
494
     *
495
     * @return false|SimpleHtmlDomInterface
496
     */
497 6
    public function findOneOrFalse(string $selector)
498
    {
499 6
        $return = $this->find($selector, 0);
500
501 6
        if ($return instanceof SimpleHtmlDomBlank) {
502 3
            return false;
503
        }
504
505 4
        return $return;
506
    }
507
508
    /**
509
     * @param string $content
510
     * @param bool   $multiDecodeNewHtmlEntity
511
     *
512
     * @return string
513
     */
514 122
    public function fixHtmlOutput(
515
        string $content,
516
        bool $multiDecodeNewHtmlEntity = false
517
    ): string {
518
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
519
        //          so we try to remove it here again ...
520
521 122
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
522
            /** @noinspection HtmlRequiredLangAttribute */
523 58
            $content = \str_replace(
524
                [
525 58
                    '<html>',
526
                    '</html>',
527
                ],
528 58
                '',
529 58
                $content
530
            );
531
        }
532
533 122
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
534
            /** @noinspection HtmlRequiredTitleElement */
535 62
            $content = \str_replace(
536
                [
537 62
                    '<head>',
538
                    '</head>',
539
                ],
540 62
                '',
541 62
                $content
542
            );
543
        }
544
545 122
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
546 61
            $content = \str_replace(
547
                [
548 61
                    '<body>',
549
                    '</body>',
550
                ],
551 61
                '',
552 61
                $content
553
            );
554
        }
555
556 122
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
557 1
            $content = \str_replace(
558 1
                '</script>',
559 1
                '',
560 1
                $content
561
            );
562
        }
563
564 122
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
565 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
566 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
567
        }
568
569 122
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
570 56
            $content = \str_replace(
571
                [
572 56
                    '<p>',
573
                    '</p>',
574
                ],
575 56
                '',
576 56
                $content
577
            );
578
        }
579
580 122
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
581 9
            $content = \str_replace(
582 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
583 9
                '',
584 9
                $content
585
            );
586
        }
587
588
        /** @noinspection CheckTagEmptyBody */
589
        /** @noinspection HtmlExtraClosingTag */
590
        /** @noinspection HtmlRequiredTitleElement */
591 122
        $content = \trim(
592 122
            \str_replace(
593
                [
594 122
                    '<simpleHtmlDomHtml>',
595
                    '</simpleHtmlDomHtml>',
596
                    '<simpleHtmlDomP>',
597
                    '</simpleHtmlDomP>',
598
                    '<head><head>',
599
                    '</head></head>',
600
                    '<br></br>',
601
                ],
602
                [
603 122
                    '',
604
                    '',
605
                    '',
606
                    '',
607
                    '<head>',
608
                    '</head>',
609
                    '<br>',
610
                ],
611 122
                $content
612
            )
613
        );
614
615 122
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
616
617 122
        return self::putReplacedBackToPreserveHtmlEntities($content);
618
    }
619
620
    /**
621
     * Return elements by ".class".
622
     *
623
     * @param string $class
624
     *
625
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
626
     */
627
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
628
    {
629
        return $this->findMulti(".${class}");
630
    }
631
632
    /**
633
     * Return element by #id.
634
     *
635
     * @param string $id
636
     *
637
     * @return SimpleHtmlDomInterface
638
     */
639 3
    public function getElementById(string $id): SimpleHtmlDomInterface
640
    {
641 3
        return $this->findOne("#${id}");
642
    }
643
644
    /**
645
     * Return element by tag name.
646
     *
647
     * @param string $name
648
     *
649
     * @return SimpleHtmlDomInterface
650
     */
651 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
652
    {
653 1
        $node = $this->document->getElementsByTagName($name)->item(0);
654
655 1
        if ($node === null) {
656
            return new SimpleHtmlDomBlank();
657
        }
658
659 1
        return new SimpleHtmlDom($node);
660
    }
661
662
    /**
663
     * Returns elements by "#id".
664
     *
665
     * @param string   $id
666
     * @param int|null $idx
667
     *
668
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
669
     */
670
    public function getElementsById(string $id, $idx = null)
671
    {
672
        return $this->find("#${id}", $idx);
673
    }
674
675
    /**
676
     * Returns elements by tag name.
677
     *
678
     * @param string   $name
679
     * @param int|null $idx
680
     *
681
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
682
     */
683 6
    public function getElementsByTagName(string $name, $idx = null)
684
    {
685 6
        $nodesList = $this->document->getElementsByTagName($name);
686
687 6
        $elements = new SimpleHtmlDomNode();
688
689 6
        foreach ($nodesList as $node) {
690 4
            $elements[] = new SimpleHtmlDom($node);
691
        }
692
693
        // return all elements
694 6
        if ($idx === null) {
695 5
            if (\count($elements) === 0) {
696 2
                return new SimpleHtmlDomNodeBlank();
697
            }
698
699 3
            return $elements;
700
        }
701
702
        // handle negative values
703 1
        if ($idx < 0) {
704
            $idx = \count($elements) + $idx;
705
        }
706
707
        // return one element
708 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
709
    }
710
711
    /**
712
     * Get dom node's outer html.
713
     *
714
     * @param bool $multiDecodeNewHtmlEntity
715
     *
716
     * @return string
717
     */
718 89
    public function html(bool $multiDecodeNewHtmlEntity = false): string
719
    {
720 89
        if (static::$callback !== null) {
721
            \call_user_func(static::$callback, [$this]);
722
        }
723
724 89
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
725 51
            $content = $this->document->saveHTML($this->document->documentElement);
726
        } else {
727 51
            $content = $this->document->saveHTML();
728
        }
729
730 89
        if ($content === false) {
731
            return '';
732
        }
733
734 89
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
735
    }
736
737
    /**
738
     * Load HTML from string.
739
     *
740
     * @param string   $html
741
     * @param int|null $libXMLExtraOptions
742
     *
743
     * @return HtmlDomParser
744
     */
745 196
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
746
    {
747
        // reset
748 196
        self::$domBrokenReplaceHelper = [];
749
750 196
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
751
752 196
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
753
    }
754
755
    /**
756
     * Load HTML from file.
757
     *
758
     * @param string   $filePath
759
     * @param int|null $libXMLExtraOptions
760
     *
761
     * @throws \RuntimeException
762
     *
763
     * @return HtmlDomParser
764
     */
765 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
766
    {
767
        // reset
768 13
        self::$domBrokenReplaceHelper = [];
769
770
        if (
771 13
            !\preg_match("/^https?:\/\//i", $filePath)
772
            &&
773 13
            !\file_exists($filePath)
774
        ) {
775 1
            throw new \RuntimeException("File ${filePath} not found");
776
        }
777
778
        try {
779 12
            if (\class_exists('\voku\helper\UTF8')) {
780
                /** @noinspection PhpUndefinedClassInspection */
781
                $html = UTF8::file_get_contents($filePath);
782
            } else {
783 12
                $html = \file_get_contents($filePath);
784
            }
785 1
        } catch (\Exception $e) {
786 1
            throw new \RuntimeException("Could not load file ${filePath}");
787
        }
788
789 11
        if ($html === false) {
790
            throw new \RuntimeException("Could not load file ${filePath}");
791
        }
792
793 11
        return $this->loadHtml($html, $libXMLExtraOptions);
794
    }
795
796
    /**
797
     * Get the HTML as XML or plain XML if needed.
798
     *
799
     * @param bool $multiDecodeNewHtmlEntity
800
     * @param bool $htmlToXml
801
     * @param bool $removeXmlHeader
802
     * @param int  $options
803
     *
804
     * @return string
805
     */
806 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
807
        bool $multiDecodeNewHtmlEntity = false,
808
        bool $htmlToXml = true,
809
        bool $removeXmlHeader = true,
810
        int $options = \LIBXML_NOEMPTYTAG
811
    ): string {
812 2
        $xml = $this->document->saveXML(null, $options);
813 2
        if ($xml === false) {
814
            return '';
815
        }
816
817 2
        if ($removeXmlHeader) {
818 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
819
        }
820
821 2
        if ($htmlToXml) {
822 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
823
        } else {
824
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
825
826
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
827
        }
828
829 2
        return $return;
830
    }
831
832
    /**
833
     * @param string $selector
834
     * @param int    $idx
835
     *
836
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
837
     */
838 3
    public function __invoke($selector, $idx = null)
839
    {
840 3
        return $this->find($selector, $idx);
841
    }
842
843
    /**
844
     * @return bool
845
     */
846 122
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
847
    {
848 122
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
849
    }
850
851
    /**
852
     * @return bool
853
     */
854 122
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
855
    {
856 122
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
857
    }
858
859
    /**
860
     * @return bool
861
     */
862 122
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
863
    {
864 122
        return $this->isDOMDocumentCreatedWithoutHtml;
865
    }
866
867
    /**
868
     * @return bool
869
     */
870 122
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
871
    {
872 122
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
873
    }
874
875
    /**
876
     * @return bool
877
     */
878 122
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
879
    {
880 122
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
881
    }
882
883
    /**
884
     * @return bool
885
     */
886 122
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
887
    {
888 122
        return $this->isDOMDocumentCreatedWithoutWrapper;
889
    }
890
891
    /**
892
     * @return bool
893
     */
894 122
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
895
    {
896 122
        return $this->isDOMDocumentCreatedWithFakeEndScript;
897
    }
898
899
    /**
900
     * @param string $html
901
     *
902
     * @return string
903
     */
904 3
    protected function keepBrokenHtml(string $html): string
905
    {
906
        do {
907 3
            $original = $html;
908
909 3
            $html = (string) \preg_replace_callback(
910 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
911
                static function ($matches) {
912 3
                    return $matches['start'] .
913 3
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
914 3
                        $matches['value'] .
915 3
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
916 3
                        $matches['end'];
917 3
                },
918 3
                $html
919
            );
920 3
        } while ($original !== $html);
921
922
        do {
923 3
            $original = $html;
924
925 3
            $html = (string) \preg_replace_callback(
926 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
927
                static function ($matches) {
928 3
                    $matches['broken'] = \str_replace(
929 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
930 3
                        ['</', '<', '>'],
931 3
                        $matches['broken']
932
                    );
933
934 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
935 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
936
937 3
                    return $matches['start'] . $matchesHash . $matches['end'];
938 3
                },
939 3
                $html
940
            );
941 3
        } while ($original !== $html);
942
943 3
        return \str_replace(
944 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
945 3
            ['</', '<', '>'],
946 3
            $html
947
        );
948
    }
949
950
    /**
951
     * @param string $html
952
     *
953
     * @return void
954
     */
955 6
    protected function keepSpecialScriptTags(string &$html)
956
    {
957
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
958 6
        $tags = \implode('|', \array_map(
959
            static function ($value) {
960 6
                return \preg_quote($value, '/');
961 6
            },
962 6
            $this->specialScriptTags
963
        ));
964 6
        $html = (string) \preg_replace_callback(
965 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
966
            function ($matches) {
967
968
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
969
                // because often this looks like non valid html in the template itself.
970 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
971 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
972
                        // remove the html5 fallback
973 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
974
975 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
976 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
977
978 3
                        return $matches['start'] . $matchesHash . $matches['end'];
979
                    }
980
                }
981
982
                // remove the html5 fallback
983 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
984
985 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
986
987 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
988 6
            },
989 6
            $html
990
        );
991 6
    }
992
993
    /**
994
     * @param bool $keepBrokenHtml
995
     *
996
     * @return HtmlDomParser
997
     */
998 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
999
    {
1000 3
        $this->keepBrokenHtml = $keepBrokenHtml;
1001
1002 3
        return $this;
1003
    }
1004
1005
    /**
1006
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1007
     *
1008
     * @return HtmlDomParser
1009
     */
1010 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1011
    {
1012 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1013 2
            if (!\is_string($tmp)) {
1014 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1015
            }
1016
        }
1017
1018 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1019
1020 1
        return $this;
1021
    }
1022
1023
    /**
1024
     * @param string[] $specialScriptTags
1025
     *
1026
     * @return HtmlDomParser
1027
     */
1028
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1029
    {
1030
        foreach ($specialScriptTags as $tag) {
1031
            if (!\is_string($tag)) {
1032
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1033
            }
1034
        }
1035
1036
        $this->specialScriptTags = $specialScriptTags;
1037
1038
        return $this;
1039
    }
1040
}
1041