Completed
Push — master ( 29d020...693ab0 )
by Lars
01:48
created

HtmlDomParser::__get()   B

Complexity

Conditions 7
Paths 7

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 7

Importance

Changes 0
Metric Value
dl 0
loc 18
ccs 12
cts 12
cp 1
rs 8.8333
c 0
b 0
f 0
cc 7
nc 7
nop 1
crap 7
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array.
60
     *
61
     * ```php
62
     * protected $specialScriptTags = [
63
     *     'text/html',
64
     *     'text/x-custom-template',
65
     *     'text/x-handlebars-template'
66
     * ]
67
     * ```
68
     *
69
     * @var string[]
70
     */
71
    protected $specialScriptTags = [
72
        'text/html',
73
        'text/x-custom-template',
74
        'text/x-handlebars-template',
75
    ];
76
77
    /**
78
     * @var bool
79
     */
80
    protected $isDOMDocumentCreatedWithoutHtml = false;
81
82
    /**
83
     * @var bool
84
     */
85
    protected $isDOMDocumentCreatedWithoutWrapper = false;
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
116
117
    /**
118
     * @var bool
119
     */
120
    protected $keepBrokenHtml;
121
122
    /**
123
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
124
     */
125 214 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
126
    {
127 214
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
128
129
        // DOMDocument settings
130 214
        $this->document->preserveWhiteSpace = true;
131 214
        $this->document->formatOutput = true;
132
133 214
        if ($element instanceof SimpleHtmlDomInterface) {
134 99
            $element = $element->getNode();
135
        }
136
137 214
        if ($element instanceof \DOMNode) {
138 99
            $domNode = $this->document->importNode($element, true);
139
140 99
            if ($domNode instanceof \DOMNode) {
141
                /** @noinspection UnusedFunctionResultInspection */
142 99
                $this->document->appendChild($domNode);
143
            }
144
145 99
            return;
146
        }
147
148 214
        if ($element !== null) {
149
            /** @noinspection UnusedFunctionResultInspection */
150 85
            $this->loadHtml($element);
151
        }
152 213
    }
153
154
    /**
155
     * @param string $name
156
     * @param array  $arguments
157
     *
158
     * @return bool|mixed
159
     */
160 76
    public function __call($name, $arguments)
161
    {
162 76
        $name = \strtolower($name);
163
164 76
        if (isset(self::$functionAliases[$name])) {
165 75
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
166
        }
167
168 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
169
    }
170
171
    /**
172
     * @param string $name
173
     * @param array  $arguments
174
     *
175
     * @throws \BadMethodCallException
176
     * @throws \RuntimeException
177
     *
178
     * @return HtmlDomParser
179
     */
180 27 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
181
    {
182 27
        $arguments0 = $arguments[0] ?? '';
183
184 27
        $arguments1 = $arguments[1] ?? null;
185
186 27
        if ($name === 'str_get_html') {
187 21
            $parser = new static();
188
189 21
            return $parser->loadHtml($arguments0, $arguments1);
190
        }
191
192 7
        if ($name === 'file_get_html') {
193 6
            $parser = new static();
194
195 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
196
        }
197
198 1
        throw new \BadMethodCallException('Method does not exist');
199
    }
200
201
    /** @noinspection MagicMethodsValidityInspection */
202
203
    /**
204
     * @param string $name
205
     *
206
     * @return string|null
207
     */
208 15
    public function __get($name)
209
    {
210 15
        $name = \strtolower($name);
211
212
        switch ($name) {
213 15
            case 'outerhtml':
214 15
            case 'outertext':
215 5
                return $this->html();
216 11
            case 'innerhtml':
217 5
            case 'innertext':
218 7
                return $this->innerHtml();
219 4
            case 'text':
220 4
            case 'plaintext':
221 3
                return $this->text();
222
        }
223
224 1
        return null;
225
    }
226
227
    /**
228
     * @return string
229
     */
230 20
    public function __toString()
231
    {
232 20
        return $this->html();
233
    }
234
235
    /**
236
     * does nothing (only for api-compatibility-reasons)
237
     *
238
     * @return bool
239
     *
240
     * @deprecated
241
     */
242 6
    public function clear(): bool
243
    {
244 6
        return true;
245
    }
246
247
    /**
248
     * Create DOMDocument from HTML.
249
     *
250
     * @param string   $html
251
     * @param int|null $libXMLExtraOptions
252
     *
253
     * @return \DOMDocument
254
     */
255 198
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
256
    {
257 198
        if ($this->keepBrokenHtml) {
258 3
            $html = $this->keepBrokenHtml(\trim($html));
259
        }
260
261 198
        if (\strpos($html, '<') === false) {
262 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
263 196
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
264 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
265
        }
266
267 198
        if (\strpos(\ltrim($html), '<!--') === 0) {
268 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
269
        }
270
271
        /** @noinspection HtmlRequiredLangAttribute */
272
        if (
273 198
            \strpos($html, '<html ') === false
274
            &&
275 198
            \strpos($html, '<html>') === false
276
        ) {
277 120
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
278
        }
279
280
        if (
281 198
            \strpos($html, '<body ') === false
282
            &&
283 198
            \strpos($html, '<body>') === false
284
        ) {
285 125
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
286
        }
287
288
        /** @noinspection HtmlRequiredTitleElement */
289
        if (
290 198
            \strpos($html, '<head ') === false
291
            &&
292 198
            \strpos($html, '<head>') === false
293
        ) {
294 144
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
295
        }
296
297
        if (
298 198
            \strpos($html, '<p ') === false
299
            &&
300 198
            \strpos($html, '<p>') === false
301
        ) {
302 107
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
303
        }
304
305
        if (
306 198
            \strpos($html, '</script>') === false
307
            &&
308 198
            \strpos($html, '<\/script>') !== false
309
        ) {
310 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
311
        }
312
313 198
        if (\stripos($html, '</html>') !== false) {
314
            /** @noinspection NestedPositiveIfStatementsInspection */
315 87
            if (\preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)) {
316 86
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
317
            }
318
        }
319
320 198
        if (\strpos($html, '<script') !== false) {
321 23
            $this->html5FallbackForScriptTags($html);
322
323 23
            foreach ($this->specialScriptTags as $tag) {
324 23
                if (\strpos($html, $tag) !== false) {
325 23
                    $this->keepSpecialScriptTags($html);
326
                }
327
            }
328
        }
329
330
        // set error level
331 198
        $internalErrors = \libxml_use_internal_errors(true);
332 198
        $disableEntityLoader = \libxml_disable_entity_loader(true);
333 198
        \libxml_clear_errors();
334
335 198
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
336
337 198
        if (\defined('LIBXML_BIGLINES')) {
338 198
            $optionsXml |= \LIBXML_BIGLINES;
339
        }
340
341 198
        if (\defined('LIBXML_COMPACT')) {
342 198
            $optionsXml |= \LIBXML_COMPACT;
343
        }
344
345 198
        if (\defined('LIBXML_HTML_NODEFDTD')) {
346 198
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
347
        }
348
349 198
        if ($libXMLExtraOptions !== null) {
350 5
            $optionsXml |= $libXMLExtraOptions;
351
        }
352
353
        if (
354 198
            $this->isDOMDocumentCreatedWithoutWrapper
355
            ||
356 194
            $this->isDOMDocumentCreatedWithCommentWrapper
357
            ||
358 198
            $this->keepBrokenHtml
359
        ) {
360 19
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
361
        }
362
363 198
        $html = self::replaceToPreserveHtmlEntities($html);
364
365 198
        $documentFound = false;
366 198
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
367 198 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
368 91
            $domElementTmp = \dom_import_simplexml($sxe);
369
            if (
370 91
                $domElementTmp
371
                &&
372 91
                $domElementTmp->ownerDocument
373
            ) {
374 91
                $documentFound = true;
375 91
                $this->document = $domElementTmp->ownerDocument;
376
            }
377
        }
378
379 198 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
380
381
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
382 116
            $xmlHackUsed = false;
383
            /** @noinspection StringFragmentMisplacedInspection */
384 116
            if (\stripos('<?xml', $html) !== 0) {
385 116
                $xmlHackUsed = true;
386 116
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
387
            }
388
389 116
            $this->document->loadHTML($html, $optionsXml);
390
391
            // remove the "xml-encoding" hack
392 116
            if ($xmlHackUsed) {
393 116
                foreach ($this->document->childNodes as $child) {
394 116
                    if ($child->nodeType === \XML_PI_NODE) {
395
                        /** @noinspection UnusedFunctionResultInspection */
396 116
                        $this->document->removeChild($child);
397
398 116
                        break;
399
                    }
400
                }
401
            }
402
        }
403
404
        // set encoding
405 198
        $this->document->encoding = $this->getEncoding();
406
407
        // restore lib-xml settings
408 198
        \libxml_clear_errors();
409 198
        \libxml_use_internal_errors($internalErrors);
410 198
        \libxml_disable_entity_loader($disableEntityLoader);
411
412 198
        return $this->document;
413
    }
414
415
    /**
416
     * Find list of nodes with a CSS selector.
417
     *
418
     * @param string   $selector
419
     * @param int|null $idx
420
     *
421
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
422
     */
423 145 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
424
    {
425 145
        $xPathQuery = SelectorConverter::toXPath($selector);
426
427 145
        $xPath = new \DOMXPath($this->document);
428 145
        $nodesList = $xPath->query($xPathQuery);
429 145
        $elements = new SimpleHtmlDomNode();
430
431 145
        if ($nodesList) {
432 145
            foreach ($nodesList as $node) {
433 135
                $elements[] = new SimpleHtmlDom($node);
434
            }
435
        }
436
437
        // return all elements
438 145
        if ($idx === null) {
439 72
            if (\count($elements) === 0) {
440 16
                return new SimpleHtmlDomNodeBlank();
441
            }
442
443 69
            return $elements;
444
        }
445
446
        // handle negative values
447 91
        if ($idx < 0) {
448 11
            $idx = \count($elements) + $idx;
449
        }
450
451
        // return one element
452 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
453
    }
454
455
    /**
456
     * Find nodes with a CSS selector.
457
     *
458
     * @param string $selector
459
     *
460
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
461
     */
462 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
463
    {
464 12
        return $this->find($selector, null);
465
    }
466
467
    /**
468
     * Find nodes with a CSS selector or false, if no element is found.
469
     *
470
     * @param string $selector
471
     *
472
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
473
     */
474 4
    public function findMultiOrFalse(string $selector)
475
    {
476 4
        $return = $this->find($selector, null);
477
478 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
479 3
            return false;
480
        }
481
482 2
        return $return;
483
    }
484
485
    /**
486
     * Find one node with a CSS selector.
487
     *
488
     * @param string $selector
489
     *
490
     * @return SimpleHtmlDomInterface
491
     */
492 32
    public function findOne(string $selector): SimpleHtmlDomInterface
493
    {
494 32
        return $this->find($selector, 0);
495
    }
496
497
    /**
498
     * Find one node with a CSS selector or false, if no element is found.
499
     *
500
     * @param string $selector
501
     *
502
     * @return false|SimpleHtmlDomInterface
503
     */
504 6
    public function findOneOrFalse(string $selector)
505
    {
506 6
        $return = $this->find($selector, 0);
507
508 6
        if ($return instanceof SimpleHtmlDomBlank) {
509 3
            return false;
510
        }
511
512 4
        return $return;
513
    }
514
515
    /**
516
     * @param string $content
517
     * @param bool   $multiDecodeNewHtmlEntity
518
     *
519
     * @return string
520
     */
521 124
    public function fixHtmlOutput(
522
        string $content,
523
        bool $multiDecodeNewHtmlEntity = false
524
    ): string {
525
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
526
        //          so we try to remove it here again ...
527
528 124
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
529
            /** @noinspection HtmlRequiredLangAttribute */
530 59
            $content = \str_replace(
531
                [
532 59
                    '<html>',
533
                    '</html>',
534
                ],
535 59
                '',
536 59
                $content
537
            );
538
        }
539
540 124
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
541
            /** @noinspection HtmlRequiredTitleElement */
542 63
            $content = \str_replace(
543
                [
544 63
                    '<head>',
545
                    '</head>',
546
                ],
547 63
                '',
548 63
                $content
549
            );
550
        }
551
552 124
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
553 62
            $content = \str_replace(
554
                [
555 62
                    '<body>',
556
                    '</body>',
557
                ],
558 62
                '',
559 62
                $content
560
            );
561
        }
562
563 124
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
564 1
            $content = \str_replace(
565 1
                '</script>',
566 1
                '',
567 1
                $content
568
            );
569
        }
570
571 124
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
572 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
573 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
574
        }
575
576 124
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
577 57
            $content = \str_replace(
578
                [
579 57
                    '<p>',
580
                    '</p>',
581
                ],
582 57
                '',
583 57
                $content
584
            );
585
        }
586
587 124
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
588 9
            $content = \str_replace(
589 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
590 9
                '',
591 9
                $content
592
            );
593
        }
594
595
        // https://bugs.php.net/bug.php?id=73175
596
        /** @noinspection HtmlRequiredTitleElement */
597 124
        $content = \trim(
598 124
            \str_replace(
599
                [
600 124
                    '</area>',
601
                    '</base>',
602
                    '</br>',
603
                    '</col>',
604
                    '</command>',
605
                    '</embed>',
606
                    '</hr>',
607
                    '</img>',
608
                    '</input>',
609
                    '</keygen>',
610
                    '</link>',
611
                    '</meta>',
612
                    '</param>',
613
                    '</source>',
614
                    '</track>',
615
                    '</wbr>',
616
                    '<simpleHtmlDomHtml>',
617
                    '</simpleHtmlDomHtml>',
618
                    '<simpleHtmlDomP>',
619
                    '</simpleHtmlDomP>',
620
                    '<head><head>',
621
                    '</head></head>',
622
                ],
623
                [
624 124
                    '',
625
                    '',
626
                    '',
627
                    '',
628
                    '',
629
                    '',
630
                    '',
631
                    '',
632
                    '',
633
                    '',
634
                    '',
635
                    '',
636
                    '',
637
                    '',
638
                    '',
639
                    '',
640
                    '',
641
                    '',
642
                    '',
643
                    '',
644
                    '<head>',
645
                    '</head>',
646
                ],
647 124
                $content
648
            )
649
        );
650
651 124
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
652
653 124
        return self::putReplacedBackToPreserveHtmlEntities($content);
654
    }
655
656
    /**
657
     * Return elements by ".class".
658
     *
659
     * @param string $class
660
     *
661
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
662
     */
663
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
664
    {
665
        return $this->findMulti(".${class}");
666
    }
667
668
    /**
669
     * Return element by #id.
670
     *
671
     * @param string $id
672
     *
673
     * @return SimpleHtmlDomInterface
674
     */
675 3
    public function getElementById(string $id): SimpleHtmlDomInterface
676
    {
677 3
        return $this->findOne("#${id}");
678
    }
679
680
    /**
681
     * Return element by tag name.
682
     *
683
     * @param string $name
684
     *
685
     * @return SimpleHtmlDomInterface
686
     */
687 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
688
    {
689 1
        $node = $this->document->getElementsByTagName($name)->item(0);
690
691 1
        if ($node === null) {
692
            return new SimpleHtmlDomBlank();
693
        }
694
695 1
        return new SimpleHtmlDom($node);
696
    }
697
698
    /**
699
     * Returns elements by "#id".
700
     *
701
     * @param string   $id
702
     * @param int|null $idx
703
     *
704
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
705
     */
706
    public function getElementsById(string $id, $idx = null)
707
    {
708
        return $this->find("#${id}", $idx);
709
    }
710
711
    /**
712
     * Returns elements by tag name.
713
     *
714
     * @param string   $name
715
     * @param int|null $idx
716
     *
717
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
718
     */
719 6
    public function getElementsByTagName(string $name, $idx = null)
720
    {
721 6
        $nodesList = $this->document->getElementsByTagName($name);
722
723 6
        $elements = new SimpleHtmlDomNode();
724
725 6
        foreach ($nodesList as $node) {
726 4
            $elements[] = new SimpleHtmlDom($node);
727
        }
728
729
        // return all elements
730 6
        if ($idx === null) {
731 5
            if (\count($elements) === 0) {
732 2
                return new SimpleHtmlDomNodeBlank();
733
            }
734
735 3
            return $elements;
736
        }
737
738
        // handle negative values
739 1
        if ($idx < 0) {
740
            $idx = \count($elements) + $idx;
741
        }
742
743
        // return one element
744 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
745
    }
746
747
    /**
748
     * Get dom node's outer html.
749
     *
750
     * @param bool $multiDecodeNewHtmlEntity
751
     *
752
     * @return string
753
     */
754 91
    public function html(bool $multiDecodeNewHtmlEntity = false): string
755
    {
756 91
        if (static::$callback !== null) {
757
            \call_user_func(static::$callback, [$this]);
758
        }
759
760 91
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
761 52
            $content = $this->document->saveHTML($this->document->documentElement);
762
        } else {
763 52
            $content = $this->document->saveHTML();
764
        }
765
766 91
        if ($content === false) {
767
            return '';
768
        }
769
770 91
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
771
    }
772
773
    /**
774
     * Load HTML from string.
775
     *
776
     * @param string   $html
777
     * @param int|null $libXMLExtraOptions
778
     *
779
     * @return HtmlDomParser
780
     */
781 198
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
782
    {
783
        // reset
784 198
        self::$domBrokenReplaceHelper = [];
785
786 198
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
787
788 198
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
789
    }
790
791
    /**
792
     * Load HTML from file.
793
     *
794
     * @param string   $filePath
795
     * @param int|null $libXMLExtraOptions
796
     *
797
     * @throws \RuntimeException
798
     *
799
     * @return HtmlDomParser
800
     */
801 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
802
    {
803
        // reset
804 13
        self::$domBrokenReplaceHelper = [];
805
806
        if (
807 13
            !\preg_match("/^https?:\/\//i", $filePath)
808
            &&
809 13
            !\file_exists($filePath)
810
        ) {
811 1
            throw new \RuntimeException("File ${filePath} not found");
812
        }
813
814
        try {
815 12
            if (\class_exists('\voku\helper\UTF8')) {
816
                /** @noinspection PhpUndefinedClassInspection */
817
                $html = UTF8::file_get_contents($filePath);
818
            } else {
819 12
                $html = \file_get_contents($filePath);
820
            }
821 1
        } catch (\Exception $e) {
822 1
            throw new \RuntimeException("Could not load file ${filePath}");
823
        }
824
825 11
        if ($html === false) {
826
            throw new \RuntimeException("Could not load file ${filePath}");
827
        }
828
829 11
        return $this->loadHtml($html, $libXMLExtraOptions);
830
    }
831
832
    /**
833
     * Get the HTML as XML or plain XML if needed.
834
     *
835
     * @param bool $multiDecodeNewHtmlEntity
836
     * @param bool $htmlToXml
837
     * @param bool $removeXmlHeader
838
     * @param int  $options
839
     *
840
     * @return string
841
     */
842 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
843
        bool $multiDecodeNewHtmlEntity = false,
844
        bool $htmlToXml = true,
845
        bool $removeXmlHeader = true,
846
        int $options = \LIBXML_NOEMPTYTAG
847
    ): string {
848 2
        $xml = $this->document->saveXML(null, $options);
849 2
        if ($xml === false) {
850
            return '';
851
        }
852
853 2
        if ($removeXmlHeader) {
854 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
855
        }
856
857 2
        if ($htmlToXml) {
858 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
859
        } else {
860
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
861
862
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
863
        }
864
865 2
        return $return;
866
    }
867
868
    /**
869
     * @param string $selector
870
     * @param int    $idx
871
     *
872
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
873
     */
874 3
    public function __invoke($selector, $idx = null)
875
    {
876 3
        return $this->find($selector, $idx);
877
    }
878
879
    /**
880
     * @return bool
881
     */
882 124
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
883
    {
884 124
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
885
    }
886
887
    /**
888
     * @return bool
889
     */
890 124
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
891
    {
892 124
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
893
    }
894
895
    /**
896
     * @return bool
897
     */
898 124
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
899
    {
900 124
        return $this->isDOMDocumentCreatedWithoutHtml;
901
    }
902
903
    /**
904
     * @return bool
905
     */
906 124
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
907
    {
908 124
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
909
    }
910
911
    /**
912
     * @return bool
913
     */
914 124
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
915
    {
916 124
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
917
    }
918
919
    /**
920
     * @return bool
921
     */
922 124
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
923
    {
924 124
        return $this->isDOMDocumentCreatedWithoutWrapper;
925
    }
926
927
    /**
928
     * @return bool
929
     */
930 124
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
931
    {
932 124
        return $this->isDOMDocumentCreatedWithFakeEndScript;
933
    }
934
935
    /**
936
     * @param string $html
937
     *
938
     * @return string
939
     */
940 3
    protected function keepBrokenHtml(string $html): string
941
    {
942
        do {
943 3
            $original = $html;
944
945 3
            $html = (string) \preg_replace_callback(
946 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
947
                static function ($matches) {
948 3
                    return $matches['start'] .
949 3
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
950 3
                        $matches['value'] .
951 3
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
952 3
                        $matches['end'];
953 3
                },
954 3
                $html
955
            );
956 3
        } while ($original !== $html);
957
958
        do {
959 3
            $original = $html;
960
961 3
            $html = (string) \preg_replace_callback(
962 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
963
                static function ($matches) {
964 3
                    $matches['broken'] = \str_replace(
965 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
966 3
                        ['</', '<', '>'],
967 3
                        $matches['broken']
968
                    );
969
970 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
971 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
972
973 3
                    return $matches['start'] . $matchesHash . $matches['end'];
974 3
                },
975 3
                $html
976
            );
977 3
        } while ($original !== $html);
978
979 3
        return \str_replace(
980 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
981 3
            ['</', '<', '>'],
982 3
            $html
983
        );
984
    }
985
986
    /**
987
     * @param string $html
988
     *
989
     * @return void
990
     */
991 6
    protected function keepSpecialScriptTags(string &$html)
992
    {
993
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
994 6
        $tags = \implode('|', \array_map(
995
            static function ($value) {
996 6
                return \preg_quote($value, '/');
997 6
            },
998 6
            $this->specialScriptTags
999
        ));
1000 6
        $html = (string) \preg_replace_callback(
1001 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1002
            function ($matches) {
1003
1004
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1005
                // because often this looks like non valid html in the template itself.
1006 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1007 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1008
                        // remove the html5 fallback
1009 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
1010
1011 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1012 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1013
1014 4
                        return $matches['start'] . $matchesHash . $matches['end'];
1015
                    }
1016
                }
1017
1018
                // remove the html5 fallback
1019 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
1020
1021 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1022
1023 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1024 6
            },
1025 6
            $html
1026
        );
1027 6
    }
1028
1029
    /**
1030
     * @param bool $keepBrokenHtml
1031
     *
1032
     * @return HtmlDomParser
1033
     */
1034 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1035
    {
1036 3
        $this->keepBrokenHtml = $keepBrokenHtml;
1037
1038 3
        return $this;
1039
    }
1040
1041
    /**
1042
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1043
     *
1044
     * @return HtmlDomParser
1045
     */
1046 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1047
    {
1048 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1049 2
            if (!\is_string($tmp)) {
1050 2
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1051
            }
1052
        }
1053
1054 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1055
1056 1
        return $this;
1057
    }
1058
1059
    /**
1060
     * @param string[] $specialScriptTags
1061
     *
1062
     * @return HtmlDomParser
1063
     */
1064
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1065
    {
1066
        foreach ($specialScriptTags as $tag) {
1067
            if (!\is_string($tag)) {
1068
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1069
            }
1070
        }
1071
1072
        $this->specialScriptTags = $specialScriptTags;
1073
1074
        return $this;
1075
    }
1076
}
1077