Completed
Pull Request — master (#58)
by
unknown
04:25
created

HtmlDomParser::getElementsById()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 2
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
use DOMDocument;
8
use SimpleXMLElement;
9
use function array_map;
10
use function count;
11
use function defined;
12
use function dom_import_simplexml;
13
use function libxml_clear_errors;
14
use function libxml_disable_entity_loader;
15
use function libxml_get_errors;
16
use function libxml_use_internal_errors;
17
use function ltrim;
18
use function preg_match;
19
use function simplexml_load_string;
20
use function str_replace;
21
use function stripos;
22
use function strpos;
23
use function trim;
24
use const LIBXML_BIGLINES;
25
use const LIBXML_COMPACT;
26
use const LIBXML_DTDATTR;
27
use const LIBXML_DTDLOAD;
28
use const LIBXML_HTML_NODEFDTD;
29
use const LIBXML_NONET;
30
use const PHP_VERSION_ID;
31
32
/**
33
 * @property-read string $outerText
34
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
35
 * @property-read string $outerHtml
36
 *                                 <p>Get dom node's outer html.</p>
37
 * @property-read string $innerText
38
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
39
 * @property-read string $innerHtml
40
 *                                 <p>Get dom node's inner html.</p>
41
 * @property-read string $plaintext
42
 *                                 <p>Get dom node's plain text.</p>
43
 *
44
 * @method string outerText()
45
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
46
 * @method string outerHtml()
47
 *                                 <p>Get dom node's outer html.</p>
48
 * @method string innerText()
49
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
50
 * @method HtmlDomParser load(string $html)
51
 *                                 <p>Load HTML from string.</p>
52
 * @method HtmlDomParser load_file(string $html)
53
 *                                 <p>Load HTML from file.</p>
54
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
55
 *                                 <p>Load HTML from file.</p>
56
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
57
 *                                 <p>Load HTML from string.</p>
58
 */
59
class HtmlDomParser extends AbstractDomParser
60
{
61
    /**
62
     * @var string[]
63
     */
64
    protected static $functionAliases = [
65
        'outertext' => 'html',
66
        'outerhtml' => 'html',
67
        'innertext' => 'innerHtml',
68
        'innerhtml' => 'innerHtml',
69
        'load'      => 'loadHtml',
70
        'load_file' => 'loadHtmlFile',
71
    ];
72
73
    /**
74
     * @var string[]
75
     */
76
    protected $templateLogicSyntaxInSpecialScriptTags = [
77
        '+',
78
        '<%',
79
        '{%',
80
        '{{',
81
    ];
82
83
    /**
84
     * The properties specified for each special script tag is an array.
85
     *
86
     * ```php
87
     * protected $specialScriptTags = [
88
     *     'text/html',
89
     *     'text/x-custom-template',
90
     *     'text/x-handlebars-template'
91
     * ]
92
     * ```
93
     *
94
     * @var string[]
95
     */
96
    protected $specialScriptTags = [
97
        'text/html',
98
        'text/x-custom-template',
99
        'text/x-handlebars-template',
100
    ];
101
102
    /**
103
     * @var string[]
104
     */
105
    protected $selfClosingTags = [
106
        'area',
107
        'base',
108
        'br',
109
        'col',
110
        'command',
111
        'embed',
112
        'hr',
113
        'img',
114
        'input',
115
        'keygen',
116
        'link',
117
        'meta',
118
        'param',
119
        'source',
120
        'track',
121
        'wbr',
122
    ];
123
124
    /**
125
     * @var bool
126
     */
127
    protected $isDOMDocumentCreatedWithoutHtml = false;
128
129
    /**
130
     * @var bool
131
     */
132
    protected $isDOMDocumentCreatedWithoutWrapper = false;
133
134
    /**
135
     * @var bool
136
     */
137
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
138
139
    /**
140
     * @var bool
141
     */
142
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
143
144
    /**
145
     * @var bool
146
     */
147
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
148
149
    /**
150
     * @var bool
151
     */
152
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
153
154
    /**
155
     * @var bool
156
     */
157
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
158
159
    /**
160
     * @var bool
161
     */
162
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
163
164
    /**
165
     * @var bool
166
     */
167
    protected $keepBrokenHtml = false;
168
169
    /**
170
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
171
     */
172 219
    public function __construct($element = null)
173
    {
174 219
        $this->document = new DOMDocument('1.0', $this->getEncoding());
175
176
        // DOMDocument settings
177 219
        $this->document->preserveWhiteSpace = true;
178 219
        $this->document->formatOutput = true;
179
180 219
        if ($element instanceof SimpleHtmlDomInterface) {
181 105
            $element = $element->getNode();
182
        }
183
184 219
        if ($element instanceof \DOMNode) {
185 105
            $domNode = $this->document->importNode($element, true);
186
187 105
            if ($domNode instanceof \DOMNode) {
188
                /** @noinspection UnusedFunctionResultInspection */
189 105
                $this->document->appendChild($domNode);
190
            }
191
192 105
            return;
193
        }
194
195 219
        if ($element !== null) {
196
            /** @noinspection UnusedFunctionResultInspection */
197 88
            $this->loadHtml($element);
198
        }
199 218
    }
200
201
    /**
202
     * @param string $name
203
     * @param array  $arguments
204
     *
205
     * @return bool|mixed
206
     */
207 79
    public function __call($name, $arguments)
208
    {
209 79
        $name = \strtolower($name);
210
211 79
        if (isset(self::$functionAliases[$name])) {
212 78
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
213
        }
214
215 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
216
    }
217
218
    /**
219
     * @param string $name
220
     * @param array  $arguments
221
     *
222
     * @throws \BadMethodCallException
223
     * @throws \RuntimeException
224
     *
225
     * @return HtmlDomParser
226
     */
227 28
    public static function __callStatic($name, $arguments)
228
    {
229 28
        $arguments0 = $arguments[0] ?? '';
230
231 28
        $arguments1 = $arguments[1] ?? null;
232
233 28
        if ($name === 'str_get_html') {
234 22
            $parser = new static();
235
236 22
            return $parser->loadHtml($arguments0, $arguments1);
237
        }
238
239 7
        if ($name === 'file_get_html') {
240 6
            $parser = new static();
241
242 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
243
        }
244
245 1
        throw new \BadMethodCallException('Method does not exist');
246
    }
247
248
    /** @noinspection MagicMethodsValidityInspection */
249
250
    /**
251
     * @param string $name
252
     *
253
     * @return string|null
254
     */
255 17
    public function __get($name)
256
    {
257 17
        $name = \strtolower($name);
258
259 17
        switch ($name) {
260 17
            case 'outerhtml':
261 17
            case 'outertext':
262 7
                return $this->html();
263 11
            case 'innerhtml':
264 5
            case 'innertext':
265 7
                return $this->innerHtml();
266 4
            case 'text':
267 4
            case 'plaintext':
268 3
                return $this->text();
269
        }
270
271 1
        return null;
272
    }
273
274
    /**
275
     * @return string
276
     */
277 20
    public function __toString()
278
    {
279 20
        return $this->html();
280
    }
281
282
    /**
283
     * does nothing (only for api-compatibility-reasons)
284
     *
285
     * @return bool
286
     *
287
     * @deprecated
288
     */
289
    public function clear(): bool
290
    {
291
        return true;
292
    }
293
294
  /**
295
   * Create DOMDocument from HTML.
296
   *
297
   * @param string|null $html
298
   * @param int|null $libXMLExtraOptions
299
   *
300
   * @return DOMDocument
301
   */
302 204
    protected function createDOMDocument(?string $html, $libXMLExtraOptions = null): DOMDocument
303
    {
304 204
        if (empty($html)) {
305 3
          return new DOMDocument();
306
        }
307
308
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
309 203
        $isDOMDocumentCreatedWithDoctype = false;
310 203
        if (stripos($html, '<!DOCTYPE') !== false) {
311 60
            $isDOMDocumentCreatedWithDoctype = true;
312
            if (
313 60
                preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype)
314
                &&
315 60
                trim($matches_before_doctype[1])
316
            ) {
317 2
                $html = str_replace($matches_before_doctype[1], '', $html);
318
            }
319
        }
320
321 203
        if ($this->keepBrokenHtml) {
322 5
            $html = $this->keepBrokenHtml(trim($html));
323
        }
324
325 203
        if (strpos($html, '<') === false) {
326 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
327 202
        } elseif (strpos(ltrim($html), '<') !== 0) {
328 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
329
        }
330
331 203
        if (strpos(ltrim($html), '<!--') === 0) {
332 12
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
333
        }
334
335
        /** @noinspection HtmlRequiredLangAttribute */
336
        if (
337 203
            strpos($html, '<html ') === false
338
            &&
339 203
            strpos($html, '<html>') === false
340
        ) {
341 122
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
342
        }
343
344
        if (
345 203
            strpos($html, '<body ') === false
346
            &&
347 203
            strpos($html, '<body>') === false
348
        ) {
349 127
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
350
        }
351
352
        /** @noinspection HtmlRequiredTitleElement */
353
        if (
354 203
            strpos($html, '<head ') === false
355
            &&
356 203
            strpos($html, '<head>') === false
357
        ) {
358 146
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
359
        }
360
361
        if (
362 203
            strpos($html, '<p ') === false
363
            &&
364 203
            strpos($html, '<p>') === false
365
        ) {
366 112
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
367
        }
368
369
        if (
370 203
            strpos($html, '</script>') === false
371
            &&
372 203
            strpos($html, '<\/script>') !== false
373
        ) {
374 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
375
        }
376
377 203
        if (stripos($html, '</html>') !== false) {
378
            /** @noinspection NestedPositiveIfStatementsInspection */
379
            if (
380 90
                preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
381
                &&
382 90
                trim($matches_after_html[1])
383
            ) {
384 4
                $html = str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
385
            }
386
        }
387
388 203
        if (strpos($html, '<script') !== false) {
389 23
            $this->html5FallbackForScriptTags($html);
390
391 23
            foreach ($this->specialScriptTags as $tag) {
392 23
                if (strpos($html, $tag) !== false) {
393 6
                    $this->keepSpecialScriptTags($html);
394
                }
395
            }
396
        }
397
398 203
        $html = str_replace(
399 203
            array_map(static function ($e) {
400 203
                return '<' . $e . '>';
401 203
            }, $this->selfClosingTags),
402 203
            array_map(static function ($e) {
403 203
                return '<' . $e . '/>';
404 203
            }, $this->selfClosingTags),
405 203
            $html
406
        );
407
408
        // set error level
409 203
        $internalErrors = libxml_use_internal_errors(true);
410 203
        if (PHP_VERSION_ID < 80000) {
411 203
          $disableEntityLoader = libxml_disable_entity_loader(true);
412
        }
413 203
        libxml_clear_errors();
414
415 203
        $optionsXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
416
417 203
        if (defined('LIBXML_BIGLINES')) {
418 203
            $optionsXml |= LIBXML_BIGLINES;
419
        }
420
421 203
        if (defined('LIBXML_COMPACT')) {
422 203
            $optionsXml |= LIBXML_COMPACT;
423
        }
424
425 203
        if (defined('LIBXML_HTML_NODEFDTD')) {
426 203
            $optionsXml |= LIBXML_HTML_NODEFDTD;
427
        }
428
429 203
        if ($libXMLExtraOptions !== null) {
430 5
            $optionsXml |= $libXMLExtraOptions;
431
        }
432
433
        if (
434 203
            $this->isDOMDocumentCreatedWithoutWrapper
435
            ||
436 199
            $this->isDOMDocumentCreatedWithCommentWrapper
437
            ||
438
            (
439 187
                !$isDOMDocumentCreatedWithDoctype
440
                &&
441 203
                $this->keepBrokenHtml
442
            )
443
        ) {
444 20
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
445
        }
446
447 203
        $html = self::replaceToPreserveHtmlEntities($html);
448
449 203
        $documentFound = false;
450 203
        $sxe = simplexml_load_string($html, SimpleXMLElement::class, $optionsXml);
451 203
        if ($sxe !== false && count(libxml_get_errors()) === 0) {
452 99
            $domElementTmp = dom_import_simplexml($sxe);
453
            if (
454 99
                $domElementTmp
455
                &&
456 99
                $domElementTmp->ownerDocument
457
            ) {
458 99
                $documentFound = true;
459 99
                $this->document = $domElementTmp->ownerDocument;
460
            }
461
        }
462
463 203
        if ($documentFound === false) {
464
465
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
466 114
            $xmlHackUsed = false;
467
            /** @noinspection StringFragmentMisplacedInspection */
468 114
            if (stripos('<?xml', $html) !== 0) {
469 114
                $xmlHackUsed = true;
470 114
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
471
            }
472
473 114
            $this->document->loadHTML($html, $optionsXml);
474
475
            // remove the "xml-encoding" hack
476 114
            if ($xmlHackUsed) {
477 114
                foreach ($this->document->childNodes as $child) {
478 114
                    if ($child->nodeType === \XML_PI_NODE) {
479
                        /** @noinspection UnusedFunctionResultInspection */
480 114
                        $this->document->removeChild($child);
481
482 114
                        break;
483
                    }
484
                }
485
            }
486
        }
487
488
        // set encoding
489 203
        $this->document->encoding = $this->getEncoding();
490
491
        // restore lib-xml settings
492 203
        libxml_clear_errors();
493 203
        libxml_use_internal_errors($internalErrors);
494 203
        if (PHP_VERSION_ID < 80000) {
495 203
          libxml_disable_entity_loader($disableEntityLoader);
0 ignored issues
show
Bug introduced by
The variable $disableEntityLoader does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
496
        }
497
498 203
        return $this->document;
499
    }
500
501
    /**
502
     * Find list of nodes with a CSS selector.
503
     *
504
     * @param string   $selector
505
     * @param int|null $idx
506
     *
507
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
508
     */
509 149
    public function find(string $selector, $idx = null)
510
    {
511 149
        $xPathQuery = SelectorConverter::toXPath($selector);
512
513 149
        $xPath = new \DOMXPath($this->document);
514 149
        $nodesList = $xPath->query($xPathQuery);
515 149
        $elements = new SimpleHtmlDomNode();
516
517 149
        if ($nodesList) {
518 149
            foreach ($nodesList as $node) {
519 139
                $elements[] = new SimpleHtmlDom($node);
520
            }
521
        }
522
523
        // return all elements
524 149
        if ($idx === null) {
525 75
            if (count($elements) === 0) {
526 16
                return new SimpleHtmlDomNodeBlank();
527
            }
528
529 72
            return $elements;
530
        }
531
532
        // handle negative values
533 92
        if ($idx < 0) {
534 11
            $idx = count($elements) + $idx;
535
        }
536
537
        // return one element
538 92
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
539
    }
540
541
    /**
542
     * Find nodes with a CSS selector.
543
     *
544
     * @param string $selector
545
     *
546
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
547
     */
548 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
549
    {
550 12
        return $this->find($selector, null);
551
    }
552
553
    /**
554
     * Find nodes with a CSS selector or false, if no element is found.
555
     *
556
     * @param string $selector
557
     *
558
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
559
     */
560 4
    public function findMultiOrFalse(string $selector)
561
    {
562 4
        $return = $this->find($selector, null);
563
564 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
565 3
            return false;
566
        }
567
568 2
        return $return;
569
    }
570
571
    /**
572
     * Find one node with a CSS selector.
573
     *
574
     * @param string $selector
575
     *
576
     * @return SimpleHtmlDomInterface
577
     */
578 33
    public function findOne(string $selector): SimpleHtmlDomInterface
579
    {
580 33
        return $this->find($selector, 0);
581
    }
582
583
    /**
584
     * Find one node with a CSS selector or false, if no element is found.
585
     *
586
     * @param string $selector
587
     *
588
     * @return false|SimpleHtmlDomInterface
589
     */
590 6
    public function findOneOrFalse(string $selector)
591
    {
592 6
        $return = $this->find($selector, 0);
593
594 6
        if ($return instanceof SimpleHtmlDomBlank) {
595 3
            return false;
596
        }
597
598 4
        return $return;
599
    }
600
601
    /**
602
     * @param string $content
603
     * @param bool   $multiDecodeNewHtmlEntity
604
     *
605
     * @return string
606
     */
607 130
    public function fixHtmlOutput(
608
        string $content,
609
        bool $multiDecodeNewHtmlEntity = false
610
    ): string {
611
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
612
        //          so we try to remove it here again ...
613
614 130
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
615
            /** @noinspection HtmlRequiredLangAttribute */
616 61
            $content = str_replace(
617
                [
618 61
                    '<html>',
619
                    '</html>',
620
                ],
621 61
                '',
622 61
                $content
623
            );
624
        }
625
626 130
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
627
            /** @noinspection HtmlRequiredTitleElement */
628 65
            $content = str_replace(
629
                [
630 65
                    '<head>',
631
                    '</head>',
632
                ],
633 65
                '',
634 65
                $content
635
            );
636
        }
637
638 130
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
639 64
            $content = str_replace(
640
                [
641 64
                    '<body>',
642
                    '</body>',
643
                ],
644 64
                '',
645 64
                $content
646
            );
647
        }
648
649 130
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
650 1
            $content = str_replace(
651 1
                '</script>',
652 1
                '',
653 1
                $content
654
            );
655
        }
656
657 130
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
658 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
659 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
660
        }
661
662 130
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
663 62
            $content = str_replace(
664
                [
665 62
                    '<p>',
666
                    '</p>',
667
                ],
668 62
                '',
669 62
                $content
670
            );
671
        }
672
673 130
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
674 9
            $content = str_replace(
675 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
676 9
                '',
677 9
                $content
678
            );
679
        }
680
681
        // https://bugs.php.net/bug.php?id=73175
682 130
        $content = str_replace(
683 130
            array_map(static function ($e) {
684 130
                return '</' . $e . '>';
685 130
            }, $this->selfClosingTags),
686 130
            '',
687 130
            $content
688
        );
689
690
        /** @noinspection HtmlRequiredTitleElement */
691 130
        $content = trim(
692 130
            str_replace(
693
                [
694 130
                    '<simpleHtmlDomHtml>',
695
                    '</simpleHtmlDomHtml>',
696
                    '<simpleHtmlDomP>',
697
                    '</simpleHtmlDomP>',
698
                    '<head><head>',
699
                    '</head></head>',
700
                ],
701
                [
702 130
                    '',
703
                    '',
704
                    '',
705
                    '',
706
                    '<head>',
707
                    '</head>',
708
                ],
709 130
                $content
710
            )
711
        );
712
713 130
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
714
715 130
        return self::putReplacedBackToPreserveHtmlEntities($content);
716
    }
717
718
    /**
719
     * Return elements by ".class".
720
     *
721
     * @param string $class
722
     *
723
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
724
     */
725
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
726
    {
727
        return $this->findMulti(".${class}");
728
    }
729
730
    /**
731
     * Return element by #id.
732
     *
733
     * @param string $id
734
     *
735
     * @return SimpleHtmlDomInterface
736
     */
737 3
    public function getElementById(string $id): SimpleHtmlDomInterface
738
    {
739 3
        return $this->findOne("#${id}");
740
    }
741
742
    /**
743
     * Return element by tag name.
744
     *
745
     * @param string $name
746
     *
747
     * @return SimpleHtmlDomInterface
748
     */
749 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
750
    {
751 1
        $node = $this->document->getElementsByTagName($name)->item(0);
752
753 1
        if ($node === null) {
754
            return new SimpleHtmlDomBlank();
755
        }
756
757 1
        return new SimpleHtmlDom($node);
758
    }
759
760
    /**
761
     * Returns elements by "#id".
762
     *
763
     * @param string   $id
764
     * @param int|null $idx
765
     *
766
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
767
     */
768
    public function getElementsById(string $id, $idx = null)
769
    {
770
        return $this->find("#${id}", $idx);
771
    }
772
773
    /**
774
     * Returns elements by tag name.
775
     *
776
     * @param string   $name
777
     * @param int|null $idx
778
     *
779
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
780
     */
781 6
    public function getElementsByTagName(string $name, $idx = null)
782
    {
783 6
        $nodesList = $this->document->getElementsByTagName($name);
784
785 6
        $elements = new SimpleHtmlDomNode();
786
787 6
        foreach ($nodesList as $node) {
788 4
            $elements[] = new SimpleHtmlDom($node);
789
        }
790
791
        // return all elements
792 6
        if ($idx === null) {
793 5
            if (count($elements) === 0) {
794 2
                return new SimpleHtmlDomNodeBlank();
795
            }
796
797 3
            return $elements;
798
        }
799
800
        // handle negative values
801 1
        if ($idx < 0) {
802
            $idx = count($elements) + $idx;
803
        }
804
805
        // return one element
806 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
807
    }
808
809
    /**
810
     * Get dom node's outer html.
811
     *
812
     * @param bool $multiDecodeNewHtmlEntity
813
     *
814
     * @return string
815
     */
816 96
    public function html(bool $multiDecodeNewHtmlEntity = false): string
817
    {
818 96
        if (static::$callback !== null) {
819
            \call_user_func(static::$callback, [$this]);
820
        }
821
822 96
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
823 54
            $content = $this->document->saveHTML($this->document->documentElement);
824
        } else {
825 58
            $content = $this->document->saveHTML();
826
        }
827
828 96
        if ($content === false) {
829
            return '';
830
        }
831
832 96
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
833
    }
834
835
  /**
836
   * Load HTML from string.
837
   *
838
   * @param string|null $html
839
   * @param int|null $libXMLExtraOptions
840
   *
841
   * @return HtmlDomParser
842
   */
843 207
    public function loadHtml($html, $libXMLExtraOptions = null): DomParserInterface
844
    {
845
        // reset
846 207
        self::$domBrokenReplaceHelper = [];
847
848 207
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
849
850 204
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
851
    }
852
853
    /**
854
     * Load HTML from file.
855
     *
856
     * @param string   $filePath
857
     * @param int|null $libXMLExtraOptions
858
     *
859
     * @throws \RuntimeException
860
     *
861
     * @return HtmlDomParser
862
     */
863 13
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
864
    {
865
        // reset
866 13
        self::$domBrokenReplaceHelper = [];
867
868
        if (
869 13
            !preg_match("/^https?:\/\//i", $filePath)
870
            &&
871 13
            !\file_exists($filePath)
872
        ) {
873 1
            throw new \RuntimeException("File ${filePath} not found");
874
        }
875
876
        try {
877 12
            if (\class_exists('\voku\helper\UTF8')) {
878
                /** @noinspection PhpUndefinedClassInspection */
879
                $html = UTF8::file_get_contents($filePath);
880
            } else {
881 12
                $html = \file_get_contents($filePath);
882
            }
883 1
        } catch (\Exception $e) {
884 1
            throw new \RuntimeException("Could not load file ${filePath}");
885
        }
886
887 11
        if ($html === false) {
888
            throw new \RuntimeException("Could not load file ${filePath}");
889
        }
890
891 11
        return $this->loadHtml($html, $libXMLExtraOptions);
892
    }
893
894
    /**
895
     * Get the HTML as XML or plain XML if needed.
896
     *
897
     * @param bool $multiDecodeNewHtmlEntity
898
     * @param bool $htmlToXml
899
     * @param bool $removeXmlHeader
900
     * @param int  $options
901
     *
902
     * @return string
903
     */
904 2
    public function xml(
905
        bool $multiDecodeNewHtmlEntity = false,
906
        bool $htmlToXml = true,
907
        bool $removeXmlHeader = true,
908
        int $options = \LIBXML_NOEMPTYTAG
909
    ): string {
910 2
        $xml = $this->document->saveXML(null, $options);
911 2
        if ($xml === false) {
912
            return '';
913
        }
914
915 2
        if ($removeXmlHeader) {
916 2
            $xml = ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
917
        }
918
919 2
        if ($htmlToXml) {
920 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
921
        } else {
922
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
923
924
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
925
        }
926
927 2
        return $return;
928
    }
929
930
    /**
931
     * @param string $selector
932
     * @param int    $idx
933
     *
934
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
935
     */
936 3
    public function __invoke($selector, $idx = null)
937
    {
938 3
        return $this->find($selector, $idx);
939
    }
940
941
    /**
942
     * @return bool
943
     */
944 130
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
945
    {
946 130
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
947
    }
948
949
    /**
950
     * @return bool
951
     */
952 130
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
953
    {
954 130
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
955
    }
956
957
    /**
958
     * @return bool
959
     */
960 130
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
961
    {
962 130
        return $this->isDOMDocumentCreatedWithoutHtml;
963
    }
964
965
    /**
966
     * @return bool
967
     */
968 130
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
969
    {
970 130
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
971
    }
972
973
    /**
974
     * @return bool
975
     */
976 130
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
977
    {
978 130
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
979
    }
980
981
    /**
982
     * @return bool
983
     */
984 130
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
985
    {
986 130
        return $this->isDOMDocumentCreatedWithoutWrapper;
987
    }
988
989
    /**
990
     * @return bool
991
     */
992 130
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
993
    {
994 130
        return $this->isDOMDocumentCreatedWithFakeEndScript;
995
    }
996
997
    /**
998
     * @param string $html
999
     *
1000
     * @return string
1001
     */
1002 5
    protected function keepBrokenHtml(string $html): string
1003
    {
1004
        do {
1005 5
            $original = $html;
1006
1007 5
            $html = (string) \preg_replace_callback(
1008 5
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
1009 5
                static function ($matches) {
1010 5
                    return $matches['start'] .
1011 5
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
1012 5
                        $matches['value'] .
1013 5
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
1014 5
                        $matches['end'];
1015 5
                },
1016 5
                $html
1017
            );
1018 5
        } while ($original !== $html);
1019
1020
        do {
1021 5
            $original = $html;
1022
1023 5
            $html = (string) \preg_replace_callback(
1024 5
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
1025 5
                static function ($matches) {
1026 3
                    $matches['broken'] = str_replace(
1027 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1028 3
                        ['</', '<', '>'],
1029 3
                        $matches['broken']
1030
                    );
1031
1032 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
1033 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
1034
1035 3
                    return $matches['start'] . $matchesHash . $matches['end'];
1036 5
                },
1037 5
                $html
1038
            );
1039 5
        } while ($original !== $html);
1040
1041 5
        return str_replace(
1042 5
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
1043 5
            ['</', '<', '>'],
1044 5
            $html
1045
        );
1046
    }
1047
1048
    /**
1049
     * @param string $html
1050
     *
1051
     * @return void
1052
     */
1053 6
    protected function keepSpecialScriptTags(string &$html)
1054
    {
1055
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1056 6
        $tags = \implode('|', array_map(
1057 6
            static function ($value) {
1058 6
                return \preg_quote($value, '/');
1059 6
            },
1060 6
            $this->specialScriptTags
1061
        ));
1062 6
        $html = (string) \preg_replace_callback(
1063 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
1064 6
            function ($matches) {
1065
1066
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1067
                // because often this looks like non valid html in the template itself.
1068 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
1069 4
                    if (strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
1070
                        // remove the html5 fallback
1071 3
                        $matches['innerContent'] = str_replace('<\/', '</', $matches['innerContent']);
1072
1073 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
1074 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
1075
1076 3
                        return $matches['start'] . $matchesHash . $matches['end'];
1077
                    }
1078
                }
1079
1080
                // remove the html5 fallback
1081 3
                $matches[0] = str_replace('<\/', '</', $matches[0]);
1082
1083 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
1084
1085 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
1086 6
            },
1087 6
            $html
1088
        );
1089 6
    }
1090
1091
    /**
1092
     * @param bool $keepBrokenHtml
1093
     *
1094
     * @return HtmlDomParser
1095
     */
1096 5
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1097
    {
1098 5
        $this->keepBrokenHtml = $keepBrokenHtml;
1099
1100 5
        return $this;
1101
    }
1102
1103
    /**
1104
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1105
     *
1106
     * @return HtmlDomParser
1107
     */
1108 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1109
    {
1110 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1111 2
            if (!\is_string($tmp)) {
1112 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1113
            }
1114
        }
1115
1116 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1117
1118 1
        return $this;
1119
    }
1120
1121
    /**
1122
     * @param string[] $specialScriptTags
1123
     *
1124
     * @return HtmlDomParser
1125
     */
1126
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1127
    {
1128
        foreach ($specialScriptTags as $tag) {
1129
            if (!\is_string($tag)) {
1130
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
1131
            }
1132
        }
1133
1134
        $this->specialScriptTags = $specialScriptTags;
1135
1136
        return $this;
1137
    }
1138
}
1139