Completed
Push — master ( 899793...d65cdf )
by Lars
02:01 queued 17s
created

HtmlDomParser::loadHtmlFile()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 30

Duplication

Lines 30
Ratio 100 %

Code Coverage

Tests 11
CRAP Score 6.1308

Importance

Changes 0
Metric Value
dl 30
loc 30
ccs 11
cts 13
cp 0.8462
rs 8.8177
c 0
b 0
f 0
cc 6
nc 8
nop 2
crap 6.1308
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $keepBrokenHtml;
77
78
    /**
79
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
80
     */
81 147
    public function __construct($element = null)
82
    {
83 147
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
84
85
        // DOMDocument settings
86 147
        $this->document->preserveWhiteSpace = true;
87 147
        $this->document->formatOutput = true;
88
89 147
        if ($element instanceof SimpleHtmlDomInterface) {
90 73
            $element = $element->getNode();
91
        }
92
93 147
        if ($element instanceof \DOMNode) {
94 73
            $domNode = $this->document->importNode($element, true);
95
96 73
            if ($domNode instanceof \DOMNode) {
97
                /** @noinspection UnusedFunctionResultInspection */
98 73
                $this->document->appendChild($domNode);
99
            }
100
101 73
            return;
102
        }
103
104 147
        if ($element !== null) {
105
            /** @noinspection UnusedFunctionResultInspection */
106 79
            $this->loadHtml($element);
107
        }
108 146
    }
109
110
    /**
111
     * @param string $name
112
     * @param array  $arguments
113
     *
114
     * @return bool|mixed
115
     */
116 57
    public function __call($name, $arguments)
117
    {
118 57
        $name = \strtolower($name);
119
120 57
        if (isset(self::$functionAliases[$name])) {
121 56
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
122
        }
123
124 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
125
    }
126
127
    /**
128
     * @param string $name
129
     * @param array  $arguments
130
     *
131
     * @throws \BadMethodCallException
132
     * @throws \RuntimeException
133
     *
134
     * @return HtmlDomParser
135
     */
136 21 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
137
    {
138 21
        $arguments0 = $arguments[0] ?? '';
139
140 21
        $arguments1 = $arguments[1] ?? null;
141
142 21
        if ($name === 'str_get_html') {
143 16
            $parser = new static();
144
145 16
            return $parser->loadHtml($arguments0, $arguments1);
146
        }
147
148 5
        if ($name === 'file_get_html') {
149 4
            $parser = new static();
150
151 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
152
        }
153
154 1
        throw new \BadMethodCallException('Method does not exist');
155
    }
156
157
    /** @noinspection MagicMethodsValidityInspection */
158
159
    /**
160
     * @param string $name
161
     *
162
     * @return string|null
163
     */
164 14
    public function __get($name)
165
    {
166 14
        $name = \strtolower($name);
167
168 14
        switch ($name) {
169 14
            case 'outerhtml':
170 14
            case 'outertext':
171 5
                return $this->html();
172 10
            case 'innerhtml':
173 4
            case 'innertext':
174 7
                return $this->innerHtml();
175 3
            case 'text':
176 3
            case 'plaintext':
177 2
                return $this->text();
178
        }
179
180 1
        return null;
181
    }
182
183
    /**
184
     * @return string
185
     */
186 18
    public function __toString()
187
    {
188 18
        return $this->html();
189
    }
190
191
    /**
192
     * does nothing (only for api-compatibility-reasons)
193
     *
194
     * @return bool
195
     *
196
     * @deprecated
197
     */
198 1
    public function clear(): bool
199
    {
200 1
        return true;
201
    }
202
203
    /**
204
     * Create DOMDocument from HTML.
205
     *
206
     * @param string   $html
207
     * @param int|null $libXMLExtraOptions
208
     *
209
     * @return \DOMDocument
210
     */
211 135
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
212
    {
213 135
        if ($this->keepBrokenHtml) {
214 2
            $html = $this->keepBrokenHtml(\trim($html));
215
        }
216
217 135
        if (\strpos($html, '<') === false) {
218 7
            $this->isDOMDocumentCreatedWithoutHtml = true;
219 133
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
220 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
221
        }
222
223 135
        if (\strpos($html, '<html') === false) {
224 82
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
225
        }
226
227
        /** @noinspection HtmlRequiredTitleElement */
228 135
        if (\strpos($html, '<head>') === false) {
229 85
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
230
        }
231
232
        if (
233 135
            \strpos($html, '</script>') === false
234
            &&
235 135
            \strpos($html, '<\/script>') !== false
236
        ) {
237 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
238
        }
239
240 135
        if (\strpos($html, '<script') !== false) {
241 16
            $this->html5FallbackForScriptTags($html);
242
243
            if (
244 16
                \strpos($html, 'type="text/html"') !== false
245
                ||
246 15
                \strpos($html, 'type=\'text/html\'') !== false
247
                ||
248 15
                \strpos($html, 'type=text/html') !== false
249
                ||
250 15
                \strpos($html, 'type="text/x-custom-template"') !== false
251
                ||
252 14
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
253
                ||
254 16
                \strpos($html, 'type=text/x-custom-template') !== false
255
            ) {
256 2
                $this->keepSpecialScriptTags($html);
257
            }
258
        }
259
260
        // set error level
261 135
        $internalErrors = \libxml_use_internal_errors(true);
262 135
        $disableEntityLoader = \libxml_disable_entity_loader(true);
263 135
        \libxml_clear_errors();
264
265 135
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
266
267 135
        if (\defined('LIBXML_BIGLINES')) {
268 135
            $optionsXml |= \LIBXML_BIGLINES;
269
        }
270
271 135
        if (\defined('LIBXML_COMPACT')) {
272 135
            $optionsXml |= \LIBXML_COMPACT;
273
        }
274
275 135
        if (\defined('LIBXML_HTML_NODEFDTD')) {
276 135
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
277
        }
278
279 135
        if ($libXMLExtraOptions !== null) {
280 1
            $optionsXml |= $libXMLExtraOptions;
281
        }
282
283
        if (
284 135
            $this->isDOMDocumentCreatedWithoutWrapper
285
            ||
286 135
            $this->keepBrokenHtml
287
        ) {
288 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
289
        }
290
291 135
        $html = self::replaceToPreserveHtmlEntities($html);
292
293 135
        $documentFound = false;
294 135
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
295 135 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
296 49
            $domElementTmp = \dom_import_simplexml($sxe);
297 49
            if ($domElementTmp) {
298 49
                $documentFound = true;
299 49
                $this->document = $domElementTmp->ownerDocument;
300
            }
301
        }
302
303 135 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
304
305
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
306 91
            $xmlHackUsed = false;
307
            /** @noinspection StringFragmentMisplacedInspection */
308 91
            if (\stripos('<?xml', $html) !== 0) {
309 91
                $xmlHackUsed = true;
310 91
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
311
            }
312
313 91
            $this->document->loadHTML($html, $optionsXml);
314
315
            // remove the "xml-encoding" hack
316 91
            if ($xmlHackUsed) {
317 91
                foreach ($this->document->childNodes as $child) {
318 91
                    if ($child->nodeType === \XML_PI_NODE) {
319
                        /** @noinspection UnusedFunctionResultInspection */
320 91
                        $this->document->removeChild($child);
321
322 91
                        break;
323
                    }
324
                }
325
            }
326
        }
327
328
        // set encoding
329 135
        $this->document->encoding = $this->getEncoding();
330
331
        // restore lib-xml settings
332 135
        \libxml_clear_errors();
333 135
        \libxml_use_internal_errors($internalErrors);
334 135
        \libxml_disable_entity_loader($disableEntityLoader);
335
336 135
        return $this->document;
337
    }
338
339
    /**
340
     * Find list of nodes with a CSS selector.
341
     *
342
     * @param string   $selector
343
     * @param int|null $idx
344
     *
345
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
346
     */
347 96 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
348
    {
349 96
        $xPathQuery = SelectorConverter::toXPath($selector);
350
351 96
        $xPath = new \DOMXPath($this->document);
352 96
        $nodesList = $xPath->query($xPathQuery);
353 96
        $elements = new SimpleHtmlDomNode();
354
355 96
        foreach ($nodesList as $node) {
356 88
            $elements[] = new SimpleHtmlDom($node);
357
        }
358
359
        // return all elements
360 96
        if ($idx === null) {
361 62
            if (\count($elements) === 0) {
362 14
                return new SimpleHtmlDomNodeBlank();
363
            }
364
365 59
            return $elements;
366
        }
367
368
        // handle negative values
369 48
        if ($idx < 0) {
370 11
            $idx = \count($elements) + $idx;
371
        }
372
373
        // return one element
374 48
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
375
    }
376
377
    /**
378
     * Find nodes with a CSS selector.
379
     *
380
     * @param string $selector
381
     *
382
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
383
     */
384 5
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
385
    {
386 5
        return $this->find($selector, null);
387
    }
388
389
    /**
390
     * Find nodes with a CSS selector or false, if no element is found.
391
     *
392
     * @param string $selector
393
     *
394
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
395
     */
396 1
    public function findMultiOrFalse(string $selector)
397
    {
398 1
        $return = $this->find($selector, null);
399
400 1
        if ($return instanceof SimpleHtmlDomNodeBlank) {
401 1
            return false;
402
        }
403
404 1
        return $return;
405
    }
406
407
    /**
408
     * Find one node with a CSS selector.
409
     *
410
     * @param string $selector
411
     *
412
     * @return SimpleHtmlDomInterface
413
     */
414 8
    public function findOne(string $selector): SimpleHtmlDomInterface
415
    {
416 8
        return $this->find($selector, 0);
417
    }
418
419
    /**
420
     * Find one node with a CSS selector or false, if no element is found.
421
     *
422
     * @param string $selector
423
     *
424
     * @return false|SimpleHtmlDomInterface
425
     */
426 1
    public function findOneOrFalse(string $selector)
427
    {
428 1
        $return = $this->find($selector, 0);
429
430 1
        if ($return instanceof SimpleHtmlDomBlank) {
431 1
            return false;
432
        }
433
434 1
        return $return;
435
    }
436
437
    /**
438
     * @param string $content
439
     * @param bool   $multiDecodeNewHtmlEntity
440
     *
441
     * @return string
442
     */
443 78
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
444
    {
445
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
446
        //          so we try to remove it here again ...
447
448 78
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
449
            /** @noinspection HtmlRequiredLangAttribute */
450 30
            $content = \str_replace(
451
                [
452 30
                    '<body>',
453
                    '</body>',
454
                    '<html>',
455
                    '</html>',
456
                ],
457 30
                '',
458 30
                $content
459
            );
460
        }
461
462 78
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
463
            /** @noinspection HtmlRequiredTitleElement */
464 31
            $content = \str_replace(
465
                [
466 31
                    '<head>',
467
                    '</head>',
468
                ],
469 31
                '',
470 31
                $content
471
            );
472
        }
473
474 78
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
475 1
            $content = \str_replace(
476 1
                '</script>',
477 1
                '',
478 1
                $content
479
            );
480
        }
481
482 78
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
483 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
484 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
485
        }
486
487 78
        if ($this->isDOMDocumentCreatedWithoutHtml) {
488 5
            $content = \str_replace(
489
                [
490 5
                    '<p>',
491
                    '</p>',
492
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
493
                ],
494 5
                '',
495 5
                $content
496
            );
497
        }
498
499
        /** @noinspection CheckTagEmptyBody */
500
        /** @noinspection HtmlExtraClosingTag */
501
        /** @noinspection HtmlRequiredTitleElement */
502 78
        $content = \trim(
503 78
            \str_replace(
504
                [
505 78
                    '<simpleHtmlDomP>',
506
                    '</simpleHtmlDomP>',
507
                    '<head><head>',
508
                    '</head></head>',
509
                    '<br></br>',
510
                ],
511
                [
512 78
                    '',
513
                    '',
514
                    '<head>',
515
                    '</head>',
516
                    '<br>',
517
                ],
518 78
                $content
519
            )
520
        );
521
522 78
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
523
524 78
        return self::putReplacedBackToPreserveHtmlEntities($content);
525
    }
526
527
    /**
528
     * Return elements by .class.
529
     *
530
     * @param string $class
531
     *
532
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
533
     */
534
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
535
    {
536
        return $this->findMulti(".${class}");
537
    }
538
539
    /**
540
     * Return element by #id.
541
     *
542
     * @param string $id
543
     *
544
     * @return SimpleHtmlDomInterface
545
     */
546 2
    public function getElementById(string $id): SimpleHtmlDomInterface
547
    {
548 2
        return $this->findOne("#${id}");
549
    }
550
551
    /**
552
     * Return element by tag name.
553
     *
554
     * @param string $name
555
     *
556
     * @return SimpleHtmlDomInterface
557
     */
558 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
559
    {
560 1
        $node = $this->document->getElementsByTagName($name)->item(0);
561
562 1
        if ($node === null) {
563
            return new SimpleHtmlDomBlank();
564
        }
565
566 1
        return new SimpleHtmlDom($node);
567
    }
568
569
    /**
570
     * Returns elements by #id.
571
     *
572
     * @param string   $id
573
     * @param int|null $idx
574
     *
575
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
576
     */
577
    public function getElementsById(string $id, $idx = null)
578
    {
579
        return $this->find("#${id}", $idx);
580
    }
581
582
    /**
583
     * Returns elements by tag name.
584
     *
585
     * @param string   $name
586
     * @param int|null $idx
587
     *
588
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
589
     */
590 4
    public function getElementsByTagName(string $name, $idx = null)
591
    {
592 4
        $nodesList = $this->document->getElementsByTagName($name);
593
594 4
        $elements = new SimpleHtmlDomNode();
595
596 4
        foreach ($nodesList as $node) {
597 4
            $elements[] = new SimpleHtmlDom($node);
598
        }
599
600
        // return all elements
601 4
        if ($idx === null) {
602 3
            if (\count($elements) === 0) {
603
                return new SimpleHtmlDomNodeBlank();
604
            }
605
606 3
            return $elements;
607
        }
608
609
        // handle negative values
610 1
        if ($idx < 0) {
611
            $idx = \count($elements) + $idx;
612
        }
613
614
        // return one element
615 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
616
    }
617
618
    /**
619
     * Get dom node's outer html.
620
     *
621
     * @param bool $multiDecodeNewHtmlEntity
622
     *
623
     * @return string
624
     */
625 51
    public function html(bool $multiDecodeNewHtmlEntity = false): string
626
    {
627 51
        if ($this::$callback !== null) {
628
            \call_user_func($this::$callback, [$this]);
629
        }
630
631 51
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
632 23
            $content = $this->document->saveHTML($this->document->documentElement);
633
        } else {
634 36
            $content = $this->document->saveHTML();
635
        }
636
637 51
        if ($content === false) {
638
            return '';
639
        }
640
641 51
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
642
    }
643
644
    /**
645
     * Load HTML from string.
646
     *
647
     * @param string   $html
648
     * @param int|null $libXMLExtraOptions
649
     *
650
     * @return HtmlDomParser
651
     */
652 135
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
653
    {
654
        // reset
655 135
        self::$domBrokenReplaceHelper = [];
656
657 135
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
658
659 135
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
660
    }
661
662
    /**
663
     * Load HTML from file.
664
     *
665
     * @param string   $filePath
666
     * @param int|null $libXMLExtraOptions
667
     *
668
     * @throws \RuntimeException
669
     *
670
     * @return HtmlDomParser
671
     */
672 11 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
673
    {
674
        // reset
675 11
        self::$domBrokenReplaceHelper = [];
676
677
        if (
678 11
            !\preg_match("/^https?:\/\//i", $filePath)
679
            &&
680 11
            !\file_exists($filePath)
681
        ) {
682 1
            throw new \RuntimeException("File ${filePath} not found");
683
        }
684
685
        try {
686 10
            if (\class_exists('\voku\helper\UTF8')) {
687
                /** @noinspection PhpUndefinedClassInspection */
688
                $html = UTF8::file_get_contents($filePath);
689
            } else {
690 10
                $html = \file_get_contents($filePath);
691
            }
692 1
        } catch (\Exception $e) {
693 1
            throw new \RuntimeException("Could not load file ${filePath}");
694
        }
695
696 9
        if ($html === false) {
697
            throw new \RuntimeException("Could not load file ${filePath}");
698
        }
699
700 9
        return $this->loadHtml($html, $libXMLExtraOptions);
701
    }
702
703
    /**
704
     * Get the HTML as XML or plain XML if needed.
705
     *
706
     * @param bool $multiDecodeNewHtmlEntity
707
     * @param bool $htmlToXml
708
     * @param bool $removeXmlHeader
709
     * @param int  $options
710
     *
711
     * @return string
712
     */
713 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
714
        bool $multiDecodeNewHtmlEntity = false,
715
        bool $htmlToXml = true,
716
        bool $removeXmlHeader = true,
717
        int $options = \LIBXML_NOEMPTYTAG
718
    ): string {
719 2
        $xml = $this->document->saveXML(null, $options);
720
721 2
        if ($removeXmlHeader) {
722 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
723
        }
724
725 2
        if ($htmlToXml) {
726 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
727
        } else {
728
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
729
730
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
731
        }
732
733 2
        return $return;
734
    }
735
736
    /**
737
     * @param string $selector
738
     * @param int    $idx
739
     *
740
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
741
     */
742 3
    public function __invoke($selector, $idx = null)
743
    {
744 3
        return $this->find($selector, $idx);
745
    }
746
747
    /**
748
     * @return bool
749
     */
750 9
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
751
    {
752 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
753
    }
754
755
    /**
756
     * @return bool
757
     */
758 9
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
759
    {
760 9
        return $this->isDOMDocumentCreatedWithoutHtml;
761
    }
762
763
    /**
764
     * @return bool
765
     */
766 51
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
767
    {
768 51
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
769
    }
770
771
    /**
772
     * @return bool
773
     */
774
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
775
    {
776
        return $this->isDOMDocumentCreatedWithoutWrapper;
777
    }
778
779
    /**
780
     * @param string $html
781
     *
782
     * @return string
783
     */
784 2
    protected function keepBrokenHtml(string $html): string
785
    {
786
        do {
787 2
            $original = $html;
788
789 2
            $html = (string) \preg_replace_callback(
790 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
791
                static function ($matches) {
792 2
                    return $matches['start'] .
793 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
794 2
                           $matches['value'] .
795 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
796 2
                           $matches['end'];
797 2
                },
798 2
                $html
799
            );
800 2
        } while ($original !== $html);
801
802
        do {
803 2
            $original = $html;
804
805 2
            $html = (string) \preg_replace_callback(
806 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
807
                static function ($matches) {
808 2
                    $matches['broken'] = \str_replace(
809 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
810 2
                        ['</', '<', '>'],
811 2
                        $matches['broken']
812
                    );
813
814 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
815 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
816
817 2
                    return $matches['start'] . $matchesHash . $matches['end'];
818 2
                },
819 2
                $html
820
            );
821 2
        } while ($original !== $html);
822
823 2
        return \str_replace(
824 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
825 2
            ['</', '<', '>'],
826 2
            $html
827
        );
828
    }
829
830
    /**
831
     * @param string $html
832
     */
833 2
    protected function keepSpecialScriptTags(string &$html)
834
    {
835
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
836 2
        $html = (string) \preg_replace_callback(
837 2
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
838
            static function ($matches) {
839
840
                if (
841 2
                    strpos($matches['innerContent'], '+') === false
842
                    &&
843 2
                    strpos($matches['innerContent'], '<%') === false
844
                    &&
845 2
                    strpos($matches['innerContent'], '{%') === false
846
                    &&
847 2
                    strpos($matches['innerContent'], '{{') === false
848
                ) {
849
                    // remove the html5 fallback
850 1
                    $matches[0] = \str_replace('<\/', '</', $matches[0]);
851
852 1
                    $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
853 1
                    $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
854
855 1
                    return $specialNonScript;
856
                }
857
858
                // remove the html5 fallback
859 1
                $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
860
861 1
                self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
862 1
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
863
864 1
                return $matches['start'] . $matchesHash . $matches['end'];
865 2
            },
866 2
            $html
867
        );
868 2
    }
869
870
    /**
871
     * @param bool $keepBrokenHtml
872
     *
873
     * @return HtmlDomParser
874
     */
875 2
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
876
    {
877 2
        $this->keepBrokenHtml = $keepBrokenHtml;
878
879 2
        return $this;
880
    }
881
}
882