Completed
Push — master ( 236b50...7a7214 )
by Lars
02:07
created

getIsDOMDocumentCreatedWithoutBodyWrapper()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 2
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $keepBrokenHtml;
82
83
    /**
84
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
85
     */
86 181 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
87
    {
88 181
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
89
90
        // DOMDocument settings
91 181
        $this->document->preserveWhiteSpace = true;
92 181
        $this->document->formatOutput = true;
93
94 181
        if ($element instanceof SimpleHtmlDomInterface) {
95 82
            $element = $element->getNode();
96
        }
97
98 181
        if ($element instanceof \DOMNode) {
99 82
            $domNode = $this->document->importNode($element, true);
100
101 82
            if ($domNode instanceof \DOMNode) {
102
                /** @noinspection UnusedFunctionResultInspection */
103 82
                $this->document->appendChild($domNode);
104
            }
105
106 82
            return;
107
        }
108
109 181
        if ($element !== null) {
110
            /** @noinspection UnusedFunctionResultInspection */
111 81
            $this->loadHtml($element);
112
        }
113 180
    }
114
115
    /**
116
     * @param string $name
117
     * @param array  $arguments
118
     *
119
     * @return bool|mixed
120
     */
121 59
    public function __call($name, $arguments)
122
    {
123 59
        $name = \strtolower($name);
124
125 59
        if (isset(self::$functionAliases[$name])) {
126 58
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
127
        }
128
129 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
130
    }
131
132
    /**
133
     * @param string $name
134
     * @param array  $arguments
135
     *
136
     * @throws \BadMethodCallException
137
     * @throws \RuntimeException
138
     *
139
     * @return HtmlDomParser
140
     */
141 21 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
142
    {
143 21
        $arguments0 = $arguments[0] ?? '';
144
145 21
        $arguments1 = $arguments[1] ?? null;
146
147 21
        if ($name === 'str_get_html') {
148 16
            $parser = new static();
149
150 16
            return $parser->loadHtml($arguments0, $arguments1);
151
        }
152
153 5
        if ($name === 'file_get_html') {
154 4
            $parser = new static();
155
156 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
157
        }
158
159 1
        throw new \BadMethodCallException('Method does not exist');
160
    }
161
162
    /** @noinspection MagicMethodsValidityInspection */
163
164
    /**
165
     * @param string $name
166
     *
167
     * @return string|null
168
     */
169 14
    public function __get($name)
170
    {
171 14
        $name = \strtolower($name);
172
173
        switch ($name) {
174 14
            case 'outerhtml':
175 14
            case 'outertext':
176 5
                return $this->html();
177 10
            case 'innerhtml':
178 4
            case 'innertext':
179 7
                return $this->innerHtml();
180 3
            case 'text':
181 3
            case 'plaintext':
182 2
                return $this->text();
183
        }
184
185 1
        return null;
186
    }
187
188
    /**
189
     * @return string
190
     */
191 18
    public function __toString()
192
    {
193 18
        return $this->html();
194
    }
195
196
    /**
197
     * does nothing (only for api-compatibility-reasons)
198
     *
199
     * @return bool
200
     *
201
     * @deprecated
202
     */
203 1
    public function clear(): bool
204
    {
205 1
        return true;
206
    }
207
208
    /**
209
     * Create DOMDocument from HTML.
210
     *
211
     * @param string   $html
212
     * @param int|null $libXMLExtraOptions
213
     *
214
     * @return \DOMDocument
215
     */
216 169
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
217
    {
218 169
        if ($this->keepBrokenHtml) {
219 3
            $html = $this->keepBrokenHtml(\trim($html));
220
        }
221
222 169
        if (\strpos($html, '<') === false) {
223 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
224 167
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
225 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
226
        }
227
228 169
        if (\strpos($html, '<html') === false) {
229 96
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
230
        }
231
232 169
        if (\strpos($html, '<body') === false) {
233 99
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
234
        }
235
236
        /** @noinspection HtmlRequiredTitleElement */
237 169
        if (\strpos($html, '<head>') === false) {
238 116
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
239
        }
240
241
        if (
242 169
            \strpos($html, '</script>') === false
243
            &&
244 169
            \strpos($html, '<\/script>') !== false
245
        ) {
246 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
247
        }
248
249 169
        if (\strpos($html, '<script') !== false) {
250 20
            $this->html5FallbackForScriptTags($html);
251
252
            if (
253 20
                \strpos($html, 'type="text/html"') !== false
254
                ||
255 19
                \strpos($html, 'type=\'text/html\'') !== false
256
                ||
257 19
                \strpos($html, 'type=text/html') !== false
258
                ||
259 19
                \strpos($html, 'type="text/x-custom-template"') !== false
260
                ||
261 18
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
262
                ||
263 20
                \strpos($html, 'type=text/x-custom-template') !== false
264
            ) {
265 2
                $this->keepSpecialScriptTags($html);
266
            }
267
        }
268
269
        // set error level
270 169
        $internalErrors = \libxml_use_internal_errors(true);
271 169
        $disableEntityLoader = \libxml_disable_entity_loader(true);
272 169
        \libxml_clear_errors();
273
274 169
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
275
276 169
        if (\defined('LIBXML_BIGLINES')) {
277 169
            $optionsXml |= \LIBXML_BIGLINES;
278
        }
279
280 169
        if (\defined('LIBXML_COMPACT')) {
281 169
            $optionsXml |= \LIBXML_COMPACT;
282
        }
283
284 169
        if (\defined('LIBXML_HTML_NODEFDTD')) {
285 169
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
286
        }
287
288 169
        if ($libXMLExtraOptions !== null) {
289 5
            $optionsXml |= $libXMLExtraOptions;
290
        }
291
292
        if (
293 169
            $this->isDOMDocumentCreatedWithoutWrapper
294
            ||
295 169
            $this->keepBrokenHtml
296
        ) {
297 7
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
298
        }
299
300 169
        $html = self::replaceToPreserveHtmlEntities($html);
301
302 169
        $documentFound = false;
303 169
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
304 169 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
305 74
            $domElementTmp = \dom_import_simplexml($sxe);
306 74
            if ($domElementTmp) {
307 74
                $documentFound = true;
308 74
                $this->document = $domElementTmp->ownerDocument;
309
            }
310
        }
311
312 169 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
313
314
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
315 103
            $xmlHackUsed = false;
316
            /** @noinspection StringFragmentMisplacedInspection */
317 103
            if (\stripos('<?xml', $html) !== 0) {
318 103
                $xmlHackUsed = true;
319 103
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
320
            }
321
322 103
            $this->document->loadHTML($html, $optionsXml);
323
324
            // remove the "xml-encoding" hack
325 103
            if ($xmlHackUsed) {
326 103
                foreach ($this->document->childNodes as $child) {
327 103
                    if ($child->nodeType === \XML_PI_NODE) {
328
                        /** @noinspection UnusedFunctionResultInspection */
329 103
                        $this->document->removeChild($child);
330
331 103
                        break;
332
                    }
333
                }
334
            }
335
        }
336
337
        // set encoding
338 169
        $this->document->encoding = $this->getEncoding();
339
340
        // restore lib-xml settings
341 169
        \libxml_clear_errors();
342 169
        \libxml_use_internal_errors($internalErrors);
343 169
        \libxml_disable_entity_loader($disableEntityLoader);
344
345 169
        return $this->document;
346
    }
347
348
    /**
349
     * Find list of nodes with a CSS selector.
350
     *
351
     * @param string   $selector
352
     * @param int|null $idx
353
     *
354
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
355
     */
356 121 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
357
    {
358 121
        $xPathQuery = SelectorConverter::toXPath($selector);
359
360 121
        $xPath = new \DOMXPath($this->document);
361 121
        $nodesList = $xPath->query($xPathQuery);
362 121
        $elements = new SimpleHtmlDomNode();
363
364 121
        foreach ($nodesList as $node) {
365 113
            $elements[] = new SimpleHtmlDom($node);
366
        }
367
368
        // return all elements
369 121
        if ($idx === null) {
370 69
            if (\count($elements) === 0) {
371 16
                return new SimpleHtmlDomNodeBlank();
372
            }
373
374 66
            return $elements;
375
        }
376
377
        // handle negative values
378 70
        if ($idx < 0) {
379 11
            $idx = \count($elements) + $idx;
380
        }
381
382
        // return one element
383 70
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
384
    }
385
386
    /**
387
     * Find nodes with a CSS selector.
388
     *
389
     * @param string $selector
390
     *
391
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
392
     */
393 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
394
    {
395 12
        return $this->find($selector, null);
396
    }
397
398
    /**
399
     * Find nodes with a CSS selector or false, if no element is found.
400
     *
401
     * @param string $selector
402
     *
403
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
404
     */
405 3
    public function findMultiOrFalse(string $selector)
406
    {
407 3
        $return = $this->find($selector, null);
408
409 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
410 3
            return false;
411
        }
412
413 1
        return $return;
414
    }
415
416
    /**
417
     * Find one node with a CSS selector.
418
     *
419
     * @param string $selector
420
     *
421
     * @return SimpleHtmlDomInterface
422
     */
423 30
    public function findOne(string $selector): SimpleHtmlDomInterface
424
    {
425 30
        return $this->find($selector, 0);
426
    }
427
428
    /**
429
     * Find one node with a CSS selector or false, if no element is found.
430
     *
431
     * @param string $selector
432
     *
433
     * @return false|SimpleHtmlDomInterface
434
     */
435 2
    public function findOneOrFalse(string $selector)
436
    {
437 2
        $return = $this->find($selector, 0);
438
439 2
        if ($return instanceof SimpleHtmlDomBlank) {
440 2
            return false;
441
        }
442
443 1
        return $return;
444
    }
445
446
    /**
447
     * @param string $content
448
     * @param bool   $multiDecodeNewHtmlEntity
449
     *
450
     * @return string
451
     */
452 98
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
453
    {
454
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
455
        //          so we try to remove it here again ...
456
457 98
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
458
            /** @noinspection HtmlRequiredLangAttribute */
459 42
            $content = \str_replace(
460
                [
461 42
                    '<html>',
462
                    '</html>',
463
                ],
464 42
                '',
465 42
                $content
466
            );
467
        }
468
469 98
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
470
            /** @noinspection HtmlRequiredTitleElement */
471 44
            $content = \str_replace(
472
                [
473 44
                    '<head>',
474
                    '</head>',
475
                ],
476 44
                '',
477 44
                $content
478
            );
479
        }
480
481 98
        if ($this->isDOMDocumentCreatedWithoutBodyWrapper) {
482
            /** @noinspection HtmlRequiredLangAttribute */
483 44
            $content = \str_replace(
484
                [
485 44
                    '<body>',
486
                    '</body>',
487
                ],
488 44
                '',
489 44
                $content
490
            );
491
        }
492
493 98
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
494 1
            $content = \str_replace(
495 1
                '</script>',
496 1
                '',
497 1
                $content
498
            );
499
        }
500
501 98
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
502 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
503 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
504
        }
505
506 98
        if ($this->isDOMDocumentCreatedWithoutHtml) {
507 8
            $content = \str_replace(
508
                [
509 8
                    '<p>',
510
                    '</p>',
511
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
512
                ],
513 8
                '',
514 8
                $content
515
            );
516
        }
517
518
        /** @noinspection CheckTagEmptyBody */
519
        /** @noinspection HtmlExtraClosingTag */
520
        /** @noinspection HtmlRequiredTitleElement */
521 98
        $content = \trim(
522 98
            \str_replace(
523
                [
524 98
                    '<simpleHtmlDomP>',
525
                    '</simpleHtmlDomP>',
526
                    '<head><head>',
527
                    '</head></head>',
528
                    '<br></br>',
529
                ],
530
                [
531 98
                    '',
532
                    '',
533
                    '<head>',
534
                    '</head>',
535
                    '<br>',
536
                ],
537 98
                $content
538
            )
539
        );
540
541 98
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
542
543 98
        return self::putReplacedBackToPreserveHtmlEntities($content);
544
    }
545
546
    /**
547
     * Return elements by ".class".
548
     *
549
     * @param string $class
550
     *
551
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
552
     */
553
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
554
    {
555
        return $this->findMulti(".${class}");
556
    }
557
558
    /**
559
     * Return element by #id.
560
     *
561
     * @param string $id
562
     *
563
     * @return SimpleHtmlDomInterface
564
     */
565 3
    public function getElementById(string $id): SimpleHtmlDomInterface
566
    {
567 3
        return $this->findOne("#${id}");
568
    }
569
570
    /**
571
     * Return element by tag name.
572
     *
573
     * @param string $name
574
     *
575
     * @return SimpleHtmlDomInterface
576
     */
577 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
578
    {
579 1
        $node = $this->document->getElementsByTagName($name)->item(0);
580
581 1
        if ($node === null) {
582
            return new SimpleHtmlDomBlank();
583
        }
584
585 1
        return new SimpleHtmlDom($node);
586
    }
587
588
    /**
589
     * Returns elements by "#id".
590
     *
591
     * @param string   $id
592
     * @param int|null $idx
593
     *
594
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
595
     */
596
    public function getElementsById(string $id, $idx = null)
597
    {
598
        return $this->find("#${id}", $idx);
599
    }
600
601
    /**
602
     * Returns elements by tag name.
603
     *
604
     * @param string   $name
605
     * @param int|null $idx
606
     *
607
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
608
     */
609 6
    public function getElementsByTagName(string $name, $idx = null)
610
    {
611 6
        $nodesList = $this->document->getElementsByTagName($name);
612
613 6
        $elements = new SimpleHtmlDomNode();
614
615 6
        foreach ($nodesList as $node) {
616 4
            $elements[] = new SimpleHtmlDom($node);
617
        }
618
619
        // return all elements
620 6
        if ($idx === null) {
621 5
            if (\count($elements) === 0) {
622 2
                return new SimpleHtmlDomNodeBlank();
623
            }
624
625 3
            return $elements;
626
        }
627
628
        // handle negative values
629 1
        if ($idx < 0) {
630
            $idx = \count($elements) + $idx;
631
        }
632
633
        // return one element
634 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
635
    }
636
637
    /**
638
     * Get dom node's outer html.
639
     *
640
     * @param bool $multiDecodeNewHtmlEntity
641
     *
642
     * @return string
643
     */
644 67
    public function html(bool $multiDecodeNewHtmlEntity = false): string
645
    {
646 67
        if ($this::$callback !== null) {
647
            \call_user_func($this::$callback, [$this]);
648
        }
649
650 67
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
651 35
            $content = $this->document->saveHTML($this->document->documentElement);
652
        } else {
653 43
            $content = $this->document->saveHTML();
654
        }
655
656 67
        if ($content === false) {
657
            return '';
658
        }
659
660 67
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
661
    }
662
663
    /**
664
     * Load HTML from string.
665
     *
666
     * @param string   $html
667
     * @param int|null $libXMLExtraOptions
668
     *
669
     * @return HtmlDomParser
670
     */
671 169
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
672
    {
673
        // reset
674 169
        self::$domBrokenReplaceHelper = [];
675
676 169
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
677
678 169
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
679
    }
680
681
    /**
682
     * Load HTML from file.
683
     *
684
     * @param string   $filePath
685
     * @param int|null $libXMLExtraOptions
686
     *
687
     * @throws \RuntimeException
688
     *
689
     * @return HtmlDomParser
690
     */
691 11 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
692
    {
693
        // reset
694 11
        self::$domBrokenReplaceHelper = [];
695
696
        if (
697 11
            !\preg_match("/^https?:\/\//i", $filePath)
698
            &&
699 11
            !\file_exists($filePath)
700
        ) {
701 1
            throw new \RuntimeException("File ${filePath} not found");
702
        }
703
704
        try {
705 10
            if (\class_exists('\voku\helper\UTF8')) {
706
                /** @noinspection PhpUndefinedClassInspection */
707
                $html = UTF8::file_get_contents($filePath);
708
            } else {
709 10
                $html = \file_get_contents($filePath);
710
            }
711 1
        } catch (\Exception $e) {
712 1
            throw new \RuntimeException("Could not load file ${filePath}");
713
        }
714
715 9
        if ($html === false) {
716
            throw new \RuntimeException("Could not load file ${filePath}");
717
        }
718
719 9
        return $this->loadHtml($html, $libXMLExtraOptions);
720
    }
721
722
    /**
723
     * Get the HTML as XML or plain XML if needed.
724
     *
725
     * @param bool $multiDecodeNewHtmlEntity
726
     * @param bool $htmlToXml
727
     * @param bool $removeXmlHeader
728
     * @param int  $options
729
     *
730
     * @return string
731
     */
732 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
733
        bool $multiDecodeNewHtmlEntity = false,
734
        bool $htmlToXml = true,
735
        bool $removeXmlHeader = true,
736
        int $options = \LIBXML_NOEMPTYTAG
737
    ): string {
738 2
        $xml = $this->document->saveXML(null, $options);
739
740 2
        if ($removeXmlHeader) {
741 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
742
        }
743
744 2
        if ($htmlToXml) {
745 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
746
        } else {
747
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
748
749
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
750
        }
751
752 2
        return $return;
753
    }
754
755
    /**
756
     * @param string $selector
757
     * @param int    $idx
758
     *
759
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
760
     */
761 3
    public function __invoke($selector, $idx = null)
762
    {
763 3
        return $this->find($selector, $idx);
764
    }
765
766
    /**
767
     * @return bool
768
     */
769 11
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
770
    {
771 11
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
772
    }
773
774
    /**
775
     * @return bool
776
     */
777 11
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
778
    {
779 11
        return $this->isDOMDocumentCreatedWithoutHtml;
780
    }
781
782
    /**
783
     * @return bool
784
     */
785
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
786
    {
787
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
788
    }
789
790
    /**
791
     * @return bool
792
     */
793 67
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
794
    {
795 67
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
796
    }
797
798
    /**
799
     * @return bool
800
     */
801
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
802
    {
803
        return $this->isDOMDocumentCreatedWithoutWrapper;
804
    }
805
806
    /**
807
     * @param string $html
808
     *
809
     * @return string
810
     */
811 3
    protected function keepBrokenHtml(string $html): string
812
    {
813
        do {
814 3
            $original = $html;
815
816 3
            $html = (string) \preg_replace_callback(
817 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
818
                static function ($matches) {
819 3
                    return $matches['start'] .
820 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
821 3
                           $matches['value'] .
822 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
823 3
                           $matches['end'];
824 3
                },
825 3
                $html
826
            );
827 3
        } while ($original !== $html);
828
829
        do {
830 3
            $original = $html;
831
832 3
            $html = (string) \preg_replace_callback(
833 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
834
                static function ($matches) {
835 3
                    $matches['broken'] = \str_replace(
836 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
837 3
                        ['</', '<', '>'],
838 3
                        $matches['broken']
839
                    );
840
841 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
842 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
843
844 3
                    return $matches['start'] . $matchesHash . $matches['end'];
845 3
                },
846 3
                $html
847
            );
848 3
        } while ($original !== $html);
849
850 3
        return \str_replace(
851 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
852 3
            ['</', '<', '>'],
853 3
            $html
854
        );
855
    }
856
857
    /**
858
     * @param string $html
859
     */
860 2
    protected function keepSpecialScriptTags(string &$html)
861
    {
862
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
863 2
        $html = (string) \preg_replace_callback(
864 2
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
865
            static function ($matches) {
866
867
                if (
868 2
                    strpos($matches['innerContent'], '+') === false
869
                    &&
870 2
                    strpos($matches['innerContent'], '<%') === false
871
                    &&
872 2
                    strpos($matches['innerContent'], '{%') === false
873
                    &&
874 2
                    strpos($matches['innerContent'], '{{') === false
875
                ) {
876
                    // remove the html5 fallback
877 1
                    $matches[0] = \str_replace('<\/', '</', $matches[0]);
878
879 1
                    $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
880 1
                    $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
881
882 1
                    return $specialNonScript;
883
                }
884
885
                // remove the html5 fallback
886 1
                $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
887
888 1
                self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
889 1
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
890
891 1
                return $matches['start'] . $matchesHash . $matches['end'];
892 2
            },
893 2
            $html
894
        );
895 2
    }
896
897
    /**
898
     * @param bool $keepBrokenHtml
899
     *
900
     * @return HtmlDomParser
901
     */
902 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
903
    {
904 3
        $this->keepBrokenHtml = $keepBrokenHtml;
905
906 3
        return $this;
907
    }
908
}
909