Completed
Push — master ( fa1e09...061358 )
by Lars
02:27 queued 12s
created

HtmlDomParser::__toString()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $keepBrokenHtml;
87
88
    /**
89
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
90
     */
91 183 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
92
    {
93 183
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
94
95
        // DOMDocument settings
96 183
        $this->document->preserveWhiteSpace = true;
97 183
        $this->document->formatOutput = true;
98
99 183
        if ($element instanceof SimpleHtmlDomInterface) {
100 83
            $element = $element->getNode();
101
        }
102
103 183
        if ($element instanceof \DOMNode) {
104 83
            $domNode = $this->document->importNode($element, true);
105
106 83
            if ($domNode instanceof \DOMNode) {
107
                /** @noinspection UnusedFunctionResultInspection */
108 83
                $this->document->appendChild($domNode);
109
            }
110
111 83
            return;
112
        }
113
114 183
        if ($element !== null) {
115
            /** @noinspection UnusedFunctionResultInspection */
116 81
            $this->loadHtml($element);
117
        }
118 182
    }
119
120
    /**
121
     * @param string $name
122
     * @param array  $arguments
123
     *
124
     * @return bool|mixed
125
     */
126 59
    public function __call($name, $arguments)
127
    {
128 59
        $name = \strtolower($name);
129
130 59
        if (isset(self::$functionAliases[$name])) {
131 58
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
132
        }
133
134 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
135
    }
136
137
    /**
138
     * @param string $name
139
     * @param array  $arguments
140
     *
141
     * @throws \BadMethodCallException
142
     * @throws \RuntimeException
143
     *
144
     * @return HtmlDomParser
145
     */
146 22 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
147
    {
148 22
        $arguments0 = $arguments[0] ?? '';
149
150 22
        $arguments1 = $arguments[1] ?? null;
151
152 22
        if ($name === 'str_get_html') {
153 17
            $parser = new static();
154
155 17
            return $parser->loadHtml($arguments0, $arguments1);
156
        }
157
158 5
        if ($name === 'file_get_html') {
159 4
            $parser = new static();
160
161 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
162
        }
163
164 1
        throw new \BadMethodCallException('Method does not exist');
165
    }
166
167
    /** @noinspection MagicMethodsValidityInspection */
168
169
    /**
170
     * @param string $name
171
     *
172
     * @return string|null
173
     */
174 14
    public function __get($name)
175
    {
176 14
        $name = \strtolower($name);
177
178 14
        switch ($name) {
179 14
            case 'outerhtml':
180 14
            case 'outertext':
181 5
                return $this->html();
182 10
            case 'innerhtml':
183 4
            case 'innertext':
184 7
                return $this->innerHtml();
185 3
            case 'text':
186 3
            case 'plaintext':
187 2
                return $this->text();
188
        }
189
190 1
        return null;
191
    }
192
193
    /**
194
     * @return string
195
     */
196 18
    public function __toString()
197
    {
198 18
        return $this->html();
199
    }
200
201
    /**
202
     * does nothing (only for api-compatibility-reasons)
203
     *
204
     * @return bool
205
     *
206
     * @deprecated
207
     */
208 1
    public function clear(): bool
209
    {
210 1
        return true;
211
    }
212
213
    /**
214
     * Create DOMDocument from HTML.
215
     *
216
     * @param string   $html
217
     * @param int|null $libXMLExtraOptions
218
     *
219
     * @return \DOMDocument
220
     */
221 171
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
222
    {
223 171
        if ($this->keepBrokenHtml) {
224 3
            $html = $this->keepBrokenHtml(\trim($html));
225
        }
226
227 171
        if (\strpos($html, '<') === false) {
228 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
229 169
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
230 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
231
        }
232
233
        /** @noinspection HtmlRequiredLangAttribute */
234
        if (
235 171
            \strpos($html, '<html ') === false
236
            &&
237 171
            \strpos($html, '<html>') === false
238
        ) {
239 96
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
240
        }
241
242
        if (
243 171
            \strpos($html, '<body ') === false
244
            &&
245 171
            \strpos($html, '<body>') === false
246
        ) {
247 100
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
248
        }
249
250
        /** @noinspection HtmlRequiredTitleElement */
251
        if (
252 171
            \strpos($html, '<head ') === false
253
            &&
254 171
            \strpos($html, '<head>') === false
255
        ) {
256 118
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
257
        }
258
259
        /** @noinspection HtmlRequiredTitleElement */
260
        if (
261 171
            \strpos($html, '<p ') === false
262
            &&
263 171
            \strpos($html, '<p>') === false
264
        ) {
265 88
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
266
        }
267
268
        if (
269 171
            \strpos($html, '</script>') === false
270
            &&
271 171
            \strpos($html, '<\/script>') !== false
272
        ) {
273 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
274
        }
275
276 171
        if (\strpos($html, '<script') !== false) {
277 20
            $this->html5FallbackForScriptTags($html);
278
279
            if (
280 20
                \strpos($html, 'type="text/html"') !== false
281
                ||
282 19
                \strpos($html, 'type=\'text/html\'') !== false
283
                ||
284 19
                \strpos($html, 'type=text/html') !== false
285
                ||
286 19
                \strpos($html, 'type="text/x-custom-template"') !== false
287
                ||
288 18
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
289
                ||
290 20
                \strpos($html, 'type=text/x-custom-template') !== false
291
            ) {
292 2
                $this->keepSpecialScriptTags($html);
293
            }
294
        }
295
296
        // set error level
297 171
        $internalErrors = \libxml_use_internal_errors(true);
298 171
        $disableEntityLoader = \libxml_disable_entity_loader(true);
299 171
        \libxml_clear_errors();
300
301 171
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
302
303 171
        if (\defined('LIBXML_BIGLINES')) {
304 171
            $optionsXml |= \LIBXML_BIGLINES;
305
        }
306
307 171
        if (\defined('LIBXML_COMPACT')) {
308 171
            $optionsXml |= \LIBXML_COMPACT;
309
        }
310
311 171
        if (\defined('LIBXML_HTML_NODEFDTD')) {
312 171
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
313
        }
314
315 171
        if ($libXMLExtraOptions !== null) {
316 5
            $optionsXml |= $libXMLExtraOptions;
317
        }
318
319
        if (
320 171
            $this->isDOMDocumentCreatedWithoutWrapper
321
            ||
322 171
            $this->keepBrokenHtml
323
        ) {
324 7
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
325
        }
326
327 171
        $html = self::replaceToPreserveHtmlEntities($html);
328
329 171
        $documentFound = false;
330 171
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
331 171 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
332 75
            $domElementTmp = \dom_import_simplexml($sxe);
333 75
            if ($domElementTmp) {
334 75
                $documentFound = true;
335 75
                $this->document = $domElementTmp->ownerDocument;
336
            }
337
        }
338
339 171 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
340
341
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
342 104
            $xmlHackUsed = false;
343
            /** @noinspection StringFragmentMisplacedInspection */
344 104
            if (\stripos('<?xml', $html) !== 0) {
345 104
                $xmlHackUsed = true;
346 104
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
347
            }
348
349 104
            $this->document->loadHTML($html, $optionsXml);
350
351
            // remove the "xml-encoding" hack
352 104
            if ($xmlHackUsed) {
353 104
                foreach ($this->document->childNodes as $child) {
354 104
                    if ($child->nodeType === \XML_PI_NODE) {
355
                        /** @noinspection UnusedFunctionResultInspection */
356 104
                        $this->document->removeChild($child);
357
358 104
                        break;
359
                    }
360
                }
361
            }
362
        }
363
364
        // set encoding
365 171
        $this->document->encoding = $this->getEncoding();
366
367
        // restore lib-xml settings
368 171
        \libxml_clear_errors();
369 171
        \libxml_use_internal_errors($internalErrors);
370 171
        \libxml_disable_entity_loader($disableEntityLoader);
371
372 171
        return $this->document;
373
    }
374
375
    /**
376
     * Find list of nodes with a CSS selector.
377
     *
378
     * @param string   $selector
379
     * @param int|null $idx
380
     *
381
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
382
     */
383 122 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
384
    {
385 122
        $xPathQuery = SelectorConverter::toXPath($selector);
386
387 122
        $xPath = new \DOMXPath($this->document);
388 122
        $nodesList = $xPath->query($xPathQuery);
389 122
        $elements = new SimpleHtmlDomNode();
390
391 122
        foreach ($nodesList as $node) {
392 114
            $elements[] = new SimpleHtmlDom($node);
393
        }
394
395
        // return all elements
396 122
        if ($idx === null) {
397 69
            if (\count($elements) === 0) {
398 16
                return new SimpleHtmlDomNodeBlank();
399
            }
400
401 66
            return $elements;
402
        }
403
404
        // handle negative values
405 71
        if ($idx < 0) {
406 11
            $idx = \count($elements) + $idx;
407
        }
408
409
        // return one element
410 71
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
411
    }
412
413
    /**
414
     * Find nodes with a CSS selector.
415
     *
416
     * @param string $selector
417
     *
418
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
419
     */
420 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
421
    {
422 12
        return $this->find($selector, null);
423
    }
424
425
    /**
426
     * Find nodes with a CSS selector or false, if no element is found.
427
     *
428
     * @param string $selector
429
     *
430
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
431
     */
432 3
    public function findMultiOrFalse(string $selector)
433
    {
434 3
        $return = $this->find($selector, null);
435
436 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
437 3
            return false;
438
        }
439
440 1
        return $return;
441
    }
442
443
    /**
444
     * Find one node with a CSS selector.
445
     *
446
     * @param string $selector
447
     *
448
     * @return SimpleHtmlDomInterface
449
     */
450 31
    public function findOne(string $selector): SimpleHtmlDomInterface
451
    {
452 31
        return $this->find($selector, 0);
453
    }
454
455
    /**
456
     * Find one node with a CSS selector or false, if no element is found.
457
     *
458
     * @param string $selector
459
     *
460
     * @return false|SimpleHtmlDomInterface
461
     */
462 2
    public function findOneOrFalse(string $selector)
463
    {
464 2
        $return = $this->find($selector, 0);
465
466 2
        if ($return instanceof SimpleHtmlDomBlank) {
467 2
            return false;
468
        }
469
470 1
        return $return;
471
    }
472
473
    /**
474
     * @param string $content
475
     * @param bool   $multiDecodeNewHtmlEntity
476
     *
477
     * @return string
478
     */
479 100
    public function fixHtmlOutput(
480
        string $content,
481
        bool $multiDecodeNewHtmlEntity = false
482
    ): string
483
    {
484
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
485
        //          so we try to remove it here again ...
486
487 100
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
488
            /** @noinspection HtmlRequiredLangAttribute */
489 42
            $content = \str_replace(
490
                [
491 42
                    '<html>',
492
                    '</html>',
493
                ],
494 42
                '',
495 42
                $content
496
            );
497
        }
498
499 100
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
500
            /** @noinspection HtmlRequiredTitleElement */
501 46
            $content = \str_replace(
502
                [
503 46
                    '<head>',
504
                    '</head>',
505
                ],
506 46
                '',
507 46
                $content
508
            );
509
        }
510
511 100
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
512
            /** @noinspection HtmlRequiredLangAttribute */
513 45
            $content = \str_replace(
514
                [
515 45
                    '<body>',
516
                    '</body>',
517
                ],
518 45
                '',
519 45
                $content
520
            );
521
        }
522
523 100
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
524 1
            $content = \str_replace(
525 1
                '</script>',
526 1
                '',
527 1
                $content
528
            );
529
        }
530
531 100
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
532 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
533 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
534
        }
535
536 100
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
537 46
            $content = \str_replace(
538
                [
539 46
                    '<p>',
540
                    '</p>',
541
                ],
542 46
                '',
543 46
                $content
544
            );
545
        }
546
547 100
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
548 8
            $content = \str_replace(
549 8
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
550 8
                '',
551 8
                $content
552
            );
553
        }
554
555
        /** @noinspection CheckTagEmptyBody */
556
        /** @noinspection HtmlExtraClosingTag */
557
        /** @noinspection HtmlRequiredTitleElement */
558 100
        $content = \trim(
559 100
            \str_replace(
560
                [
561 100
                    '<simpleHtmlDomP>',
562
                    '</simpleHtmlDomP>',
563
                    '<head><head>',
564
                    '</head></head>',
565
                    '<br></br>',
566
                ],
567
                [
568 100
                    '',
569
                    '',
570
                    '<head>',
571
                    '</head>',
572
                    '<br>',
573
                ],
574 100
                $content
575
            )
576
        );
577
578 100
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
579
580 100
        return self::putReplacedBackToPreserveHtmlEntities($content);
581
    }
582
583
    /**
584
     * Return elements by ".class".
585
     *
586
     * @param string $class
587
     *
588
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
589
     */
590
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
591
    {
592
        return $this->findMulti(".${class}");
593
    }
594
595
    /**
596
     * Return element by #id.
597
     *
598
     * @param string $id
599
     *
600
     * @return SimpleHtmlDomInterface
601
     */
602 3
    public function getElementById(string $id): SimpleHtmlDomInterface
603
    {
604 3
        return $this->findOne("#${id}");
605
    }
606
607
    /**
608
     * Return element by tag name.
609
     *
610
     * @param string $name
611
     *
612
     * @return SimpleHtmlDomInterface
613
     */
614 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
615
    {
616 1
        $node = $this->document->getElementsByTagName($name)->item(0);
617
618 1
        if ($node === null) {
619
            return new SimpleHtmlDomBlank();
620
        }
621
622 1
        return new SimpleHtmlDom($node);
623
    }
624
625
    /**
626
     * Returns elements by "#id".
627
     *
628
     * @param string   $id
629
     * @param int|null $idx
630
     *
631
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
632
     */
633
    public function getElementsById(string $id, $idx = null)
634
    {
635
        return $this->find("#${id}", $idx);
636
    }
637
638
    /**
639
     * Returns elements by tag name.
640
     *
641
     * @param string   $name
642
     * @param int|null $idx
643
     *
644
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
645
     */
646 6
    public function getElementsByTagName(string $name, $idx = null)
647
    {
648 6
        $nodesList = $this->document->getElementsByTagName($name);
649
650 6
        $elements = new SimpleHtmlDomNode();
651
652 6
        foreach ($nodesList as $node) {
653 4
            $elements[] = new SimpleHtmlDom($node);
654
        }
655
656
        // return all elements
657 6
        if ($idx === null) {
658 5
            if (\count($elements) === 0) {
659 2
                return new SimpleHtmlDomNodeBlank();
660
            }
661
662 3
            return $elements;
663
        }
664
665
        // handle negative values
666 1
        if ($idx < 0) {
667
            $idx = \count($elements) + $idx;
668
        }
669
670
        // return one element
671 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
672
    }
673
674
    /**
675
     * Get dom node's outer html.
676
     *
677
     * @param bool $multiDecodeNewHtmlEntity
678
     *
679
     * @return string
680
     */
681 69
    public function html(bool $multiDecodeNewHtmlEntity = false): string
682
    {
683 69
        if ($this::$callback !== null) {
684
            \call_user_func($this::$callback, [$this]);
685
        }
686
687 69
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
688 35
            $content = $this->document->saveHTML($this->document->documentElement);
689
        } else {
690 45
            $content = $this->document->saveHTML();
691
        }
692
693 69
        if ($content === false) {
694
            return '';
695
        }
696
697 69
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
698
    }
699
700
    /**
701
     * Load HTML from string.
702
     *
703
     * @param string   $html
704
     * @param int|null $libXMLExtraOptions
705
     *
706
     * @return HtmlDomParser
707
     */
708 171
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
709
    {
710
        // reset
711 171
        self::$domBrokenReplaceHelper = [];
712
713 171
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
714
715 171
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
716
    }
717
718
    /**
719
     * Load HTML from file.
720
     *
721
     * @param string   $filePath
722
     * @param int|null $libXMLExtraOptions
723
     *
724
     * @throws \RuntimeException
725
     *
726
     * @return HtmlDomParser
727
     */
728 11 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
729
    {
730
        // reset
731 11
        self::$domBrokenReplaceHelper = [];
732
733
        if (
734 11
            !\preg_match("/^https?:\/\//i", $filePath)
735
            &&
736 11
            !\file_exists($filePath)
737
        ) {
738 1
            throw new \RuntimeException("File ${filePath} not found");
739
        }
740
741
        try {
742 10
            if (\class_exists('\voku\helper\UTF8')) {
743
                /** @noinspection PhpUndefinedClassInspection */
744
                $html = UTF8::file_get_contents($filePath);
745
            } else {
746 10
                $html = \file_get_contents($filePath);
747
            }
748 1
        } catch (\Exception $e) {
749 1
            throw new \RuntimeException("Could not load file ${filePath}");
750
        }
751
752 9
        if ($html === false) {
753
            throw new \RuntimeException("Could not load file ${filePath}");
754
        }
755
756 9
        return $this->loadHtml($html, $libXMLExtraOptions);
757
    }
758
759
    /**
760
     * Get the HTML as XML or plain XML if needed.
761
     *
762
     * @param bool $multiDecodeNewHtmlEntity
763
     * @param bool $htmlToXml
764
     * @param bool $removeXmlHeader
765
     * @param int  $options
766
     *
767
     * @return string
768
     */
769 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
770
        bool $multiDecodeNewHtmlEntity = false,
771
        bool $htmlToXml = true,
772
        bool $removeXmlHeader = true,
773
        int $options = \LIBXML_NOEMPTYTAG
774
    ): string {
775 2
        $xml = $this->document->saveXML(null, $options);
776
777 2
        if ($removeXmlHeader) {
778 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
779
        }
780
781 2
        if ($htmlToXml) {
782 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
783
        } else {
784
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
785
786
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
787
        }
788
789 2
        return $return;
790
    }
791
792
    /**
793
     * @param string $selector
794
     * @param int    $idx
795
     *
796
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
797
     */
798 3
    public function __invoke($selector, $idx = null)
799
    {
800 3
        return $this->find($selector, $idx);
801
    }
802
803
    /**
804
     * @return bool
805
     */
806 100
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
807
    {
808 100
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
809
    }
810
811
    /**
812
     * @return bool
813
     */
814 100
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
815
    {
816 100
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
817
    }
818
819
    /**
820
     * @return bool
821
     */
822 100
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
823
    {
824 100
        return $this->isDOMDocumentCreatedWithoutHtml;
825
    }
826
827
    /**
828
     * @return bool
829
     */
830 100
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
831
    {
832 100
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
833
    }
834
835
    /**
836
     * @return bool
837
     */
838 100
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
839
    {
840 100
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
841
    }
842
843
    /**
844
     * @return bool
845
     */
846 100
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
847
    {
848 100
        return $this->isDOMDocumentCreatedWithoutWrapper;
849
    }
850
851
    /**
852
     * @return bool
853
     */
854 100
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
855
    {
856 100
        return $this->isDOMDocumentCreatedWithFakeEndScript;
857
    }
858
859
    /**
860
     * @param string $html
861
     *
862
     * @return string
863
     */
864 3
    protected function keepBrokenHtml(string $html): string
865
    {
866
        do {
867 3
            $original = $html;
868
869 3
            $html = (string) \preg_replace_callback(
870 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
871
                static function ($matches) {
872 3
                    return $matches['start'] .
873 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
874 3
                           $matches['value'] .
875 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
876 3
                           $matches['end'];
877 3
                },
878 3
                $html
879
            );
880 3
        } while ($original !== $html);
881
882
        do {
883 3
            $original = $html;
884
885 3
            $html = (string) \preg_replace_callback(
886 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
887
                static function ($matches) {
888 3
                    $matches['broken'] = \str_replace(
889 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
890 3
                        ['</', '<', '>'],
891 3
                        $matches['broken']
892
                    );
893
894 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
895 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
896
897 3
                    return $matches['start'] . $matchesHash . $matches['end'];
898 3
                },
899 3
                $html
900
            );
901 3
        } while ($original !== $html);
902
903 3
        return \str_replace(
904 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
905 3
            ['</', '<', '>'],
906 3
            $html
907
        );
908
    }
909
910
    /**
911
     * @param string $html
912
     */
913 2
    protected function keepSpecialScriptTags(string &$html)
914
    {
915
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
916 2
        $html = (string) \preg_replace_callback(
917 2
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
918
            static function ($matches) {
919
                if (
920 2
                    \strpos($matches['innerContent'], '+') === false
921
                    &&
922 2
                    \strpos($matches['innerContent'], '<%') === false
923
                    &&
924 2
                    \strpos($matches['innerContent'], '{%') === false
925
                    &&
926 2
                    \strpos($matches['innerContent'], '{{') === false
927
                ) {
928
                    // remove the html5 fallback
929 1
                    $matches[0] = \str_replace('<\/', '</', $matches[0]);
930
931 1
                    $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
932
933 1
                    return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
934
                }
935
936
                // remove the html5 fallback
937 1
                $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
938
939 1
                self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
940 1
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
941
942 1
                return $matches['start'] . $matchesHash . $matches['end'];
943 2
            },
944 2
            $html
945
        );
946 2
    }
947
948
    /**
949
     * @param bool $keepBrokenHtml
950
     *
951
     * @return HtmlDomParser
952
     */
953 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
954
    {
955 3
        $this->keepBrokenHtml = $keepBrokenHtml;
956
957 3
        return $this;
958
    }
959
}
960