Completed
Push — master ( e413a0...5eb7b7 )
by Lars
15:31
created

HtmlDomParser::findOneOrFalse()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 5
cts 5
cp 1
rs 9.9332
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $keepBrokenHtml;
87
88
    /**
89
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
90
     */
91 183 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
92
    {
93 183
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
94
95
        // DOMDocument settings
96 183
        $this->document->preserveWhiteSpace = true;
97 183
        $this->document->formatOutput = true;
98
99 183
        if ($element instanceof SimpleHtmlDomInterface) {
100 83
            $element = $element->getNode();
101
        }
102
103 183
        if ($element instanceof \DOMNode) {
104 83
            $domNode = $this->document->importNode($element, true);
105
106 83
            if ($domNode instanceof \DOMNode) {
107
                /** @noinspection UnusedFunctionResultInspection */
108 83
                $this->document->appendChild($domNode);
109
            }
110
111 83
            return;
112
        }
113
114 183
        if ($element !== null) {
115
            /** @noinspection UnusedFunctionResultInspection */
116 81
            $this->loadHtml($element);
117
        }
118 182
    }
119
120
    /**
121
     * @param string $name
122
     * @param array  $arguments
123
     *
124
     * @return bool|mixed
125
     */
126 59
    public function __call($name, $arguments)
127
    {
128 59
        $name = \strtolower($name);
129
130 59
        if (isset(self::$functionAliases[$name])) {
131 58
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
132
        }
133
134 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
135
    }
136
137
    /**
138
     * @param string $name
139
     * @param array  $arguments
140
     *
141
     * @throws \BadMethodCallException
142
     * @throws \RuntimeException
143
     *
144
     * @return HtmlDomParser
145
     */
146 22 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
147
    {
148 22
        $arguments0 = $arguments[0] ?? '';
149
150 22
        $arguments1 = $arguments[1] ?? null;
151
152 22
        if ($name === 'str_get_html') {
153 17
            $parser = new static();
154
155 17
            return $parser->loadHtml($arguments0, $arguments1);
156
        }
157
158 5
        if ($name === 'file_get_html') {
159 4
            $parser = new static();
160
161 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
162
        }
163
164 1
        throw new \BadMethodCallException('Method does not exist');
165
    }
166
167
    /** @noinspection MagicMethodsValidityInspection */
168
169
    /**
170
     * @param string $name
171
     *
172
     * @return string|null
173
     */
174 14
    public function __get($name)
175
    {
176 14
        $name = \strtolower($name);
177
178
        switch ($name) {
179 14
            case 'outerhtml':
180 14
            case 'outertext':
181 5
                return $this->html();
182 10
            case 'innerhtml':
183 4
            case 'innertext':
184 7
                return $this->innerHtml();
185 3
            case 'text':
186 3
            case 'plaintext':
187 2
                return $this->text();
188
        }
189
190 1
        return null;
191
    }
192
193
    /**
194
     * @return string
195
     */
196 18
    public function __toString()
197
    {
198 18
        return $this->html();
199
    }
200
201
    /**
202
     * does nothing (only for api-compatibility-reasons)
203
     *
204
     * @return bool
205
     *
206
     * @deprecated
207
     */
208 1
    public function clear(): bool
209
    {
210 1
        return true;
211
    }
212
213
    /**
214
     * Create DOMDocument from HTML.
215
     *
216
     * @param string   $html
217
     * @param int|null $libXMLExtraOptions
218
     *
219
     * @return \DOMDocument
220
     */
221 171
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
222
    {
223 171
        if ($this->keepBrokenHtml) {
224 3
            $html = $this->keepBrokenHtml(\trim($html));
225
        }
226
227 171
        if (\strpos($html, '<') === false) {
228 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
229 169
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
230 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
231
        }
232
233
        /** @noinspection HtmlRequiredLangAttribute */
234
        if (
235 171
            \strpos($html, '<html ') === false
236
            &&
237 171
            \strpos($html, '<html>') === false
238
        ) {
239 96
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
240
        }
241
242
        if (
243 171
            \strpos($html, '<body ') === false
244
            &&
245 171
            \strpos($html, '<body>') === false
246
        ) {
247 100
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
248
        }
249
250
        /** @noinspection HtmlRequiredTitleElement */
251
        if (
252 171
            \strpos($html, '<head ') === false
253
            &&
254 171
            \strpos($html, '<head>') === false
255
        ) {
256 118
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
257
        }
258
259
        /** @noinspection HtmlRequiredTitleElement */
260
        if (
261 171
            \strpos($html, '<p ') === false
262
            &&
263 171
            \strpos($html, '<p>') === false
264
        ) {
265 88
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
266
        }
267
268
        if (
269 171
            \strpos($html, '</script>') === false
270
            &&
271 171
            \strpos($html, '<\/script>') !== false
272
        ) {
273 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
274
        }
275
276 171
        if (\strpos($html, '<script') !== false) {
277 20
            $this->html5FallbackForScriptTags($html);
278
279
            if (
280 20
                \strpos($html, 'type="text/html"') !== false
281
                ||
282 19
                \strpos($html, 'type=\'text/html\'') !== false
283
                ||
284 19
                \strpos($html, 'type=text/html') !== false
285
                ||
286 19
                \strpos($html, 'type="text/x-custom-template"') !== false
287
                ||
288 18
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
289
                ||
290 20
                \strpos($html, 'type=text/x-custom-template') !== false
291
            ) {
292 2
                $this->keepSpecialScriptTags($html);
293
            }
294
        }
295
296
        // set error level
297 171
        $internalErrors = \libxml_use_internal_errors(true);
298 171
        $disableEntityLoader = \libxml_disable_entity_loader(true);
299 171
        \libxml_clear_errors();
300
301 171
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
302
303 171
        if (\defined('LIBXML_BIGLINES')) {
304 171
            $optionsXml |= \LIBXML_BIGLINES;
305
        }
306
307 171
        if (\defined('LIBXML_COMPACT')) {
308 171
            $optionsXml |= \LIBXML_COMPACT;
309
        }
310
311 171
        if (\defined('LIBXML_HTML_NODEFDTD')) {
312 171
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
313
        }
314
315 171
        if ($libXMLExtraOptions !== null) {
316 5
            $optionsXml |= $libXMLExtraOptions;
317
        }
318
319
        if (
320 171
            $this->isDOMDocumentCreatedWithoutWrapper
321
            ||
322 171
            $this->keepBrokenHtml
323
        ) {
324 7
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
325
        }
326
327 171
        $html = self::replaceToPreserveHtmlEntities($html);
328
329 171
        $documentFound = false;
330 171
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
331 171 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
332 75
            $domElementTmp = \dom_import_simplexml($sxe);
333 75
            if ($domElementTmp) {
334 75
                $documentFound = true;
335 75
                $this->document = $domElementTmp->ownerDocument;
336
            }
337
        }
338
339 171 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
340
341
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
342 104
            $xmlHackUsed = false;
343
            /** @noinspection StringFragmentMisplacedInspection */
344 104
            if (\stripos('<?xml', $html) !== 0) {
345 104
                $xmlHackUsed = true;
346 104
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
347
            }
348
349 104
            $this->document->loadHTML($html, $optionsXml);
350
351
            // remove the "xml-encoding" hack
352 104
            if ($xmlHackUsed) {
353 104
                foreach ($this->document->childNodes as $child) {
354 104
                    if ($child->nodeType === \XML_PI_NODE) {
355
                        /** @noinspection UnusedFunctionResultInspection */
356 104
                        $this->document->removeChild($child);
357
358 104
                        break;
359
                    }
360
                }
361
            }
362
        }
363
364
        // set encoding
365 171
        $this->document->encoding = $this->getEncoding();
366
367
        // restore lib-xml settings
368 171
        \libxml_clear_errors();
369 171
        \libxml_use_internal_errors($internalErrors);
370 171
        \libxml_disable_entity_loader($disableEntityLoader);
371
372 171
        return $this->document;
373
    }
374
375
    /**
376
     * Find list of nodes with a CSS selector.
377
     *
378
     * @param string   $selector
379
     * @param int|null $idx
380
     *
381
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
382
     */
383 122 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
384
    {
385 122
        $xPathQuery = SelectorConverter::toXPath($selector);
386
387 122
        $xPath = new \DOMXPath($this->document);
388 122
        $nodesList = $xPath->query($xPathQuery);
389 122
        $elements = new SimpleHtmlDomNode();
390
391 122
        foreach ($nodesList as $node) {
392 114
            $elements[] = new SimpleHtmlDom($node);
393
        }
394
395
        // return all elements
396 122
        if ($idx === null) {
397 69
            if (\count($elements) === 0) {
398 16
                return new SimpleHtmlDomNodeBlank();
399
            }
400
401 66
            return $elements;
402
        }
403
404
        // handle negative values
405 71
        if ($idx < 0) {
406 11
            $idx = \count($elements) + $idx;
407
        }
408
409
        // return one element
410 71
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
411
    }
412
413
    /**
414
     * Find nodes with a CSS selector.
415
     *
416
     * @param string $selector
417
     *
418
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
419
     */
420 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
421
    {
422 12
        return $this->find($selector, null);
423
    }
424
425
    /**
426
     * Find nodes with a CSS selector or false, if no element is found.
427
     *
428
     * @param string $selector
429
     *
430
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
431
     */
432 3
    public function findMultiOrFalse(string $selector)
433
    {
434 3
        $return = $this->find($selector, null);
435
436 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
437 3
            return false;
438
        }
439
440 1
        return $return;
441
    }
442
443
    /**
444
     * Find one node with a CSS selector.
445
     *
446
     * @param string $selector
447
     *
448
     * @return SimpleHtmlDomInterface
449
     */
450 31
    public function findOne(string $selector): SimpleHtmlDomInterface
451
    {
452 31
        return $this->find($selector, 0);
453
    }
454
455
    /**
456
     * Find one node with a CSS selector or false, if no element is found.
457
     *
458
     * @param string $selector
459
     *
460
     * @return false|SimpleHtmlDomInterface
461
     */
462 2
    public function findOneOrFalse(string $selector)
463
    {
464 2
        $return = $this->find($selector, 0);
465
466 2
        if ($return instanceof SimpleHtmlDomBlank) {
467 2
            return false;
468
        }
469
470 1
        return $return;
471
    }
472
473
    /**
474
     * @param string $content
475
     * @param bool   $multiDecodeNewHtmlEntity
476
     *
477
     * @return string
478
     */
479 100
    public function fixHtmlOutput(
480
        string $content,
481
        bool $multiDecodeNewHtmlEntity = false
482
    ): string {
483
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
484
        //          so we try to remove it here again ...
485
486 100
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
487
            /** @noinspection HtmlRequiredLangAttribute */
488 42
            $content = \str_replace(
489
                [
490 42
                    '<html>',
491
                    '</html>',
492
                ],
493 42
                '',
494 42
                $content
495
            );
496
        }
497
498 100
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
499
            /** @noinspection HtmlRequiredTitleElement */
500 46
            $content = \str_replace(
501
                [
502 46
                    '<head>',
503
                    '</head>',
504
                ],
505 46
                '',
506 46
                $content
507
            );
508
        }
509
510 100
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
511
            /** @noinspection HtmlRequiredLangAttribute */
512 45
            $content = \str_replace(
513
                [
514 45
                    '<body>',
515
                    '</body>',
516
                ],
517 45
                '',
518 45
                $content
519
            );
520
        }
521
522 100
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
523 1
            $content = \str_replace(
524 1
                '</script>',
525 1
                '',
526 1
                $content
527
            );
528
        }
529
530 100
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
531 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
532 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
533
        }
534
535 100
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
536 46
            $content = \str_replace(
537
                [
538 46
                    '<p>',
539
                    '</p>',
540
                ],
541 46
                '',
542 46
                $content
543
            );
544
        }
545
546 100
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
547 8
            $content = \str_replace(
548 8
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
549 8
                '',
550 8
                $content
551
            );
552
        }
553
554
        /** @noinspection CheckTagEmptyBody */
555
        /** @noinspection HtmlExtraClosingTag */
556
        /** @noinspection HtmlRequiredTitleElement */
557 100
        $content = \trim(
558 100
            \str_replace(
559
                [
560 100
                    '<simpleHtmlDomP>',
561
                    '</simpleHtmlDomP>',
562
                    '<head><head>',
563
                    '</head></head>',
564
                    '<br></br>',
565
                ],
566
                [
567 100
                    '',
568
                    '',
569
                    '<head>',
570
                    '</head>',
571
                    '<br>',
572
                ],
573 100
                $content
574
            )
575
        );
576
577 100
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
578
579 100
        return self::putReplacedBackToPreserveHtmlEntities($content);
580
    }
581
582
    /**
583
     * Return elements by ".class".
584
     *
585
     * @param string $class
586
     *
587
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
588
     */
589
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
590
    {
591
        return $this->findMulti(".${class}");
592
    }
593
594
    /**
595
     * Return element by #id.
596
     *
597
     * @param string $id
598
     *
599
     * @return SimpleHtmlDomInterface
600
     */
601
    public function getElementById(string $id): SimpleHtmlDomInterface
602
    {
603 3
        return $this->findOne("#${id}");
604
    }
605
606
    /**
607
     * Return element by tag name.
608
     *
609
     * @param string $name
610
     *
611
     * @return SimpleHtmlDomInterface
612
     */
613
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
614
    {
615 1
        $node = $this->document->getElementsByTagName($name)->item(0);
616
617 1
        if ($node === null) {
618
            return new SimpleHtmlDomBlank();
619
        }
620
621 1
        return new SimpleHtmlDom($node);
622
    }
623
624
    /**
625
     * Returns elements by "#id".
626
     *
627
     * @param string   $id
628
     * @param int|null $idx
629
     *
630
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
631
     */
632
    public function getElementsById(string $id, $idx = null)
633
    {
634
        return $this->find("#${id}", $idx);
635
    }
636
637
    /**
638
     * Returns elements by tag name.
639
     *
640
     * @param string   $name
641
     * @param int|null $idx
642
     *
643
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
644
     */
645
    public function getElementsByTagName(string $name, $idx = null)
646
    {
647 6
        $nodesList = $this->document->getElementsByTagName($name);
648
649 6
        $elements = new SimpleHtmlDomNode();
650
651 6
        foreach ($nodesList as $node) {
652 4
            $elements[] = new SimpleHtmlDom($node);
653
        }
654
655
        // return all elements
656 6
        if ($idx === null) {
657 5
            if (\count($elements) === 0) {
658 2
                return new SimpleHtmlDomNodeBlank();
659
            }
660
661 3
            return $elements;
662
        }
663
664
        // handle negative values
665 1
        if ($idx < 0) {
666
            $idx = \count($elements) + $idx;
667
        }
668
669
        // return one element
670 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
671
    }
672
673
    /**
674
     * Get dom node's outer html.
675
     *
676
     * @param bool $multiDecodeNewHtmlEntity
677
     *
678
     * @return string
679
     */
680
    public function html(bool $multiDecodeNewHtmlEntity = false): string
681
    {
682 69
        if (static::$callback !== null) {
683
            \call_user_func(static::$callback, [$this]);
684
        }
685
686 69
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
687 35
            $content = $this->document->saveHTML($this->document->documentElement);
688
        } else {
689 45
            $content = $this->document->saveHTML();
690
        }
691
692 69
        if ($content === false) {
693
            return '';
694
        }
695
696 69
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
697
    }
698
699
    /**
700
     * Load HTML from string.
701
     *
702
     * @param string   $html
703
     * @param int|null $libXMLExtraOptions
704
     *
705
     * @return HtmlDomParser
706
     */
707
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
708
    {
709
        // reset
710 171
        self::$domBrokenReplaceHelper = [];
711
712 171
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
713
714 171
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
715
    }
716
717
    /**
718
     * Load HTML from file.
719
     *
720
     * @param string   $filePath
721
     * @param int|null $libXMLExtraOptions
722
     *
723
     * @throws \RuntimeException
724
     *
725
     * @return HtmlDomParser
726
     */
727 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
728
    {
729
        // reset
730 11
        self::$domBrokenReplaceHelper = [];
731
732
        if (
733 11
            !\preg_match("/^https?:\/\//i", $filePath)
734
            &&
735 11
            !\file_exists($filePath)
736
        ) {
737 1
            throw new \RuntimeException("File ${filePath} not found");
738
        }
739
740
        try {
741 10
            if (\class_exists('\voku\helper\UTF8')) {
742
                /** @noinspection PhpUndefinedClassInspection */
743
                $html = UTF8::file_get_contents($filePath);
744
            } else {
745 10
                $html = \file_get_contents($filePath);
746
            }
747 1
        } catch (\Exception $e) {
748 1
            throw new \RuntimeException("Could not load file ${filePath}");
749
        }
750
751 9
        if ($html === false) {
752
            throw new \RuntimeException("Could not load file ${filePath}");
753
        }
754
755 9
        return $this->loadHtml($html, $libXMLExtraOptions);
756
    }
757
758
    /**
759
     * Get the HTML as XML or plain XML if needed.
760
     *
761
     * @param bool $multiDecodeNewHtmlEntity
762
     * @param bool $htmlToXml
763
     * @param bool $removeXmlHeader
764
     * @param int  $options
765
     *
766
     * @return string
767
     */
768 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
769
        bool $multiDecodeNewHtmlEntity = false,
770
        bool $htmlToXml = true,
771
        bool $removeXmlHeader = true,
772
        int $options = \LIBXML_NOEMPTYTAG
773
    ): string {
774 2
        $xml = $this->document->saveXML(null, $options);
775
776 2
        if ($removeXmlHeader) {
777 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
778
        }
779
780 2
        if ($htmlToXml) {
781 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
782
        } else {
783
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
784
785
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
786
        }
787
788 2
        return $return;
789
    }
790
791
    /**
792
     * @param string $selector
793
     * @param int    $idx
794
     *
795
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
796
     */
797
    public function __invoke($selector, $idx = null)
798
    {
799 3
        return $this->find($selector, $idx);
800
    }
801
802
    /**
803
     * @return bool
804
     */
805
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
806
    {
807 100
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
808
    }
809
810
    /**
811
     * @return bool
812
     */
813
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
814
    {
815 100
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
816
    }
817
818
    /**
819
     * @return bool
820
     */
821
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
822
    {
823 100
        return $this->isDOMDocumentCreatedWithoutHtml;
824
    }
825
826
    /**
827
     * @return bool
828
     */
829
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
830
    {
831 100
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
832
    }
833
834
    /**
835
     * @return bool
836
     */
837
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
838
    {
839 100
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
840
    }
841
842
    /**
843
     * @return bool
844
     */
845
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
846
    {
847 100
        return $this->isDOMDocumentCreatedWithoutWrapper;
848
    }
849
850
    /**
851
     * @return bool
852
     */
853
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
854
    {
855 100
        return $this->isDOMDocumentCreatedWithFakeEndScript;
856
    }
857
858
    /**
859
     * @param string $html
860
     *
861
     * @return string
862
     */
863
    protected function keepBrokenHtml(string $html): string
864
    {
865
        do {
866 3
            $original = $html;
867
868 3
            $html = (string) \preg_replace_callback(
869 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
870
                static function ($matches) {
871 3
                    return $matches['start'] .
872 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
873 3
                           $matches['value'] .
874 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
875 3
                           $matches['end'];
876 3
                },
877 3
                $html
878
            );
879 3
        } while ($original !== $html);
880
881
        do {
882 3
            $original = $html;
883
884 3
            $html = (string) \preg_replace_callback(
885 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
886
                static function ($matches) {
887 3
                    $matches['broken'] = \str_replace(
888 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
889 3
                        ['</', '<', '>'],
890 3
                        $matches['broken']
891
                    );
892
893 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
894 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
895
896 3
                    return $matches['start'] . $matchesHash . $matches['end'];
897 3
                },
898 3
                $html
899
            );
900 3
        } while ($original !== $html);
901
902 3
        return \str_replace(
903 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
904 3
            ['</', '<', '>'],
905 3
            $html
906
        );
907
    }
908
909
    /**
910
     * @param string $html
911
     */
912
    protected function keepSpecialScriptTags(string &$html)
913
    {
914
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
915 2
        $html = (string) \preg_replace_callback(
916 2
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
917
            static function ($matches) {
918
                if (
919 2
                    \strpos($matches['innerContent'], '+') === false
920
                    &&
921 2
                    \strpos($matches['innerContent'], '<%') === false
922
                    &&
923 2
                    \strpos($matches['innerContent'], '{%') === false
924
                    &&
925 2
                    \strpos($matches['innerContent'], '{{') === false
926
                ) {
927
                    // remove the html5 fallback
928 1
                    $matches[0] = \str_replace('<\/', '</', $matches[0]);
929
930 1
                    $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
931
932 1
                    return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
933
                }
934
935
                // remove the html5 fallback
936 1
                $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
937
938 1
                self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
939 1
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
940
941 1
                return $matches['start'] . $matchesHash . $matches['end'];
942 2
            },
943 2
            $html
944
        );
945 2
    }
946
947
    /**
948
     * @param bool $keepBrokenHtml
949
     *
950
     * @return HtmlDomParser
951
     */
952
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
953
    {
954 3
        $this->keepBrokenHtml = $keepBrokenHtml;
955
956 3
        return $this;
957
    }
958
}
959