Completed
Push — master ( 4c12d1...58ce77 )
by Lars
02:39 queued 13s
created

HtmlDomParser::useKeepBrokenHtml()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
87
88
    /**
89
     * @var bool
90
     */
91
    protected $keepBrokenHtml;
92
93
    /**
94
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
95
     */
96 208 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
97
    {
98 208
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
99
100
        // DOMDocument settings
101 208
        $this->document->preserveWhiteSpace = true;
102 208
        $this->document->formatOutput = true;
103
104 208
        if ($element instanceof SimpleHtmlDomInterface) {
105 97
            $element = $element->getNode();
106
        }
107
108 208
        if ($element instanceof \DOMNode) {
109 97
            $domNode = $this->document->importNode($element, true);
110
111 97
            if ($domNode instanceof \DOMNode) {
112
                /** @noinspection UnusedFunctionResultInspection */
113 97
                $this->document->appendChild($domNode);
114
            }
115
116 97
            return;
117
        }
118
119 208
        if ($element !== null) {
120
            /** @noinspection UnusedFunctionResultInspection */
121 84
            $this->loadHtml($element);
122
        }
123 207
    }
124
125
    /**
126
     * @param string $name
127
     * @param array  $arguments
128
     *
129
     * @return bool|mixed
130
     */
131 75
    public function __call($name, $arguments)
132
    {
133 75
        $name = \strtolower($name);
134
135 75
        if (isset(self::$functionAliases[$name])) {
136 74
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
137
        }
138
139 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
140
    }
141
142
    /**
143
     * @param string $name
144
     * @param array  $arguments
145
     *
146
     * @throws \BadMethodCallException
147
     * @throws \RuntimeException
148
     *
149
     * @return HtmlDomParser
150
     */
151 24 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
152
    {
153 24
        $arguments0 = $arguments[0] ?? '';
154
155 24
        $arguments1 = $arguments[1] ?? null;
156
157 24
        if ($name === 'str_get_html') {
158 19
            $parser = new static();
159
160 19
            return $parser->loadHtml($arguments0, $arguments1);
161
        }
162
163 5
        if ($name === 'file_get_html') {
164 4
            $parser = new static();
165
166 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
167
        }
168
169 1
        throw new \BadMethodCallException('Method does not exist');
170
    }
171
172
    /** @noinspection MagicMethodsValidityInspection */
173
174
    /**
175
     * @param string $name
176
     *
177
     * @return string|null
178
     */
179 15
    public function __get($name)
180
    {
181 15
        $name = \strtolower($name);
182
183 15
        switch ($name) {
184 15
            case 'outerhtml':
185 15
            case 'outertext':
186 5
                return $this->html();
187 11
            case 'innerhtml':
188 5
            case 'innertext':
189 7
                return $this->innerHtml();
190 4
            case 'text':
191 4
            case 'plaintext':
192 3
                return $this->text();
193
        }
194
195 1
        return null;
196
    }
197
198
    /**
199
     * @return string
200
     */
201 19
    public function __toString()
202
    {
203 19
        return $this->html();
204
    }
205
206
    /**
207
     * does nothing (only for api-compatibility-reasons)
208
     *
209
     * @return bool
210
     *
211
     * @deprecated
212
     */
213 6
    public function clear(): bool
214
    {
215 6
        return true;
216
    }
217
218
    /**
219
     * Create DOMDocument from HTML.
220
     *
221
     * @param string   $html
222
     * @param int|null $libXMLExtraOptions
223
     *
224
     * @return \DOMDocument
225
     */
226 193
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
227
    {
228 193
        if ($this->keepBrokenHtml) {
229 3
            $html = $this->keepBrokenHtml(\trim($html));
230
        }
231
232 193
        if (\strpos($html, '<') === false) {
233 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
234 191
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
235 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
236
        }
237
238 193
        if (\strpos(\ltrim($html), '<!--') === 0) {
239 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
240
        }
241
242
        /** @noinspection HtmlRequiredLangAttribute */
243
        if (
244 193
            \strpos($html, '<html ') === false
245
            &&
246 193
            \strpos($html, '<html>') === false
247
        ) {
248 116
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
249
        }
250
251
        if (
252 193
            \strpos($html, '<body ') === false
253
            &&
254 193
            \strpos($html, '<body>') === false
255
        ) {
256 121
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
257
        }
258
259
        /** @noinspection HtmlRequiredTitleElement */
260
        if (
261 193
            \strpos($html, '<head ') === false
262
            &&
263 193
            \strpos($html, '<head>') === false
264
        ) {
265 140
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
266
        }
267
268
        /** @noinspection HtmlRequiredTitleElement */
269
        if (
270 193
            \strpos($html, '<p ') === false
271
            &&
272 193
            \strpos($html, '<p>') === false
273
        ) {
274 104
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
275
        }
276
277
        if (
278 193
            \strpos($html, '</script>') === false
279
            &&
280 193
            \strpos($html, '<\/script>') !== false
281
        ) {
282 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
283
        }
284
285 193
        if (\strpos($html, '<script') !== false) {
286 20
            $this->html5FallbackForScriptTags($html);
287
288
            if (
289 20
                \strpos($html, 'type="text/html"') !== false
290
                ||
291 19
                \strpos($html, 'type=\'text/html\'') !== false
292
                ||
293 19
                \strpos($html, 'type=text/html') !== false
294
                ||
295 19
                \strpos($html, 'type="text/x-custom-template"') !== false
296
                ||
297 18
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
298
                ||
299 20
                \strpos($html, 'type=text/x-custom-template') !== false
300
            ) {
301 2
                $this->keepSpecialScriptTags($html);
302
            }
303
        }
304
305
        // set error level
306 193
        $internalErrors = \libxml_use_internal_errors(true);
307 193
        $disableEntityLoader = \libxml_disable_entity_loader(true);
308 193
        \libxml_clear_errors();
309
310 193
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
311
312 193
        if (\defined('LIBXML_BIGLINES')) {
313 193
            $optionsXml |= \LIBXML_BIGLINES;
314
        }
315
316 193
        if (\defined('LIBXML_COMPACT')) {
317 193
            $optionsXml |= \LIBXML_COMPACT;
318
        }
319
320 193
        if (\defined('LIBXML_HTML_NODEFDTD')) {
321 193
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
322
        }
323
324 193
        if ($libXMLExtraOptions !== null) {
325 5
            $optionsXml |= $libXMLExtraOptions;
326
        }
327
328
        if (
329 193
            $this->isDOMDocumentCreatedWithoutWrapper
330
            ||
331 189
            $this->isDOMDocumentCreatedWithCommentWrapper
332
            ||
333 193
            $this->keepBrokenHtml
334
        ) {
335 18
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
336
        }
337
338 193
        $html = self::replaceToPreserveHtmlEntities($html);
339
340 193
        $documentFound = false;
341 193
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
342 193 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
343 90
            $domElementTmp = \dom_import_simplexml($sxe);
344 90
            if ($domElementTmp) {
345 90
                $documentFound = true;
346 90
                $this->document = $domElementTmp->ownerDocument;
347
            }
348
        }
349
350 193 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
351
352
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
353 112
            $xmlHackUsed = false;
354
            /** @noinspection StringFragmentMisplacedInspection */
355 112
            if (\stripos('<?xml', $html) !== 0) {
356 112
                $xmlHackUsed = true;
357 112
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
358
            }
359
360 112
            $this->document->loadHTML($html, $optionsXml);
361
362
            // remove the "xml-encoding" hack
363 112
            if ($xmlHackUsed) {
364 112
                foreach ($this->document->childNodes as $child) {
365 112
                    if ($child->nodeType === \XML_PI_NODE) {
366
                        /** @noinspection UnusedFunctionResultInspection */
367 112
                        $this->document->removeChild($child);
368
369 112
                        break;
370
                    }
371
                }
372
            }
373
        }
374
375
        // set encoding
376 193
        $this->document->encoding = $this->getEncoding();
377
378
        // restore lib-xml settings
379 193
        \libxml_clear_errors();
380 193
        \libxml_use_internal_errors($internalErrors);
381 193
        \libxml_disable_entity_loader($disableEntityLoader);
382
383 193
        return $this->document;
384
    }
385
386
    /**
387
     * Find list of nodes with a CSS selector.
388
     *
389
     * @param string   $selector
390
     * @param int|null $idx
391
     *
392
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
393
     */
394 142 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
395
    {
396 142
        $xPathQuery = SelectorConverter::toXPath($selector);
397
398 142
        $xPath = new \DOMXPath($this->document);
399 142
        $nodesList = $xPath->query($xPathQuery);
400 142
        $elements = new SimpleHtmlDomNode();
401
402 142
        if ($nodesList) {
403 142
            foreach ($nodesList as $node) {
404 132
                $elements[] = new SimpleHtmlDom($node);
405
            }
406
        }
407
408
        // return all elements
409 142
        if ($idx === null) {
410 70
            if (\count($elements) === 0) {
411 16
                return new SimpleHtmlDomNodeBlank();
412
            }
413
414 67
            return $elements;
415
        }
416
417
        // handle negative values
418 90
        if ($idx < 0) {
419 11
            $idx = \count($elements) + $idx;
420
        }
421
422
        // return one element
423 90
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
424
    }
425
426
    /**
427
     * Find nodes with a CSS selector.
428
     *
429
     * @param string $selector
430
     *
431
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
432
     */
433 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
434
    {
435 12
        return $this->find($selector, null);
436
    }
437
438
    /**
439
     * Find nodes with a CSS selector or false, if no element is found.
440
     *
441
     * @param string $selector
442
     *
443
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
444
     */
445 3
    public function findMultiOrFalse(string $selector)
446
    {
447 3
        $return = $this->find($selector, null);
448
449 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
450 3
            return false;
451
        }
452
453 1
        return $return;
454
    }
455
456
    /**
457
     * Find one node with a CSS selector.
458
     *
459
     * @param string $selector
460
     *
461
     * @return SimpleHtmlDomInterface
462
     */
463 32
    public function findOne(string $selector): SimpleHtmlDomInterface
464
    {
465 32
        return $this->find($selector, 0);
466
    }
467
468
    /**
469
     * Find one node with a CSS selector or false, if no element is found.
470
     *
471
     * @param string $selector
472
     *
473
     * @return false|SimpleHtmlDomInterface
474
     */
475 5
    public function findOneOrFalse(string $selector)
476
    {
477 5
        $return = $this->find($selector, 0);
478
479 5
        if ($return instanceof SimpleHtmlDomBlank) {
480 3
            return false;
481
        }
482
483 3
        return $return;
484
    }
485
486
    /**
487
     * @param string $content
488
     * @param bool   $multiDecodeNewHtmlEntity
489
     *
490
     * @return string
491
     */
492 119
    public function fixHtmlOutput(
493
        string $content,
494
        bool $multiDecodeNewHtmlEntity = false
495
    ): string {
496
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
497
        //          so we try to remove it here again ...
498
499 119
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
500
            /** @noinspection HtmlRequiredLangAttribute */
501 56
            $content = \str_replace(
502
                [
503 56
                    '<html>',
504
                    '</html>',
505
                ],
506 56
                '',
507 56
                $content
508
            );
509
        }
510
511 119
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
512
            /** @noinspection HtmlRequiredTitleElement */
513 60
            $content = \str_replace(
514
                [
515 60
                    '<head>',
516
                    '</head>',
517
                ],
518 60
                '',
519 60
                $content
520
            );
521
        }
522
523 119
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
524
            /** @noinspection HtmlRequiredLangAttribute */
525 59
            $content = \str_replace(
526
                [
527 59
                    '<body>',
528
                    '</body>',
529
                ],
530 59
                '',
531 59
                $content
532
            );
533
        }
534
535 119
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
536 1
            $content = \str_replace(
537 1
                '</script>',
538 1
                '',
539 1
                $content
540
            );
541
        }
542
543 119
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
544 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
545 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
546
        }
547
548 119
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
549 55
            $content = \str_replace(
550
                [
551 55
                    '<p>',
552
                    '</p>',
553
                ],
554 55
                '',
555 55
                $content
556
            );
557
        }
558
559 119
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
560 8
            $content = \str_replace(
561 8
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
562 8
                '',
563 8
                $content
564
            );
565
        }
566
567
        /** @noinspection CheckTagEmptyBody */
568
        /** @noinspection HtmlExtraClosingTag */
569
        /** @noinspection HtmlRequiredTitleElement */
570 119
        $content = \trim(
571 119
            \str_replace(
572
                [
573 119
                    '<simpleHtmlDomHtml>',
574
                    '</simpleHtmlDomHtml>',
575
                    '<simpleHtmlDomP>',
576
                    '</simpleHtmlDomP>',
577
                    '<head><head>',
578
                    '</head></head>',
579
                    '<br></br>',
580
                ],
581
                [
582 119
                    '',
583
                    '',
584
                    '',
585
                    '',
586
                    '<head>',
587
                    '</head>',
588
                    '<br>',
589
                ],
590 119
                $content
591
            )
592
        );
593
594 119
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
595
596 119
        return self::putReplacedBackToPreserveHtmlEntities($content);
597
    }
598
599
    /**
600
     * Return elements by ".class".
601
     *
602
     * @param string $class
603
     *
604
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
605
     */
606
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
607
    {
608
        return $this->findMulti(".${class}");
609
    }
610
611
    /**
612
     * Return element by #id.
613
     *
614
     * @param string $id
615
     *
616
     * @return SimpleHtmlDomInterface
617
     */
618 3
    public function getElementById(string $id): SimpleHtmlDomInterface
619
    {
620 3
        return $this->findOne("#${id}");
621
    }
622
623
    /**
624
     * Return element by tag name.
625
     *
626
     * @param string $name
627
     *
628
     * @return SimpleHtmlDomInterface
629
     */
630 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
631
    {
632 1
        $node = $this->document->getElementsByTagName($name)->item(0);
633
634 1
        if ($node === null) {
635
            return new SimpleHtmlDomBlank();
636
        }
637
638 1
        return new SimpleHtmlDom($node);
639
    }
640
641
    /**
642
     * Returns elements by "#id".
643
     *
644
     * @param string   $id
645
     * @param int|null $idx
646
     *
647
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
648
     */
649
    public function getElementsById(string $id, $idx = null)
650
    {
651
        return $this->find("#${id}", $idx);
652
    }
653
654
    /**
655
     * Returns elements by tag name.
656
     *
657
     * @param string   $name
658
     * @param int|null $idx
659
     *
660
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
661
     */
662 6
    public function getElementsByTagName(string $name, $idx = null)
663
    {
664 6
        $nodesList = $this->document->getElementsByTagName($name);
665
666 6
        $elements = new SimpleHtmlDomNode();
667
668 6
        foreach ($nodesList as $node) {
669 4
            $elements[] = new SimpleHtmlDom($node);
670
        }
671
672
        // return all elements
673 6
        if ($idx === null) {
674 5
            if (\count($elements) === 0) {
675 2
                return new SimpleHtmlDomNodeBlank();
676
            }
677
678 3
            return $elements;
679
        }
680
681
        // handle negative values
682 1
        if ($idx < 0) {
683
            $idx = \count($elements) + $idx;
684
        }
685
686
        // return one element
687 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
688
    }
689
690
    /**
691
     * Get dom node's outer html.
692
     *
693
     * @param bool $multiDecodeNewHtmlEntity
694
     *
695
     * @return string
696
     */
697 86
    public function html(bool $multiDecodeNewHtmlEntity = false): string
698
    {
699 86
        if (static::$callback !== null) {
700
            \call_user_func(static::$callback, [$this]);
701
        }
702
703 86
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
704 49
            $content = $this->document->saveHTML($this->document->documentElement);
705
        } else {
706 49
            $content = $this->document->saveHTML();
707
        }
708
709 86
        if ($content === false) {
710
            return '';
711
        }
712
713 86
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
714
    }
715
716
    /**
717
     * Load HTML from string.
718
     *
719
     * @param string   $html
720
     * @param int|null $libXMLExtraOptions
721
     *
722
     * @return HtmlDomParser
723
     */
724 193
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
725
    {
726
        // reset
727 193
        self::$domBrokenReplaceHelper = [];
728
729 193
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
730
731 193
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
732
    }
733
734
    /**
735
     * Load HTML from file.
736
     *
737
     * @param string   $filePath
738
     * @param int|null $libXMLExtraOptions
739
     *
740
     * @throws \RuntimeException
741
     *
742
     * @return HtmlDomParser
743
     */
744 11 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
745
    {
746
        // reset
747 11
        self::$domBrokenReplaceHelper = [];
748
749
        if (
750 11
            !\preg_match("/^https?:\/\//i", $filePath)
751
            &&
752 11
            !\file_exists($filePath)
753
        ) {
754 1
            throw new \RuntimeException("File ${filePath} not found");
755
        }
756
757
        try {
758 10
            if (\class_exists('\voku\helper\UTF8')) {
759
                /** @noinspection PhpUndefinedClassInspection */
760
                $html = UTF8::file_get_contents($filePath);
761
            } else {
762 10
                $html = \file_get_contents($filePath);
763
            }
764 1
        } catch (\Exception $e) {
765 1
            throw new \RuntimeException("Could not load file ${filePath}");
766
        }
767
768 9
        if ($html === false) {
769
            throw new \RuntimeException("Could not load file ${filePath}");
770
        }
771
772 9
        return $this->loadHtml($html, $libXMLExtraOptions);
773
    }
774
775
    /**
776
     * Get the HTML as XML or plain XML if needed.
777
     *
778
     * @param bool $multiDecodeNewHtmlEntity
779
     * @param bool $htmlToXml
780
     * @param bool $removeXmlHeader
781
     * @param int  $options
782
     *
783
     * @return string
784
     */
785 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
786
        bool $multiDecodeNewHtmlEntity = false,
787
        bool $htmlToXml = true,
788
        bool $removeXmlHeader = true,
789
        int $options = \LIBXML_NOEMPTYTAG
790
    ): string {
791 2
        $xml = $this->document->saveXML(null, $options);
792 2
        if ($xml === false) {
793
            return '';
794
        }
795
796 2
        if ($removeXmlHeader) {
797 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
798
        }
799
800 2
        if ($htmlToXml) {
801 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
802
        } else {
803
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
804
805
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
806
        }
807
808 2
        return $return;
809
    }
810
811
    /**
812
     * @param string $selector
813
     * @param int    $idx
814
     *
815
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
816
     */
817 3
    public function __invoke($selector, $idx = null)
818
    {
819 3
        return $this->find($selector, $idx);
820
    }
821
822
    /**
823
     * @return bool
824
     */
825 119
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
826
    {
827 119
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
828
    }
829
830
    /**
831
     * @return bool
832
     */
833 119
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
834
    {
835 119
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
836
    }
837
838
    /**
839
     * @return bool
840
     */
841 119
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
842
    {
843 119
        return $this->isDOMDocumentCreatedWithoutHtml;
844
    }
845
846
    /**
847
     * @return bool
848
     */
849 119
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
850
    {
851 119
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
852
    }
853
854
    /**
855
     * @return bool
856
     */
857 119
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
858
    {
859 119
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
860
    }
861
862
    /**
863
     * @return bool
864
     */
865 119
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
866
    {
867 119
        return $this->isDOMDocumentCreatedWithoutWrapper;
868
    }
869
870
    /**
871
     * @return bool
872
     */
873 119
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
874
    {
875 119
        return $this->isDOMDocumentCreatedWithFakeEndScript;
876
    }
877
878
    /**
879
     * @param string $html
880
     *
881
     * @return string
882
     */
883 3
    protected function keepBrokenHtml(string $html): string
884
    {
885
        do {
886 3
            $original = $html;
887
888 3
            $html = (string) \preg_replace_callback(
889 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
890
                static function ($matches) {
891 3
                    return $matches['start'] .
892 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
893 3
                           $matches['value'] .
894 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
895 3
                           $matches['end'];
896 3
                },
897 3
                $html
898
            );
899 3
        } while ($original !== $html);
900
901
        do {
902 3
            $original = $html;
903
904 3
            $html = (string) \preg_replace_callback(
905 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
906
                static function ($matches) {
907 3
                    $matches['broken'] = \str_replace(
908 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
909 3
                        ['</', '<', '>'],
910 3
                        $matches['broken']
911
                    );
912
913 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
914 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
915
916 3
                    return $matches['start'] . $matchesHash . $matches['end'];
917 3
                },
918 3
                $html
919
            );
920 3
        } while ($original !== $html);
921
922 3
        return \str_replace(
923 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
924 3
            ['</', '<', '>'],
925 3
            $html
926
        );
927
    }
928
929
    /**
930
     * @param string $html
931
     *
932
     * @return void
933
     */
934 2
    protected function keepSpecialScriptTags(string &$html)
935
    {
936
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
937 2
        $html = (string) \preg_replace_callback(
938 2
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
939
            static function ($matches) {
940
                if (
941 2
                    \strpos($matches['innerContent'], '+') === false
942
                    &&
943 2
                    \strpos($matches['innerContent'], '<%') === false
944
                    &&
945 2
                    \strpos($matches['innerContent'], '{%') === false
946
                    &&
947 2
                    \strpos($matches['innerContent'], '{{') === false
948
                ) {
949
                    // remove the html5 fallback
950 1
                    $matches[0] = \str_replace('<\/', '</', $matches[0]);
951
952 1
                    $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
953
954 1
                    return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
955
                }
956
957
                // remove the html5 fallback
958 1
                $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
959
960 1
                self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
961 1
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
962
963 1
                return $matches['start'] . $matchesHash . $matches['end'];
964 2
            },
965 2
            $html
966
        );
967 2
    }
968
969
    /**
970
     * @param bool $keepBrokenHtml
971
     *
972
     * @return HtmlDomParser
973
     */
974 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
975
    {
976 3
        $this->keepBrokenHtml = $keepBrokenHtml;
977
978 3
        return $this;
979
    }
980
}
981