Completed
Pull Request — master (#55)
by Volodymyr
01:53
created

HtmlDomParser::xml()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 25

Duplication

Lines 25
Ratio 100 %

Code Coverage

Tests 8
CRAP Score 4.3244

Importance

Changes 0
Metric Value
cc 4
nc 5
nop 4
dl 25
loc 25
ccs 8
cts 11
cp 0.7272
crap 4.3244
rs 9.52
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * The properties specified for each special script tag is an array of the following format:
60
     * string script tag => regex script tag
61
     *
62
     * ```php
63
     * protected $specialScriptTags = [
64
     *     'text/html' => 'text\/html',
65
     *     'text/x-custom-template' => 'text\/x-custom-template',
66
     *     'text/x-handlebars-template' => 'text\/x-handlebars-template'
67
     * ]
68
     * ```
69
     *
70
     * @var string[]
71
     */
72
    protected $specialScriptTags = [
73
        'text/html' => 'text\/html',
74
        'text/x-custom-template' => 'text\/x-custom-template',
75
        'text/x-handlebars-template' => 'text\/x-handlebars-template'
76
    ];
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithoutHtml = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $isDOMDocumentCreatedWithoutWrapper = false;
87
88
    /**
89
     * @var bool
90
     */
91
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
92
93
    /**
94
     * @var bool
95
     */
96
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
97
98
    /**
99
     * @var bool
100
     */
101
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
102
103
    /**
104
     * @var bool
105
     */
106
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
107
108
    /**
109
     * @var bool
110
     */
111
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
112
113
    /**
114
     * @var bool
115
     */
116
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
117
118
    /**
119
     * @var bool
120
     */
121
    protected $keepBrokenHtml;
122
123
    /**
124
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
125
     */
126 212 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
127
    {
128 212
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
129
130
        // DOMDocument settings
131 212
        $this->document->preserveWhiteSpace = true;
132 212
        $this->document->formatOutput = true;
133
134 212
        if ($element instanceof SimpleHtmlDomInterface) {
135 98
            $element = $element->getNode();
136
        }
137
138 212
        if ($element instanceof \DOMNode) {
139 98
            $domNode = $this->document->importNode($element, true);
140
141 98
            if ($domNode instanceof \DOMNode) {
142
                /** @noinspection UnusedFunctionResultInspection */
143 98
                $this->document->appendChild($domNode);
144
            }
145
146 98
            return;
147
        }
148
149 212
        if ($element !== null) {
150
            /** @noinspection UnusedFunctionResultInspection */
151 85
            $this->loadHtml($element);
152
        }
153 211
    }
154
155
    /**
156
     * @param string $name
157
     * @param array  $arguments
158
     *
159
     * @return bool|mixed
160
     */
161 76
    public function __call($name, $arguments)
162
    {
163 76
        $name = \strtolower($name);
164
165 76
        if (isset(self::$functionAliases[$name])) {
166 75
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
167
        }
168
169 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
170
    }
171
172
    /**
173
     * @param string $name
174
     * @param array  $arguments
175
     *
176
     * @throws \BadMethodCallException
177
     * @throws \RuntimeException
178
     *
179
     * @return HtmlDomParser
180
     */
181 26 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
182
    {
183 26
        $arguments0 = $arguments[0] ?? '';
184
185 26
        $arguments1 = $arguments[1] ?? null;
186
187 26
        if ($name === 'str_get_html') {
188 20
            $parser = new static();
189
190 20
            return $parser->loadHtml($arguments0, $arguments1);
191
        }
192
193 7
        if ($name === 'file_get_html') {
194 6
            $parser = new static();
195
196 6
            return $parser->loadHtmlFile($arguments0, $arguments1);
197
        }
198
199 1
        throw new \BadMethodCallException('Method does not exist');
200
    }
201
202
    /** @noinspection MagicMethodsValidityInspection */
203
204
    /**
205
     * @param string $name
206
     *
207
     * @return string|null
208
     */
209 15
    public function __get($name)
210
    {
211 15
        $name = \strtolower($name);
212
213 15
        switch ($name) {
214 15
            case 'outerhtml':
215 15
            case 'outertext':
216 5
                return $this->html();
217 11
            case 'innerhtml':
218 5
            case 'innertext':
219 7
                return $this->innerHtml();
220 4
            case 'text':
221 4
            case 'plaintext':
222 3
                return $this->text();
223
        }
224
225 1
        return null;
226
    }
227
228
    /**
229
     * @return string
230
     */
231 19
    public function __toString()
232
    {
233 19
        return $this->html();
234
    }
235
236
    /**
237
     * does nothing (only for api-compatibility-reasons)
238
     *
239
     * @return bool
240
     *
241
     * @deprecated
242
     */
243 6
    public function clear(): bool
244
    {
245 6
        return true;
246
    }
247
248
    /**
249
     * Create DOMDocument from HTML.
250
     *
251
     * @param string   $html
252
     * @param int|null $libXMLExtraOptions
253
     *
254
     * @return \DOMDocument
255
     */
256 196
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
257
    {
258 196
        if ($this->keepBrokenHtml) {
259 3
            $html = $this->keepBrokenHtml(\trim($html));
260
        }
261
262 196
        if (\strpos($html, '<') === false) {
263 11
            $this->isDOMDocumentCreatedWithoutHtml = true;
264 194
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
265 6
            $this->isDOMDocumentCreatedWithoutWrapper = true;
266
        }
267
268 196
        if (\strpos(\ltrim($html), '<!--') === 0) {
269 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
270
        }
271
272
        /** @noinspection HtmlRequiredLangAttribute */
273
        if (
274 196
            \strpos($html, '<html ') === false
275
            &&
276 196
            \strpos($html, '<html>') === false
277
        ) {
278 119
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
279
        }
280
281
        if (
282 196
            \strpos($html, '<body ') === false
283
            &&
284 196
            \strpos($html, '<body>') === false
285
        ) {
286 124
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
287
        }
288
289
        /** @noinspection HtmlRequiredTitleElement */
290
        if (
291 196
            \strpos($html, '<head ') === false
292
            &&
293 196
            \strpos($html, '<head>') === false
294
        ) {
295 143
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
296
        }
297
298
        /** @noinspection HtmlRequiredTitleElement */
299
        if (
300 196
            \strpos($html, '<p ') === false
301
            &&
302 196
            \strpos($html, '<p>') === false
303
        ) {
304 106
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
305
        }
306
307
        if (
308 196
            \strpos($html, '</script>') === false
309
            &&
310 196
            \strpos($html, '<\/script>') !== false
311
        ) {
312 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
313
        }
314
315 196
        if (\strpos($html, '<script') !== false) {
316 23
            $this->html5FallbackForScriptTags($html);
317
318 23
            foreach (array_keys($this->specialScriptTags) as $tag) {
319 23
                if (\strpos($html, $tag) !== false) {
320 6
                    $this->keepSpecialScriptTags($html);
321
                }
322
            }
323
        }
324
325
        // set error level
326 196
        $internalErrors = \libxml_use_internal_errors(true);
327 196
        $disableEntityLoader = \libxml_disable_entity_loader(true);
328 196
        \libxml_clear_errors();
329
330 196
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
331
332 196
        if (\defined('LIBXML_BIGLINES')) {
333 196
            $optionsXml |= \LIBXML_BIGLINES;
334
        }
335
336 196
        if (\defined('LIBXML_COMPACT')) {
337 196
            $optionsXml |= \LIBXML_COMPACT;
338
        }
339
340 196
        if (\defined('LIBXML_HTML_NODEFDTD')) {
341 196
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
342
        }
343
344 196
        if ($libXMLExtraOptions !== null) {
345 5
            $optionsXml |= $libXMLExtraOptions;
346
        }
347
348
        if (
349 196
            $this->isDOMDocumentCreatedWithoutWrapper
350
            ||
351 192
            $this->isDOMDocumentCreatedWithCommentWrapper
352
            ||
353 196
            $this->keepBrokenHtml
354
        ) {
355 19
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
356
        }
357
358 196
        $html = self::replaceToPreserveHtmlEntities($html);
359
360 196
        $documentFound = false;
361 196
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
362 196 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
363 90
            $domElementTmp = \dom_import_simplexml($sxe);
364 90
            if ($domElementTmp) {
365 90
                $documentFound = true;
366 90
                $this->document = $domElementTmp->ownerDocument;
367
            }
368
        }
369
370 196 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
371
372
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
373 115
            $xmlHackUsed = false;
374
            /** @noinspection StringFragmentMisplacedInspection */
375 115
            if (\stripos('<?xml', $html) !== 0) {
376 115
                $xmlHackUsed = true;
377 115
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
378
            }
379
380 115
            $this->document->loadHTML($html, $optionsXml);
381
382
            // remove the "xml-encoding" hack
383 115
            if ($xmlHackUsed) {
384 115
                foreach ($this->document->childNodes as $child) {
385 115
                    if ($child->nodeType === \XML_PI_NODE) {
386
                        /** @noinspection UnusedFunctionResultInspection */
387 115
                        $this->document->removeChild($child);
388
389 115
                        break;
390
                    }
391
                }
392
            }
393
        }
394
395
        // set encoding
396 196
        $this->document->encoding = $this->getEncoding();
397
398
        // restore lib-xml settings
399 196
        \libxml_clear_errors();
400 196
        \libxml_use_internal_errors($internalErrors);
401 196
        \libxml_disable_entity_loader($disableEntityLoader);
402
403 196
        return $this->document;
404
    }
405
406
    /**
407
     * Find list of nodes with a CSS selector.
408
     *
409
     * @param string   $selector
410
     * @param int|null $idx
411
     *
412
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
413
     */
414 144 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
415
    {
416 144
        $xPathQuery = SelectorConverter::toXPath($selector);
417
418 144
        $xPath = new \DOMXPath($this->document);
419 144
        $nodesList = $xPath->query($xPathQuery);
420 144
        $elements = new SimpleHtmlDomNode();
421
422 144
        if ($nodesList) {
423 144
            foreach ($nodesList as $node) {
424 134
                $elements[] = new SimpleHtmlDom($node);
425
            }
426
        }
427
428
        // return all elements
429 144
        if ($idx === null) {
430 71
            if (\count($elements) === 0) {
431 16
                return new SimpleHtmlDomNodeBlank();
432
            }
433
434 68
            return $elements;
435
        }
436
437
        // handle negative values
438 91
        if ($idx < 0) {
439 11
            $idx = \count($elements) + $idx;
440
        }
441
442
        // return one element
443 91
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
444
    }
445
446
    /**
447
     * Find nodes with a CSS selector.
448
     *
449
     * @param string $selector
450
     *
451
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
452
     */
453 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
454
    {
455 12
        return $this->find($selector, null);
456
    }
457
458
    /**
459
     * Find nodes with a CSS selector or false, if no element is found.
460
     *
461
     * @param string $selector
462
     *
463
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
464
     */
465 4
    public function findMultiOrFalse(string $selector)
466
    {
467 4
        $return = $this->find($selector, null);
468
469 4
        if ($return instanceof SimpleHtmlDomNodeBlank) {
470 3
            return false;
471
        }
472
473 2
        return $return;
474
    }
475
476
    /**
477
     * Find one node with a CSS selector.
478
     *
479
     * @param string $selector
480
     *
481
     * @return SimpleHtmlDomInterface
482
     */
483 32
    public function findOne(string $selector): SimpleHtmlDomInterface
484
    {
485 32
        return $this->find($selector, 0);
486
    }
487
488
    /**
489
     * Find one node with a CSS selector or false, if no element is found.
490
     *
491
     * @param string $selector
492
     *
493
     * @return false|SimpleHtmlDomInterface
494
     */
495 6
    public function findOneOrFalse(string $selector)
496
    {
497 6
        $return = $this->find($selector, 0);
498
499 6
        if ($return instanceof SimpleHtmlDomBlank) {
500 3
            return false;
501
        }
502
503 4
        return $return;
504
    }
505
506
    /**
507
     * @param string $content
508
     * @param bool   $multiDecodeNewHtmlEntity
509
     *
510
     * @return string
511
     */
512 122
    public function fixHtmlOutput(
513
        string $content,
514
        bool $multiDecodeNewHtmlEntity = false
515
    ): string {
516
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
517
        //          so we try to remove it here again ...
518
519 122
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
520
            /** @noinspection HtmlRequiredLangAttribute */
521 58
            $content = \str_replace(
522
                [
523 58
                    '<html>',
524
                    '</html>',
525
                ],
526 58
                '',
527 58
                $content
528
            );
529
        }
530
531 122
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
532
            /** @noinspection HtmlRequiredTitleElement */
533 62
            $content = \str_replace(
534
                [
535 62
                    '<head>',
536
                    '</head>',
537
                ],
538 62
                '',
539 62
                $content
540
            );
541
        }
542
543 122
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
544
            /** @noinspection HtmlRequiredLangAttribute */
545 61
            $content = \str_replace(
546
                [
547 61
                    '<body>',
548
                    '</body>',
549
                ],
550 61
                '',
551 61
                $content
552
            );
553
        }
554
555 122
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
556 1
            $content = \str_replace(
557 1
                '</script>',
558 1
                '',
559 1
                $content
560
            );
561
        }
562
563 122
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
564 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
565 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
566
        }
567
568 122
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
569 56
            $content = \str_replace(
570
                [
571 56
                    '<p>',
572
                    '</p>',
573
                ],
574 56
                '',
575 56
                $content
576
            );
577
        }
578
579 122
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
580 9
            $content = \str_replace(
581 9
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
582 9
                '',
583 9
                $content
584
            );
585
        }
586
587
        /** @noinspection CheckTagEmptyBody */
588
        /** @noinspection HtmlExtraClosingTag */
589
        /** @noinspection HtmlRequiredTitleElement */
590 122
        $content = \trim(
591 122
            \str_replace(
592
                [
593 122
                    '<simpleHtmlDomHtml>',
594
                    '</simpleHtmlDomHtml>',
595
                    '<simpleHtmlDomP>',
596
                    '</simpleHtmlDomP>',
597
                    '<head><head>',
598
                    '</head></head>',
599
                    '<br></br>',
600
                ],
601
                [
602 122
                    '',
603
                    '',
604
                    '',
605
                    '',
606
                    '<head>',
607
                    '</head>',
608
                    '<br>',
609
                ],
610 122
                $content
611
            )
612
        );
613
614 122
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
615
616 122
        return self::putReplacedBackToPreserveHtmlEntities($content);
617
    }
618
619
    /**
620
     * Return elements by ".class".
621
     *
622
     * @param string $class
623
     *
624
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
625
     */
626
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
627
    {
628
        return $this->findMulti(".${class}");
629
    }
630
631
    /**
632
     * Return element by #id.
633
     *
634
     * @param string $id
635
     *
636
     * @return SimpleHtmlDomInterface
637
     */
638 3
    public function getElementById(string $id): SimpleHtmlDomInterface
639
    {
640 3
        return $this->findOne("#${id}");
641
    }
642
643
    /**
644
     * Return element by tag name.
645
     *
646
     * @param string $name
647
     *
648
     * @return SimpleHtmlDomInterface
649
     */
650 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
651
    {
652 1
        $node = $this->document->getElementsByTagName($name)->item(0);
653
654 1
        if ($node === null) {
655
            return new SimpleHtmlDomBlank();
656
        }
657
658 1
        return new SimpleHtmlDom($node);
659
    }
660
661
    /**
662
     * Returns elements by "#id".
663
     *
664
     * @param string   $id
665
     * @param int|null $idx
666
     *
667
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
668
     */
669
    public function getElementsById(string $id, $idx = null)
670
    {
671
        return $this->find("#${id}", $idx);
672
    }
673
674
    /**
675
     * Returns elements by tag name.
676
     *
677
     * @param string   $name
678
     * @param int|null $idx
679
     *
680
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
681
     */
682 6
    public function getElementsByTagName(string $name, $idx = null)
683
    {
684 6
        $nodesList = $this->document->getElementsByTagName($name);
685
686 6
        $elements = new SimpleHtmlDomNode();
687
688 6
        foreach ($nodesList as $node) {
689 4
            $elements[] = new SimpleHtmlDom($node);
690
        }
691
692
        // return all elements
693 6
        if ($idx === null) {
694 5
            if (\count($elements) === 0) {
695 2
                return new SimpleHtmlDomNodeBlank();
696
            }
697
698 3
            return $elements;
699
        }
700
701
        // handle negative values
702 1
        if ($idx < 0) {
703
            $idx = \count($elements) + $idx;
704
        }
705
706
        // return one element
707 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
708
    }
709
710
    /**
711
     * Get dom node's outer html.
712
     *
713
     * @param bool $multiDecodeNewHtmlEntity
714
     *
715
     * @return string
716
     */
717 89
    public function html(bool $multiDecodeNewHtmlEntity = false): string
718
    {
719 89
        if (static::$callback !== null) {
720
            \call_user_func(static::$callback, [$this]);
721
        }
722
723 89
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
724 51
            $content = $this->document->saveHTML($this->document->documentElement);
725
        } else {
726 51
            $content = $this->document->saveHTML();
727
        }
728
729 89
        if ($content === false) {
730
            return '';
731
        }
732
733 89
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
734
    }
735
736
    /**
737
     * Load HTML from string.
738
     *
739
     * @param string   $html
740
     * @param int|null $libXMLExtraOptions
741
     *
742
     * @return HtmlDomParser
743
     */
744 196
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
745
    {
746
        // reset
747 196
        self::$domBrokenReplaceHelper = [];
748
749 196
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
750
751 196
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
752
    }
753
754
    /**
755
     * Load HTML from file.
756
     *
757
     * @param string   $filePath
758
     * @param int|null $libXMLExtraOptions
759
     *
760
     * @throws \RuntimeException
761
     *
762
     * @return HtmlDomParser
763
     */
764 13 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
765
    {
766
        // reset
767 13
        self::$domBrokenReplaceHelper = [];
768
769
        if (
770 13
            !\preg_match("/^https?:\/\//i", $filePath)
771
            &&
772 13
            !\file_exists($filePath)
773
        ) {
774 1
            throw new \RuntimeException("File ${filePath} not found");
775
        }
776
777
        try {
778 12
            if (\class_exists('\voku\helper\UTF8')) {
779
                /** @noinspection PhpUndefinedClassInspection */
780
                $html = UTF8::file_get_contents($filePath);
781
            } else {
782 12
                $html = \file_get_contents($filePath);
783
            }
784 1
        } catch (\Exception $e) {
785 1
            throw new \RuntimeException("Could not load file ${filePath}");
786
        }
787
788 11
        if ($html === false) {
789
            throw new \RuntimeException("Could not load file ${filePath}");
790
        }
791
792 11
        return $this->loadHtml($html, $libXMLExtraOptions);
793
    }
794
795
    /**
796
     * Get the HTML as XML or plain XML if needed.
797
     *
798
     * @param bool $multiDecodeNewHtmlEntity
799
     * @param bool $htmlToXml
800
     * @param bool $removeXmlHeader
801
     * @param int  $options
802
     *
803
     * @return string
804
     */
805 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
806
        bool $multiDecodeNewHtmlEntity = false,
807
        bool $htmlToXml = true,
808
        bool $removeXmlHeader = true,
809
        int $options = \LIBXML_NOEMPTYTAG
810
    ): string {
811 2
        $xml = $this->document->saveXML(null, $options);
812 2
        if ($xml === false) {
813
            return '';
814
        }
815
816 2
        if ($removeXmlHeader) {
817 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
818
        }
819
820 2
        if ($htmlToXml) {
821 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
822
        } else {
823
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
824
825
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
826
        }
827
828 2
        return $return;
829
    }
830
831
    /**
832
     * @param string $selector
833
     * @param int    $idx
834
     *
835
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
836
     */
837 3
    public function __invoke($selector, $idx = null)
838
    {
839 3
        return $this->find($selector, $idx);
840
    }
841
842
    /**
843
     * @return bool
844
     */
845 122
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
846
    {
847 122
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
848
    }
849
850
    /**
851
     * @return bool
852
     */
853 122
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
854
    {
855 122
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
856
    }
857
858
    /**
859
     * @return bool
860
     */
861 122
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
862
    {
863 122
        return $this->isDOMDocumentCreatedWithoutHtml;
864
    }
865
866
    /**
867
     * @return bool
868
     */
869 122
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
870
    {
871 122
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
872
    }
873
874
    /**
875
     * @return bool
876
     */
877 122
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
878
    {
879 122
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
880
    }
881
882
    /**
883
     * @return bool
884
     */
885 122
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
886
    {
887 122
        return $this->isDOMDocumentCreatedWithoutWrapper;
888
    }
889
890
    /**
891
     * @return bool
892
     */
893 122
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
894
    {
895 122
        return $this->isDOMDocumentCreatedWithFakeEndScript;
896
    }
897
898
    /**
899
     * @param string $html
900
     *
901
     * @return string
902
     */
903 3
    protected function keepBrokenHtml(string $html): string
904
    {
905
        do {
906 3
            $original = $html;
907
908 3
            $html = (string) \preg_replace_callback(
909 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
910
                static function ($matches) {
911 3
                    return $matches['start'] .
912 3
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
913 3
                        $matches['value'] .
914 3
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
915 3
                        $matches['end'];
916 3
                },
917 3
                $html
918
            );
919 3
        } while ($original !== $html);
920
921
        do {
922 3
            $original = $html;
923
924 3
            $html = (string) \preg_replace_callback(
925 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
926
                static function ($matches) {
927 3
                    $matches['broken'] = \str_replace(
928 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
929 3
                        ['</', '<', '>'],
930 3
                        $matches['broken']
931
                    );
932
933 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
934 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
935
936 3
                    return $matches['start'] . $matchesHash . $matches['end'];
937 3
                },
938 3
                $html
939
            );
940 3
        } while ($original !== $html);
941
942 3
        return \str_replace(
943 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
944 3
            ['</', '<', '>'],
945 3
            $html
946
        );
947
    }
948
949
    /**
950
     * @param string $html
951
     *
952
     * @return void
953
     */
954 6
    protected function keepSpecialScriptTags(string &$html)
955
    {
956
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
957 6
        $html = (string) \preg_replace_callback(
958 6
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . implode('|', $this->specialScriptTags) . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
959
            function ($matches) {
960
961
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
962
                // because often this looks like non valid html in the template itself.
963 4
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
964 4
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
965
                        // remove the html5 fallback
966 3
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
967
968 3
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
969 3
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
970
971 3
                        return $matches['start'] . $matchesHash . $matches['end'];
972
                    }
973
                }
974
975
                // remove the html5 fallback
976 3
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
977
978 3
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
979
980 3
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
981 6
            },
982 6
            $html
983
        );
984 6
    }
985
986
    /**
987
     * @param bool $keepBrokenHtml
988
     *
989
     * @return HtmlDomParser
990
     */
991 3
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
992
    {
993 3
        $this->keepBrokenHtml = $keepBrokenHtml;
994
995 3
        return $this;
996
    }
997
998
    /**
999
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1000
     *
1001
     * @return HtmlDomParser
1002
     */
1003 2
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1004
    {
1005 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
1006 2
            if (!\is_string($tmp)) {
1007 1
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
1008
            }
1009
        }
1010
1011 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1012
1013 1
        return $this;
1014
    }
1015
1016
    /**
1017
     * @param string[] $specialScriptTags
1018
     *
1019
     * @return HtmlDomParser
1020
     */
1021
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1022
    {
1023
        foreach ($specialScriptTags as $k => $tag) {
1024
            if (!\is_string($tag) || !\is_string($k)) {
1025
                throw new \InvalidArgumentException('SpecialScriptTags only allows array[string => string]');
1026
            }
1027
        }
1028
1029
        $this->specialScriptTags = $specialScriptTags;
1030
1031
        return $this;
1032
    }
1033
}
1034