Completed
Push — master ( d1f2b9...89c29c )
by Lars
01:36
created

HtmlDomParser::getElementsById()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 1
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHtml = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
87
88
    /**
89
     * @var bool
90
     */
91
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
92
93
    /**
94
     * @var bool
95
     */
96
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
97
98
    /**
99
     * @var bool
100
     */
101
    protected $keepBrokenHtml;
102
103
    /**
104
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
105
     */
106 210 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
107
    {
108 210
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
109
110
        // DOMDocument settings
111 210
        $this->document->preserveWhiteSpace = true;
112 210
        $this->document->formatOutput = true;
113
114 210
        if ($element instanceof SimpleHtmlDomInterface) {
115 97
            $element = $element->getNode();
116
        }
117
118 210
        if ($element instanceof \DOMNode) {
119 97
            $domNode = $this->document->importNode($element, true);
120
121 97
            if ($domNode instanceof \DOMNode) {
122
                /** @noinspection UnusedFunctionResultInspection */
123 97
                $this->document->appendChild($domNode);
124
            }
125
126 97
            return;
127
        }
128
129 210
        if ($element !== null) {
130
            /** @noinspection UnusedFunctionResultInspection */
131 84
            $this->loadHtml($element);
132
        }
133 209
    }
134
135
    /**
136
     * @param string $name
137
     * @param array  $arguments
138
     *
139
     * @return bool|mixed
140
     */
141 75
    public function __call($name, $arguments)
142
    {
143 75
        $name = \strtolower($name);
144
145 75
        if (isset(self::$functionAliases[$name])) {
146 74
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
147
        }
148
149 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
150
    }
151
152
    /**
153
     * @param string $name
154
     * @param array  $arguments
155
     *
156
     * @throws \BadMethodCallException
157
     * @throws \RuntimeException
158
     *
159
     * @return HtmlDomParser
160
     */
161 24 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
    {
163 24
        $arguments0 = $arguments[0] ?? '';
164
165 24
        $arguments1 = $arguments[1] ?? null;
166
167 24
        if ($name === 'str_get_html') {
168 19
            $parser = new static();
169
170 19
            return $parser->loadHtml($arguments0, $arguments1);
171
        }
172
173 5
        if ($name === 'file_get_html') {
174 4
            $parser = new static();
175
176 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
177
        }
178
179 1
        throw new \BadMethodCallException('Method does not exist');
180
    }
181
182
    /** @noinspection MagicMethodsValidityInspection */
183
184
    /**
185
     * @param string $name
186
     *
187
     * @return string|null
188
     */
189 15
    public function __get($name)
190
    {
191 15
        $name = \strtolower($name);
192
193
        switch ($name) {
194 15
            case 'outerhtml':
195 15
            case 'outertext':
196 5
                return $this->html();
197 11
            case 'innerhtml':
198 5
            case 'innertext':
199 7
                return $this->innerHtml();
200 4
            case 'text':
201 4
            case 'plaintext':
202 3
                return $this->text();
203
        }
204
205 1
        return null;
206
    }
207
208
    /**
209
     * @return string
210
     */
211 19
    public function __toString()
212
    {
213 19
        return $this->html();
214
    }
215
216
    /**
217
     * does nothing (only for api-compatibility-reasons)
218
     *
219
     * @return bool
220
     *
221
     * @deprecated
222
     */
223 6
    public function clear(): bool
224
    {
225 6
        return true;
226
    }
227
228
    /**
229
     * Create DOMDocument from HTML.
230
     *
231
     * @param string   $html
232
     * @param int|null $libXMLExtraOptions
233
     *
234
     * @return \DOMDocument
235
     */
236 194
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
237
    {
238 194
        if ($this->keepBrokenHtml) {
239 3
            $html = $this->keepBrokenHtml(\trim($html));
240
        }
241
242 194
        if (\strpos($html, '<') === false) {
243 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
244 192
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
245 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
246
        }
247
248 194
        if (\strpos(\ltrim($html), '<!--') === 0) {
249 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
250
        }
251
252
        /** @noinspection HtmlRequiredLangAttribute */
253
        if (
254 194
            \strpos($html, '<html ') === false
255
            &&
256 194
            \strpos($html, '<html>') === false
257
        ) {
258 117
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
259
        }
260
261
        if (
262 194
            \strpos($html, '<body ') === false
263
            &&
264 194
            \strpos($html, '<body>') === false
265
        ) {
266 122
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
267
        }
268
269
        /** @noinspection HtmlRequiredTitleElement */
270
        if (
271 194
            \strpos($html, '<head ') === false
272
            &&
273 194
            \strpos($html, '<head>') === false
274
        ) {
275 141
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
276
        }
277
278
        /** @noinspection HtmlRequiredTitleElement */
279
        if (
280 194
            \strpos($html, '<p ') === false
281
            &&
282 194
            \strpos($html, '<p>') === false
283
        ) {
284 104
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
285
        }
286
287
        if (
288 194
            \strpos($html, '</script>') === false
289
            &&
290 194
            \strpos($html, '<\/script>') !== false
291
        ) {
292 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
293
        }
294
295 194
        if (\strpos($html, '<script') !== false) {
296 21
            $this->html5FallbackForScriptTags($html);
297
298
            if (
299 21
                \strpos($html, 'type="text/html"') !== false
300
                ||
301 20
                \strpos($html, 'type=\'text/html\'') !== false
302
                ||
303 20
                \strpos($html, 'type=text/html') !== false
304
                ||
305 19
                \strpos($html, 'type="text/x-custom-template"') !== false
306
                ||
307 18
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
308
                ||
309 21
                \strpos($html, 'type=text/x-custom-template') !== false
310
            ) {
311 3
                $this->keepSpecialScriptTags($html);
312
            }
313
        }
314
315
        // set error level
316 194
        $internalErrors = \libxml_use_internal_errors(true);
317 194
        $disableEntityLoader = \libxml_disable_entity_loader(true);
318 194
        \libxml_clear_errors();
319
320 194
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
321
322 194
        if (\defined('LIBXML_BIGLINES')) {
323 194
            $optionsXml |= \LIBXML_BIGLINES;
324
        }
325
326 194
        if (\defined('LIBXML_COMPACT')) {
327 194
            $optionsXml |= \LIBXML_COMPACT;
328
        }
329
330 194
        if (\defined('LIBXML_HTML_NODEFDTD')) {
331 194
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
332
        }
333
334 194
        if ($libXMLExtraOptions !== null) {
335 5
            $optionsXml |= $libXMLExtraOptions;
336
        }
337
338
        if (
339 194
            $this->isDOMDocumentCreatedWithoutWrapper
340
            ||
341 190
            $this->isDOMDocumentCreatedWithCommentWrapper
342
            ||
343 194
            $this->keepBrokenHtml
344
        ) {
345 18
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
346
        }
347
348 194
        $html = self::replaceToPreserveHtmlEntities($html);
349
350 194
        $documentFound = false;
351 194
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
352 194 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353 90
            $domElementTmp = \dom_import_simplexml($sxe);
354 90
            if ($domElementTmp) {
355 90
                $documentFound = true;
356 90
                $this->document = $domElementTmp->ownerDocument;
357
            }
358
        }
359
360 194 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
361
362
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
363 113
            $xmlHackUsed = false;
364
            /** @noinspection StringFragmentMisplacedInspection */
365 113
            if (\stripos('<?xml', $html) !== 0) {
366 113
                $xmlHackUsed = true;
367 113
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
368
            }
369
370 113
            $this->document->loadHTML($html, $optionsXml);
371
372
            // remove the "xml-encoding" hack
373 113
            if ($xmlHackUsed) {
374 113
                foreach ($this->document->childNodes as $child) {
375 113
                    if ($child->nodeType === \XML_PI_NODE) {
376
                        /** @noinspection UnusedFunctionResultInspection */
377 113
                        $this->document->removeChild($child);
378
379 113
                        break;
380
                    }
381
                }
382
            }
383
        }
384
385
        // set encoding
386 194
        $this->document->encoding = $this->getEncoding();
387
388
        // restore lib-xml settings
389 194
        \libxml_clear_errors();
390 194
        \libxml_use_internal_errors($internalErrors);
391 194
        \libxml_disable_entity_loader($disableEntityLoader);
392
393 194
        return $this->document;
394
    }
395
396
    /**
397
     * Find list of nodes with a CSS selector.
398
     *
399
     * @param string   $selector
400
     * @param int|null $idx
401
     *
402
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
403
     */
404 142 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
405
    {
406 142
        $xPathQuery = SelectorConverter::toXPath($selector);
407
408 142
        $xPath = new \DOMXPath($this->document);
409 142
        $nodesList = $xPath->query($xPathQuery);
410 142
        $elements = new SimpleHtmlDomNode();
411
412 142
        if ($nodesList) {
413 142
            foreach ($nodesList as $node) {
414 132
                $elements[] = new SimpleHtmlDom($node);
415
            }
416
        }
417
418
        // return all elements
419 142
        if ($idx === null) {
420 70
            if (\count($elements) === 0) {
421 16
                return new SimpleHtmlDomNodeBlank();
422
            }
423
424 67
            return $elements;
425
        }
426
427
        // handle negative values
428 90
        if ($idx < 0) {
429 11
            $idx = \count($elements) + $idx;
430
        }
431
432
        // return one element
433 90
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
434
    }
435
436
    /**
437
     * Find nodes with a CSS selector.
438
     *
439
     * @param string $selector
440
     *
441
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
442
     */
443 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
444
    {
445 12
        return $this->find($selector, null);
446
    }
447
448
    /**
449
     * Find nodes with a CSS selector or false, if no element is found.
450
     *
451
     * @param string $selector
452
     *
453
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
454
     */
455 3
    public function findMultiOrFalse(string $selector)
456
    {
457 3
        $return = $this->find($selector, null);
458
459 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
460 3
            return false;
461
        }
462
463 1
        return $return;
464
    }
465
466
    /**
467
     * Find one node with a CSS selector.
468
     *
469
     * @param string $selector
470
     *
471
     * @return SimpleHtmlDomInterface
472
     */
473 32
    public function findOne(string $selector): SimpleHtmlDomInterface
474
    {
475 32
        return $this->find($selector, 0);
476
    }
477
478
    /**
479
     * Find one node with a CSS selector or false, if no element is found.
480
     *
481
     * @param string $selector
482
     *
483
     * @return false|SimpleHtmlDomInterface
484
     */
485 5
    public function findOneOrFalse(string $selector)
486
    {
487 5
        $return = $this->find($selector, 0);
488
489 5
        if ($return instanceof SimpleHtmlDomBlank) {
490 3
            return false;
491
        }
492
493 3
        return $return;
494
    }
495
496
    /**
497
     * @param string $content
498
     * @param bool   $multiDecodeNewHtmlEntity
499
     *
500
     * @return string
501
     */
502 120
    public function fixHtmlOutput(
503
        string $content,
504
        bool $multiDecodeNewHtmlEntity = false
505
    ): string {
506
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
507
        //          so we try to remove it here again ...
508
509 120
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
510
            /** @noinspection HtmlRequiredLangAttribute */
511 57
            $content = \str_replace(
512
                [
513 57
                    '<html>',
514
                    '</html>',
515
                ],
516 57
                '',
517 57
                $content
518
            );
519
        }
520
521 120
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
522
            /** @noinspection HtmlRequiredTitleElement */
523 61
            $content = \str_replace(
524
                [
525 61
                    '<head>',
526
                    '</head>',
527
                ],
528 61
                '',
529 61
                $content
530
            );
531
        }
532
533 120
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
534
            /** @noinspection HtmlRequiredLangAttribute */
535 60
            $content = \str_replace(
536
                [
537 60
                    '<body>',
538
                    '</body>',
539
                ],
540 60
                '',
541 60
                $content
542
            );
543
        }
544
545 120
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
546 1
            $content = \str_replace(
547 1
                '</script>',
548 1
                '',
549 1
                $content
550
            );
551
        }
552
553 120
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
554 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
555 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
556
        }
557
558 120
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
559 55
            $content = \str_replace(
560
                [
561 55
                    '<p>',
562
                    '</p>',
563
                ],
564 55
                '',
565 55
                $content
566
            );
567
        }
568
569 120
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
570 8
            $content = \str_replace(
571 8
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
572 8
                '',
573 8
                $content
574
            );
575
        }
576
577
        /** @noinspection CheckTagEmptyBody */
578
        /** @noinspection HtmlExtraClosingTag */
579
        /** @noinspection HtmlRequiredTitleElement */
580 120
        $content = \trim(
581 120
            \str_replace(
582
                [
583 120
                    '<simpleHtmlDomHtml>',
584
                    '</simpleHtmlDomHtml>',
585
                    '<simpleHtmlDomP>',
586
                    '</simpleHtmlDomP>',
587
                    '<head><head>',
588
                    '</head></head>',
589
                    '<br></br>',
590
                ],
591
                [
592 120
                    '',
593
                    '',
594
                    '',
595
                    '',
596
                    '<head>',
597
                    '</head>',
598
                    '<br>',
599
                ],
600 120
                $content
601
            )
602
        );
603
604 120
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
605
606 120
        return self::putReplacedBackToPreserveHtmlEntities($content);
607
    }
608
609
    /**
610
     * Return elements by ".class".
611
     *
612
     * @param string $class
613
     *
614
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
615
     */
616
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
617
    {
618
        return $this->findMulti(".${class}");
619
    }
620
621
    /**
622
     * Return element by #id.
623
     *
624
     * @param string $id
625
     *
626
     * @return SimpleHtmlDomInterface
627
     */
628
    public function getElementById(string $id): SimpleHtmlDomInterface
629
    {
630 3
        return $this->findOne("#${id}");
631
    }
632
633
    /**
634
     * Return element by tag name.
635
     *
636
     * @param string $name
637
     *
638
     * @return SimpleHtmlDomInterface
639
     */
640
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
641
    {
642 1
        $node = $this->document->getElementsByTagName($name)->item(0);
643
644 1
        if ($node === null) {
645
            return new SimpleHtmlDomBlank();
646
        }
647
648 1
        return new SimpleHtmlDom($node);
649
    }
650
651
    /**
652
     * Returns elements by "#id".
653
     *
654
     * @param string   $id
655
     * @param int|null $idx
656
     *
657
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
658
     */
659
    public function getElementsById(string $id, $idx = null)
660
    {
661
        return $this->find("#${id}", $idx);
662
    }
663
664
    /**
665
     * Returns elements by tag name.
666
     *
667
     * @param string   $name
668
     * @param int|null $idx
669
     *
670
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
671
     */
672
    public function getElementsByTagName(string $name, $idx = null)
673
    {
674 6
        $nodesList = $this->document->getElementsByTagName($name);
675
676 6
        $elements = new SimpleHtmlDomNode();
677
678 6
        foreach ($nodesList as $node) {
679 4
            $elements[] = new SimpleHtmlDom($node);
680
        }
681
682
        // return all elements
683 6
        if ($idx === null) {
684 5
            if (\count($elements) === 0) {
685 2
                return new SimpleHtmlDomNodeBlank();
686
            }
687
688 3
            return $elements;
689
        }
690
691
        // handle negative values
692 1
        if ($idx < 0) {
693
            $idx = \count($elements) + $idx;
694
        }
695
696
        // return one element
697 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
698
    }
699
700
    /**
701
     * Get dom node's outer html.
702
     *
703
     * @param bool $multiDecodeNewHtmlEntity
704
     *
705
     * @return string
706
     */
707
    public function html(bool $multiDecodeNewHtmlEntity = false): string
708
    {
709 87
        if (static::$callback !== null) {
710
            \call_user_func(static::$callback, [$this]);
711
        }
712
713 87
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
714 50
            $content = $this->document->saveHTML($this->document->documentElement);
715
        } else {
716 49
            $content = $this->document->saveHTML();
717
        }
718
719 87
        if ($content === false) {
720
            return '';
721
        }
722
723 87
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
724
    }
725
726
    /**
727
     * Load HTML from string.
728
     *
729
     * @param string   $html
730
     * @param int|null $libXMLExtraOptions
731
     *
732
     * @return HtmlDomParser
733
     */
734
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
735
    {
736
        // reset
737 194
        self::$domBrokenReplaceHelper = [];
738
739 194
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
740
741 194
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
742
    }
743
744
    /**
745
     * Load HTML from file.
746
     *
747
     * @param string   $filePath
748
     * @param int|null $libXMLExtraOptions
749
     *
750
     * @throws \RuntimeException
751
     *
752
     * @return HtmlDomParser
753
     */
754 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
755
    {
756
        // reset
757 11
        self::$domBrokenReplaceHelper = [];
758
759
        if (
760 11
            !\preg_match("/^https?:\/\//i", $filePath)
761
            &&
762 11
            !\file_exists($filePath)
763
        ) {
764 1
            throw new \RuntimeException("File ${filePath} not found");
765
        }
766
767
        try {
768 10
            if (\class_exists('\voku\helper\UTF8')) {
769
                /** @noinspection PhpUndefinedClassInspection */
770
                $html = UTF8::file_get_contents($filePath);
771
            } else {
772 10
                $html = \file_get_contents($filePath);
773
            }
774 1
        } catch (\Exception $e) {
775 1
            throw new \RuntimeException("Could not load file ${filePath}");
776
        }
777
778 9
        if ($html === false) {
779
            throw new \RuntimeException("Could not load file ${filePath}");
780
        }
781
782 9
        return $this->loadHtml($html, $libXMLExtraOptions);
783
    }
784
785
    /**
786
     * Get the HTML as XML or plain XML if needed.
787
     *
788
     * @param bool $multiDecodeNewHtmlEntity
789
     * @param bool $htmlToXml
790
     * @param bool $removeXmlHeader
791
     * @param int  $options
792
     *
793
     * @return string
794
     */
795 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
796
        bool $multiDecodeNewHtmlEntity = false,
797
        bool $htmlToXml = true,
798
        bool $removeXmlHeader = true,
799
        int $options = \LIBXML_NOEMPTYTAG
800
    ): string {
801 2
        $xml = $this->document->saveXML(null, $options);
802 2
        if ($xml === false) {
803
            return '';
804
        }
805
806 2
        if ($removeXmlHeader) {
807 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
808
        }
809
810 2
        if ($htmlToXml) {
811 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
812
        } else {
813
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
814
815
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
816
        }
817
818 2
        return $return;
819
    }
820
821
    /**
822
     * @param string $selector
823
     * @param int    $idx
824
     *
825
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
826
     */
827
    public function __invoke($selector, $idx = null)
828
    {
829 3
        return $this->find($selector, $idx);
830
    }
831
832
    /**
833
     * @return bool
834
     */
835
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
836
    {
837 120
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
838
    }
839
840
    /**
841
     * @return bool
842
     */
843
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
844
    {
845 120
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
846
    }
847
848
    /**
849
     * @return bool
850
     */
851
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
852
    {
853 120
        return $this->isDOMDocumentCreatedWithoutHtml;
854
    }
855
856
    /**
857
     * @return bool
858
     */
859
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
860
    {
861 120
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
862
    }
863
864
    /**
865
     * @return bool
866
     */
867
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
868
    {
869 120
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
870
    }
871
872
    /**
873
     * @return bool
874
     */
875
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
876
    {
877 120
        return $this->isDOMDocumentCreatedWithoutWrapper;
878
    }
879
880
    /**
881
     * @return bool
882
     */
883
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
884
    {
885 120
        return $this->isDOMDocumentCreatedWithFakeEndScript;
886
    }
887
888
    /**
889
     * @param string $html
890
     *
891
     * @return string
892
     */
893
    protected function keepBrokenHtml(string $html): string
894
    {
895
        do {
896 3
            $original = $html;
897
898 3
            $html = (string) \preg_replace_callback(
899 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
900
                static function ($matches) {
901 3
                    return $matches['start'] .
902 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
903 3
                           $matches['value'] .
904 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
905 3
                           $matches['end'];
906 3
                },
907 3
                $html
908
            );
909 3
        } while ($original !== $html);
910
911
        do {
912 3
            $original = $html;
913
914 3
            $html = (string) \preg_replace_callback(
915 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
916
                static function ($matches) {
917 3
                    $matches['broken'] = \str_replace(
918 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
919 3
                        ['</', '<', '>'],
920 3
                        $matches['broken']
921
                    );
922
923 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
924 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
925
926 3
                    return $matches['start'] . $matchesHash . $matches['end'];
927 3
                },
928 3
                $html
929
            );
930 3
        } while ($original !== $html);
931
932 3
        return \str_replace(
933 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
934 3
            ['</', '<', '>'],
935 3
            $html
936
        );
937
    }
938
939
    /**
940
     * @param string $html
941
     *
942
     * @return void
943
     */
944
    protected function keepSpecialScriptTags(string &$html)
945
    {
946
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
947 3
        $html = (string) \preg_replace_callback(
948 3
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
949
            function ($matches) {
950
951
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
952
                // because often this looks like non valid html in the template itself.
953 3
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
954 3
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
955
                        // remove the html5 fallback
956 2
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
957
958 2
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
959 2
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
960
961 3
                        return $matches['start'] . $matchesHash . $matches['end'];
962
                    }
963
                }
964
965
                // remove the html5 fallback
966 2
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
967
968 2
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
969
970 2
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
971 3
            },
972 3
            $html
973
        );
974 3
    }
975
976
    /**
977
     * @param bool $keepBrokenHtml
978
     *
979
     * @return HtmlDomParser
980
     */
981
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
982
    {
983 3
        $this->keepBrokenHtml = $keepBrokenHtml;
984
985 3
        return $this;
986
    }
987
988
    /**
989
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
990
     *
991
     * @return HtmlDomParser
992
     */
993
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
994
    {
995 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
996 2
            if (!\is_string($tmp)) {
997 2
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
998
            }
999
        }
1000
1001 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
1002
1003 1
        return $this;
1004
    }
1005
}
1006