Completed
Push — master ( 17e133...01a57c )
by Lars
03:36 queued 01:26
created

HtmlDomParser::createDOMDocument()   F

Complexity

Conditions 34
Paths > 20000

Size

Total Lines 153

Duplication

Lines 30
Ratio 19.61 %

Code Coverage

Tests 70
CRAP Score 34

Importance

Changes 0
Metric Value
cc 34
nc 995328
nop 2
dl 30
loc 153
ccs 70
cts 70
cp 1
crap 34
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[]
50
     */
51
    protected $templateLogicSyntaxInSpecialScriptTags = [
52
        '+',
53
        '<%',
54
        '{%',
55
        '{{',
56
    ];
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHtml = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
77
78
    /**
79
     * @var bool
80
     */
81
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
82
83
    /**
84
     * @var bool
85
     */
86
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
87
88
    /**
89
     * @var bool
90
     */
91
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
92
93
    /**
94
     * @var bool
95
     */
96
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
97
98
    /**
99
     * @var bool
100
     */
101
    protected $keepBrokenHtml;
102
103
    /**
104
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
105
     */
106 210 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
107
    {
108 210
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
109
110
        // DOMDocument settings
111 210
        $this->document->preserveWhiteSpace = true;
112 210
        $this->document->formatOutput = true;
113
114 210
        if ($element instanceof SimpleHtmlDomInterface) {
115 97
            $element = $element->getNode();
116
        }
117
118 210
        if ($element instanceof \DOMNode) {
119 97
            $domNode = $this->document->importNode($element, true);
120
121 97
            if ($domNode instanceof \DOMNode) {
122
                /** @noinspection UnusedFunctionResultInspection */
123 97
                $this->document->appendChild($domNode);
124
            }
125
126 97
            return;
127
        }
128
129 210
        if ($element !== null) {
130
            /** @noinspection UnusedFunctionResultInspection */
131 84
            $this->loadHtml($element);
132
        }
133 209
    }
134
135
    /**
136
     * @param string $name
137
     * @param array  $arguments
138
     *
139
     * @return bool|mixed
140
     */
141 75
    public function __call($name, $arguments)
142
    {
143 75
        $name = \strtolower($name);
144
145 75
        if (isset(self::$functionAliases[$name])) {
146 74
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
147
        }
148
149 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
150
    }
151
152
    /**
153
     * @param string $name
154
     * @param array  $arguments
155
     *
156
     * @throws \BadMethodCallException
157
     * @throws \RuntimeException
158
     *
159
     * @return HtmlDomParser
160
     */
161 24 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
    {
163 24
        $arguments0 = $arguments[0] ?? '';
164
165 24
        $arguments1 = $arguments[1] ?? null;
166
167 24
        if ($name === 'str_get_html') {
168 19
            $parser = new static();
169
170 19
            return $parser->loadHtml($arguments0, $arguments1);
171
        }
172
173 5
        if ($name === 'file_get_html') {
174 4
            $parser = new static();
175
176 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
177
        }
178
179 1
        throw new \BadMethodCallException('Method does not exist');
180
    }
181
182
    /** @noinspection MagicMethodsValidityInspection */
183
184
    /**
185
     * @param string $name
186
     *
187
     * @return string|null
188
     */
189 15
    public function __get($name)
190
    {
191 15
        $name = \strtolower($name);
192
193
        switch ($name) {
194 15
            case 'outerhtml':
195 15
            case 'outertext':
196 5
                return $this->html();
197 11
            case 'innerhtml':
198 5
            case 'innertext':
199 7
                return $this->innerHtml();
200 4
            case 'text':
201 4
            case 'plaintext':
202 3
                return $this->text();
203
        }
204
205 1
        return null;
206
    }
207
208
    /**
209
     * @return string
210
     */
211 19
    public function __toString()
212
    {
213 19
        return $this->html();
214
    }
215
216
    /**
217
     * does nothing (only for api-compatibility-reasons)
218
     *
219
     * @return bool
220
     *
221
     * @deprecated
222
     */
223 6
    public function clear(): bool
224
    {
225 6
        return true;
226
    }
227
228
    /**
229
     * Create DOMDocument from HTML.
230
     *
231
     * @param string   $html
232
     * @param int|null $libXMLExtraOptions
233
     *
234
     * @return \DOMDocument
235
     */
236 194
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
237
    {
238 194
        if ($this->keepBrokenHtml) {
239 3
            $html = $this->keepBrokenHtml(\trim($html));
240
        }
241
242 194
        if (\strpos($html, '<') === false) {
243 10
            $this->isDOMDocumentCreatedWithoutHtml = true;
244 192
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
245 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
246
        }
247
248 194
        if (\strpos(\ltrim($html), '<!--') === 0) {
249 11
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
250
        }
251
252
        /** @noinspection HtmlRequiredLangAttribute */
253
        if (
254 194
            \strpos($html, '<html ') === false
255
            &&
256 194
            \strpos($html, '<html>') === false
257
        ) {
258 117
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
259
        }
260
261
        if (
262 194
            \strpos($html, '<body ') === false
263
            &&
264 194
            \strpos($html, '<body>') === false
265
        ) {
266 122
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
267
        }
268
269
        /** @noinspection HtmlRequiredTitleElement */
270
        if (
271 194
            \strpos($html, '<head ') === false
272
            &&
273 194
            \strpos($html, '<head>') === false
274
        ) {
275 141
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
276
        }
277
278
        /** @noinspection HtmlRequiredTitleElement */
279
        if (
280 194
            \strpos($html, '<p ') === false
281
            &&
282 194
            \strpos($html, '<p>') === false
283
        ) {
284 104
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
285
        }
286
287
        if (
288 194
            \strpos($html, '</script>') === false
289
            &&
290 194
            \strpos($html, '<\/script>') !== false
291
        ) {
292 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
293
        }
294
295 194
        if (\strpos($html, '<script') !== false) {
296 21
            $this->html5FallbackForScriptTags($html);
297
298
            if (
299 21
                \strpos($html, 'text/html') !== false
300
                ||
301 17
                \strpos($html, 'text/x-custom-template') !== false
302
                ||
303 21
                \strpos($html, 'text/x-handlebars-template') !== false
304
            ) {
305 5
                $this->keepSpecialScriptTags($html);
306
            }
307
        }
308
309
        // set error level
310 194
        $internalErrors = \libxml_use_internal_errors(true);
311 194
        $disableEntityLoader = \libxml_disable_entity_loader(true);
312 194
        \libxml_clear_errors();
313
314 194
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
315
316 194
        if (\defined('LIBXML_BIGLINES')) {
317 194
            $optionsXml |= \LIBXML_BIGLINES;
318
        }
319
320 194
        if (\defined('LIBXML_COMPACT')) {
321 194
            $optionsXml |= \LIBXML_COMPACT;
322
        }
323
324 194
        if (\defined('LIBXML_HTML_NODEFDTD')) {
325 194
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
326
        }
327
328 194
        if ($libXMLExtraOptions !== null) {
329 5
            $optionsXml |= $libXMLExtraOptions;
330
        }
331
332
        if (
333 194
            $this->isDOMDocumentCreatedWithoutWrapper
334
            ||
335 190
            $this->isDOMDocumentCreatedWithCommentWrapper
336
            ||
337 194
            $this->keepBrokenHtml
338
        ) {
339 18
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
340
        }
341
342 194
        $html = self::replaceToPreserveHtmlEntities($html);
343
344 194
        $documentFound = false;
345 194
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
346 194 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
347 90
            $domElementTmp = \dom_import_simplexml($sxe);
348 90
            if ($domElementTmp) {
349 90
                $documentFound = true;
350 90
                $this->document = $domElementTmp->ownerDocument;
351
            }
352
        }
353
354 194 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
355
356
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
357 113
            $xmlHackUsed = false;
358
            /** @noinspection StringFragmentMisplacedInspection */
359 113
            if (\stripos('<?xml', $html) !== 0) {
360 113
                $xmlHackUsed = true;
361 113
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
362
            }
363
364 113
            $this->document->loadHTML($html, $optionsXml);
365
366
            // remove the "xml-encoding" hack
367 113
            if ($xmlHackUsed) {
368 113
                foreach ($this->document->childNodes as $child) {
369 113
                    if ($child->nodeType === \XML_PI_NODE) {
370
                        /** @noinspection UnusedFunctionResultInspection */
371 113
                        $this->document->removeChild($child);
372
373 113
                        break;
374
                    }
375
                }
376
            }
377
        }
378
379
        // set encoding
380 194
        $this->document->encoding = $this->getEncoding();
381
382
        // restore lib-xml settings
383 194
        \libxml_clear_errors();
384 194
        \libxml_use_internal_errors($internalErrors);
385 194
        \libxml_disable_entity_loader($disableEntityLoader);
386
387 194
        return $this->document;
388
    }
389
390
    /**
391
     * Find list of nodes with a CSS selector.
392
     *
393
     * @param string   $selector
394
     * @param int|null $idx
395
     *
396
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
397
     */
398 142 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
399
    {
400 142
        $xPathQuery = SelectorConverter::toXPath($selector);
401
402 142
        $xPath = new \DOMXPath($this->document);
403 142
        $nodesList = $xPath->query($xPathQuery);
404 142
        $elements = new SimpleHtmlDomNode();
405
406 142
        if ($nodesList) {
407 142
            foreach ($nodesList as $node) {
408 132
                $elements[] = new SimpleHtmlDom($node);
409
            }
410
        }
411
412
        // return all elements
413 142
        if ($idx === null) {
414 70
            if (\count($elements) === 0) {
415 16
                return new SimpleHtmlDomNodeBlank();
416
            }
417
418 67
            return $elements;
419
        }
420
421
        // handle negative values
422 90
        if ($idx < 0) {
423 11
            $idx = \count($elements) + $idx;
424
        }
425
426
        // return one element
427 90
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
428
    }
429
430
    /**
431
     * Find nodes with a CSS selector.
432
     *
433
     * @param string $selector
434
     *
435
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
436
     */
437 12
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
438
    {
439 12
        return $this->find($selector, null);
440
    }
441
442
    /**
443
     * Find nodes with a CSS selector or false, if no element is found.
444
     *
445
     * @param string $selector
446
     *
447
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type false|SimpleHtmlDomInter...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 57. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
448
     */
449 3
    public function findMultiOrFalse(string $selector)
450
    {
451 3
        $return = $this->find($selector, null);
452
453 3
        if ($return instanceof SimpleHtmlDomNodeBlank) {
454 3
            return false;
455
        }
456
457 1
        return $return;
458
    }
459
460
    /**
461
     * Find one node with a CSS selector.
462
     *
463
     * @param string $selector
464
     *
465
     * @return SimpleHtmlDomInterface
466
     */
467 32
    public function findOne(string $selector): SimpleHtmlDomInterface
468
    {
469 32
        return $this->find($selector, 0);
470
    }
471
472
    /**
473
     * Find one node with a CSS selector or false, if no element is found.
474
     *
475
     * @param string $selector
476
     *
477
     * @return false|SimpleHtmlDomInterface
478
     */
479 5
    public function findOneOrFalse(string $selector)
480
    {
481 5
        $return = $this->find($selector, 0);
482
483 5
        if ($return instanceof SimpleHtmlDomBlank) {
484 3
            return false;
485
        }
486
487 3
        return $return;
488
    }
489
490
    /**
491
     * @param string $content
492
     * @param bool   $multiDecodeNewHtmlEntity
493
     *
494
     * @return string
495
     */
496 120
    public function fixHtmlOutput(
497
        string $content,
498
        bool $multiDecodeNewHtmlEntity = false
499
    ): string {
500
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
501
        //          so we try to remove it here again ...
502
503 120
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
504
            /** @noinspection HtmlRequiredLangAttribute */
505 57
            $content = \str_replace(
506
                [
507 57
                    '<html>',
508
                    '</html>',
509
                ],
510 57
                '',
511 57
                $content
512
            );
513
        }
514
515 120
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
516
            /** @noinspection HtmlRequiredTitleElement */
517 61
            $content = \str_replace(
518
                [
519 61
                    '<head>',
520
                    '</head>',
521
                ],
522 61
                '',
523 61
                $content
524
            );
525
        }
526
527 120
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
528
            /** @noinspection HtmlRequiredLangAttribute */
529 60
            $content = \str_replace(
530
                [
531 60
                    '<body>',
532
                    '</body>',
533
                ],
534 60
                '',
535 60
                $content
536
            );
537
        }
538
539 120
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
540 1
            $content = \str_replace(
541 1
                '</script>',
542 1
                '',
543 1
                $content
544
            );
545
        }
546
547 120
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
548 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
549 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
550
        }
551
552 120
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
553 55
            $content = \str_replace(
554
                [
555 55
                    '<p>',
556
                    '</p>',
557
                ],
558 55
                '',
559 55
                $content
560
            );
561
        }
562
563 120
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
564 8
            $content = \str_replace(
565 8
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
566 8
                '',
567 8
                $content
568
            );
569
        }
570
571
        /** @noinspection CheckTagEmptyBody */
572
        /** @noinspection HtmlExtraClosingTag */
573
        /** @noinspection HtmlRequiredTitleElement */
574 120
        $content = \trim(
575 120
            \str_replace(
576
                [
577 120
                    '<simpleHtmlDomHtml>',
578
                    '</simpleHtmlDomHtml>',
579
                    '<simpleHtmlDomP>',
580
                    '</simpleHtmlDomP>',
581
                    '<head><head>',
582
                    '</head></head>',
583
                    '<br></br>',
584
                ],
585
                [
586 120
                    '',
587
                    '',
588
                    '',
589
                    '',
590
                    '<head>',
591
                    '</head>',
592
                    '<br>',
593
                ],
594 120
                $content
595
            )
596
        );
597
598 120
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
599
600 120
        return self::putReplacedBackToPreserveHtmlEntities($content);
601
    }
602
603
    /**
604
     * Return elements by ".class".
605
     *
606
     * @param string $class
607
     *
608
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface[]...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 51. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
609
     */
610
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
611
    {
612
        return $this->findMulti(".${class}");
613
    }
614
615
    /**
616
     * Return element by #id.
617
     *
618
     * @param string $id
619
     *
620
     * @return SimpleHtmlDomInterface
621
     */
622
    public function getElementById(string $id): SimpleHtmlDomInterface
623
    {
624 3
        return $this->findOne("#${id}");
625
    }
626
627
    /**
628
     * Return element by tag name.
629
     *
630
     * @param string $name
631
     *
632
     * @return SimpleHtmlDomInterface
633
     */
634
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
635
    {
636 1
        $node = $this->document->getElementsByTagName($name)->item(0);
637
638 1
        if ($node === null) {
639
            return new SimpleHtmlDomBlank();
640
        }
641
642 1
        return new SimpleHtmlDom($node);
643
    }
644
645
    /**
646
     * Returns elements by "#id".
647
     *
648
     * @param string   $id
649
     * @param int|null $idx
650
     *
651
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
652
     */
653
    public function getElementsById(string $id, $idx = null)
654
    {
655
        return $this->find("#${id}", $idx);
656
    }
657
658
    /**
659
     * Returns elements by tag name.
660
     *
661
     * @param string   $name
662
     * @param int|null $idx
663
     *
664
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
665
     */
666
    public function getElementsByTagName(string $name, $idx = null)
667
    {
668 6
        $nodesList = $this->document->getElementsByTagName($name);
669
670 6
        $elements = new SimpleHtmlDomNode();
671
672 6
        foreach ($nodesList as $node) {
673 4
            $elements[] = new SimpleHtmlDom($node);
674
        }
675
676
        // return all elements
677 6
        if ($idx === null) {
678 5
            if (\count($elements) === 0) {
679 2
                return new SimpleHtmlDomNodeBlank();
680
            }
681
682 3
            return $elements;
683
        }
684
685
        // handle negative values
686 1
        if ($idx < 0) {
687
            $idx = \count($elements) + $idx;
688
        }
689
690
        // return one element
691 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
692
    }
693
694
    /**
695
     * Get dom node's outer html.
696
     *
697
     * @param bool $multiDecodeNewHtmlEntity
698
     *
699
     * @return string
700
     */
701
    public function html(bool $multiDecodeNewHtmlEntity = false): string
702
    {
703 87
        if (static::$callback !== null) {
704
            \call_user_func(static::$callback, [$this]);
705
        }
706
707 87
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
708 50
            $content = $this->document->saveHTML($this->document->documentElement);
709
        } else {
710 49
            $content = $this->document->saveHTML();
711
        }
712
713 87
        if ($content === false) {
714
            return '';
715
        }
716
717 87
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
718
    }
719
720
    /**
721
     * Load HTML from string.
722
     *
723
     * @param string   $html
724
     * @param int|null $libXMLExtraOptions
725
     *
726
     * @return HtmlDomParser
727
     */
728
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
729
    {
730
        // reset
731 194
        self::$domBrokenReplaceHelper = [];
732
733 194
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
734
735 194
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
736
    }
737
738
    /**
739
     * Load HTML from file.
740
     *
741
     * @param string   $filePath
742
     * @param int|null $libXMLExtraOptions
743
     *
744
     * @throws \RuntimeException
745
     *
746
     * @return HtmlDomParser
747
     */
748 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
749
    {
750
        // reset
751 11
        self::$domBrokenReplaceHelper = [];
752
753
        if (
754 11
            !\preg_match("/^https?:\/\//i", $filePath)
755
            &&
756 11
            !\file_exists($filePath)
757
        ) {
758 1
            throw new \RuntimeException("File ${filePath} not found");
759
        }
760
761
        try {
762 10
            if (\class_exists('\voku\helper\UTF8')) {
763
                /** @noinspection PhpUndefinedClassInspection */
764
                $html = UTF8::file_get_contents($filePath);
765
            } else {
766 10
                $html = \file_get_contents($filePath);
767
            }
768 1
        } catch (\Exception $e) {
769 1
            throw new \RuntimeException("Could not load file ${filePath}");
770
        }
771
772 9
        if ($html === false) {
773
            throw new \RuntimeException("Could not load file ${filePath}");
774
        }
775
776 9
        return $this->loadHtml($html, $libXMLExtraOptions);
777
    }
778
779
    /**
780
     * Get the HTML as XML or plain XML if needed.
781
     *
782
     * @param bool $multiDecodeNewHtmlEntity
783
     * @param bool $htmlToXml
784
     * @param bool $removeXmlHeader
785
     * @param int  $options
786
     *
787
     * @return string
788
     */
789 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
790
        bool $multiDecodeNewHtmlEntity = false,
791
        bool $htmlToXml = true,
792
        bool $removeXmlHeader = true,
793
        int $options = \LIBXML_NOEMPTYTAG
794
    ): string {
795 2
        $xml = $this->document->saveXML(null, $options);
796 2
        if ($xml === false) {
797
            return '';
798
        }
799
800 2
        if ($removeXmlHeader) {
801 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
802
        }
803
804 2
        if ($htmlToXml) {
805 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
806
        } else {
807
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
808
809
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
810
        }
811
812 2
        return $return;
813
    }
814
815
    /**
816
     * @param string $selector
817
     * @param int    $idx
818
     *
819
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
0 ignored issues
show
Documentation introduced by
The doc-type SimpleHtmlDomInterface|S...SimpleHtmlDomInterface> could not be parsed: Expected "|" or "end of type", but got "<" at position 74. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
820
     */
821
    public function __invoke($selector, $idx = null)
822
    {
823 3
        return $this->find($selector, $idx);
824
    }
825
826
    /**
827
     * @return bool
828
     */
829
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
830
    {
831 120
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
832
    }
833
834
    /**
835
     * @return bool
836
     */
837
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
838
    {
839 120
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
840
    }
841
842
    /**
843
     * @return bool
844
     */
845
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
846
    {
847 120
        return $this->isDOMDocumentCreatedWithoutHtml;
848
    }
849
850
    /**
851
     * @return bool
852
     */
853
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
854
    {
855 120
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
856
    }
857
858
    /**
859
     * @return bool
860
     */
861
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
862
    {
863 120
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
864
    }
865
866
    /**
867
     * @return bool
868
     */
869
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
870
    {
871 120
        return $this->isDOMDocumentCreatedWithoutWrapper;
872
    }
873
874
    /**
875
     * @return bool
876
     */
877
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
878
    {
879 120
        return $this->isDOMDocumentCreatedWithFakeEndScript;
880
    }
881
882
    /**
883
     * @param string $html
884
     *
885
     * @return string
886
     */
887
    protected function keepBrokenHtml(string $html): string
888
    {
889
        do {
890 3
            $original = $html;
891
892 3
            $html = (string) \preg_replace_callback(
893 3
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
894
                static function ($matches) {
895 3
                    return $matches['start'] .
896 3
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
897 3
                           $matches['value'] .
898 3
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
899 3
                           $matches['end'];
900 3
                },
901 3
                $html
902
            );
903 3
        } while ($original !== $html);
904
905
        do {
906 3
            $original = $html;
907
908 3
            $html = (string) \preg_replace_callback(
909 3
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
910
                static function ($matches) {
911 3
                    $matches['broken'] = \str_replace(
912 3
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
913 3
                        ['</', '<', '>'],
914 3
                        $matches['broken']
915
                    );
916
917 3
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
918 3
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
919
920 3
                    return $matches['start'] . $matchesHash . $matches['end'];
921 3
                },
922 3
                $html
923
            );
924 3
        } while ($original !== $html);
925
926 3
        return \str_replace(
927 3
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
928 3
            ['</', '<', '>'],
929 3
            $html
930
        );
931
    }
932
933
    /**
934
     * @param string $html
935
     *
936
     * @return void
937
     */
938
    protected function keepSpecialScriptTags(string &$html)
939
    {
940
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
941 5
        $html = (string) \preg_replace_callback(
942 5
            '/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:text\/html|text\/x-custom-template|text\/x-handlebars-template)+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU',
943
            function ($matches) {
944
945
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
946
                // because often this looks like non valid html in the template itself.
947 3
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
948 3
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
949
                        // remove the html5 fallback
950 2
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
951
952 2
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
953 2
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']);
954
955 3
                        return $matches['start'] . $matchesHash . $matches['end'];
956
                    }
957
                }
958
959
                // remove the html5 fallback
960 2
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
961
962 2
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
963
964 2
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
965 5
            },
966 5
            $html
967
        );
968 5
    }
969
970
    /**
971
     * @param bool $keepBrokenHtml
972
     *
973
     * @return HtmlDomParser
974
     */
975
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
976
    {
977 3
        $this->keepBrokenHtml = $keepBrokenHtml;
978
979 3
        return $this;
980
    }
981
982
    /**
983
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
984
     *
985
     * @return HtmlDomParser
986
     */
987
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
988
    {
989 2
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
990 2
            if (!\is_string($tmp)) {
991 2
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
992
            }
993
        }
994
995 1
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
996
997 1
        return $this;
998
    }
999
}
1000