Completed
Push — master ( d45b97...b40fc2 )
by Lars
01:55
created

HtmlDomParser::loadHtmlFile()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 27

Duplication

Lines 27
Ratio 100 %

Code Coverage

Tests 10
CRAP Score 6.1666

Importance

Changes 0
Metric Value
dl 27
loc 27
ccs 10
cts 12
cp 0.8333
rs 8.8657
c 0
b 0
f 0
cc 6
nc 8
nop 2
crap 6.1666
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $keepBrokenHtml;
77
78
    /**
79
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
80
     */
81 143 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
82
    {
83 143
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
84
85
        // reset
86 143
        self::$domBrokenReplaceHelper = [];
87
88
        // DOMDocument settings
89 143
        $this->document->preserveWhiteSpace = true;
90 143
        $this->document->formatOutput = true;
91
92 143
        if ($element instanceof SimpleHtmlDomInterface) {
93 72
            $element = $element->getNode();
94
        }
95
96 143
        if ($element instanceof \DOMNode) {
97 72
            $domNode = $this->document->importNode($element, true);
98
99 72
            if ($domNode instanceof \DOMNode) {
100
                /** @noinspection UnusedFunctionResultInspection */
101 72
                $this->document->appendChild($domNode);
102
            }
103
104 72
            return;
105
        }
106
107 143
        if ($element !== null) {
108
            /** @noinspection UnusedFunctionResultInspection */
109 79
            $this->loadHtml($element);
110
        }
111 142
    }
112
113
    /**
114
     * @param string $name
115
     * @param array  $arguments
116
     *
117
     * @return bool|mixed
118
     */
119 53
    public function __call($name, $arguments)
120
    {
121 53
        $name = \strtolower($name);
122
123 53
        if (isset(self::$functionAliases[$name])) {
124 52
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
125
        }
126
127 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
128
    }
129
130
    /**
131
     * @param string $name
132
     * @param array  $arguments
133
     *
134
     * @throws \BadMethodCallException
135
     * @throws \RuntimeException
136
     *
137
     * @return HtmlDomParser
138
     */
139 21 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
140
    {
141 21
        $arguments0 = $arguments[0] ?? '';
142
143 21
        $arguments1 = $arguments[1] ?? null;
144
145 21
        if ($name === 'str_get_html') {
146 16
            $parser = new static();
147
148 16
            return $parser->loadHtml($arguments0, $arguments1);
149
        }
150
151 5
        if ($name === 'file_get_html') {
152 4
            $parser = new static();
153
154 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
155
        }
156
157 1
        throw new \BadMethodCallException('Method does not exist');
158
    }
159
160
    /** @noinspection MagicMethodsValidityInspection */
161
162
    /**
163
     * @param string $name
164
     *
165
     * @return string|null
166
     */
167 14
    public function __get($name)
168
    {
169 14
        $name = \strtolower($name);
170
171 14
        switch ($name) {
172 14
            case 'outerhtml':
173 14
            case 'outertext':
174 5
                return $this->html();
175 10
            case 'innerhtml':
176 4
            case 'innertext':
177 7
                return $this->innerHtml();
178 3
            case 'text':
179 3
            case 'plaintext':
180 2
                return $this->text();
181
        }
182
183 1
        return null;
184
    }
185
186
    /**
187
     * @return string
188
     */
189 17
    public function __toString()
190
    {
191 17
        return $this->html();
192
    }
193
194
    /**
195
     * does nothing (only for api-compatibility-reasons)
196
     *
197
     * @return bool
198
     *
199
     * @deprecated
200
     */
201 1
    public function clear(): bool
202
    {
203 1
        return true;
204
    }
205
206
    /**
207
     * Create DOMDocument from HTML.
208
     *
209
     * @param string   $html
210
     * @param int|null $libXMLExtraOptions
211
     *
212
     * @return \DOMDocument
213
     */
214 131
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
215
    {
216 131
        if ($this->keepBrokenHtml) {
217 2
            $html = $this->keepBrokenHtml(\trim($html));
218
        }
219
220 131
        if (\strpos($html, '<') === false) {
221 7
            $this->isDOMDocumentCreatedWithoutHtml = true;
222 129
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
223 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
224
        }
225
226 131
        if (\strpos($html, '<html') === false) {
227 79
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
228
        }
229
230
        /** @noinspection HtmlRequiredTitleElement */
231 131
        if (\strpos($html, '<head>') === false) {
232 82
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
233
        }
234
235
        if (
236 131
            \strpos($html, '</script>') === false
237
            &&
238 131
            \strpos($html, '<\/script>') !== false
239
        ) {
240 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
241
        }
242
243 131
        if (\strpos($html, '<script') !== false) {
244 15
            $this->html5FallbackForScriptTags($html);
245
246
            if (
247 15
                \strpos($html, 'type="text/html"') !== false
248
                ||
249 14
                \strpos($html, 'type=\'text/html\'') !== false
250
                ||
251 14
                \strpos($html, 'type=text/html') !== false
252
                ||
253 14
                \strpos($html, 'type="text/x-custom-template"') !== false
254
                ||
255 14
                \strpos($html, 'type=\'text/x-custom-template\'') !== false
256
                ||
257 15
                \strpos($html, 'type=text/x-custom-template') !== false
258
            ) {
259 1
                $this->keepSpecialScriptTags($html);
260
            }
261
        }
262
263
        // set error level
264 131
        $internalErrors = \libxml_use_internal_errors(true);
265 131
        $disableEntityLoader = \libxml_disable_entity_loader(true);
266 131
        \libxml_clear_errors();
267
268 131
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
269
270 131
        if (\defined('LIBXML_BIGLINES')) {
271 131
            $optionsXml |= \LIBXML_BIGLINES;
272
        }
273
274 131
        if (\defined('LIBXML_COMPACT')) {
275 131
            $optionsXml |= \LIBXML_COMPACT;
276
        }
277
278 131
        if (\defined('LIBXML_HTML_NODEFDTD')) {
279 131
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
280
        }
281
282 131
        if ($libXMLExtraOptions !== null) {
283 1
            $optionsXml |= $libXMLExtraOptions;
284
        }
285
286
        if (
287 131
            $this->isDOMDocumentCreatedWithoutWrapper
288
            ||
289 131
            $this->keepBrokenHtml
290
        ) {
291 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
292
        }
293
294 131
        $html = self::replaceToPreserveHtmlEntities($html);
295
296 131
        $documentFound = false;
297 131
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
298 131 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
299 47
            $domElementTmp = \dom_import_simplexml($sxe);
300 47
            if ($domElementTmp) {
301 47
                $documentFound = true;
302 47
                $this->document = $domElementTmp->ownerDocument;
303
            }
304
        }
305
306 131 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
307
308
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
309 89
            $xmlHackUsed = false;
310 89
            if (\stripos('<?xml', $html) !== 0) {
311 89
                $xmlHackUsed = true;
312 89
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
313
            }
314
315 89
            $this->document->loadHTML($html, $optionsXml);
316
317
            // remove the "xml-encoding" hack
318 89
            if ($xmlHackUsed) {
319 89
                foreach ($this->document->childNodes as $child) {
320 89
                    if ($child->nodeType === \XML_PI_NODE) {
321
                        /** @noinspection UnusedFunctionResultInspection */
322 89
                        $this->document->removeChild($child);
323
324 89
                        break;
325
                    }
326
                }
327
            }
328
        }
329
330
        // set encoding
331 131
        $this->document->encoding = $this->getEncoding();
332
333
        // restore lib-xml settings
334 131
        \libxml_clear_errors();
335 131
        \libxml_use_internal_errors($internalErrors);
336 131
        \libxml_disable_entity_loader($disableEntityLoader);
337
338 131
        return $this->document;
339
    }
340
341
    /**
342
     * Find list of nodes with a CSS selector.
343
     *
344
     * @param string   $selector
345
     * @param int|null $idx
346
     *
347
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
348
     */
349 94 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
350
    {
351 94
        $xPathQuery = SelectorConverter::toXPath($selector);
352
353 94
        $xPath = new \DOMXPath($this->document);
354 94
        $nodesList = $xPath->query($xPathQuery);
355 94
        $elements = new SimpleHtmlDomNode();
356
357 94
        foreach ($nodesList as $node) {
358 86
            $elements[] = new SimpleHtmlDom($node);
359
        }
360
361
        // return all elements
362 94
        if ($idx === null) {
363 61
            if (\count($elements) === 0) {
364 13
                return new SimpleHtmlDomNodeBlank();
365
            }
366
367 58
            return $elements;
368
        }
369
370
        // handle negative values
371 46
        if ($idx < 0) {
372 11
            $idx = \count($elements) + $idx;
373
        }
374
375
        // return one element
376 46
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
377
    }
378
379
    /**
380
     * Find nodes with a CSS selector.
381
     *
382
     * @param string $selector
383
     *
384
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
385
     */
386 4
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
387
    {
388 4
        return $this->find($selector, null);
389
    }
390
391
    /**
392
     * Find one node with a CSS selector.
393
     *
394
     * @param string $selector
395
     *
396
     * @return SimpleHtmlDomInterface
397
     */
398 5
    public function findOne(string $selector): SimpleHtmlDomInterface
399
    {
400 5
        return $this->find($selector, 0);
401
    }
402
403
    /**
404
     * @param string $content
405
     * @param bool   $multiDecodeNewHtmlEntity
406
     *
407
     * @return string
408
     */
409 76
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
410
    {
411
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
412
        //          so we try to remove it here again ...
413
414 76
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
415
            /** @noinspection HtmlRequiredLangAttribute */
416 30
            $content = \str_replace(
417
                [
418 30
                    '<body>',
419
                    '</body>',
420
                    '<html>',
421
                    '</html>',
422
                ],
423 30
                '',
424 30
                $content
425
            );
426
        }
427
428 76
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
429
            /** @noinspection HtmlRequiredTitleElement */
430 31
            $content = \str_replace(
431
                [
432 31
                    '<head>',
433
                    '</head>',
434
                ],
435 31
                '',
436 31
                $content
437
            );
438
        }
439
440 76
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
441 1
            $content = \str_replace(
442 1
                '</script>',
443 1
                '',
444 1
                $content
445
            );
446
        }
447
448 76
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
449 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
450 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
451
        }
452
453 76
        if ($this->isDOMDocumentCreatedWithoutHtml) {
454 5
            $content = \str_replace(
455
                [
456 5
                    '<p>',
457
                    '</p>',
458
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
459
                ],
460 5
                '',
461 5
                $content
462
            );
463
        }
464
465
        /** @noinspection CheckTagEmptyBody */
466
        /** @noinspection HtmlExtraClosingTag */
467
        /** @noinspection HtmlRequiredTitleElement */
468 76
        $content = \trim(
469 76
            \str_replace(
470
                [
471 76
                    '<simpleHtmlDomP>',
472
                    '</simpleHtmlDomP>',
473
                    '<head><head>',
474
                    '</head></head>',
475
                    '<br></br>',
476
                ],
477
                [
478 76
                    '',
479
                    '',
480
                    '<head>',
481
                    '</head>',
482
                    '<br>',
483
                ],
484 76
                $content
485
            )
486
        );
487
488 76
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
489
490 76
        return self::putReplacedBackToPreserveHtmlEntities($content);
491
    }
492
493
    /**
494
     * Return elements by .class.
495
     *
496
     * @param string $class
497
     *
498
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
499
     */
500
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
501
    {
502
        return $this->findMulti(".${class}");
503
    }
504
505
    /**
506
     * Return element by #id.
507
     *
508
     * @param string $id
509
     *
510
     * @return SimpleHtmlDomInterface
511
     */
512 2
    public function getElementById(string $id): SimpleHtmlDomInterface
513
    {
514 2
        return $this->findOne("#${id}");
515
    }
516
517
    /**
518
     * Return element by tag name.
519
     *
520
     * @param string $name
521
     *
522
     * @return SimpleHtmlDomInterface
523
     */
524 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
525
    {
526 1
        $node = $this->document->getElementsByTagName($name)->item(0);
527
528 1
        if ($node === null) {
529
            return new SimpleHtmlDomBlank();
530
        }
531
532 1
        return new SimpleHtmlDom($node);
533
    }
534
535
    /**
536
     * Returns elements by #id.
537
     *
538
     * @param string   $id
539
     * @param int|null $idx
540
     *
541
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
542
     */
543
    public function getElementsById(string $id, $idx = null)
544
    {
545
        return $this->find("#${id}", $idx);
546
    }
547
548
    /**
549
     * Returns elements by tag name.
550
     *
551
     * @param string   $name
552
     * @param int|null $idx
553
     *
554
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
555
     */
556 3
    public function getElementsByTagName(string $name, $idx = null)
557
    {
558 3
        $nodesList = $this->document->getElementsByTagName($name);
559
560 3
        $elements = new SimpleHtmlDomNode();
561
562 3
        foreach ($nodesList as $node) {
563 3
            $elements[] = new SimpleHtmlDom($node);
564
        }
565
566
        // return all elements
567 3
        if ($idx === null) {
568 2
            if (\count($elements) === 0) {
569
                return new SimpleHtmlDomNodeBlank();
570
            }
571
572 2
            return $elements;
573
        }
574
575
        // handle negative values
576 1
        if ($idx < 0) {
577
            $idx = \count($elements) + $idx;
578
        }
579
580
        // return one element
581 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
582
    }
583
584
    /**
585
     * Get dom node's outer html.
586
     *
587
     * @param bool $multiDecodeNewHtmlEntity
588
     *
589
     * @return string
590
     */
591 50
    public function html(bool $multiDecodeNewHtmlEntity = false): string
592
    {
593 50
        if ($this::$callback !== null) {
594
            \call_user_func($this::$callback, [$this]);
595
        }
596
597 50
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
598 23
            $content = $this->document->saveHTML($this->document->documentElement);
599
        } else {
600 35
            $content = $this->document->saveHTML();
601
        }
602
603 50
        if ($content === false) {
604
            return '';
605
        }
606
607 50
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
608
    }
609
610
    /**
611
     * Load HTML from string.
612
     *
613
     * @param string   $html
614
     * @param int|null $libXMLExtraOptions
615
     *
616
     * @return HtmlDomParser
617
     */
618 131
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
619
    {
620 131
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
621
622 131
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
623
    }
624
625
    /**
626
     * Load HTML from file.
627
     *
628
     * @param string   $filePath
629
     * @param int|null $libXMLExtraOptions
630
     *
631
     * @throws \RuntimeException
632
     *
633
     * @return HtmlDomParser
634
     */
635 11 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
636
    {
637
        if (
638 11
            !\preg_match("/^https?:\/\//i", $filePath)
639
            &&
640 11
            !\file_exists($filePath)
641
        ) {
642 1
            throw new \RuntimeException("File ${filePath} not found");
643
        }
644
645
        try {
646 10
            if (\class_exists('\voku\helper\UTF8')) {
647
                /** @noinspection PhpUndefinedClassInspection */
648
                $html = UTF8::file_get_contents($filePath);
649
            } else {
650 10
                $html = \file_get_contents($filePath);
651
            }
652 1
        } catch (\Exception $e) {
653 1
            throw new \RuntimeException("Could not load file ${filePath}");
654
        }
655
656 9
        if ($html === false) {
657
            throw new \RuntimeException("Could not load file ${filePath}");
658
        }
659
660 9
        return $this->loadHtml($html, $libXMLExtraOptions);
661
    }
662
663
    /**
664
     * @param string $html
665
     *
666
     * @return string
667
     */
668 84 View Code Duplication
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
669
    {
670 84
        static $DOM_REPLACE__HELPER_CACHE = null;
671
672 84
        if ($DOM_REPLACE__HELPER_CACHE === null) {
673 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
674 1
                self::$domLinkReplaceHelper['tmp'],
675 1
                self::$domReplaceHelper['tmp']
676
            );
677 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
678 1
                self::$domLinkReplaceHelper['orig'],
679 1
                self::$domReplaceHelper['orig']
680
            );
681
682 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
683 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
684
685 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
686 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
687
688 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
689 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
690
691 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
692 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
693
        }
694
695
        if (
696 84
            isset(self::$domBrokenReplaceHelper['tmp'])
697
            &&
698 84
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
699
        ) {
700 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
701
        }
702
703 84
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
704
    }
705
706
    /**
707
     * @param string $html
708
     *
709
     * @return string
710
     */
711 132 View Code Duplication
    public static function replaceToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
712
    {
713
        // init
714 132
        $linksNew = [];
715 132
        $linksOld = [];
716
717 132
        if (\strpos($html, 'http') !== false) {
718
719
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
720 60
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
721 60
            \preg_match_all($regExUrl, $html, $linksOld);
722
723 60
            if (!empty($linksOld[1])) {
724 57
                $linksOld = $linksOld[1];
725 57
                foreach ((array) $linksOld as $linkKey => $linkOld) {
726 57
                    $linksNew[$linkKey] = \str_replace(
727 57
                        self::$domLinkReplaceHelper['orig'],
728 57
                        self::$domLinkReplaceHelper['tmp'],
729 57
                        $linkOld
730
                    );
731
                }
732
            }
733
        }
734
735 132
        $linksNewCount = \count($linksNew);
736 132
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
737 57
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
738 57
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
739
        } else {
740 80
            $search = self::$domReplaceHelper['orig'];
741 80
            $replace = self::$domReplaceHelper['tmp'];
742
        }
743
744 132
        return \str_replace($search, $replace, $html);
745
    }
746
747
    /**
748
     * Get the HTML as XML or plain XML if needed.
749
     *
750
     * @param bool $multiDecodeNewHtmlEntity
751
     * @param bool $htmlToXml
752
     * @param bool $removeXmlHeader
753
     * @param int  $options
754
     *
755
     * @return string
756
     */
757 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
758
        bool $multiDecodeNewHtmlEntity = false,
759
        bool $htmlToXml = true,
760
        bool $removeXmlHeader = true,
761
        int $options = \LIBXML_NOEMPTYTAG
762
    ): string {
763 2
        $xml = $this->document->saveXML(null, $options);
764
765 2
        if ($removeXmlHeader) {
766 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
767
        }
768
769 2
        if ($htmlToXml) {
770 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
771
        } else {
772
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
773
774
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
775
        }
776
777 2
        return $return;
778
    }
779
780
    /**
781
     * @param string $selector
782
     * @param int    $idx
783
     *
784
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
785
     */
786 3
    public function __invoke($selector, $idx = null)
787
    {
788 3
        return $this->find($selector, $idx);
789
    }
790
791
    /**
792
     * @return bool
793
     */
794 9
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
795
    {
796 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
797
    }
798
799
    /**
800
     * @return bool
801
     */
802 9
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
803
    {
804 9
        return $this->isDOMDocumentCreatedWithoutHtml;
805
    }
806
807
    /**
808
     * @return bool
809
     */
810 50
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
811
    {
812 50
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
813
    }
814
815
    /**
816
     * @return bool
817
     */
818
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
819
    {
820
        return $this->isDOMDocumentCreatedWithoutWrapper;
821
    }
822
823
    /**
824
     * @param string $html
825
     *
826
     * @return string
827
     */
828 2
    protected function keepBrokenHtml(string $html): string
829
    {
830
        do {
831 2
            $original = $html;
832
833 2
            $html = (string) \preg_replace_callback(
834 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
835
                static function ($matches) {
836 2
                    return $matches['start'] .
837 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
838 2
                           $matches['value'] .
839 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
840 2
                           $matches['end'];
841 2
                },
842 2
                $html
843
            );
844 2
        } while ($original !== $html);
845
846
        do {
847 2
            $original = $html;
848
849 2
            $html = (string) \preg_replace_callback(
850 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
851
                static function ($matches) {
852 2
                    $matches['broken'] = \str_replace(
853 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
854 2
                        ['</', '<', '>'],
855 2
                        $matches['broken']
856
                    );
857
858 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
859 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
860
861 2
                    return $matches['start'] . $matchesHash . $matches['end'];
862 2
                },
863 2
                $html
864
            );
865 2
        } while ($original !== $html);
866
867 2
        return \str_replace(
868 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
869 2
            ['</', '<', '>'],
870 2
            $html
871
        );
872
    }
873
874
    /**
875
     * @param string $html
876
     */
877 1
    protected function keepSpecialScriptTags(string &$html)
878
    {
879 1
        $specialScripts = [];
880
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
881 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}(text\/html|text\/x-custom-template)\2{0,1}([^>]*)>.*<\/\1>/isU';
882 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
883
884 1
        if (isset($specialScripts[0])) {
885 1
            foreach ($specialScripts[0] as $specialScript) {
886 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
887 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
888
                // remove the html5 fallback
889 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
890
891 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
892
            }
893
        }
894 1
    }
895
896
    /**
897
     * @param bool $keepBrokenHtml
898
     *
899
     * @return HtmlDomParser
900
     */
901 2
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
902
    {
903 2
        $this->keepBrokenHtml = $keepBrokenHtml;
904
905 2
        return $this;
906
    }
907
}
908