Completed
Push — master ( d36330...3c9b9d )
by Lars
01:55
created

HtmlDomParser::find()   A

Complexity

Conditions 5
Paths 8

Size

Total Lines 29

Duplication

Lines 29
Ratio 100 %

Code Coverage

Tests 14
CRAP Score 5

Importance

Changes 0
Metric Value
dl 29
loc 29
ccs 14
cts 14
cp 1
rs 9.1448
c 0
b 0
f 0
cc 5
nc 8
nop 2
crap 5
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var string[]
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var bool
50
     */
51
    protected $isDOMDocumentCreatedWithoutHtml = false;
52
53
    /**
54
     * @var bool
55
     */
56
    protected $isDOMDocumentCreatedWithoutWrapper = false;
57
58
    /**
59
     * @var bool
60
     */
61
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
62
63
    /**
64
     * @var bool
65
     */
66
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
67
68
    /**
69
     * @var bool
70
     */
71
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
72
73
    /**
74
     * @var bool
75
     */
76
    protected $keepBrokenHtml;
77
78
    /**
79
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
80
     */
81 143 View Code Duplication
    public function __construct($element = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
82
    {
83 143
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
84
85
        // reset
86 143
        self::$domBrokenReplaceHelper = [];
87
88
        // DOMDocument settings
89 143
        $this->document->preserveWhiteSpace = true;
90 143
        $this->document->formatOutput = true;
91
92 143
        if ($element instanceof SimpleHtmlDomInterface) {
93 72
            $element = $element->getNode();
94
        }
95
96 143
        if ($element instanceof \DOMNode) {
97 72
            $domNode = $this->document->importNode($element, true);
98
99 72
            if ($domNode instanceof \DOMNode) {
100
                /** @noinspection UnusedFunctionResultInspection */
101 72
                $this->document->appendChild($domNode);
102
            }
103
104 72
            return;
105
        }
106
107 143
        if ($element !== null) {
108
            /** @noinspection UnusedFunctionResultInspection */
109 79
            $this->loadHtml($element);
110
        }
111 142
    }
112
113
    /**
114
     * @param string $name
115
     * @param array  $arguments
116
     *
117
     * @return bool|mixed
118
     */
119 53
    public function __call($name, $arguments)
120
    {
121 53
        $name = \strtolower($name);
122
123 53
        if (isset(self::$functionAliases[$name])) {
124 52
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
125
        }
126
127 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
128
    }
129
130
    /**
131
     * @param string $name
132
     * @param array  $arguments
133
     *
134
     * @throws \BadMethodCallException
135
     * @throws \RuntimeException
136
     *
137
     * @return HtmlDomParser
138
     */
139 21 View Code Duplication
    public static function __callStatic($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
140
    {
141 21
        $arguments0 = $arguments[0] ?? '';
142
143 21
        $arguments1 = $arguments[1] ?? null;
144
145 21
        if ($name === 'str_get_html') {
146 16
            $parser = new static();
147
148 16
            return $parser->loadHtml($arguments0, $arguments1);
149
        }
150
151 5
        if ($name === 'file_get_html') {
152 4
            $parser = new static();
153
154 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
155
        }
156
157 1
        throw new \BadMethodCallException('Method does not exist');
158
    }
159
160
    /** @noinspection MagicMethodsValidityInspection */
161
162
    /**
163
     * @param string $name
164
     *
165
     * @return string|null
166
     */
167 14
    public function __get($name)
168
    {
169 14
        $name = \strtolower($name);
170
171
        switch ($name) {
172 14
            case 'outerhtml':
173 14
            case 'outertext':
174 5
                return $this->html();
175 10
            case 'innerhtml':
176 4
            case 'innertext':
177 7
                return $this->innerHtml();
178 3
            case 'text':
179 3
            case 'plaintext':
180 2
                return $this->text();
181
        }
182
183 1
        return null;
184
    }
185
186
    /**
187
     * @return string
188
     */
189 17
    public function __toString()
190
    {
191 17
        return $this->html();
192
    }
193
194
    /**
195
     * does nothing (only for api-compatibility-reasons)
196
     *
197
     * @return bool
198
     *
199
     * @deprecated
200
     */
201 1
    public function clear(): bool
202
    {
203 1
        return true;
204
    }
205
206
    /**
207
     * Create DOMDocument from HTML.
208
     *
209
     * @param string   $html
210
     * @param int|null $libXMLExtraOptions
211
     *
212
     * @return \DOMDocument
213
     */
214 131
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
215
    {
216 131
        if ($this->keepBrokenHtml) {
217 2
            $html = $this->keepBrokenHtml(\trim($html));
218
        }
219
220 131
        if (\strpos($html, '<') === false) {
221 7
            $this->isDOMDocumentCreatedWithoutHtml = true;
222 129
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
223 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
224
        }
225
226 131
        if (\strpos($html, '<html') === false) {
227 79
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
228
        }
229
230
        /** @noinspection HtmlRequiredTitleElement */
231 131
        if (\strpos($html, '<head>') === false) {
232 82
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
233
        }
234
235
        if (
236 131
            \strpos($html, '</script>') === false
237
            &&
238 131
            \strpos($html, '<\/script>') !== false
239
        ) {
240 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
241
        }
242
243 131
        if (\strpos($html, '<script') !== false) {
244 15
            $this->html5FallbackForScriptTags($html);
245
246
            if (
247 15
                \strpos($html, 'type="text/html"') !== false
248
                ||
249 14
                \strpos($html, 'type=\'text/html\'') !== false
250
                ||
251 15
                \strpos($html, 'type=text/html') !== false
252
            ) {
253 1
                $this->keepSpecialScriptTags($html);
254
            }
255
        }
256
257
        // set error level
258 131
        $internalErrors = \libxml_use_internal_errors(true);
259 131
        $disableEntityLoader = \libxml_disable_entity_loader(true);
260 131
        \libxml_clear_errors();
261
262 131
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
263
264 131
        if (\defined('LIBXML_BIGLINES')) {
265 131
            $optionsXml |= \LIBXML_BIGLINES;
266
        }
267
268 131
        if (\defined('LIBXML_COMPACT')) {
269 131
            $optionsXml |= \LIBXML_COMPACT;
270
        }
271
272 131
        if (\defined('LIBXML_HTML_NODEFDTD')) {
273 131
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
274
        }
275
276 131
        if ($libXMLExtraOptions !== null) {
277 1
            $optionsXml |= $libXMLExtraOptions;
278
        }
279
280
        if (
281 131
            $this->isDOMDocumentCreatedWithoutWrapper
282
            ||
283 131
            $this->keepBrokenHtml
284
        ) {
285 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
286
        }
287
288 131
        $html = self::replaceToPreserveHtmlEntities($html);
289
290 131
        $documentFound = false;
291 131
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
292 131 View Code Duplication
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
293 47
            $domElementTmp = \dom_import_simplexml($sxe);
294 47
            if ($domElementTmp) {
295 47
                $documentFound = true;
296 47
                $this->document = $domElementTmp->ownerDocument;
297
            }
298
        }
299
300 131 View Code Duplication
        if ($documentFound === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
301
302
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
303 89
            $xmlHackUsed = false;
304 89
            if (\stripos('<?xml', $html) !== 0) {
305 89
                $xmlHackUsed = true;
306 89
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
307
            }
308
309 89
            $this->document->loadHTML($html, $optionsXml);
310
311
            // remove the "xml-encoding" hack
312 89
            if ($xmlHackUsed) {
313 89
                foreach ($this->document->childNodes as $child) {
314 89
                    if ($child->nodeType === \XML_PI_NODE) {
315
                        /** @noinspection UnusedFunctionResultInspection */
316 89
                        $this->document->removeChild($child);
317
318 89
                        break;
319
                    }
320
                }
321
            }
322
        }
323
324
        // set encoding
325 131
        $this->document->encoding = $this->getEncoding();
326
327
        // restore lib-xml settings
328 131
        \libxml_clear_errors();
329 131
        \libxml_use_internal_errors($internalErrors);
330 131
        \libxml_disable_entity_loader($disableEntityLoader);
331
332 131
        return $this->document;
333
    }
334
335
    /**
336
     * Find list of nodes with a CSS selector.
337
     *
338
     * @param string   $selector
339
     * @param int|null $idx
340
     *
341
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
342
     */
343 94 View Code Duplication
    public function find(string $selector, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
344
    {
345 94
        $xPathQuery = SelectorConverter::toXPath($selector);
346
347 94
        $xPath = new \DOMXPath($this->document);
348 94
        $nodesList = $xPath->query($xPathQuery);
349 94
        $elements = new SimpleHtmlDomNode();
350
351 94
        foreach ($nodesList as $node) {
352 86
            $elements[] = new SimpleHtmlDom($node);
353
        }
354
355
        // return all elements
356 94
        if ($idx === null) {
357 61
            if (\count($elements) === 0) {
358 13
                return new SimpleHtmlDomNodeBlank();
359
            }
360
361 58
            return $elements;
362
        }
363
364
        // handle negative values
365 46
        if ($idx < 0) {
366 11
            $idx = \count($elements) + $idx;
367
        }
368
369
        // return one element
370 46
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
371
    }
372
373
    /**
374
     * Find nodes with a CSS selector.
375
     *
376
     * @param string $selector
377
     *
378
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
379
     */
380 4
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
381
    {
382 4
        return $this->find($selector, null);
383
    }
384
385
    /**
386
     * Find one node with a CSS selector.
387
     *
388
     * @param string $selector
389
     *
390
     * @return SimpleHtmlDomInterface
391
     */
392 5
    public function findOne(string $selector): SimpleHtmlDomInterface
393
    {
394 5
        return $this->find($selector, 0);
395
    }
396
397
    /**
398
     * @param string $content
399
     * @param bool   $multiDecodeNewHtmlEntity
400
     *
401
     * @return string
402
     */
403 76
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
404
    {
405
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
406
        //          so we try to remove it here again ...
407
408 76
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
409
            /** @noinspection HtmlRequiredLangAttribute */
410 30
            $content = \str_replace(
411
                [
412 30
                    '<body>',
413
                    '</body>',
414
                    '<html>',
415
                    '</html>',
416
                ],
417 30
                '',
418 30
                $content
419
            );
420
        }
421
422 76
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
423
            /** @noinspection HtmlRequiredTitleElement */
424 31
            $content = \str_replace(
425
                [
426 31
                    '<head>',
427
                    '</head>',
428
                ],
429 31
                '',
430 31
                $content
431
            );
432
        }
433
434 76
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
435 1
            $content = \str_replace(
436 1
                '</script>',
437 1
                '',
438 1
                $content
439
            );
440
        }
441
442 76
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
443 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
444 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
445
        }
446
447 76
        if ($this->isDOMDocumentCreatedWithoutHtml) {
448 5
            $content = \str_replace(
449
                [
450 5
                    '<p>',
451
                    '</p>',
452
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
453
                ],
454 5
                '',
455 5
                $content
456
            );
457
        }
458
459
        /** @noinspection CheckTagEmptyBody */
460
        /** @noinspection HtmlExtraClosingTag */
461
        /** @noinspection HtmlRequiredTitleElement */
462 76
        $content = \trim(
463 76
            \str_replace(
464
                [
465 76
                    '<simpleHtmlDomP>',
466
                    '</simpleHtmlDomP>',
467
                    '<head><head>',
468
                    '</head></head>',
469
                    '<br></br>',
470
                ],
471
                [
472 76
                    '',
473
                    '',
474
                    '<head>',
475
                    '</head>',
476
                    '<br>',
477
                ],
478 76
                $content
479
            )
480
        );
481
482 76
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
483
484 76
        return self::putReplacedBackToPreserveHtmlEntities($content);
485
    }
486
487
    /**
488
     * Return elements by .class.
489
     *
490
     * @param string $class
491
     *
492
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
493
     */
494
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
495
    {
496
        return $this->findMulti(".${class}");
497
    }
498
499
    /**
500
     * Return element by #id.
501
     *
502
     * @param string $id
503
     *
504
     * @return SimpleHtmlDomInterface
505
     */
506
    public function getElementById(string $id): SimpleHtmlDomInterface
507
    {
508 2
        return $this->findOne("#${id}");
509
    }
510
511
    /**
512
     * Return element by tag name.
513
     *
514
     * @param string $name
515
     *
516
     * @return SimpleHtmlDomInterface
517
     */
518
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
519
    {
520 1
        $node = $this->document->getElementsByTagName($name)->item(0);
521
522 1
        if ($node === null) {
523
            return new SimpleHtmlDomBlank();
524
        }
525
526 1
        return new SimpleHtmlDom($node);
527
    }
528
529
    /**
530
     * Returns elements by #id.
531
     *
532
     * @param string   $id
533
     * @param int|null $idx
534
     *
535
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
536
     */
537
    public function getElementsById(string $id, $idx = null)
538
    {
539
        return $this->find("#${id}", $idx);
540
    }
541
542
    /**
543
     * Returns elements by tag name.
544
     *
545
     * @param string   $name
546
     * @param int|null $idx
547
     *
548
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
549
     */
550 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
551
    {
552 3
        $nodesList = $this->document->getElementsByTagName($name);
553
554 3
        $elements = new SimpleHtmlDomNode();
555
556 3
        foreach ($nodesList as $node) {
557 3
            $elements[] = new SimpleHtmlDom($node);
558
        }
559
560
        // return all elements
561 3
        if ($idx === null) {
562 2
            if (\count($elements) === 0) {
563
                return new SimpleHtmlDomNodeBlank();
564
            }
565
566 2
            return $elements;
567
        }
568
569
        // handle negative values
570 1
        if ($idx < 0) {
571
            $idx = \count($elements) + $idx;
572
        }
573
574
        // return one element
575 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
576
    }
577
578
    /**
579
     * Get dom node's outer html.
580
     *
581
     * @param bool $multiDecodeNewHtmlEntity
582
     *
583
     * @return string
584
     */
585
    public function html(bool $multiDecodeNewHtmlEntity = false): string
586
    {
587 50
        if ($this::$callback !== null) {
588
            \call_user_func($this::$callback, [$this]);
589
        }
590
591 50
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
592 23
            $content = $this->document->saveHTML($this->document->documentElement);
593
        } else {
594 35
            $content = $this->document->saveHTML();
595
        }
596
597 50
        if ($content === false) {
598
            return '';
599
        }
600
601 50
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
602
    }
603
604
    /**
605
     * Load HTML from string.
606
     *
607
     * @param string   $html
608
     * @param int|null $libXMLExtraOptions
609
     *
610
     * @return HtmlDomParser
611
     */
612
    public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface
613
    {
614 131
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
615
616 131
        return $this;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this; (voku\helper\HtmlDomParser) is incompatible with the return type declared by the interface voku\helper\DomParserInterface::loadHtml of type self.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
617
    }
618
619
    /**
620
     * Load HTML from file.
621
     *
622
     * @param string   $filePath
623
     * @param int|null $libXMLExtraOptions
624
     *
625
     * @throws \RuntimeException
626
     *
627
     * @return HtmlDomParser
628
     */
629 View Code Duplication
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
630
    {
631
        if (
632 11
            !\preg_match("/^https?:\/\//i", $filePath)
633
            &&
634 11
            !\file_exists($filePath)
635
        ) {
636 1
            throw new \RuntimeException("File ${filePath} not found");
637
        }
638
639
        try {
640 10
            if (\class_exists('\voku\helper\UTF8')) {
641
                /** @noinspection PhpUndefinedClassInspection */
642
                $html = UTF8::file_get_contents($filePath);
643
            } else {
644 10
                $html = \file_get_contents($filePath);
645
            }
646 1
        } catch (\Exception $e) {
647 1
            throw new \RuntimeException("Could not load file ${filePath}");
648
        }
649
650 9
        if ($html === false) {
651
            throw new \RuntimeException("Could not load file ${filePath}");
652
        }
653
654 9
        return $this->loadHtml($html, $libXMLExtraOptions);
655
    }
656
657
    /**
658
     * @param string $html
659
     *
660
     * @return string
661
     */
662 View Code Duplication
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
663
    {
664 84
        static $DOM_REPLACE__HELPER_CACHE = null;
665
666 84
        if ($DOM_REPLACE__HELPER_CACHE === null) {
667 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
668 1
                self::$domLinkReplaceHelper['tmp'],
669 1
                self::$domReplaceHelper['tmp']
670
            );
671 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
672 1
                self::$domLinkReplaceHelper['orig'],
673 1
                self::$domReplaceHelper['orig']
674
            );
675
676 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
677 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
678
679 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
680 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
681
682 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
683 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
684
685 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
686 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
687
        }
688
689
        if (
690 84
            isset(self::$domBrokenReplaceHelper['tmp'])
691
            &&
692 84
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
693
        ) {
694 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
695
        }
696
697 84
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
698
    }
699
700
    /**
701
     * @param string $html
702
     *
703
     * @return string
704
     */
705 View Code Duplication
    public static function replaceToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
706
    {
707
        // init
708 132
        $linksNew = [];
709 132
        $linksOld = [];
710
711 132
        if (\strpos($html, 'http') !== false) {
712
713
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
714 60
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
715 60
            \preg_match_all($regExUrl, $html, $linksOld);
716
717 60
            if (!empty($linksOld[1])) {
718 57
                $linksOld = $linksOld[1];
719 57
                foreach ((array) $linksOld as $linkKey => $linkOld) {
720 57
                    $linksNew[$linkKey] = \str_replace(
721 57
                        self::$domLinkReplaceHelper['orig'],
722 57
                        self::$domLinkReplaceHelper['tmp'],
723 57
                        $linkOld
724
                    );
725
                }
726
            }
727
        }
728
729 132
        $linksNewCount = \count($linksNew);
730 132
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
731 57
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
732 57
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
733
        } else {
734 80
            $search = self::$domReplaceHelper['orig'];
735 80
            $replace = self::$domReplaceHelper['tmp'];
736
        }
737
738 132
        return \str_replace($search, $replace, $html);
739
    }
740
741
    /**
742
     * Get the HTML as XML or plain XML if needed.
743
     *
744
     * @param bool $multiDecodeNewHtmlEntity
745
     * @param bool $htmlToXml
746
     * @param bool $removeXmlHeader
747
     * @param int  $options
748
     *
749
     * @return string
750
     */
751 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
752
        bool $multiDecodeNewHtmlEntity = false,
753
        bool $htmlToXml = true,
754
        bool $removeXmlHeader = true,
755
        int $options = \LIBXML_NOEMPTYTAG
756
    ): string {
757 2
        $xml = $this->document->saveXML(null, $options);
758
759 2
        if ($removeXmlHeader) {
760 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
761
        }
762
763 2
        if ($htmlToXml) {
764 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
765
        } else {
766
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
767
768
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
769
        }
770
771 2
        return $return;
772
    }
773
774
    /**
775
     * @param string $selector
776
     * @param int    $idx
777
     *
778
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
779
     */
780
    public function __invoke($selector, $idx = null)
781
    {
782 3
        return $this->find($selector, $idx);
783
    }
784
785
    /**
786
     * @return bool
787
     */
788
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
789
    {
790 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
791
    }
792
793
    /**
794
     * @return bool
795
     */
796
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
797
    {
798 9
        return $this->isDOMDocumentCreatedWithoutHtml;
799
    }
800
801
    /**
802
     * @return bool
803
     */
804
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
805
    {
806 50
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
807
    }
808
809
    /**
810
     * @return bool
811
     */
812
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
813
    {
814
        return $this->isDOMDocumentCreatedWithoutWrapper;
815
    }
816
817
    /**
818
     * @param string $html
819
     *
820
     * @return string
821
     */
822
    protected function keepBrokenHtml(string $html): string
823
    {
824
        do {
825 2
            $original = $html;
826
827 2
            $html = (string) \preg_replace_callback(
828 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
829
                static function ($matches) {
830 2
                    return $matches['start'] .
831 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
832 2
                           $matches['value'] .
833 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
834 2
                           $matches['end'];
835 2
                },
836 2
                $html
837
            );
838 2
        } while ($original !== $html);
839
840
        do {
841 2
            $original = $html;
842
843 2
            $html = (string) \preg_replace_callback(
844 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
845
                static function ($matches) {
846 2
                    $matches['broken'] = \str_replace(
847 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
848 2
                        ['</', '<', '>'],
849 2
                        $matches['broken']
850
                    );
851
852 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
853 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
854
855 2
                    return $matches['start'] . $matchesHash . $matches['end'];
856 2
                },
857 2
                $html
858
            );
859 2
        } while ($original !== $html);
860
861 2
        return \str_replace(
862 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
863 2
            ['</', '<', '>'],
864 2
            $html
865
        );
866
    }
867
868
    /**
869
     * @param string $html
870
     */
871
    protected function keepSpecialScriptTags(string &$html)
872
    {
873 1
        $specialScripts = [];
874
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
875 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU';
876 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
877
878 1
        if (isset($specialScripts[0])) {
879 1
            foreach ($specialScripts[0] as $specialScript) {
880 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
881 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
882
                // remove the html5 fallback
883 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
884
885 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
886
            }
887
        }
888 1
    }
889
890
    /**
891
     * @param bool $keepBrokenHtml
892
     *
893
     * @return HtmlDomParser
894
     */
895
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
896
    {
897 2
        $this->keepBrokenHtml = $keepBrokenHtml;
898
899 2
        return $this;
900
    }
901
}
902