Completed
Push — master ( 7c71fa...1dceb4 )
by Lars
01:30
created

HtmlDomParser::xml()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 9
ccs 3
cts 3
cp 1
rs 9.9666
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
9
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
10
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
11
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
12
 * @property-read string plaintext <p>Get dom node's plain text.</p>
13
 *
14
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
15
 * @method string outerHtml() <p>Get dom node's outer html.</p>
16
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
17
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
18
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
19
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from file.</p>
20
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from string.</p>
21
 */
22
class HtmlDomParser
23
{
24
    /**
25
     * @var array
26
     */
27
    protected static $functionAliases = [
28
        'outertext' => 'html',
29
        'outerhtml' => 'html',
30
        'innertext' => 'innerHtml',
31
        'innerhtml' => 'innerHtml',
32
        'load'      => 'loadHtml',
33
        'load_file' => 'loadHtmlFile',
34
    ];
35
36
    /**
37
     * @var string[][]
38
     */
39
    protected static $domLinkReplaceHelper = [
40
        'orig' => ['[', ']', '{', '}'],
41
        'tmp'  => [
42
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
43
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
44
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
45
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
46
        ],
47
    ];
48
49
    /**
50
     * @var array
51
     */
52
    protected static $domReplaceHelper = [
53
        'orig' => ['&', '|', '+', '%', '@'],
54
        'tmp'  => [
55
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
56
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
57
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
58
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
59
            '____SIMPLE_HTML_DOM__VOKU__AT____',
60
        ],
61
    ];
62
63
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
64
65
    /**
66
     * @var array
67
     */
68
    protected static $domBrokenReplaceHelper = [];
69
70
    /**
71
     * @var callable
72
     */
73
    protected static $callback;
74
75
    /**
76
     * @var \DOMDocument
77
     */
78
    protected $document;
79
80
    /**
81
     * @var string
82
     */
83
    protected $encoding = 'UTF-8';
84
85
    /**
86
     * @var bool
87
     */
88
    protected $isDOMDocumentCreatedWithoutHtml = false;
89
90
    /**
91
     * @var bool
92
     */
93
    protected $isDOMDocumentCreatedWithoutWrapper = false;
94
95
    /**
96
     * @var bool
97
     */
98
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
99
100
    /**
101
     * @var bool
102
     */
103
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
104
105
    /**
106
     * @var bool
107
     */
108
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
109
110
    /**
111
     * @var bool
112
     */
113
    protected $keepBrokenHtml;
114
115
    /**
116
     * Constructor
117
     *
118
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
119
     *
120
     * @throws \InvalidArgumentException
121
     */
122 132
    public function __construct($element = null)
123
    {
124 132
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
125
126
        // reset
127 132
        self::$domBrokenReplaceHelper = [];
128
129
        // DOMDocument settings
130 132
        $this->document->preserveWhiteSpace = true;
131 132
        $this->document->formatOutput = true;
132
133 132
        if ($element instanceof SimpleHtmlDom) {
134 66
            $element = $element->getNode();
135
        }
136
137 132
        if ($element instanceof \DOMNode) {
138 66
            $domNode = $this->document->importNode($element, true);
139
140 66
            if ($domNode instanceof \DOMNode) {
141
                /** @noinspection UnusedFunctionResultInspection */
142 66
                $this->document->appendChild($domNode);
143
            }
144
145 66
            return;
146
        }
147
148 132
        if ($element !== null) {
149
            /** @noinspection UnusedFunctionResultInspection */
150 76
            $this->loadHtml($element);
151
        }
152 131
    }
153
154
    /**
155
     * @param $name
156
     * @param $arguments
157
     *
158
     * @return bool|mixed
159
     */
160 48 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
161
    {
162 48
        $name = \strtolower($name);
163
164 48
        if (isset(self::$functionAliases[$name])) {
165 47
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
166
        }
167
168 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
169
    }
170
171
    /**
172
     * @param $name
173
     * @param $arguments
174
     *
175
     * @throws \BadMethodCallException
176
     * @throws \RuntimeException
177
     * @throws \InvalidArgumentException
178
     *
179
     * @return HtmlDomParser
180
     */
181 18
    public static function __callStatic($name, $arguments)
182
    {
183 18
        $arguments0 = '';
184 18
        if (isset($arguments[0])) {
185 17
            $arguments0 = $arguments[0];
186
        }
187
188 18
        $arguments1 = null;
189 18
        if (isset($arguments[1])) {
190 1
            $arguments1 = $arguments[1];
191
        }
192
193 18
        if ($name === 'str_get_html') {
194 13
            $parser = new self();
195
196 13
            return $parser->loadHtml($arguments0, $arguments1);
197
        }
198
199 5
        if ($name === 'file_get_html') {
200 4
            $parser = new self();
201
202 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
203
        }
204
205 1
        throw new \BadMethodCallException('Method does not exist');
206
    }
207
208
    /** @noinspection MagicMethodsValidityInspection */
209
210
    /**
211
     * @param $name
212
     *
213
     * @return string
214
     */
215 14
    public function __get($name)
216
    {
217 14
        $name = \strtolower($name);
218
219
        switch ($name) {
220 14
            case 'outerhtml':
221 14
            case 'outertext':
222 5
                return $this->html();
223 10
            case 'innerhtml':
224 4
            case 'innertext':
225 7
                return $this->innerHtml();
226 3
            case 'text':
227 3
            case 'plaintext':
228 2
                return $this->text();
229
        }
230
231 1
        return null;
232
    }
233
234
    /**
235
     * @param string $selector
236
     * @param int    $idx
237
     *
238
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
239
     */
240 3
    public function __invoke($selector, $idx = null)
241
    {
242 3
        return $this->find($selector, $idx);
243
    }
244
245
    /**
246
     * @return string
247
     */
248 15
    public function __toString()
249
    {
250 15
        return $this->html();
251
    }
252
253
    /**
254
     * does nothing (only for api-compatibility-reasons)
255
     *
256
     * @deprecated
257
     *
258
     * @return bool
259
     */
260 1
    public function clear(): bool
261
    {
262 1
        return true;
263
    }
264
265
    /**
266
     * @param string $html
267
     *
268
     * @return string
269
     */
270 121
    public static function replaceToPreserveHtmlEntities(string $html): string
271
    {
272
        // init
273 121
        $linksNew = [];
274 121
        $linksOld = [];
275
276 121
        if (\strpos($html, 'http') !== false) {
277
278
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
279 57
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
280 57
            \preg_match_all($regExUrl, $html, $linksOld);
281
282 57
            if (!empty($linksOld[1])) {
283 55
                $linksOld = $linksOld[1];
284 55
                foreach ((array) $linksOld as $linkKey => $linkOld) {
285 55
                    $linksNew[$linkKey] = \str_replace(
286 55
                        self::$domLinkReplaceHelper['orig'],
287 55
                        self::$domLinkReplaceHelper['tmp'],
288 55
                        $linkOld
289
                    );
290
                }
291
            }
292
        }
293
294 121
        $linksNewCount = \count($linksNew);
295 121
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
296 55
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
297 55
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
298
        } else {
299 70
            $search = self::$domReplaceHelper['orig'];
300 70
            $replace = self::$domReplaceHelper['tmp'];
301
        }
302
303 121
        return \str_replace($search, $replace, $html);
304
    }
305
306
    /**
307
     * @param string $html
308
     *
309
     * @return string
310
     */
311 77
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
312
    {
313 77
        static $DOM_REPLACE__HELPER_CACHE = null;
314
315 77
        if ($DOM_REPLACE__HELPER_CACHE === null) {
316 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
317 1
                self::$domLinkReplaceHelper['tmp'],
318 1
                self::$domReplaceHelper['tmp']
319
            );
320 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
321 1
                self::$domLinkReplaceHelper['orig'],
322 1
                self::$domReplaceHelper['orig']
323
            );
324
325 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
326 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
327
328 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
329 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
330
        }
331
332
        if (
333 77
            isset(self::$domBrokenReplaceHelper['tmp'])
334
            &&
335 77
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
336
        ) {
337 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
338
        }
339
340 77
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
341
    }
342
343
    /**
344
     * Create DOMDocument from HTML.
345
     *
346
     * @param string   $html
347
     * @param int|null $libXMLExtraOptions
348
     *
349
     * @return \DOMDocument
350
     */
351 120
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
352
    {
353 120
        if ($this->keepBrokenHtml === true) {
354 2
            $html = $this->keepBrokenHtml(\trim($html));
355
        }
356
357 120
        if (\strpos($html, '<') === false) {
358 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
359 119
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
360 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
361
        }
362
363 120
        if (\strpos($html, '<html') === false) {
364 71
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
365
        }
366
367
        /** @noinspection HtmlRequiredTitleElement */
368 120
        if (\strpos($html, '<head>') === false) {
369 73
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
370
        }
371
372
        if (
373 120
            \strpos($html, '</script>') === false
374
            &&
375 120
            \strpos($html, '<\/script>') !== false
376
        ) {
377 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
378
        }
379
380
        // set error level
381 120
        $internalErrors = \libxml_use_internal_errors(true);
382 120
        $disableEntityLoader = \libxml_disable_entity_loader(true);
383 120
        \libxml_clear_errors();
384
385 120
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
386
387 120
        if (\defined('LIBXML_BIGLINES')) {
388 120
            $optionsXml |= \LIBXML_BIGLINES;
389
        }
390
391 120
        if (\defined('LIBXML_COMPACT')) {
392 120
            $optionsXml |= \LIBXML_COMPACT;
393
        }
394
395 120
        if (\defined('LIBXML_HTML_NODEFDTD')) {
396 120
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
397
        }
398
399 120
        if ($libXMLExtraOptions !== null) {
400 1
            $optionsXml |= $libXMLExtraOptions;
401
        }
402
403
        if (
404 120
            $this->isDOMDocumentCreatedWithoutWrapper === true
405
            ||
406 120
            $this->keepBrokenHtml === true
407
        ) {
408 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
409
        }
410
411 120
        $html = self::replaceToPreserveHtmlEntities($html);
412
413 120
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
414 120
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
415 42
            $this->document = \dom_import_simplexml($sxe)->ownerDocument;
416
        } else {
417
418
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
419 82
            $xmlHackUsed = false;
420 82
            if (\stripos('<?xml', $html) !== 0) {
421 82
                $xmlHackUsed = true;
422 82
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
423
            }
424
425 82
            $this->document->loadHTML($html, $optionsXml);
426
427
            // remove the "xml-encoding" hack
428 82
            if ($xmlHackUsed === true) {
429 82
                foreach ($this->document->childNodes as $child) {
430 82
                    if ($child->nodeType === \XML_PI_NODE) {
431
                        /** @noinspection UnusedFunctionResultInspection */
432 82
                        $this->document->removeChild($child);
433
434 82
                        break;
435
                    }
436
                }
437
            }
438
        }
439
440
        // set encoding
441 120
        $this->document->encoding = $this->getEncoding();
442
443
        // restore lib-xml settings
444 120
        \libxml_clear_errors();
445 120
        \libxml_use_internal_errors($internalErrors);
446 120
        \libxml_disable_entity_loader($disableEntityLoader);
447
448 120
        return $this->document;
449
    }
450
451
    /**
452
     * @param string $html
453
     *
454
     * @return string
455
     */
456 2
    protected function keepBrokenHtml(string $html): string
457
    {
458
        do {
459 2
            $original = $html;
460
461 2
            $html = (string) \preg_replace_callback(
462 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
463 2
                function ($matches) {
464 2
                    return $matches['start'] .
465 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
466 2
                           $matches['value'] .
467 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
468 2
                           $matches['end'];
469 2
                },
470 2
                $html
471
            );
472 2
        } while ($original !== $html);
473
474
        do {
475 2
            $original = $html;
476
477 2
            $html = (string) \preg_replace_callback(
478 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
479 2
                function ($matches) {
480 2
                    $matches['broken'] = \str_replace(
481 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
482 2
                        ['</', '<', '>'],
483 2
                        $matches['broken']
484
                    );
485
486 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
487 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
488
489 2
                    return $matches['start'] . $matchesHash . $matches['end'];
490 2
                },
491 2
                $html
492
            );
493 2
        } while ($original !== $html);
494
495 2
        return \str_replace(
496 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
497 2
            ['</', '<', '>'],
498 2
            $html
499
        );
500
    }
501
502
    /**
503
     * Return element by #id.
504
     *
505
     * @param string $id
506
     *
507
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
508
     */
509 2
    public function getElementById(string $id)
510
    {
511 2
        return $this->find("#${id}", 0);
512
    }
513
514
    /**
515
     * Return element by tag name.
516
     *
517
     * @param string $name
518
     *
519
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
520
     */
521
    public function getElementByTagName(string $name)
522
    {
523 1
        $node = $this->document->getElementsByTagName($name)->item(0);
524
525 1
        if ($node === null) {
526
            return new SimpleHtmlDomNodeBlank();
527
        }
528
529 1
        return new SimpleHtmlDom($node);
530
    }
531
532
    /**
533
     * Returns elements by #id.
534
     *
535
     * @param string   $id
536
     * @param int|null $idx
537
     *
538
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
539
     */
540
    public function getElementsById(string $id, $idx = null)
541
    {
542
        return $this->find("#${id}", $idx);
543
    }
544
545
    /**
546
     * Returns elements by tag name.
547
     *
548
     * @param string   $name
549
     * @param int|null $idx
550
     *
551
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
552
     */
553 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
554
    {
555 3
        $nodesList = $this->document->getElementsByTagName($name);
556
557 3
        $elements = new SimpleHtmlDomNode();
558
559 3
        foreach ($nodesList as $node) {
560 3
            $elements[] = new SimpleHtmlDom($node);
561
        }
562
563
        // return all elements
564 3
        if ($idx === null) {
565 2
            return $elements;
566
        }
567
568
        // handle negative values
569 1
        if ($idx < 0) {
570
            $idx = \count($elements) + $idx;
571
        }
572
573
        // return one element
574 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
575
    }
576
577
    /**
578
     * Find one node with a CSS selector.
579
     *
580
     * @param string $selector
581
     *
582
     * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
583
     */
584
    public function findOne(string $selector)
585
    {
586 2
        return $this->find($selector, 0);
587
    }
588
589
    /**
590
     * Find list of nodes with a CSS selector.
591
     *
592
     * @param string $selector
593
     * @param int    $idx
594
     *
595
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
596
     */
597
    public function find(string $selector, $idx = null)
598
    {
599 85
        $xPathQuery = SelectorConverter::toXPath($selector);
600
601 85
        $xPath = new \DOMXPath($this->document);
602 85
        $nodesList = $xPath->query($xPathQuery);
603 85
        $elements = new SimpleHtmlDomNode();
604
605 85
        foreach ($nodesList as $node) {
606 81
            $elements[] = new SimpleHtmlDom($node);
607
        }
608
609
        // return all elements
610 85
        if ($idx === null) {
611 55
            return $elements;
612
        }
613
614
        // handle negative values
615 42
        if ($idx < 0) {
616 11
            $idx = \count($elements) + $idx;
617
        }
618
619
        // return one element
620 42
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
621
    }
622
623
    /**
624
     * @param string $content
625
     * @param bool   $multiDecodeNewHtmlEntity
626
     *
627
     * @return string
628
     */
629
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
630
    {
631
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
632
        //          so we try to remove it here again ...
633
634 68
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
635
            /** @noinspection HtmlRequiredLangAttribute */
636 30
            $content = \str_replace(
637
                [
638 30
                    "\n",
639
                    "\r\n",
640
                    "\r",
641
                    '<body>',
642
                    '</body>',
643
                    '<html>',
644
                    '</html>',
645
                ],
646 30
                '',
647 30
                $content
648
            );
649
        }
650
651 68
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) {
652
            /** @noinspection HtmlRequiredTitleElement */
653 31
            $content = \str_replace(
654
                [
655 31
                    '<head>',
656
                    '</head>',
657
                ],
658 31
                '',
659 31
                $content
660
            );
661
        }
662
663 68
        if ($this->isDOMDocumentCreatedWithFakeEndScript === true) {
664 1
            $content = \str_replace(
665 1
                '</script>',
666 1
                '',
667 1
                $content
668
            );
669
        }
670
671 68
        if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
672 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
673 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
674
        }
675
676 68
        if ($this->isDOMDocumentCreatedWithoutHtml === true) {
677 5
            $content = \str_replace(
678
                [
679 5
                    '<p>',
680
                    '</p>',
681
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
682
                ],
683 5
                '',
684 5
                $content
685
            );
686
        }
687
688
        /** @noinspection CheckTagEmptyBody */
689
        /** @noinspection HtmlExtraClosingTag */
690
        /** @noinspection HtmlRequiredTitleElement */
691 68
        $content = \trim(
692 68
            \str_replace(
693
                [
694 68
                    '<simpleHtmlDomP>',
695
                    '</simpleHtmlDomP>',
696
                    '<head><head>',
697
                    '</head></head>',
698
                    '<br></br>',
699
                ],
700
                [
701 68
                    '',
702
                    '',
703
                    '<head>',
704
                    '</head>',
705
                    '<br>',
706
                ],
707 68
                $content
708
            )
709
        );
710
711 68
        if ($multiDecodeNewHtmlEntity === true) {
712 3
            if (\class_exists('\voku\helper\UTF8')) {
713
714
                /** @noinspection PhpUndefinedClassInspection */
715
                $content = UTF8::rawurldecode($content);
716
            } else {
717
                do {
718 3
                    $content_compare = $content;
719
720 3
                    $content = \rawurldecode(
721 3
                        \html_entity_decode(
722 3
                            $content,
723 3
                            \ENT_QUOTES | \ENT_HTML5
724
                        )
725
                    );
726 3
                } while ($content_compare !== $content);
727
            }
728
        } else {
729 67
            $content = \rawurldecode(
730 67
                \html_entity_decode(
731 67
                    $content,
732 67
                    \ENT_QUOTES | \ENT_HTML5
733
                )
734
            );
735
        }
736
737 68
        return self::putReplacedBackToPreserveHtmlEntities($content);
738
    }
739
740
    /**
741
     * @return \DOMDocument
742
     */
743
    public function getDocument(): \DOMDocument
744
    {
745 39
        return $this->document;
746
    }
747
748
    /**
749
     * Get the encoding to use.
750
     *
751
     * @return string
752
     */
753
    private function getEncoding(): string
754
    {
755 132
        return $this->encoding;
756
    }
757
758
    /**
759
     * @return bool
760
     */
761
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
762
    {
763 9
        return $this->isDOMDocumentCreatedWithoutHtml;
764
    }
765
766
    /**
767
     * @return bool
768
     */
769
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
770
    {
771 43
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
772
    }
773
774
    /**
775
     * @return bool
776
     */
777
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
778
    {
779 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
780
    }
781
782
    /**
783
     * @return bool
784
     */
785
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
786
    {
787
        return $this->isDOMDocumentCreatedWithoutWrapper;
788
    }
789
790
    /**
791
     * Get dom node's outer html.
792
     *
793
     * @param bool $multiDecodeNewHtmlEntity
794
     *
795
     * @return string
796
     */
797
    public function html(bool $multiDecodeNewHtmlEntity = false): string
798
    {
799 43
        if ($this::$callback !== null) {
800
            \call_user_func($this::$callback, [$this]);
801
        }
802
803 43
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
804 23
            $content = $this->document->saveHTML($this->document->documentElement);
805
        } else {
806 28
            $content = $this->document->saveHTML();
807
        }
808
809 43
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
810
    }
811
812
    /**
813
     * @param bool $keepBrokenHtml
814
     *
815
     * @return HtmlDomParser
816
     */
817
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
818
    {
819 2
        $this->keepBrokenHtml = $keepBrokenHtml;
820
821 2
        return $this;
822
    }
823
824
    /**
825
     * Get the HTML as XML.
826
     *
827
     * @param bool $multiDecodeNewHtmlEntity
828
     *
829
     * @return string
830
     */
831
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
832
    {
833 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
834
835
        // remove the XML-header
836 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
837
838 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
839
    }
840
841
    /**
842
     * Get dom node's inner html.
843
     *
844
     * @param bool $multiDecodeNewHtmlEntity
845
     *
846
     * @return string
847
     */
848
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
849
    {
850
        // init
851 19
        $text = '';
852
853 19
        foreach ($this->document->documentElement->childNodes as $node) {
854 19
            $text .= $this->document->saveHTML($node);
855
        }
856
857 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
858
    }
859
860
    /**
861
     * Load HTML from string.
862
     *
863
     * @param string   $html
864
     * @param int|null $libXMLExtraOptions
865
     *
866
     * @throws \InvalidArgumentException if argument is not string
867
     *
868
     * @return HtmlDomParser
869
     */
870
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
871
    {
872 120
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
873
874 120
        return $this;
875
    }
876
877
    /**
878
     * Load HTML from file.
879
     *
880
     * @param string   $filePath
881
     * @param int|null $libXMLExtraOptions
882
     *
883
     * @throws \RuntimeException
884
     * @throws \InvalidArgumentException
885
     *
886
     * @return HtmlDomParser
887
     */
888
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
889
    {
890
        if (
891 11
            !\preg_match("/^https?:\/\//i", $filePath)
892
            &&
893 11
            !\file_exists($filePath)
894
        ) {
895 1
            throw new \RuntimeException("File ${filePath} not found");
896
        }
897
898
        try {
899 10
            if (\class_exists('\voku\helper\UTF8')) {
900
                /** @noinspection PhpUndefinedClassInspection */
901
                $html = UTF8::file_get_contents($filePath);
902
            } else {
903 10
                $html = \file_get_contents($filePath);
904
            }
905 1
        } catch (\Exception $e) {
906 1
            throw new \RuntimeException("Could not load file ${filePath}");
907
        }
908
909 9
        if ($html === false) {
910
            throw new \RuntimeException("Could not load file ${filePath}");
911
        }
912
913 9
        return $this->loadHtml($html, $libXMLExtraOptions);
914
    }
915
916
    /**
917
     * Save the html-dom as string.
918
     *
919
     * @param string $filepath
920
     *
921
     * @return string
922
     */
923
    public function save(string $filepath = ''): string
924
    {
925 1
        $string = $this->innerHtml();
926 1
        if ($filepath !== '') {
927
            \file_put_contents($filepath, $string, \LOCK_EX);
928
        }
929
930 1
        return $string;
931
    }
932
933
    /**
934
     * @param $functionName
935
     */
936
    public function set_callback($functionName)
937
    {
938
        $this::$callback = $functionName;
939
    }
940
941
    /**
942
     * Get dom node's plain text.
943
     *
944
     * @param bool $multiDecodeNewHtmlEntity
945
     *
946
     * @return string
947
     */
948
    public function text(bool $multiDecodeNewHtmlEntity = false): string
949
    {
950 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
951
    }
952
953
    public function __clone()
954
    {
955
        $this->document = clone $this->document;
956
    }
957
}
958