Completed
Push — master ( bfaa7e...162bcd )
by Lars
04:53 queued 11s
created

getIsDOMDocumentCreatedWithoutHtmlWrapper()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 1
cts 1
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
9
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
10
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
11
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
12
 * @property-read string plaintext <p>Get dom node's plain text.</p>
13
 *
14
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
15
 * @method string outerHtml() <p>Get dom node's outer html.</p>
16
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
17
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
18
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
19
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from file.</p>
20
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from string.</p>
21
 */
22
class HtmlDomParser
23
{
24
    /**
25
     * @var array
26
     */
27
    protected static $functionAliases = [
28
        'outertext' => 'html',
29
        'outerhtml' => 'html',
30
        'innertext' => 'innerHtml',
31
        'innerhtml' => 'innerHtml',
32
        'load'      => 'loadHtml',
33
        'load_file' => 'loadHtmlFile',
34
    ];
35
36
    /**
37
     * @var string[][]
38
     */
39
    protected static $domLinkReplaceHelper = [
40
        'orig' => ['[', ']', '{', '}'],
41
        'tmp'  => [
42
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
43
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
44
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
45
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
46
        ],
47
    ];
48
49
    /**
50
     * @var array
51
     */
52
    protected static $domReplaceHelper = [
53
        'orig' => ['&', '|', '+', '%', '@'],
54
        'tmp'  => [
55
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
56
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
57
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
58
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
59
            '____SIMPLE_HTML_DOM__VOKU__AT____',
60
        ],
61
    ];
62
63
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
64
65
    /**
66
     * @var array
67
     */
68
    protected static $domBrokenReplaceHelper = [];
69
70
    /**
71
     * @var callable
72
     */
73
    protected static $callback;
74
75
    /**
76
     * @var \DOMDocument
77
     */
78
    protected $document;
79
80
    /**
81
     * @var string
82
     */
83
    protected $encoding = 'UTF-8';
84
85
    /**
86
     * @var bool
87
     */
88
    protected $isDOMDocumentCreatedWithoutHtml = false;
89
90
    /**
91
     * @var bool
92
     */
93
    protected $isDOMDocumentCreatedWithoutWrapper = false;
94
95
    /**
96
     * @var bool
97
     */
98
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
99
100
    /**
101
     * @var bool
102
     */
103
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
104
105
    /**
106
     * @var bool
107
     */
108
    protected $keepBrokenHtml;
109
110
    /**
111
     * Constructor
112
     *
113
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
114
     *
115
     * @throws \InvalidArgumentException
116
     */
117 129
    public function __construct($element = null)
118
    {
119 129
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
120
121
        // reset
122 129
        self::$domBrokenReplaceHelper = [];
123
124
        // DOMDocument settings
125 129
        $this->document->preserveWhiteSpace = true;
126 129
        $this->document->formatOutput = true;
127
128 129
        if ($element instanceof SimpleHtmlDom) {
129 64
            $element = $element->getNode();
130
        }
131
132 129
        if ($element instanceof \DOMNode) {
133 64
            $domNode = $this->document->importNode($element, true);
134
135 64
            if ($domNode instanceof \DOMNode) {
136 64
                $this->document->appendChild($domNode);
137
            }
138
139 64
            return;
140
        }
141
142 129
        if ($element !== null) {
143 73
            $this->loadHtml($element);
144
        }
145 128
    }
146
147
    /**
148
     * @param $name
149
     * @param $arguments
150
     *
151
     * @return bool|mixed
152
     */
153 47 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
154
    {
155 47
        $name = \strtolower($name);
156
157 47
        if (isset(self::$functionAliases[$name])) {
158 46
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
159
        }
160
161 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
162
    }
163
164
    /**
165
     * @param $name
166
     * @param $arguments
167
     *
168
     * @throws \BadMethodCallException
169
     * @throws \RuntimeException
170
     * @throws \InvalidArgumentException
171
     *
172
     * @return HtmlDomParser
173
     */
174 18
    public static function __callStatic($name, $arguments)
175
    {
176 18
        $arguments0 = '';
177 18
        if (isset($arguments[0])) {
178 17
            $arguments0 = $arguments[0];
179
        }
180
181 18
        $arguments1 = null;
182 18
        if (isset($arguments[1])) {
183 1
            $arguments1 = $arguments[1];
184
        }
185
186 18
        if ($name === 'str_get_html') {
187 13
            $parser = new self();
188
189 13
            return $parser->loadHtml($arguments0, $arguments1);
190
        }
191
192 5
        if ($name === 'file_get_html') {
193 4
            $parser = new self();
194
195 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
196
        }
197
198 1
        throw new \BadMethodCallException('Method does not exist');
199
    }
200
201
    /** @noinspection MagicMethodsValidityInspection */
202
203
    /**
204
     * @param $name
205
     *
206
     * @return string
207
     */
208 13
    public function __get($name)
209
    {
210 13
        $name = \strtolower($name);
211
212
        switch ($name) {
213 13
            case 'outerhtml':
214 13
            case 'outertext':
215 4
                return $this->html();
216 9
            case 'innerhtml':
217 3
            case 'innertext':
218 7
                return $this->innerHtml();
219 2
            case 'text':
220 2
            case 'plaintext':
221 1
                return $this->text();
222
        }
223
224 1
        return null;
225
    }
226
227
    /**
228
     * @param string $selector
229
     * @param int    $idx
230
     *
231
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
232
     */
233 3
    public function __invoke($selector, $idx = null)
234
    {
235 3
        return $this->find($selector, $idx);
236
    }
237
238
    /**
239
     * @return string
240
     */
241 15
    public function __toString()
242
    {
243 15
        return $this->html();
244
    }
245
246
    /**
247
     * does nothing (only for api-compatibility-reasons)
248
     *
249
     * @deprecated
250
     *
251
     * @return bool
252
     */
253 1
    public function clear(): bool
254
    {
255 1
        return true;
256
    }
257
258
    /**
259
     * @param string $html
260
     *
261
     * @return string
262
     */
263 118
    public static function replaceToPreserveHtmlEntities(string $html): string
264
    {
265
        // init
266 118
        $linksNew = [];
267 118
        $linksOld = [];
268
269 118
        if (\strpos($html, 'http') !== false) {
270
271
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
272 56
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
273 56
            \preg_match_all($regExUrl, $html, $linksOld);
274
275 56
            if (!empty($linksOld[1])) {
276 54
                $linksOld = $linksOld[1];
277 54
                foreach ((array) $linksOld as $linkKey => $linkOld) {
278 54
                    $linksNew[$linkKey] = \str_replace(
279 54
                        self::$domLinkReplaceHelper['orig'],
280 54
                        self::$domLinkReplaceHelper['tmp'],
281 54
                        $linkOld
282
                    );
283
                }
284
            }
285
        }
286
287 118
        $linksNewCount = \count($linksNew);
288 118
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
289 54
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
290 54
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
291
        } else {
292 68
            $search = self::$domReplaceHelper['orig'];
293 68
            $replace = self::$domReplaceHelper['tmp'];
294
        }
295
296 118
        return \str_replace($search, $replace, $html);
297
    }
298
299
    /**
300
     * @param string $html
301
     *
302
     * @return string
303
     */
304 74
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
305
    {
306 74
        static $DOM_REPLACE__HELPER_CACHE = null;
307
308 74
        if ($DOM_REPLACE__HELPER_CACHE === null) {
309 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
310 1
                self::$domLinkReplaceHelper['tmp'],
311 1
                self::$domReplaceHelper['tmp']
312
            );
313 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
314 1
                self::$domLinkReplaceHelper['orig'],
315 1
                self::$domReplaceHelper['orig']
316
            );
317
318 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
319 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
320
321 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
322 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
323
        }
324
325
        if (
326 74
            isset(self::$domBrokenReplaceHelper['tmp'])
327
            &&
328 74
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
329
        ) {
330 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
331
        }
332
333 74
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
334
    }
335
336
    /**
337
     * Create DOMDocument from HTML.
338
     *
339
     * @param string   $html
340
     * @param int|null $libXMLExtraOptions
341
     *
342
     * @return \DOMDocument
343
     */
344 117
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
345
    {
346 117
        if ($this->keepBrokenHtml === true) {
347 2
            $html = $this->keepBrokenHtml(\trim($html));
348
        }
349
350 117
        if (\strpos($html, '<') === false) {
351 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
352 116
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
353 4
            $this->isDOMDocumentCreatedWithoutWrapper = true;
354
        }
355
356 117
        if (\strpos($html, '<html') === false) {
357 67
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
358
        }
359
360
        /** @noinspection HtmlRequiredTitleElement */
361 117
        if (\strpos($html, '<head>') === false) {
362 69
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
363
        }
364
365
        // set error level
366 117
        $internalErrors = \libxml_use_internal_errors(true);
367 117
        $disableEntityLoader = \libxml_disable_entity_loader(true);
368 117
        \libxml_clear_errors();
369
370 117
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
371
372 117
        if (\defined('LIBXML_BIGLINES')) {
373 117
            $optionsXml |= \LIBXML_BIGLINES;
374
        }
375
376 117
        if (\defined('LIBXML_COMPACT')) {
377 117
            $optionsXml |= \LIBXML_COMPACT;
378
        }
379
380 117
        if (\defined('LIBXML_HTML_NODEFDTD')) {
381 117
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
382
        }
383
384 117
        if ($libXMLExtraOptions !== null) {
385 1
            $optionsXml |= $libXMLExtraOptions;
386
        }
387
388
        if (
389 117
            $this->isDOMDocumentCreatedWithoutWrapper === true
390
            ||
391 117
            $this->keepBrokenHtml === true
392
        ) {
393 5
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
394
        }
395
396 117
        $html = self::replaceToPreserveHtmlEntities($html);
397
398 117
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
399 117
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
400 41
            $this->document = \dom_import_simplexml($sxe)->ownerDocument;
401
        } else {
402
403
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
404 80
            $xmlHackUsed = false;
405 80
            if (\stripos('<?xml', $html) !== 0) {
406 80
                $xmlHackUsed = true;
407 80
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
408
            }
409
410 80
            $this->document->loadHTML($html, $optionsXml);
411
412
            // remove the "xml-encoding" hack
413 80
            if ($xmlHackUsed === true) {
414 80
                foreach ($this->document->childNodes as $child) {
415 80
                    if ($child->nodeType === \XML_PI_NODE) {
416 80
                        $this->document->removeChild($child);
417
418 80
                        break;
419
                    }
420
                }
421
            }
422
        }
423
424
        // set encoding
425 117
        $this->document->encoding = $this->getEncoding();
426
427
        // restore lib-xml settings
428 117
        \libxml_clear_errors();
429 117
        \libxml_use_internal_errors($internalErrors);
430 117
        \libxml_disable_entity_loader($disableEntityLoader);
431
432 117
        return $this->document;
433
    }
434
435
    /**
436
     * @param string $html
437
     *
438
     * @return string
439
     */
440 2
    protected function keepBrokenHtml(string $html): string
441
    {
442
        do {
443 2
            $original = $html;
444
445 2
            $html = (string) \preg_replace_callback(
446 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
447 2
                function ($matches) {
448 2
                    return $matches['start'] .
449 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
450 2
                           $matches['value'] .
451 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
452 2
                           $matches['end'];
453 2
                },
454 2
                $html
455
            );
456 2
        } while ($original !== $html);
457
458
        do {
459 2
            $original = $html;
460
461 2
            $html = (string) \preg_replace_callback(
462 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
463 2
                function ($matches) {
464 2
                    $matches['broken'] = \str_replace(
465 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
466 2
                        ['</', '<', '>'],
467 2
                        $matches['broken']
468
                    );
469
470 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
471 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
472
473 2
                    return $matches['start'] . $matchesHash . $matches['end'];
474 2
                },
475 2
                $html
476
            );
477 2
        } while ($original !== $html);
478
479 2
        return \str_replace(
480 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
481 2
            ['</', '<', '>'],
482 2
            $html
483
        );
484
    }
485
486
    /**
487
     * Return element by #id.
488
     *
489
     * @param string $id
490
     *
491
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
492
     */
493 2
    public function getElementById(string $id)
494
    {
495 2
        return $this->find("#${id}", 0);
496
    }
497
498
    /**
499
     * Return element by tag name.
500
     *
501
     * @param string $name
502
     *
503
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
504
     */
505
    public function getElementByTagName(string $name)
506
    {
507 1
        $node = $this->document->getElementsByTagName($name)->item(0);
508
509 1
        if ($node === null) {
510
            return new SimpleHtmlDomNodeBlank();
511
        }
512
513 1
        return new SimpleHtmlDom($node);
514
    }
515
516
    /**
517
     * Returns elements by #id.
518
     *
519
     * @param string   $id
520
     * @param int|null $idx
521
     *
522
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
523
     */
524
    public function getElementsById(string $id, $idx = null)
525
    {
526
        return $this->find("#${id}", $idx);
527
    }
528
529
    /**
530
     * Returns elements by tag name.
531
     *
532
     * @param string   $name
533
     * @param int|null $idx
534
     *
535
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
536
     */
537 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
538
    {
539 3
        $nodesList = $this->document->getElementsByTagName($name);
540
541 3
        $elements = new SimpleHtmlDomNode();
542
543 3
        foreach ($nodesList as $node) {
544 3
            $elements[] = new SimpleHtmlDom($node);
545
        }
546
547
        // return all elements
548 3
        if ($idx === null) {
549 2
            return $elements;
550
        }
551
552
        // handle negative values
553 1
        if ($idx < 0) {
554
            $idx = \count($elements) + $idx;
555
        }
556
557
        // return one element
558 1
        if (isset($elements[$idx])) {
559 1
            return $elements[$idx];
560
        }
561
562
        // return a blank-element
563
        return new SimpleHtmlDomNodeBlank();
564
    }
565
566
    /**
567
     * Find one node with a CSS selector.
568
     *
569
     * @param string $selector
570
     *
571
     * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
572
     */
573
    public function findOne(string $selector)
574
    {
575 2
        return $this->find($selector, 0);
576
    }
577
578
    /**
579
     * Find list of nodes with a CSS selector.
580
     *
581
     * @param string $selector
582
     * @param int    $idx
583
     *
584
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
585
     */
586
    public function find(string $selector, $idx = null)
587
    {
588 84
        $xPathQuery = SelectorConverter::toXPath($selector);
589
590 84
        $xPath = new \DOMXPath($this->document);
591 84
        $nodesList = $xPath->query($xPathQuery);
592 84
        $elements = new SimpleHtmlDomNode();
593
594 84
        foreach ($nodesList as $node) {
595 80
            $elements[] = new SimpleHtmlDom($node);
596
        }
597
598
        // return all elements
599 84
        if ($idx === null) {
600 54
            return $elements;
601
        }
602
603
        // handle negative values
604 42
        if ($idx < 0) {
605 11
            $idx = \count($elements) + $idx;
606
        }
607
608
        // return one element
609 42
        if (isset($elements[$idx])) {
610 40
            return $elements[$idx];
611
        }
612
613
        // return a blank-element
614 5
        return new SimpleHtmlDomNodeBlank();
615
    }
616
617
    /**
618
     * @param string $content
619
     * @param bool   $multiDecodeNewHtmlEntity
620
     *
621
     * @return string
622
     */
623
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
624
    {
625
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
626
        //          so we try to remove it here again ...
627
628 65
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
629
            /** @noinspection HtmlRequiredLangAttribute */
630 26
            $content = \str_replace(
631
                [
632 26
                    "\n",
633
                    "\r\n",
634
                    "\r",
635
                    '<body>',
636
                    '</body>',
637
                    '<html>',
638
                    '</html>',
639
                ],
640 26
                '',
641 26
                $content
642
            );
643
        }
644
645 65
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) {
646
            /** @noinspection HtmlRequiredTitleElement */
647 27
            $content = \str_replace(
648
                [
649 27
                    '<head>',
650
                    '</head>',
651
                ],
652 27
                '',
653 27
                $content
654
            );
655
        }
656
657 65
        if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
658 3
            $content = (string) \preg_replace('/^<p>/', '', $content);
659 3
            $content = (string) \preg_replace('/<\/p>/', '', $content);
660
        }
661
662 65
        if ($this->isDOMDocumentCreatedWithoutHtml === true) {
663 5
            $content = \str_replace(
664
                [
665 5
                    '<p>',
666
                    '</p>',
667
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
668
                ],
669 5
                '',
670 5
                $content
671
            );
672
        }
673
674
        /** @noinspection CheckTagEmptyBody */
675
        /** @noinspection HtmlExtraClosingTag */
676
        /** @noinspection HtmlRequiredTitleElement */
677 65
        $content = \trim(
678 65
            \str_replace(
679
                [
680 65
                    '<simpleHtmlDomP>',
681
                    '</simpleHtmlDomP>',
682
                    '<head><head>',
683
                    '</head></head>',
684
                    '<br></br>',
685
                ],
686
                [
687 65
                    '',
688
                    '',
689
                    '<head>',
690
                    '</head>',
691
                    '<br>',
692
                ],
693 65
                $content
694
            )
695
        );
696
697 65
        if ($multiDecodeNewHtmlEntity === true) {
698 3
            if (\class_exists('\voku\helper\UTF8')) {
699
700
                /** @noinspection PhpUndefinedClassInspection */
701
                $content = UTF8::rawurldecode($content);
702
            } else {
703
                do {
704 3
                    $content_compare = $content;
705
706 3
                    $content = \rawurldecode(
707 3
                        \html_entity_decode(
708 3
                            $content,
709 3
                            \ENT_QUOTES | \ENT_HTML5
710
                        )
711
                    );
712 3
                } while ($content_compare !== $content);
713
            }
714
        } else {
715 64
            $content = \rawurldecode(
716 64
                \html_entity_decode(
717 64
                    $content,
718 64
                    \ENT_QUOTES | \ENT_HTML5
719
                )
720
            );
721
        }
722
723 65
        return self::putReplacedBackToPreserveHtmlEntities($content);
724
    }
725
726
    /**
727
     * @return \DOMDocument
728
     */
729
    public function getDocument(): \DOMDocument
730
    {
731 37
        return $this->document;
732
    }
733
734
    /**
735
     * Get the encoding to use.
736
     *
737
     * @return string
738
     */
739
    private function getEncoding(): string
740
    {
741 129
        return $this->encoding;
742
    }
743
744
    /**
745
     * @return bool
746
     */
747
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
748
    {
749 8
        return $this->isDOMDocumentCreatedWithoutHtml;
750
    }
751
752
    /**
753
     * @return bool
754
     */
755
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
756
    {
757 40
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
758
    }
759
760
    /**
761
     * @return bool
762
     */
763
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
764
    {
765 6
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
766
    }
767
768
    /**
769
     * @return bool
770
     */
771
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
772
    {
773
        return $this->isDOMDocumentCreatedWithoutWrapper;
774
    }
775
776
    /**
777
     * Get dom node's outer html.
778
     *
779
     * @param bool $multiDecodeNewHtmlEntity
780
     *
781
     * @return string
782
     */
783
    public function html(bool $multiDecodeNewHtmlEntity = false): string
784
    {
785 40
        if ($this::$callback !== null) {
786
            \call_user_func($this::$callback, [$this]);
787
        }
788
789 40
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
790 19
            $content = $this->document->saveHTML($this->document->documentElement);
791
        } else {
792 26
            $content = $this->document->saveHTML();
793
        }
794
795 40
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
796
    }
797
798
    /**
799
     * @param bool $keepBrokenHtml
800
     *
801
     * @return HtmlDomParser
802
     */
803
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
804
    {
805 2
        $this->keepBrokenHtml = $keepBrokenHtml;
806
807 2
        return $this;
808
    }
809
810
    /**
811
     * Get the HTML as XML.
812
     *
813
     * @param bool $multiDecodeNewHtmlEntity
814
     *
815
     * @return string
816
     */
817
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
818
    {
819 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
820
821
        // remove the XML-header
822 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
823
824 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
825
    }
826
827
    /**
828
     * Get dom node's inner html.
829
     *
830
     * @param bool $multiDecodeNewHtmlEntity
831
     *
832
     * @return string
833
     */
834
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
835
    {
836
        // init
837 19
        $text = '';
838
839 19
        foreach ($this->document->documentElement->childNodes as $node) {
840 19
            $text .= $this->document->saveHTML($node);
841
        }
842
843 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
844
    }
845
846
    /**
847
     * Load HTML from string.
848
     *
849
     * @param string   $html
850
     * @param int|null $libXMLExtraOptions
851
     *
852
     * @throws \InvalidArgumentException if argument is not string
853
     *
854
     * @return HtmlDomParser
855
     */
856
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
857
    {
858 117
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
859
860 117
        return $this;
861
    }
862
863
    /**
864
     * Load HTML from file.
865
     *
866
     * @param string   $filePath
867
     * @param int|null $libXMLExtraOptions
868
     *
869
     * @throws \RuntimeException
870
     * @throws \InvalidArgumentException
871
     *
872
     * @return HtmlDomParser
873
     */
874
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
875
    {
876
        if (
877 11
            !\preg_match("/^https?:\/\//i", $filePath)
878
            &&
879 11
            !\file_exists($filePath)
880
        ) {
881 1
            throw new \RuntimeException("File ${filePath} not found");
882
        }
883
884
        try {
885 10
            if (\class_exists('\voku\helper\UTF8')) {
886
                /** @noinspection PhpUndefinedClassInspection */
887
                $html = UTF8::file_get_contents($filePath);
888
            } else {
889 10
                $html = \file_get_contents($filePath);
890
            }
891 1
        } catch (\Exception $e) {
892 1
            throw new \RuntimeException("Could not load file ${filePath}");
893
        }
894
895 9
        if ($html === false) {
896
            throw new \RuntimeException("Could not load file ${filePath}");
897
        }
898
899 9
        return $this->loadHtml($html, $libXMLExtraOptions);
900
    }
901
902
    /**
903
     * Save the html-dom as string.
904
     *
905
     * @param string $filepath
906
     *
907
     * @return string
908
     */
909
    public function save(string $filepath = ''): string
910
    {
911 1
        $string = $this->innerHtml();
912 1
        if ($filepath !== '') {
913
            \file_put_contents($filepath, $string, \LOCK_EX);
914
        }
915
916 1
        return $string;
917
    }
918
919
    /**
920
     * @param $functionName
921
     */
922
    public function set_callback($functionName)
923
    {
924
        $this::$callback = $functionName;
925
    }
926
927
    /**
928
     * Get dom node's plain text.
929
     *
930
     * @param bool $multiDecodeNewHtmlEntity
931
     *
932
     * @return string
933
     */
934
    public function text(bool $multiDecodeNewHtmlEntity = false): string
935
    {
936 2
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
937
    }
938
939
    public function __clone()
940
    {
941
        $this->document = clone $this->document;
942
    }
943
}
944