Completed
Push — master ( 3e96cf...809b01 )
by Lars
01:35
created

HtmlDomParser::innerHtml()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 11
ccs 5
cts 5
cp 1
rs 9.9
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                            <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                            <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                            <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                   <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                        <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                                                               <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                                                              <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser
35
{
36
    /**
37
     * @var array
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[][]
50
     */
51
    protected static $domLinkReplaceHelper = [
52
        'orig' => ['[', ']', '{', '}'],
53
        'tmp'  => [
54
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
55
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
56
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
57
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
58
        ],
59
    ];
60
61
    /**
62
     * @var array
63
     */
64
    protected static $domReplaceHelper = [
65
        'orig' => ['&', '|', '+', '%', '@'],
66
        'tmp'  => [
67
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
68
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
69
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
70
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
71
            '____SIMPLE_HTML_DOM__VOKU__AT____',
72
        ],
73
    ];
74
75
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
76
77
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
78
79
    /**
80
     * @var array
81
     */
82
    protected static $domBrokenReplaceHelper = [];
83
84
    /**
85
     * @var callable
86
     */
87
    protected static $callback;
88
89
    /**
90
     * @var \DOMDocument
91
     */
92
    protected $document;
93
94
    /**
95
     * @var string
96
     */
97
    protected $encoding = 'UTF-8';
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $keepBrokenHtml;
128
129
    /**
130
     * Constructor
131
     *
132
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
133
     *
134
     * @throws \InvalidArgumentException
135
     */
136 138
    public function __construct($element = null)
137
    {
138 138
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
139
140
        // reset
141 138
        self::$domBrokenReplaceHelper = [];
142
143
        // DOMDocument settings
144 138
        $this->document->preserveWhiteSpace = true;
145 138
        $this->document->formatOutput = true;
146
147 138
        if ($element instanceof SimpleHtmlDom) {
148 67
            $element = $element->getNode();
149
        }
150
151 138
        if ($element instanceof \DOMNode) {
152 67
            $domNode = $this->document->importNode($element, true);
153
154 67
            if ($domNode instanceof \DOMNode) {
155
                /** @noinspection UnusedFunctionResultInspection */
156 67
                $this->document->appendChild($domNode);
157
            }
158
159 67
            return;
160
        }
161
162 138
        if ($element !== null) {
163
            /** @noinspection UnusedFunctionResultInspection */
164 79
            $this->loadHtml($element);
165
        }
166 137
    }
167
168
    /**
169
     * @param string $name
170
     * @param array  $arguments
171
     *
172
     * @return bool|mixed
173
     */
174 50 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
175
    {
176 50
        $name = \strtolower($name);
177
178 50
        if (isset(self::$functionAliases[$name])) {
179 49
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
180
        }
181
182 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
183
    }
184
185
    /**
186
     * @param string $name
187
     * @param array  $arguments
188
     *
189
     * @throws \BadMethodCallException
190
     * @throws \RuntimeException
191
     * @throws \InvalidArgumentException
192
     *
193
     * @return HtmlDomParser
194
     */
195 19
    public static function __callStatic($name, $arguments)
196
    {
197 19
        $arguments0 = $arguments[0] ?? '';
198
199 19
        $arguments1 = $arguments[1] ?? null;
200
201 19
        if ($name === 'str_get_html') {
202 14
            $parser = new self();
203
204 14
            return $parser->loadHtml($arguments0, $arguments1);
205
        }
206
207 5
        if ($name === 'file_get_html') {
208 4
            $parser = new self();
209
210 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
211
        }
212
213 1
        throw new \BadMethodCallException('Method does not exist');
214
    }
215
216
    /** @noinspection MagicMethodsValidityInspection */
217
218
    /**
219
     * @param string $name
220
     *
221
     * @return null|string
222
     */
223 14
    public function __get($name)
224
    {
225 14
        $name = \strtolower($name);
226
227 14
        switch ($name) {
228 14
            case 'outerhtml':
229 14
            case 'outertext':
230 5
                return $this->html();
231 10
            case 'innerhtml':
232 4
            case 'innertext':
233 7
                return $this->innerHtml();
234 3
            case 'text':
235 3
            case 'plaintext':
236 2
                return $this->text();
237
        }
238
239 1
        return null;
240
    }
241
242
    /**
243
     * @param string $selector
244
     * @param int    $idx
245
     *
246
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
247
     */
248 3
    public function __invoke($selector, $idx = null)
249
    {
250 3
        return $this->find($selector, $idx);
251
    }
252
253
    /**
254
     * @return string
255
     */
256 17
    public function __toString()
257
    {
258 17
        return $this->html();
259
    }
260
261
    /**
262
     * does nothing (only for api-compatibility-reasons)
263
     *
264
     * @deprecated
265
     *
266
     * @return bool
267
     */
268 1
    public function clear(): bool
269
    {
270 1
        return true;
271
    }
272
273
    /**
274
     * @param string $html
275
     *
276
     * @return string
277
     */
278 127
    public static function replaceToPreserveHtmlEntities(string $html): string
279
    {
280
        // init
281 127
        $linksNew = [];
282 127
        $linksOld = [];
283
284 127
        if (\strpos($html, 'http') !== false) {
285
286
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
287 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
288 58
            \preg_match_all($regExUrl, $html, $linksOld);
289
290 58
            if (!empty($linksOld[1])) {
291 56
                $linksOld = $linksOld[1];
292 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
293 56
                    $linksNew[$linkKey] = \str_replace(
294 56
                        self::$domLinkReplaceHelper['orig'],
295 56
                        self::$domLinkReplaceHelper['tmp'],
296 56
                        $linkOld
297
                    );
298
                }
299
            }
300
        }
301
302 127
        $linksNewCount = \count($linksNew);
303 127
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
304 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
305 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
306
        } else {
307 75
            $search = self::$domReplaceHelper['orig'];
308 75
            $replace = self::$domReplaceHelper['tmp'];
309
        }
310
311 127
        return \str_replace($search, $replace, $html);
312
    }
313
314
    /**
315
     * @param string $html
316
     *
317
     * @return string
318
     */
319 80
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
320
    {
321 80
        static $DOM_REPLACE__HELPER_CACHE = null;
322
323 80
        if ($DOM_REPLACE__HELPER_CACHE === null) {
324 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
325 1
                self::$domLinkReplaceHelper['tmp'],
326 1
                self::$domReplaceHelper['tmp']
327
            );
328 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
329 1
                self::$domLinkReplaceHelper['orig'],
330 1
                self::$domReplaceHelper['orig']
331
            );
332
333 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
334 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
335
336 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
337 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
338
339 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
340 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
341
342 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
343 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
344
        }
345
346
        if (
347 80
            isset(self::$domBrokenReplaceHelper['tmp'])
348
            &&
349 80
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
350
        ) {
351 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
352
        }
353
354 80
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
355
    }
356
357
    /**
358
     * Create DOMDocument from HTML.
359
     *
360
     * @param string   $html
361
     * @param int|null $libXMLExtraOptions
362
     *
363
     * @return \DOMDocument
364
     */
365 126
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
366
    {
367 126
        if ($this->keepBrokenHtml) {
368 2
            $html = $this->keepBrokenHtml(\trim($html));
369
        }
370
371 126
        if (\strpos($html, '<') === false) {
372 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
373 125
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
374 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
375
        }
376
377 126
        if (\strpos($html, '<html') === false) {
378 75
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
379
        }
380
381
        /** @noinspection HtmlRequiredTitleElement */
382 126
        if (\strpos($html, '<head>') === false) {
383 77
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
384
        }
385
386
        if (
387 126
            \strpos($html, '</script>') === false
388
            &&
389 126
            \strpos($html, '<\/script>') !== false
390
        ) {
391 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
392
        }
393
394 126
        if (\strpos($html, '<script') !== false) {
395 13
            $this->html5FallbackForScriptTags($html);
396
397
            if (
398 13
                \strpos($html, 'type="text/html"') !== false
399
                ||
400 12
                \strpos($html, 'type=\'text/html\'') !== false
401
                ||
402 13
                \strpos($html, 'type=text/html') !== false
403
            ) {
404 1
                $this->keepSpecialScriptTags($html);
405
            }
406
        }
407
408
        // set error level
409 126
        $internalErrors = \libxml_use_internal_errors(true);
410 126
        $disableEntityLoader = \libxml_disable_entity_loader(true);
411 126
        \libxml_clear_errors();
412
413 126
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
414
415 126
        if (\defined('LIBXML_BIGLINES')) {
416 126
            $optionsXml |= \LIBXML_BIGLINES;
417
        }
418
419 126
        if (\defined('LIBXML_COMPACT')) {
420 126
            $optionsXml |= \LIBXML_COMPACT;
421
        }
422
423 126
        if (\defined('LIBXML_HTML_NODEFDTD')) {
424 126
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
425
        }
426
427 126
        if ($libXMLExtraOptions !== null) {
428 1
            $optionsXml |= $libXMLExtraOptions;
429
        }
430
431
        if (
432 126
            $this->isDOMDocumentCreatedWithoutWrapper
433
            ||
434 126
            $this->keepBrokenHtml
435
        ) {
436 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
437
        }
438
439 126
        $html = self::replaceToPreserveHtmlEntities($html);
440
441 126
        $documentFound = false;
442 126
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
443 126
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
444 45
            $domElementTmp = \dom_import_simplexml($sxe);
445 45
            if ($domElementTmp) {
446 45
                $documentFound = true;
447 45
                $this->document = $domElementTmp->ownerDocument;
448
            }
449
        }
450
451 126
        if ($documentFound === false) {
452
453
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
454 85
            $xmlHackUsed = false;
455 85
            if (\stripos('<?xml', $html) !== 0) {
456 85
                $xmlHackUsed = true;
457 85
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
458
            }
459
460 85
            $this->document->loadHTML($html, $optionsXml);
461
462
            // remove the "xml-encoding" hack
463 85
            if ($xmlHackUsed) {
464 85
                foreach ($this->document->childNodes as $child) {
465 85
                    if ($child->nodeType === \XML_PI_NODE) {
466
                        /** @noinspection UnusedFunctionResultInspection */
467 85
                        $this->document->removeChild($child);
468
469 85
                        break;
470
                    }
471
                }
472
            }
473
        }
474
475
        // set encoding
476 126
        $this->document->encoding = $this->getEncoding();
477
478
        // restore lib-xml settings
479 126
        \libxml_clear_errors();
480 126
        \libxml_use_internal_errors($internalErrors);
481 126
        \libxml_disable_entity_loader($disableEntityLoader);
482
483 126
        return $this->document;
484
    }
485
486
    /**
487
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
488
     *
489
     * @param string $html
490
     */
491 13
    protected function html5FallbackForScriptTags(string &$html)
492
    {
493
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
494
        /** @noinspection HtmlDeprecatedTag */
495 13
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
496
        $html = \preg_replace_callback($regExSpecialScript, static function ($scripts) {
497 12
            return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
498 13
        }, $html);
499 13
    }
500
501
    /**
502
     * @param string $html
503
     */
504 1
    protected function keepSpecialScriptTags(string &$html)
505
    {
506 1
        $specialScripts = [];
507
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
508 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU';
509 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
510
511 1
        if (isset($specialScripts[0])) {
512 1
            foreach ($specialScripts[0] as $specialScript) {
513
514 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
515 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
516
                // remove the html5 fallback
517 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
518
519 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
520
            }
521
        }
522 1
    }
523
524
    /**
525
     * @param string $html
526
     *
527
     * @return string
528
     */
529 2
    protected function keepBrokenHtml(string $html): string
530
    {
531
        do {
532 2
            $original = $html;
533
534 2
            $html = (string) \preg_replace_callback(
535 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
536
                static function ($matches) {
537 2
                    return $matches['start'] .
538 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
539 2
                           $matches['value'] .
540 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
541 2
                           $matches['end'];
542 2
                },
543 2
                $html
544
            );
545 2
        } while ($original !== $html);
546
547
        do {
548 2
            $original = $html;
549
550 2
            $html = (string) \preg_replace_callback(
551 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
552
                static function ($matches) {
553 2
                    $matches['broken'] = \str_replace(
554 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
555 2
                        ['</', '<', '>'],
556 2
                        $matches['broken']
557
                    );
558
559 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
560 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
561
562 2
                    return $matches['start'] . $matchesHash . $matches['end'];
563 2
                },
564 2
                $html
565
            );
566 2
        } while ($original !== $html);
567
568 2
        return \str_replace(
569 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
570 2
            ['</', '<', '>'],
571 2
            $html
572
        );
573
    }
574
575
    /**
576
     * Return element by #id.
577
     *
578
     * @param string $id
579
     *
580
     * @return SimpleHtmlDom
581
     */
582 2
    public function getElementById(string $id): SimpleHtmlDom
583
    {
584 2
        return $this->findOne("#${id}");
585
    }
586
587
    /**
588
     * Returns elements by #id.
589
     *
590
     * @param string   $id
591
     * @param int|null $idx
592
     *
593
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
594
     */
595
    public function getElementsById(string $id, $idx = null)
596
    {
597
        return $this->find("#${id}", $idx);
598
    }
599
600
    /**
601
     * Return elements by .class.
602
     *
603
     * @param string $class
604
     *
605
     * @return SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
606
     */
607
    public function getElementByClass(string $class)
608
    {
609
        return $this->findMulti(".${class}");
610
    }
611
612
    /**
613
     * Return element by tag name.
614
     *
615
     * @param string $name
616
     *
617
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
618
     */
619 1
    public function getElementByTagName(string $name)
620
    {
621 1
        $node = $this->document->getElementsByTagName($name)->item(0);
622
623 1
        if ($node === null) {
624
            return new SimpleHtmlDomNodeBlank();
625
        }
626
627 1
        return new SimpleHtmlDom($node);
628
    }
629
630
    /**
631
     * Returns elements by tag name.
632
     *
633
     * @param string   $name
634
     * @param int|null $idx
635
     *
636
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
637
     */
638 3
    public function getElementsByTagName(string $name, $idx = null)
639
    {
640 3
        $nodesList = $this->document->getElementsByTagName($name);
641
642 3
        $elements = new SimpleHtmlDomNode();
643
644 3
        foreach ($nodesList as $node) {
645 3
            $elements[] = new SimpleHtmlDom($node);
646
        }
647
648
        // return all elements
649 3
        if ($idx === null) {
650 2
            return $elements;
651
        }
652
653
        // handle negative values
654 1
        if ($idx < 0) {
655
            $idx = \count($elements) + $idx;
656
        }
657
658
        // return one element
659 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
660
    }
661
662
    /**
663
     * Find one node with a CSS selector.
664
     *
665
     * @param string $selector
666
     *
667
     * @return SimpleHtmlDom
668
     */
669 4
    public function findOne(string $selector): SimpleHtmlDom
670
    {
671 4
        return $this->find($selector, 0);
672
    }
673
674
    /**
675
     * Find nodes with a CSS selector.
676
     *
677
     * @param string $selector
678
     *
679
     * @return SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
680
     */
681 3
    public function findMulti(string $selector)
682
    {
683 3
        return $this->find($selector, null);
684
    }
685
686
    /**
687
     * Find list of nodes with a CSS selector.
688
     *
689
     * @param string   $selector
690
     * @param null|int $idx
691
     *
692
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
693
     */
694 89
    public function find(string $selector, $idx = null)
695
    {
696 89
        $xPathQuery = SelectorConverter::toXPath($selector);
697
698 89
        $xPath = new \DOMXPath($this->document);
699 89
        $nodesList = $xPath->query($xPathQuery);
700 89
        $elements = new SimpleHtmlDomNode();
701
702 89
        foreach ($nodesList as $node) {
703 82
            $elements[] = new SimpleHtmlDom($node);
704
        }
705
706
        // return all elements
707 89
        if ($idx === null) {
708 57
            return $elements;
709
        }
710
711
        // handle negative values
712 44
        if ($idx < 0) {
713 11
            $idx = \count($elements) + $idx;
714
        }
715
716
        // return one element
717 44
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
718
    }
719
720
    /**
721
     * @param string $content
722
     * @param bool   $multiDecodeNewHtmlEntity
723
     *
724
     * @return string
725
     */
726 71
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
727
    {
728
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
729
        //          so we try to remove it here again ...
730
731 71
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
732
            /** @noinspection HtmlRequiredLangAttribute */
733 30
            $content = \str_replace(
734
                [
735 30
                    '<body>',
736
                    '</body>',
737
                    '<html>',
738
                    '</html>',
739
                ],
740 30
                '',
741 30
                $content
742
            );
743
        }
744
745 71
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
746
            /** @noinspection HtmlRequiredTitleElement */
747 31
            $content = \str_replace(
748
                [
749 31
                    '<head>',
750
                    '</head>',
751
                ],
752 31
                '',
753 31
                $content
754
            );
755
        }
756
757 71
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
758 1
            $content = \str_replace(
759 1
                '</script>',
760 1
                '',
761 1
                $content
762
            );
763
        }
764
765 71
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
766 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
767 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
768
        }
769
770 71
        if ($this->isDOMDocumentCreatedWithoutHtml) {
771 5
            $content = \str_replace(
772
                [
773 5
                    '<p>',
774
                    '</p>',
775
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
776
                ],
777 5
                '',
778 5
                $content
779
            );
780
        }
781
782
        /** @noinspection CheckTagEmptyBody */
783
        /** @noinspection HtmlExtraClosingTag */
784
        /** @noinspection HtmlRequiredTitleElement */
785 71
        $content = \trim(
786 71
            \str_replace(
787
                [
788 71
                    '<simpleHtmlDomP>',
789
                    '</simpleHtmlDomP>',
790
                    '<head><head>',
791
                    '</head></head>',
792
                    '<br></br>',
793
                ],
794
                [
795 71
                    '',
796
                    '',
797
                    '<head>',
798
                    '</head>',
799
                    '<br>',
800
                ],
801 71
                $content
802
            )
803
        );
804
805 71
        if ($multiDecodeNewHtmlEntity) {
806 3
            if (\class_exists('\voku\helper\UTF8')) {
807
808
                /** @noinspection PhpUndefinedClassInspection */
809
                $content = UTF8::rawurldecode($content);
810
            } else {
811
                do {
812 3
                    $content_compare = $content;
813
814 3
                    $content = \rawurldecode(
815 3
                        \html_entity_decode(
816 3
                            $content,
817 3
                            \ENT_QUOTES | \ENT_HTML5
818
                        )
819
                    );
820 3
                } while ($content_compare !== $content);
821
            }
822
        } else {
823 70
            $content = \rawurldecode(
824 70
                \html_entity_decode(
825 70
                    $content,
826 70
                    \ENT_QUOTES | \ENT_HTML5
827
                )
828
            );
829
        }
830
831 71
        return self::putReplacedBackToPreserveHtmlEntities($content);
832
    }
833
834
    /**
835
     * @return \DOMDocument
836
     */
837 39
    public function getDocument(): \DOMDocument
838
    {
839 39
        return $this->document;
840
    }
841
842
    /**
843
     * Get the encoding to use.
844
     *
845
     * @return string
846
     */
847 138
    private function getEncoding(): string
848
    {
849 138
        return $this->encoding;
850
    }
851
852
    /**
853
     * @return bool
854
     */
855 9
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
856
    {
857 9
        return $this->isDOMDocumentCreatedWithoutHtml;
858
    }
859
860
    /**
861
     * @return bool
862
     */
863 46
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
864
    {
865 46
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
866
    }
867
868
    /**
869
     * @return bool
870
     */
871 9
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
872
    {
873 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
874
    }
875
876
    /**
877
     * @return bool
878
     */
879
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
880
    {
881
        return $this->isDOMDocumentCreatedWithoutWrapper;
882
    }
883
884
    /**
885
     * Get dom node's outer html.
886
     *
887
     * @param bool $multiDecodeNewHtmlEntity
888
     *
889
     * @return string
890
     */
891 46
    public function html(bool $multiDecodeNewHtmlEntity = false): string
892
    {
893 46
        if ($this::$callback !== null) {
894
            \call_user_func($this::$callback, [$this]);
895
        }
896
897 46
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
898 23
            $content = $this->document->saveHTML($this->document->documentElement);
899
        } else {
900 31
            $content = $this->document->saveHTML();
901
        }
902
903 46
        if ($content === false) {
904
            return '';
905
        }
906
907 46
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
908
    }
909
910
    /**
911
     * @param bool $keepBrokenHtml
912
     *
913
     * @return HtmlDomParser
914
     */
915 2
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
916
    {
917 2
        $this->keepBrokenHtml = $keepBrokenHtml;
918
919 2
        return $this;
920
    }
921
922
    /**
923
     * Get the HTML as XML.
924
     *
925
     * @param bool $multiDecodeNewHtmlEntity
926
     *
927
     * @return string
928
     */
929 2
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
930
    {
931 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
932
933
        // remove the XML-header
934 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
935
936 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
937
    }
938
939
    /**
940
     * Get dom node's inner html.
941
     *
942
     * @param bool $multiDecodeNewHtmlEntity
943
     *
944
     * @return string
945
     */
946 19
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
947
    {
948
        // init
949 19
        $text = '';
950
951 19
        foreach ($this->document->documentElement->childNodes as $node) {
952 19
            $text .= $this->document->saveHTML($node);
953
        }
954
955 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
956
    }
957
958
    /**
959
     * Load HTML from string.
960
     *
961
     * @param string   $html
962
     * @param int|null $libXMLExtraOptions
963
     *
964
     * @throws \InvalidArgumentException if argument is not string
965
     *
966
     * @return HtmlDomParser
967
     */
968 126
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
969
    {
970 126
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
971
972 126
        return $this;
973
    }
974
975
    /**
976
     * Load HTML from file.
977
     *
978
     * @param string   $filePath
979
     * @param int|null $libXMLExtraOptions
980
     *
981
     * @throws \RuntimeException
982
     * @throws \InvalidArgumentException
983
     *
984
     * @return HtmlDomParser
985
     */
986 11
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
987
    {
988
        if (
989 11
            !\preg_match("/^https?:\/\//i", $filePath)
990
            &&
991 11
            !\file_exists($filePath)
992
        ) {
993 1
            throw new \RuntimeException("File ${filePath} not found");
994
        }
995
996
        try {
997 10
            if (\class_exists('\voku\helper\UTF8')) {
998
                /** @noinspection PhpUndefinedClassInspection */
999
                $html = UTF8::file_get_contents($filePath);
1000
            } else {
1001 10
                $html = \file_get_contents($filePath);
1002
            }
1003 1
        } catch (\Exception $e) {
1004 1
            throw new \RuntimeException("Could not load file ${filePath}");
1005
        }
1006
1007 9
        if ($html === false) {
1008
            throw new \RuntimeException("Could not load file ${filePath}");
1009
        }
1010
1011 9
        return $this->loadHtml($html, $libXMLExtraOptions);
1012
    }
1013
1014
    /**
1015
     * Save the html-dom as string.
1016
     *
1017
     * @param string $filepath
1018
     *
1019
     * @return string
1020
     */
1021 1
    public function save(string $filepath = ''): string
1022
    {
1023 1
        $string = $this->innerHtml();
1024 1
        if ($filepath !== '') {
1025
            \file_put_contents($filepath, $string, \LOCK_EX);
1026
        }
1027
1028 1
        return $string;
1029
    }
1030
1031
    /**
1032
     * @param callable $functionName
1033
     */
1034
    public function set_callback($functionName)
1035
    {
1036
        static::$callback = $functionName;
1037
    }
1038
1039
    /**
1040
     * Get dom node's plain text.
1041
     *
1042
     * @param bool $multiDecodeNewHtmlEntity
1043
     *
1044
     * @return string
1045
     */
1046 3
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1047
    {
1048 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
1049
    }
1050
1051
    public function __clone()
1052
    {
1053
        $this->document = clone $this->document;
1054
    }
1055
}
1056