Completed
Push — master ( 308454...cfcf30 )
by Lars
02:18
created

HtmlDomParser::findMulti()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 1
cts 1
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                            <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                            <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                            <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                   <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                        <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                                                               <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                                                              <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser
35
{
36
    /**
37
     * @var array
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[][]
50
     */
51
    protected static $domLinkReplaceHelper = [
52
        'orig' => ['[', ']', '{', '}'],
53
        'tmp'  => [
54
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
55
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
56
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
57
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
58
        ],
59
    ];
60
61
    /**
62
     * @var array
63
     */
64
    protected static $domReplaceHelper = [
65
        'orig' => ['&', '|', '+', '%', '@'],
66
        'tmp'  => [
67
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
68
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
69
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
70
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
71
            '____SIMPLE_HTML_DOM__VOKU__AT____',
72
        ],
73
    ];
74
75
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
76
77
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
78
79
    /**
80
     * @var array
81
     */
82
    protected static $domBrokenReplaceHelper = [];
83
84
    /**
85
     * @var callable
86
     */
87
    protected static $callback;
88
89
    /**
90
     * @var \DOMDocument
91
     */
92
    protected $document;
93
94
    /**
95
     * @var string
96
     */
97
    protected $encoding = 'UTF-8';
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $keepBrokenHtml;
128
129
    /**
130
     * Constructor
131
     *
132
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
133
     *
134
     * @throws \InvalidArgumentException
135
     */
136 138
    public function __construct($element = null)
137
    {
138 138
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
139
140
        // reset
141 138
        self::$domBrokenReplaceHelper = [];
142
143
        // DOMDocument settings
144 138
        $this->document->preserveWhiteSpace = true;
145 138
        $this->document->formatOutput = true;
146
147 138
        if ($element instanceof SimpleHtmlDom) {
148 67
            $element = $element->getNode();
149
        }
150
151 138
        if ($element instanceof \DOMNode) {
152 67
            $domNode = $this->document->importNode($element, true);
153
154 67
            if ($domNode instanceof \DOMNode) {
155
                /** @noinspection UnusedFunctionResultInspection */
156 67
                $this->document->appendChild($domNode);
157
            }
158
159 67
            return;
160
        }
161
162 138
        if ($element !== null) {
163
            /** @noinspection UnusedFunctionResultInspection */
164 79
            $this->loadHtml($element);
165
        }
166 137
    }
167
168
    /**
169
     * @param string $name
170
     * @param array  $arguments
171
     *
172
     * @return bool|mixed
173
     */
174 50 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
175
    {
176 50
        $name = \strtolower($name);
177
178 50
        if (isset(self::$functionAliases[$name])) {
179 49
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
180
        }
181
182 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
183
    }
184
185
    /**
186
     * @param string $name
187
     * @param array  $arguments
188
     *
189
     * @throws \BadMethodCallException
190
     * @throws \RuntimeException
191
     * @throws \InvalidArgumentException
192
     *
193
     * @return HtmlDomParser
194
     */
195 19
    public static function __callStatic($name, $arguments)
196
    {
197 19
        $arguments0 = $arguments[0] ?? '';
198
199 19
        $arguments1 = $arguments[1] ?? null;
200
201 19
        if ($name === 'str_get_html') {
202 14
            $parser = new self();
203
204 14
            return $parser->loadHtml($arguments0, $arguments1);
205
        }
206
207 5
        if ($name === 'file_get_html') {
208 4
            $parser = new self();
209
210 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
211
        }
212
213 1
        throw new \BadMethodCallException('Method does not exist');
214
    }
215
216
    /** @noinspection MagicMethodsValidityInspection */
217
218
    /**
219
     * @param string $name
220
     *
221
     * @return null|string
222
     */
223 14
    public function __get($name)
224
    {
225 14
        $name = \strtolower($name);
226
227
        switch ($name) {
228 14
            case 'outerhtml':
229 14
            case 'outertext':
230 5
                return $this->html();
231 10
            case 'innerhtml':
232 4
            case 'innertext':
233 7
                return $this->innerHtml();
234 3
            case 'text':
235 3
            case 'plaintext':
236 2
                return $this->text();
237
        }
238
239 1
        return null;
240
    }
241
242
    /**
243
     * @param string $selector
244
     * @param int    $idx
245
     *
246
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
247
     */
248 3
    public function __invoke($selector, $idx = null)
249
    {
250 3
        return $this->find($selector, $idx);
251
    }
252
253
    /**
254
     * @return string
255
     */
256 17
    public function __toString()
257
    {
258 17
        return $this->html();
259
    }
260
261
    /**
262
     * does nothing (only for api-compatibility-reasons)
263
     *
264
     * @deprecated
265
     *
266
     * @return bool
267
     */
268 1
    public function clear(): bool
269
    {
270 1
        return true;
271
    }
272
273
    /**
274
     * @param string $html
275
     *
276
     * @return string
277
     */
278 127
    public static function replaceToPreserveHtmlEntities(string $html): string
279
    {
280
        // init
281 127
        $linksNew = [];
282 127
        $linksOld = [];
283
284 127
        if (\strpos($html, 'http') !== false) {
285
286
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
287 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
288 58
            \preg_match_all($regExUrl, $html, $linksOld);
289
290 58
            if (!empty($linksOld[1])) {
291 56
                $linksOld = $linksOld[1];
292 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
293 56
                    $linksNew[$linkKey] = \str_replace(
294 56
                        self::$domLinkReplaceHelper['orig'],
295 56
                        self::$domLinkReplaceHelper['tmp'],
296 56
                        $linkOld
297
                    );
298
                }
299
            }
300
        }
301
302 127
        $linksNewCount = \count($linksNew);
303 127
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
304 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
305 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
306
        } else {
307 75
            $search = self::$domReplaceHelper['orig'];
308 75
            $replace = self::$domReplaceHelper['tmp'];
309
        }
310
311 127
        return \str_replace($search, $replace, $html);
312
    }
313
314
    /**
315
     * @param string $html
316
     *
317
     * @return string
318
     */
319 80
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
320
    {
321 80
        static $DOM_REPLACE__HELPER_CACHE = null;
322
323 80
        if ($DOM_REPLACE__HELPER_CACHE === null) {
324 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
325 1
                self::$domLinkReplaceHelper['tmp'],
326 1
                self::$domReplaceHelper['tmp']
327
            );
328 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
329 1
                self::$domLinkReplaceHelper['orig'],
330 1
                self::$domReplaceHelper['orig']
331
            );
332
333 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
334 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
335
336 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
337 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
338
339 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
340 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
341
342 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
343 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
344
        }
345
346
        if (
347 80
            isset(self::$domBrokenReplaceHelper['tmp'])
348
            &&
349 80
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
350
        ) {
351 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
352
        }
353
354 80
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
355
    }
356
357
    /**
358
     * Create DOMDocument from HTML.
359
     *
360
     * @param string   $html
361
     * @param int|null $libXMLExtraOptions
362
     *
363
     * @return \DOMDocument
364
     */
365 126
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
366
    {
367 126
        if ($this->keepBrokenHtml) {
368 2
            $html = $this->keepBrokenHtml(\trim($html));
369
        }
370
371 126
        if (\strpos($html, '<') === false) {
372 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
373 125
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
374 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
375
        }
376
377 126
        if (\strpos($html, '<html') === false) {
378 75
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
379
        }
380
381
        /** @noinspection HtmlRequiredTitleElement */
382 126
        if (\strpos($html, '<head>') === false) {
383 77
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
384
        }
385
386
        if (
387 126
            \strpos($html, '</script>') === false
388
            &&
389 126
            \strpos($html, '<\/script>') !== false
390
        ) {
391 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
392
        }
393
394 126
        if (\strpos($html, '<script') !== false) {
395 13
            $this->html5FallbackForScriptTags($html);
396
397
            if (
398 13
                \strpos($html, 'type="text/html"') !== false
399
                ||
400 12
                \strpos($html, 'type=\'text/html\'') !== false
401
                ||
402 13
                \strpos($html, 'type=text/html') !== false
403
            ) {
404 1
                $this->keepSpecialScriptTags($html);
405
            }
406
        }
407
408
        // set error level
409 126
        $internalErrors = \libxml_use_internal_errors(true);
410 126
        $disableEntityLoader = \libxml_disable_entity_loader(true);
411 126
        \libxml_clear_errors();
412
413 126
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
414
415 126
        if (\defined('LIBXML_BIGLINES')) {
416 126
            $optionsXml |= \LIBXML_BIGLINES;
417
        }
418
419 126
        if (\defined('LIBXML_COMPACT')) {
420 126
            $optionsXml |= \LIBXML_COMPACT;
421
        }
422
423 126
        if (\defined('LIBXML_HTML_NODEFDTD')) {
424 126
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
425
        }
426
427 126
        if ($libXMLExtraOptions !== null) {
428 1
            $optionsXml |= $libXMLExtraOptions;
429
        }
430
431
        if (
432 126
            $this->isDOMDocumentCreatedWithoutWrapper
433
            ||
434 126
            $this->keepBrokenHtml
435
        ) {
436 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
437
        }
438
439 126
        $html = self::replaceToPreserveHtmlEntities($html);
440
441 126
        $documentFound = false;
442 126
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
443 126
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
444 45
            $domElementTmp = \dom_import_simplexml($sxe);
445 45
            if ($domElementTmp) {
446 45
                $documentFound = true;
447 45
                $this->document = $domElementTmp->ownerDocument;
448
            }
449
        }
450
451 126
        if ($documentFound === false) {
452
453
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
454 85
            $xmlHackUsed = false;
455 85
            if (\stripos('<?xml', $html) !== 0) {
456 85
                $xmlHackUsed = true;
457 85
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
458
            }
459
460 85
            $this->document->loadHTML($html, $optionsXml);
461
462
            // remove the "xml-encoding" hack
463 85
            if ($xmlHackUsed) {
464 85
                foreach ($this->document->childNodes as $child) {
465 85
                    if ($child->nodeType === \XML_PI_NODE) {
466
                        /** @noinspection UnusedFunctionResultInspection */
467 85
                        $this->document->removeChild($child);
468
469 85
                        break;
470
                    }
471
                }
472
            }
473
        }
474
475
        // set encoding
476 126
        $this->document->encoding = $this->getEncoding();
477
478
        // restore lib-xml settings
479 126
        \libxml_clear_errors();
480 126
        \libxml_use_internal_errors($internalErrors);
481 126
        \libxml_disable_entity_loader($disableEntityLoader);
482
483 126
        return $this->document;
484
    }
485
486
    /**
487
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
488
     *
489
     * @param string $html
490
     */
491 13
    protected function html5FallbackForScriptTags(string &$html)
492
    {
493
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
494
        /** @noinspection HtmlDeprecatedTag */
495 13
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
496 13
        $html = \preg_replace_callback($regExSpecialScript, static function ($scripts) {
497 12
            return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
498 13
        }, $html);
499 13
    }
500
501
    /**
502
     * @param string $html
503
     */
504 1
    protected function keepSpecialScriptTags(string &$html)
505
    {
506 1
        $specialScripts = [];
507
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
508 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU';
509 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
510
511 1
        if (isset($specialScripts[0])) {
512 1
            foreach ($specialScripts[0] as $specialScript) {
513
514 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
515 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
516
                // remove the html5 fallback
517 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
518
519 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
520
            }
521
        }
522 1
    }
523
524
    /**
525
     * @param string $html
526
     *
527
     * @return string
528
     */
529 2
    protected function keepBrokenHtml(string $html): string
530
    {
531
        do {
532 2
            $original = $html;
533
534 2
            $html = (string) \preg_replace_callback(
535 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
536 2
                static function ($matches) {
537 2
                    return $matches['start'] .
538 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
539 2
                           $matches['value'] .
540 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
541 2
                           $matches['end'];
542 2
                },
543 2
                $html
544
            );
545 2
        } while ($original !== $html);
546
547
        do {
548 2
            $original = $html;
549
550 2
            $html = (string) \preg_replace_callback(
551 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
552 2
                static function ($matches) {
553 2
                    $matches['broken'] = \str_replace(
554 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
555 2
                        ['</', '<', '>'],
556 2
                        $matches['broken']
557
                    );
558
559 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
560 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
561
562 2
                    return $matches['start'] . $matchesHash . $matches['end'];
563 2
                },
564 2
                $html
565
            );
566 2
        } while ($original !== $html);
567
568 2
        return \str_replace(
569 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
570 2
            ['</', '<', '>'],
571 2
            $html
572
        );
573
    }
574
575
    /**
576
     * Return element by #id.
577
     *
578
     * @param string $id
579
     *
580
     * @return SimpleHtmlDom
581
     */
582 2
    public function getElementById(string $id): SimpleHtmlDom
583
    {
584 2
        return $this->find("#${id}", 0);
585
    }
586
587
    /**
588
     * Return element by tag name.
589
     *
590
     * @param string $name
591
     *
592
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
593
     */
594
    public function getElementByTagName(string $name)
595
    {
596 1
        $node = $this->document->getElementsByTagName($name)->item(0);
597
598 1
        if ($node === null) {
599
            return new SimpleHtmlDomNodeBlank();
600
        }
601
602 1
        return new SimpleHtmlDom($node);
603
    }
604
605
    /**
606
     * Returns elements by #id.
607
     *
608
     * @param string   $id
609
     * @param int|null $idx
610
     *
611
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
612
     */
613
    public function getElementsById(string $id, $idx = null)
614
    {
615
        return $this->find("#${id}", $idx);
616
    }
617
618
    /**
619
     * Returns elements by tag name.
620
     *
621
     * @param string   $name
622
     * @param int|null $idx
623
     *
624
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
625
     */
626
    public function getElementsByTagName(string $name, $idx = null)
627
    {
628 3
        $nodesList = $this->document->getElementsByTagName($name);
629
630 3
        $elements = new SimpleHtmlDomNode();
631
632 3
        foreach ($nodesList as $node) {
633 3
            $elements[] = new SimpleHtmlDom($node);
634
        }
635
636
        // return all elements
637 3
        if ($idx === null) {
638 2
            return $elements;
639
        }
640
641
        // handle negative values
642 1
        if ($idx < 0) {
643
            $idx = \count($elements) + $idx;
644
        }
645
646
        // return one element
647 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
648
    }
649
650
    /**
651
     * Find one node with a CSS selector.
652
     *
653
     * @param string $selector
654
     *
655
     * @return SimpleHtmlDom
656
     */
657
    public function findOne(string $selector): SimpleHtmlDom
658
    {
659 2
        return $this->find($selector, 0);
660
    }
661
662
    /**
663
     * Find one node with a CSS selector.
664
     *
665
     * @param string $selector
666
     *
667
     * @return SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
668
     */
669
    public function findMulti(string $selector)
670
    {
671 1
        return $this->find($selector, null);
672
    }
673
674
    /**
675
     * Find list of nodes with a CSS selector.
676
     *
677
     * @param string   $selector
678
     * @param null|int $idx
679
     *
680
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
681
     */
682
    public function find(string $selector, $idx = null)
683
    {
684 89
        $xPathQuery = SelectorConverter::toXPath($selector);
685
686 89
        $xPath = new \DOMXPath($this->document);
687 89
        $nodesList = $xPath->query($xPathQuery);
688 89
        $elements = new SimpleHtmlDomNode();
689
690 89
        foreach ($nodesList as $node) {
691 82
            $elements[] = new SimpleHtmlDom($node);
692
        }
693
694
        // return all elements
695 89
        if ($idx === null) {
696 57
            return $elements;
697
        }
698
699
        // handle negative values
700 44
        if ($idx < 0) {
701 11
            $idx = \count($elements) + $idx;
702
        }
703
704
        // return one element
705 44
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
706
    }
707
708
    /**
709
     * @param string $content
710
     * @param bool   $multiDecodeNewHtmlEntity
711
     *
712
     * @return string
713
     */
714
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
715
    {
716
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
717
        //          so we try to remove it here again ...
718
719 71
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
720
            /** @noinspection HtmlRequiredLangAttribute */
721 30
            $content = \str_replace(
722
                [
723 30
                    '<body>',
724
                    '</body>',
725
                    '<html>',
726
                    '</html>',
727
                ],
728 30
                '',
729 30
                $content
730
            );
731
        }
732
733 71
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
734
            /** @noinspection HtmlRequiredTitleElement */
735 31
            $content = \str_replace(
736
                [
737 31
                    '<head>',
738
                    '</head>',
739
                ],
740 31
                '',
741 31
                $content
742
            );
743
        }
744
745 71
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
746 1
            $content = \str_replace(
747 1
                '</script>',
748 1
                '',
749 1
                $content
750
            );
751
        }
752
753 71
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
754 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
755 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
756
        }
757
758 71
        if ($this->isDOMDocumentCreatedWithoutHtml) {
759 5
            $content = \str_replace(
760
                [
761 5
                    '<p>',
762
                    '</p>',
763
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
764
                ],
765 5
                '',
766 5
                $content
767
            );
768
        }
769
770
        /** @noinspection CheckTagEmptyBody */
771
        /** @noinspection HtmlExtraClosingTag */
772
        /** @noinspection HtmlRequiredTitleElement */
773 71
        $content = \trim(
774 71
            \str_replace(
775
                [
776 71
                    '<simpleHtmlDomP>',
777
                    '</simpleHtmlDomP>',
778
                    '<head><head>',
779
                    '</head></head>',
780
                    '<br></br>',
781
                ],
782
                [
783 71
                    '',
784
                    '',
785
                    '<head>',
786
                    '</head>',
787
                    '<br>',
788
                ],
789 71
                $content
790
            )
791
        );
792
793 71
        if ($multiDecodeNewHtmlEntity) {
794 3
            if (\class_exists('\voku\helper\UTF8')) {
795
796
                /** @noinspection PhpUndefinedClassInspection */
797
                $content = UTF8::rawurldecode($content);
798
            } else {
799
                do {
800 3
                    $content_compare = $content;
801
802 3
                    $content = \rawurldecode(
803 3
                        \html_entity_decode(
804 3
                            $content,
805 3
                            \ENT_QUOTES | \ENT_HTML5
806
                        )
807
                    );
808 3
                } while ($content_compare !== $content);
809
            }
810
        } else {
811 70
            $content = \rawurldecode(
812 70
                \html_entity_decode(
813 70
                    $content,
814 70
                    \ENT_QUOTES | \ENT_HTML5
815
                )
816
            );
817
        }
818
819 71
        return self::putReplacedBackToPreserveHtmlEntities($content);
820
    }
821
822
    /**
823
     * @return \DOMDocument
824
     */
825
    public function getDocument(): \DOMDocument
826
    {
827 39
        return $this->document;
828
    }
829
830
    /**
831
     * Get the encoding to use.
832
     *
833
     * @return string
834
     */
835
    private function getEncoding(): string
836
    {
837 138
        return $this->encoding;
838
    }
839
840
    /**
841
     * @return bool
842
     */
843
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
844
    {
845 9
        return $this->isDOMDocumentCreatedWithoutHtml;
846
    }
847
848
    /**
849
     * @return bool
850
     */
851
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
852
    {
853 46
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
854
    }
855
856
    /**
857
     * @return bool
858
     */
859
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
860
    {
861 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
862
    }
863
864
    /**
865
     * @return bool
866
     */
867
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
868
    {
869
        return $this->isDOMDocumentCreatedWithoutWrapper;
870
    }
871
872
    /**
873
     * Get dom node's outer html.
874
     *
875
     * @param bool $multiDecodeNewHtmlEntity
876
     *
877
     * @return string
878
     */
879
    public function html(bool $multiDecodeNewHtmlEntity = false): string
880
    {
881 46
        if ($this::$callback !== null) {
882
            \call_user_func($this::$callback, [$this]);
883
        }
884
885 46
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
886 23
            $content = $this->document->saveHTML($this->document->documentElement);
887
        } else {
888 31
            $content = $this->document->saveHTML();
889
        }
890
891 46
        if ($content === false) {
892
            return '';
893
        }
894
895 46
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
896
    }
897
898
    /**
899
     * @param bool $keepBrokenHtml
900
     *
901
     * @return HtmlDomParser
902
     */
903
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
904
    {
905 2
        $this->keepBrokenHtml = $keepBrokenHtml;
906
907 2
        return $this;
908
    }
909
910
    /**
911
     * Get the HTML as XML.
912
     *
913
     * @param bool $multiDecodeNewHtmlEntity
914
     *
915
     * @return string
916
     */
917
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
918
    {
919 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
920
921
        // remove the XML-header
922 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
923
924 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
925
    }
926
927
    /**
928
     * Get dom node's inner html.
929
     *
930
     * @param bool $multiDecodeNewHtmlEntity
931
     *
932
     * @return string
933
     */
934
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
935
    {
936
        // init
937 19
        $text = '';
938
939 19
        foreach ($this->document->documentElement->childNodes as $node) {
940 19
            $text .= $this->document->saveHTML($node);
941
        }
942
943 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
944
    }
945
946
    /**
947
     * Load HTML from string.
948
     *
949
     * @param string   $html
950
     * @param int|null $libXMLExtraOptions
951
     *
952
     * @throws \InvalidArgumentException if argument is not string
953
     *
954
     * @return HtmlDomParser
955
     */
956
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
957
    {
958 126
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
959
960 126
        return $this;
961
    }
962
963
    /**
964
     * Load HTML from file.
965
     *
966
     * @param string   $filePath
967
     * @param int|null $libXMLExtraOptions
968
     *
969
     * @throws \RuntimeException
970
     * @throws \InvalidArgumentException
971
     *
972
     * @return HtmlDomParser
973
     */
974
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
975
    {
976
        if (
977 11
            !\preg_match("/^https?:\/\//i", $filePath)
978
            &&
979 11
            !\file_exists($filePath)
980
        ) {
981 1
            throw new \RuntimeException("File ${filePath} not found");
982
        }
983
984
        try {
985 10
            if (\class_exists('\voku\helper\UTF8')) {
986
                /** @noinspection PhpUndefinedClassInspection */
987
                $html = UTF8::file_get_contents($filePath);
988
            } else {
989 10
                $html = \file_get_contents($filePath);
990
            }
991 1
        } catch (\Exception $e) {
992 1
            throw new \RuntimeException("Could not load file ${filePath}");
993
        }
994
995 9
        if ($html === false) {
996
            throw new \RuntimeException("Could not load file ${filePath}");
997
        }
998
999 9
        return $this->loadHtml($html, $libXMLExtraOptions);
1000
    }
1001
1002
    /**
1003
     * Save the html-dom as string.
1004
     *
1005
     * @param string $filepath
1006
     *
1007
     * @return string
1008
     */
1009
    public function save(string $filepath = ''): string
1010
    {
1011 1
        $string = $this->innerHtml();
1012 1
        if ($filepath !== '') {
1013
            \file_put_contents($filepath, $string, \LOCK_EX);
1014
        }
1015
1016 1
        return $string;
1017
    }
1018
1019
    /**
1020
     * @param callable $functionName
1021
     */
1022
    public function set_callback($functionName)
1023
    {
1024
        static::$callback = $functionName;
1025
    }
1026
1027
    /**
1028
     * Get dom node's plain text.
1029
     *
1030
     * @param bool $multiDecodeNewHtmlEntity
1031
     *
1032
     * @return string
1033
     */
1034
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1035
    {
1036 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
1037
    }
1038
1039
    public function __clone()
1040
    {
1041
        $this->document = clone $this->document;
1042
    }
1043
}
1044