Completed
Push — master ( ea5e36...422e10 )
by Lars
01:59
created

HtmlDomParser::html()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3.0416

Importance

Changes 0
Metric Value
dl 0
loc 14
ccs 5
cts 6
cp 0.8333
rs 9.7998
c 0
b 0
f 0
cc 3
nc 4
nop 1
crap 3.0416
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
9
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
10
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
11
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
12
 * @property-read string plaintext <p>Get dom node's plain text.</p>
13
 *
14
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
15
 * @method string outerHtml() <p>Get dom node's outer html.</p>
16
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
17
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
18
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
19
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from file.</p>
20
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from string.</p>
21
 */
22
class HtmlDomParser
23
{
24
    /**
25
     * @var array
26
     */
27
    protected static $functionAliases = [
28
        'outertext' => 'html',
29
        'outerhtml' => 'html',
30
        'innertext' => 'innerHtml',
31
        'innerhtml' => 'innerHtml',
32
        'load'      => 'loadHtml',
33
        'load_file' => 'loadHtmlFile',
34
    ];
35
36
    /**
37
     * @var string[][]
38
     */
39
    protected static $domLinkReplaceHelper = [
40
        'orig' => ['[', ']', '{', '}'],
41
        'tmp'  => [
42
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
43
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
44
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
45
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
46
        ],
47
    ];
48
49
    /**
50
     * @var array
51
     */
52
    protected static $domReplaceHelper = [
53
        'orig' => ['&', '|', '+', '%', '@'],
54
        'tmp'  => [
55
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
56
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
57
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
58
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
59
            '____SIMPLE_HTML_DOM__VOKU__AT____',
60
        ],
61
    ];
62
63
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
64
65
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
66
67
    /**
68
     * @var array
69
     */
70
    protected static $domBrokenReplaceHelper = [];
71
72
    /**
73
     * @var callable
74
     */
75
    protected static $callback;
76
77
    /**
78
     * @var \DOMDocument
79
     */
80
    protected $document;
81
82
    /**
83
     * @var string
84
     */
85
    protected $encoding = 'UTF-8';
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithoutHtml = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $keepBrokenHtml;
116
117
    /**
118
     * Constructor
119
     *
120
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
121
     *
122
     * @throws \InvalidArgumentException
123
     */
124 134
    public function __construct($element = null)
125
    {
126 134
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
127
128
        // reset
129 134
        self::$domBrokenReplaceHelper = [];
130
131
        // DOMDocument settings
132 134
        $this->document->preserveWhiteSpace = true;
133 134
        $this->document->formatOutput = true;
134
135 134
        if ($element instanceof SimpleHtmlDom) {
136 66
            $element = $element->getNode();
137
        }
138
139 134
        if ($element instanceof \DOMNode) {
140 66
            $domNode = $this->document->importNode($element, true);
141
142 66
            if ($domNode instanceof \DOMNode) {
143
                /** @noinspection UnusedFunctionResultInspection */
144 66
                $this->document->appendChild($domNode);
145
            }
146
147 66
            return;
148
        }
149
150 134
        if ($element !== null) {
151
            /** @noinspection UnusedFunctionResultInspection */
152 76
            $this->loadHtml($element);
153
        }
154 133
    }
155
156
    /**
157
     * @param $name
158
     * @param $arguments
159
     *
160
     * @return bool|mixed
161
     */
162 50 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
    {
164 50
        $name = \strtolower($name);
165
166 50
        if (isset(self::$functionAliases[$name])) {
167 49
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
168
        }
169
170 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
171
    }
172
173
    /**
174
     * @param $name
175
     * @param $arguments
176
     *
177
     * @throws \BadMethodCallException
178
     * @throws \RuntimeException
179
     * @throws \InvalidArgumentException
180
     *
181
     * @return HtmlDomParser
182
     */
183 18
    public static function __callStatic($name, $arguments)
184
    {
185 18
        $arguments0 = $arguments[0] ?? '';
186
187 18
        $arguments1 = $arguments[1] ?? null;
188
189 18
        if ($name === 'str_get_html') {
190 13
            $parser = new self();
191
192 13
            return $parser->loadHtml($arguments0, $arguments1);
193
        }
194
195 5
        if ($name === 'file_get_html') {
196 4
            $parser = new self();
197
198 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
199
        }
200
201 1
        throw new \BadMethodCallException('Method does not exist');
202
    }
203
204
    /** @noinspection MagicMethodsValidityInspection */
205
206
    /**
207
     * @param $name
208
     *
209
     * @return string
210
     */
211 14
    public function __get($name)
212
    {
213 14
        $name = \strtolower($name);
214
215
        switch ($name) {
216 14
            case 'outerhtml':
217 14
            case 'outertext':
218 5
                return $this->html();
219 10
            case 'innerhtml':
220 4
            case 'innertext':
221 7
                return $this->innerHtml();
222 3
            case 'text':
223 3
            case 'plaintext':
224 2
                return $this->text();
225
        }
226
227 1
        return null;
228
    }
229
230
    /**
231
     * @param string $selector
232
     * @param int    $idx
233
     *
234
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
235
     */
236 3
    public function __invoke($selector, $idx = null)
237
    {
238 3
        return $this->find($selector, $idx);
239
    }
240
241
    /**
242
     * @return string
243
     */
244 17
    public function __toString()
245
    {
246 17
        return $this->html();
247
    }
248
249
    /**
250
     * does nothing (only for api-compatibility-reasons)
251
     *
252
     * @deprecated
253
     *
254
     * @return bool
255
     */
256 1
    public function clear(): bool
257
    {
258 1
        return true;
259
    }
260
261
    /**
262
     * @param string $html
263
     *
264
     * @return string
265
     */
266 123
    public static function replaceToPreserveHtmlEntities(string $html): string
267
    {
268
        // init
269 123
        $linksNew = [];
270 123
        $linksOld = [];
271
272 123
        if (\strpos($html, 'http') !== false) {
273
274
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
275 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
276 58
            \preg_match_all($regExUrl, $html, $linksOld);
277
278 58
            if (!empty($linksOld[1])) {
279 56
                $linksOld = $linksOld[1];
280 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
281 56
                    $linksNew[$linkKey] = \str_replace(
282 56
                        self::$domLinkReplaceHelper['orig'],
283 56
                        self::$domLinkReplaceHelper['tmp'],
284 56
                        $linkOld
285
                    );
286
                }
287
            }
288
        }
289
290 123
        $linksNewCount = \count($linksNew);
291 123
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
292 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
293 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
294
        } else {
295 71
            $search = self::$domReplaceHelper['orig'];
296 71
            $replace = self::$domReplaceHelper['tmp'];
297
        }
298
299 123
        return \str_replace($search, $replace, $html);
300
    }
301
302
    /**
303
     * @param string $html
304
     *
305
     * @return string
306
     */
307 79
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
308
    {
309 79
        static $DOM_REPLACE__HELPER_CACHE = null;
310
311 79
        if ($DOM_REPLACE__HELPER_CACHE === null) {
312 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
313 1
                self::$domLinkReplaceHelper['tmp'],
314 1
                self::$domReplaceHelper['tmp']
315
            );
316 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
317 1
                self::$domLinkReplaceHelper['orig'],
318 1
                self::$domReplaceHelper['orig']
319
            );
320
321 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
322 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
323
324 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
325 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
326
327 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
328 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
329
330 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
331 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
332
        }
333
334
        if (
335 79
            isset(self::$domBrokenReplaceHelper['tmp'])
336
            &&
337 79
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
338
        ) {
339 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
340
        }
341
342 79
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
343
    }
344
345
    /**
346
     * Create DOMDocument from HTML.
347
     *
348
     * @param string   $html
349
     * @param int|null $libXMLExtraOptions
350
     *
351
     * @return \DOMDocument
352
     */
353 122
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
354
    {
355 122
        if ($this->keepBrokenHtml) {
356 2
            $html = $this->keepBrokenHtml(\trim($html));
357
        }
358
359 122
        if (\strpos($html, '<') === false) {
360 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
361 121
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
362 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
363
        }
364
365 122
        if (\strpos($html, '<html') === false) {
366 71
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
367
        }
368
369
        /** @noinspection HtmlRequiredTitleElement */
370 122
        if (\strpos($html, '<head>') === false) {
371 73
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
372
        }
373
374
        if (
375 122
            \strpos($html, '</script>') === false
376
            &&
377 122
            \strpos($html, '<\/script>') !== false
378
        ) {
379 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
380
        }
381
382
        if (
383 122
            \strpos($html, 'type="text/html"') !== false
384
            ||
385 122
            \strpos($html, 'type=\'text/html\'') !== false
386
        ) {
387 1
            $this->keepSpecialScriptTags($html);
388
        }
389
390
        // set error level
391 122
        $internalErrors = \libxml_use_internal_errors(true);
392 122
        $disableEntityLoader = \libxml_disable_entity_loader(true);
393 122
        \libxml_clear_errors();
394
395 122
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
396
397 122
        if (\defined('LIBXML_BIGLINES')) {
398 122
            $optionsXml |= \LIBXML_BIGLINES;
399
        }
400
401 122
        if (\defined('LIBXML_COMPACT')) {
402 122
            $optionsXml |= \LIBXML_COMPACT;
403
        }
404
405 122
        if (\defined('LIBXML_HTML_NODEFDTD')) {
406 122
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
407
        }
408
409 122
        if ($libXMLExtraOptions !== null) {
410 1
            $optionsXml |= $libXMLExtraOptions;
411
        }
412
413
        if (
414 122
            $this->isDOMDocumentCreatedWithoutWrapper
415
            ||
416 122
            $this->keepBrokenHtml
417
        ) {
418 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
419
        }
420
421 122
        $html = self::replaceToPreserveHtmlEntities($html);
422
423 122
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
424 122
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
425 42
            $this->document = \dom_import_simplexml($sxe)->ownerDocument;
426
        } else {
427
428
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
429 84
            $xmlHackUsed = false;
430 84
            if (\stripos('<?xml', $html) !== 0) {
431 84
                $xmlHackUsed = true;
432 84
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
433
            }
434
435 84
            $this->document->loadHTML($html, $optionsXml);
436
437
            // remove the "xml-encoding" hack
438 84
            if ($xmlHackUsed) {
439 84
                foreach ($this->document->childNodes as $child) {
440 84
                    if ($child->nodeType === \XML_PI_NODE) {
441
                        /** @noinspection UnusedFunctionResultInspection */
442 84
                        $this->document->removeChild($child);
443
444 84
                        break;
445
                    }
446
                }
447
            }
448
        }
449
450
        // set encoding
451 122
        $this->document->encoding = $this->getEncoding();
452
453
        // restore lib-xml settings
454 122
        \libxml_clear_errors();
455 122
        \libxml_use_internal_errors($internalErrors);
456 122
        \libxml_disable_entity_loader($disableEntityLoader);
457
458 122
        return $this->document;
459
    }
460
461
    /**
462
     * @param string $html
463
     */
464 1
    protected function keepSpecialScriptTags(string &$html)
465
    {
466 1
        $specialScripts = [];
467
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...<script>]
468 1
        $regExSpecialScript = '/<(script) [^>]*type=(["|\'])text\/html\2([^>]*)>.*<\/\1>/isU';
469 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
470
471 1
        if (isset($specialScripts[0])) {
472 1
            foreach ($specialScripts[0] as $specialScript) {
473 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . substr($specialScript, strlen('<script'));
474 1
                $specialNonScript = substr($specialNonScript, 0, -strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
475
476 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
477
            }
478
        }
479 1
    }
480
481
    /**
482
     * @param string $html
483
     *
484
     * @return string
485
     */
486 2
    protected function keepBrokenHtml(string $html): string
487
    {
488
        do {
489 2
            $original = $html;
490
491 2
            $html = (string) \preg_replace_callback(
492 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
493 2
                function ($matches) {
494 2
                    return $matches['start'] .
495 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
496 2
                           $matches['value'] .
497 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
498 2
                           $matches['end'];
499 2
                },
500 2
                $html
501
            );
502 2
        } while ($original !== $html);
503
504
        do {
505 2
            $original = $html;
506
507 2
            $html = (string) \preg_replace_callback(
508 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
509 2
                function ($matches) {
510 2
                    $matches['broken'] = \str_replace(
511 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
512 2
                        ['</', '<', '>'],
513 2
                        $matches['broken']
514
                    );
515
516 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
517 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
518
519 2
                    return $matches['start'] . $matchesHash . $matches['end'];
520 2
                },
521 2
                $html
522
            );
523 2
        } while ($original !== $html);
524
525 2
        return \str_replace(
526 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
527 2
            ['</', '<', '>'],
528 2
            $html
529
        );
530
    }
531
532
    /**
533
     * Return element by #id.
534
     *
535
     * @param string $id
536
     *
537
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
538
     */
539 2
    public function getElementById(string $id)
540
    {
541 2
        return $this->find("#${id}", 0);
542
    }
543
544
    /**
545
     * Return element by tag name.
546
     *
547
     * @param string $name
548
     *
549
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
550
     */
551
    public function getElementByTagName(string $name)
552
    {
553 1
        $node = $this->document->getElementsByTagName($name)->item(0);
554
555 1
        if ($node === null) {
556
            return new SimpleHtmlDomNodeBlank();
557
        }
558
559 1
        return new SimpleHtmlDom($node);
560
    }
561
562
    /**
563
     * Returns elements by #id.
564
     *
565
     * @param string   $id
566
     * @param int|null $idx
567
     *
568
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
569
     */
570
    public function getElementsById(string $id, $idx = null)
571
    {
572
        return $this->find("#${id}", $idx);
573
    }
574
575
    /**
576
     * Returns elements by tag name.
577
     *
578
     * @param string   $name
579
     * @param int|null $idx
580
     *
581
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
582
     */
583 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
584
    {
585 3
        $nodesList = $this->document->getElementsByTagName($name);
586
587 3
        $elements = new SimpleHtmlDomNode();
588
589 3
        foreach ($nodesList as $node) {
590 3
            $elements[] = new SimpleHtmlDom($node);
591
        }
592
593
        // return all elements
594 3
        if ($idx === null) {
595 2
            return $elements;
596
        }
597
598
        // handle negative values
599 1
        if ($idx < 0) {
600
            $idx = \count($elements) + $idx;
601
        }
602
603
        // return one element
604 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
605
    }
606
607
    /**
608
     * Find one node with a CSS selector.
609
     *
610
     * @param string $selector
611
     *
612
     * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
613
     */
614
    public function findOne(string $selector)
615
    {
616 2
        return $this->find($selector, 0);
617
    }
618
619
    /**
620
     * Find list of nodes with a CSS selector.
621
     *
622
     * @param string $selector
623
     * @param int    $idx
624
     *
625
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
626
     */
627
    public function find(string $selector, $idx = null)
628
    {
629 85
        $xPathQuery = SelectorConverter::toXPath($selector);
630
631 85
        $xPath = new \DOMXPath($this->document);
632 85
        $nodesList = $xPath->query($xPathQuery);
633 85
        $elements = new SimpleHtmlDomNode();
634
635 85
        foreach ($nodesList as $node) {
636 81
            $elements[] = new SimpleHtmlDom($node);
637
        }
638
639
        // return all elements
640 85
        if ($idx === null) {
641 55
            return $elements;
642
        }
643
644
        // handle negative values
645 42
        if ($idx < 0) {
646 11
            $idx = \count($elements) + $idx;
647
        }
648
649
        // return one element
650 42
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
651
    }
652
653
    /**
654
     * @param string $content
655
     * @param bool   $multiDecodeNewHtmlEntity
656
     *
657
     * @return string
658
     */
659
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
660
    {
661
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
662
        //          so we try to remove it here again ...
663
664 70
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
665
            /** @noinspection HtmlRequiredLangAttribute */
666 30
            $content = \str_replace(
667
                [
668 30
                    "\n",
669
                    "\r\n",
670
                    "\r",
671
                    '<body>',
672
                    '</body>',
673
                    '<html>',
674
                    '</html>',
675
                ],
676 30
                '',
677 30
                $content
678
            );
679
        }
680
681 70
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
682
            /** @noinspection HtmlRequiredTitleElement */
683 31
            $content = \str_replace(
684
                [
685 31
                    '<head>',
686
                    '</head>',
687
                ],
688 31
                '',
689 31
                $content
690
            );
691
        }
692
693 70
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
694 1
            $content = \str_replace(
695 1
                '</script>',
696 1
                '',
697 1
                $content
698
            );
699
        }
700
701 70
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
702 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
703 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
704
        }
705
706 70
        if ($this->isDOMDocumentCreatedWithoutHtml) {
707 5
            $content = \str_replace(
708
                [
709 5
                    '<p>',
710
                    '</p>',
711
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
712
                ],
713 5
                '',
714 5
                $content
715
            );
716
        }
717
718
        /** @noinspection CheckTagEmptyBody */
719
        /** @noinspection HtmlExtraClosingTag */
720
        /** @noinspection HtmlRequiredTitleElement */
721 70
        $content = \trim(
722 70
            \str_replace(
723
                [
724 70
                    '<simpleHtmlDomP>',
725
                    '</simpleHtmlDomP>',
726
                    '<head><head>',
727
                    '</head></head>',
728
                    '<br></br>',
729
                ],
730
                [
731 70
                    '',
732
                    '',
733
                    '<head>',
734
                    '</head>',
735
                    '<br>',
736
                ],
737 70
                $content
738
            )
739
        );
740
741 70
        if ($multiDecodeNewHtmlEntity) {
742 3
            if (\class_exists('\voku\helper\UTF8')) {
743
744
                /** @noinspection PhpUndefinedClassInspection */
745
                $content = UTF8::rawurldecode($content);
746
            } else {
747
                do {
748 3
                    $content_compare = $content;
749
750 3
                    $content = \rawurldecode(
751 3
                        \html_entity_decode(
752 3
                            $content,
753 3
                            \ENT_QUOTES | \ENT_HTML5
754
                        )
755
                    );
756 3
                } while ($content_compare !== $content);
757
            }
758
        } else {
759 69
            $content = \rawurldecode(
760 69
                \html_entity_decode(
761 69
                    $content,
762 69
                    \ENT_QUOTES | \ENT_HTML5
763
                )
764
            );
765
        }
766
767 70
        return self::putReplacedBackToPreserveHtmlEntities($content);
768
    }
769
770
    /**
771
     * @return \DOMDocument
772
     */
773
    public function getDocument(): \DOMDocument
774
    {
775 39
        return $this->document;
776
    }
777
778
    /**
779
     * Get the encoding to use.
780
     *
781
     * @return string
782
     */
783
    private function getEncoding(): string
784
    {
785 134
        return $this->encoding;
786
    }
787
788
    /**
789
     * @return bool
790
     */
791
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
792
    {
793 9
        return $this->isDOMDocumentCreatedWithoutHtml;
794
    }
795
796
    /**
797
     * @return bool
798
     */
799
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
800
    {
801 45
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
802
    }
803
804
    /**
805
     * @return bool
806
     */
807
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
808
    {
809 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
810
    }
811
812
    /**
813
     * @return bool
814
     */
815
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
816
    {
817
        return $this->isDOMDocumentCreatedWithoutWrapper;
818
    }
819
820
    /**
821
     * Get dom node's outer html.
822
     *
823
     * @param bool $multiDecodeNewHtmlEntity
824
     *
825
     * @return string
826
     */
827
    public function html(bool $multiDecodeNewHtmlEntity = false): string
828
    {
829 45
        if ($this::$callback !== null) {
830
            \call_user_func($this::$callback, [$this]);
831
        }
832
833 45
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
834 23
            $content = $this->document->saveHTML($this->document->documentElement);
835
        } else {
836 30
            $content = $this->document->saveHTML();
837
        }
838
839 45
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
840
    }
841
842
    /**
843
     * @param bool $keepBrokenHtml
844
     *
845
     * @return HtmlDomParser
846
     */
847
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
848
    {
849 2
        $this->keepBrokenHtml = $keepBrokenHtml;
850
851 2
        return $this;
852
    }
853
854
    /**
855
     * Get the HTML as XML.
856
     *
857
     * @param bool $multiDecodeNewHtmlEntity
858
     *
859
     * @return string
860
     */
861
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
862
    {
863 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
864
865
        // remove the XML-header
866 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
867
868 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
869
    }
870
871
    /**
872
     * Get dom node's inner html.
873
     *
874
     * @param bool $multiDecodeNewHtmlEntity
875
     *
876
     * @return string
877
     */
878
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
879
    {
880
        // init
881 19
        $text = '';
882
883 19
        foreach ($this->document->documentElement->childNodes as $node) {
884 19
            $text .= $this->document->saveHTML($node);
885
        }
886
887 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
888
    }
889
890
    /**
891
     * Load HTML from string.
892
     *
893
     * @param string   $html
894
     * @param int|null $libXMLExtraOptions
895
     *
896
     * @throws \InvalidArgumentException if argument is not string
897
     *
898
     * @return HtmlDomParser
899
     */
900
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
901
    {
902 122
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
903
904 122
        return $this;
905
    }
906
907
    /**
908
     * Load HTML from file.
909
     *
910
     * @param string   $filePath
911
     * @param int|null $libXMLExtraOptions
912
     *
913
     * @throws \RuntimeException
914
     * @throws \InvalidArgumentException
915
     *
916
     * @return HtmlDomParser
917
     */
918
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
919
    {
920
        if (
921 11
            !\preg_match("/^https?:\/\//i", $filePath)
922
            &&
923 11
            !\file_exists($filePath)
924
        ) {
925 1
            throw new \RuntimeException("File ${filePath} not found");
926
        }
927
928
        try {
929 10
            if (\class_exists('\voku\helper\UTF8')) {
930
                /** @noinspection PhpUndefinedClassInspection */
931
                $html = UTF8::file_get_contents($filePath);
932
            } else {
933 10
                $html = \file_get_contents($filePath);
934
            }
935 1
        } catch (\Exception $e) {
936 1
            throw new \RuntimeException("Could not load file ${filePath}");
937
        }
938
939 9
        if ($html === false) {
940
            throw new \RuntimeException("Could not load file ${filePath}");
941
        }
942
943 9
        return $this->loadHtml($html, $libXMLExtraOptions);
944
    }
945
946
    /**
947
     * Save the html-dom as string.
948
     *
949
     * @param string $filepath
950
     *
951
     * @return string
952
     */
953
    public function save(string $filepath = ''): string
954
    {
955 1
        $string = $this->innerHtml();
956 1
        if ($filepath !== '') {
957
            \file_put_contents($filepath, $string, \LOCK_EX);
958
        }
959
960 1
        return $string;
961
    }
962
963
    /**
964
     * @param $functionName
965
     */
966
    public function set_callback($functionName)
967
    {
968
        $this::$callback = $functionName;
969
    }
970
971
    /**
972
     * Get dom node's plain text.
973
     *
974
     * @param bool $multiDecodeNewHtmlEntity
975
     *
976
     * @return string
977
     */
978
    public function text(bool $multiDecodeNewHtmlEntity = false): string
979
    {
980 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
981
    }
982
983
    public function __clone()
984
    {
985
        $this->document = clone $this->document;
986
    }
987
}
988