Completed
Push — master ( 1dceb4...e6e597 )
by Lars
01:36
created

HtmlDomParser::useKeepBrokenHtml()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
9
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
10
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
11
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
12
 * @property-read string plaintext <p>Get dom node's plain text.</p>
13
 *
14
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
15
 * @method string outerHtml() <p>Get dom node's outer html.</p>
16
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
17
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
18
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
19
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from file.</p>
20
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from string.</p>
21
 */
22
class HtmlDomParser
23
{
24
    /**
25
     * @var array
26
     */
27
    protected static $functionAliases = [
28
        'outertext' => 'html',
29
        'outerhtml' => 'html',
30
        'innertext' => 'innerHtml',
31
        'innerhtml' => 'innerHtml',
32
        'load'      => 'loadHtml',
33
        'load_file' => 'loadHtmlFile',
34
    ];
35
36
    /**
37
     * @var string[][]
38
     */
39
    protected static $domLinkReplaceHelper = [
40
        'orig' => ['[', ']', '{', '}'],
41
        'tmp'  => [
42
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
43
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
44
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
45
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
46
        ],
47
    ];
48
49
    /**
50
     * @var array
51
     */
52
    protected static $domReplaceHelper = [
53
        'orig' => ['&', '|', '+', '%', '@'],
54
        'tmp'  => [
55
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
56
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
57
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
58
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
59
            '____SIMPLE_HTML_DOM__VOKU__AT____',
60
        ],
61
    ];
62
63
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
64
65
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
66
67
    /**
68
     * @var array
69
     */
70
    protected static $domBrokenReplaceHelper = [];
71
72
    /**
73
     * @var callable
74
     */
75
    protected static $callback;
76
77
    /**
78
     * @var \DOMDocument
79
     */
80
    protected $document;
81
82
    /**
83
     * @var string
84
     */
85
    protected $encoding = 'UTF-8';
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithoutHtml = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $keepBrokenHtml;
116
117
    /**
118
     * Constructor
119
     *
120
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
121
     *
122
     * @throws \InvalidArgumentException
123
     */
124 133
    public function __construct($element = null)
125
    {
126 133
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
127
128
        // reset
129 133
        self::$domBrokenReplaceHelper = [];
130
131
        // DOMDocument settings
132 133
        $this->document->preserveWhiteSpace = true;
133 133
        $this->document->formatOutput = true;
134
135 133
        if ($element instanceof SimpleHtmlDom) {
136 66
            $element = $element->getNode();
137
        }
138
139 133
        if ($element instanceof \DOMNode) {
140 66
            $domNode = $this->document->importNode($element, true);
141
142 66
            if ($domNode instanceof \DOMNode) {
143
                /** @noinspection UnusedFunctionResultInspection */
144 66
                $this->document->appendChild($domNode);
145
            }
146
147 66
            return;
148
        }
149
150 133
        if ($element !== null) {
151
            /** @noinspection UnusedFunctionResultInspection */
152 76
            $this->loadHtml($element);
153
        }
154 132
    }
155
156
    /**
157
     * @param $name
158
     * @param $arguments
159
     *
160
     * @return bool|mixed
161
     */
162 49 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
    {
164 49
        $name = \strtolower($name);
165
166 49
        if (isset(self::$functionAliases[$name])) {
167 48
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
168
        }
169
170 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
171
    }
172
173
    /**
174
     * @param $name
175
     * @param $arguments
176
     *
177
     * @throws \BadMethodCallException
178
     * @throws \RuntimeException
179
     * @throws \InvalidArgumentException
180
     *
181
     * @return HtmlDomParser
182
     */
183 18
    public static function __callStatic($name, $arguments)
184
    {
185 18
        $arguments0 = '';
186 18
        if (isset($arguments[0])) {
187 17
            $arguments0 = $arguments[0];
188
        }
189
190 18
        $arguments1 = null;
191 18
        if (isset($arguments[1])) {
192 1
            $arguments1 = $arguments[1];
193
        }
194
195 18
        if ($name === 'str_get_html') {
196 13
            $parser = new self();
197
198 13
            return $parser->loadHtml($arguments0, $arguments1);
199
        }
200
201 5
        if ($name === 'file_get_html') {
202 4
            $parser = new self();
203
204 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
205
        }
206
207 1
        throw new \BadMethodCallException('Method does not exist');
208
    }
209
210
    /** @noinspection MagicMethodsValidityInspection */
211
212
    /**
213
     * @param $name
214
     *
215
     * @return string
216
     */
217 14
    public function __get($name)
218
    {
219 14
        $name = \strtolower($name);
220
221
        switch ($name) {
222 14
            case 'outerhtml':
223 14
            case 'outertext':
224 5
                return $this->html();
225 10
            case 'innerhtml':
226 4
            case 'innertext':
227 7
                return $this->innerHtml();
228 3
            case 'text':
229 3
            case 'plaintext':
230 2
                return $this->text();
231
        }
232
233 1
        return null;
234
    }
235
236
    /**
237
     * @param string $selector
238
     * @param int    $idx
239
     *
240
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
241
     */
242 3
    public function __invoke($selector, $idx = null)
243
    {
244 3
        return $this->find($selector, $idx);
245
    }
246
247
    /**
248
     * @return string
249
     */
250 16
    public function __toString()
251
    {
252 16
        return $this->html();
253
    }
254
255
    /**
256
     * does nothing (only for api-compatibility-reasons)
257
     *
258
     * @deprecated
259
     *
260
     * @return bool
261
     */
262 1
    public function clear(): bool
263
    {
264 1
        return true;
265
    }
266
267
    /**
268
     * @param string $html
269
     *
270
     * @return string
271
     */
272 122
    public static function replaceToPreserveHtmlEntities(string $html): string
273
    {
274
        // init
275 122
        $linksNew = [];
276 122
        $linksOld = [];
277
278 122
        if (\strpos($html, 'http') !== false) {
279
280
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
281 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
282 58
            \preg_match_all($regExUrl, $html, $linksOld);
283
284 58
            if (!empty($linksOld[1])) {
285 56
                $linksOld = $linksOld[1];
286 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
287 56
                    $linksNew[$linkKey] = \str_replace(
288 56
                        self::$domLinkReplaceHelper['orig'],
289 56
                        self::$domLinkReplaceHelper['tmp'],
290 56
                        $linkOld
291
                    );
292
                }
293
            }
294
        }
295
296 122
        $linksNewCount = \count($linksNew);
297 122
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
298 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
299 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
300
        } else {
301 70
            $search = self::$domReplaceHelper['orig'];
302 70
            $replace = self::$domReplaceHelper['tmp'];
303
        }
304
305 122
        return \str_replace($search, $replace, $html);
306
    }
307
308
    /**
309
     * @param string $html
310
     *
311
     * @return string
312
     */
313 78
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
314
    {
315 78
        static $DOM_REPLACE__HELPER_CACHE = null;
316
317 78
        if ($DOM_REPLACE__HELPER_CACHE === null) {
318 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
319 1
                self::$domLinkReplaceHelper['tmp'],
320 1
                self::$domReplaceHelper['tmp']
321
            );
322 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
323 1
                self::$domLinkReplaceHelper['orig'],
324 1
                self::$domReplaceHelper['orig']
325
            );
326
327 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
328 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
329
330 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
331 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
332
333 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
334 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
335
336 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
337 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
338
        }
339
340
        if (
341 78
            isset(self::$domBrokenReplaceHelper['tmp'])
342
            &&
343 78
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
344
        ) {
345 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
346
        }
347
348 78
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
349
    }
350
351
    /**
352
     * Create DOMDocument from HTML.
353
     *
354
     * @param string   $html
355
     * @param int|null $libXMLExtraOptions
356
     *
357
     * @return \DOMDocument
358
     */
359 121
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
360
    {
361 121
        if ($this->keepBrokenHtml === true) {
362 2
            $html = $this->keepBrokenHtml(\trim($html));
363
        }
364
365 121
        if (\strpos($html, '<') === false) {
366 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
367 120
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
368 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
369
        }
370
371 121
        if (\strpos($html, '<html') === false) {
372 71
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
373
        }
374
375
        /** @noinspection HtmlRequiredTitleElement */
376 121
        if (\strpos($html, '<head>') === false) {
377 73
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
378
        }
379
380
        if (
381 121
            \strpos($html, '</script>') === false
382
            &&
383 121
            \strpos($html, '<\/script>') !== false
384
        ) {
385 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
386
        }
387
388
        if (
389 121
            \strpos($html, 'type="text/html"') !== false
390
            ||
391 121
            \strpos($html, 'type=\'text/html\'') !== false
392
        ) {
393 1
            $this->keepSpecialScriptTags($html);
394
        }
395
396
        // set error level
397 121
        $internalErrors = \libxml_use_internal_errors(true);
398 121
        $disableEntityLoader = \libxml_disable_entity_loader(true);
399 121
        \libxml_clear_errors();
400
401 121
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
402
403 121
        if (\defined('LIBXML_BIGLINES')) {
404 121
            $optionsXml |= \LIBXML_BIGLINES;
405
        }
406
407 121
        if (\defined('LIBXML_COMPACT')) {
408 121
            $optionsXml |= \LIBXML_COMPACT;
409
        }
410
411 121
        if (\defined('LIBXML_HTML_NODEFDTD')) {
412 121
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
413
        }
414
415 121
        if ($libXMLExtraOptions !== null) {
416 1
            $optionsXml |= $libXMLExtraOptions;
417
        }
418
419
        if (
420 121
            $this->isDOMDocumentCreatedWithoutWrapper === true
421
            ||
422 121
            $this->keepBrokenHtml === true
423
        ) {
424 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
425
        }
426
427 121
        $html = self::replaceToPreserveHtmlEntities($html);
428
429 121
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
430 121
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
431 42
            $this->document = \dom_import_simplexml($sxe)->ownerDocument;
432
        } else {
433
434
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
435 83
            $xmlHackUsed = false;
436 83
            if (\stripos('<?xml', $html) !== 0) {
437 83
                $xmlHackUsed = true;
438 83
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
439
            }
440
441 83
            $this->document->loadHTML($html, $optionsXml);
442
443
            // remove the "xml-encoding" hack
444 83
            if ($xmlHackUsed === true) {
445 83
                foreach ($this->document->childNodes as $child) {
446 83
                    if ($child->nodeType === \XML_PI_NODE) {
447
                        /** @noinspection UnusedFunctionResultInspection */
448 83
                        $this->document->removeChild($child);
449
450 83
                        break;
451
                    }
452
                }
453
            }
454
        }
455
456
        // set encoding
457 121
        $this->document->encoding = $this->getEncoding();
458
459
        // restore lib-xml settings
460 121
        \libxml_clear_errors();
461 121
        \libxml_use_internal_errors($internalErrors);
462 121
        \libxml_disable_entity_loader($disableEntityLoader);
463
464 121
        return $this->document;
465
    }
466
467
    /**
468
     * @param string $html
469
     */
470 1
    protected function keepSpecialScriptTags(string &$html)
471
    {
472 1
        $specialScripts = [];
473
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...<script>]
474 1
        $regExSpecialScript = '/<(script) [^>]*type=("|\')text\/html\2([^>]*)>.*<\/\1>/isU';
475 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
476
477 1
        if (isset($specialScripts[0])) {
478 1
            foreach ($specialScripts[0] as $specialScript) {
479 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . substr($specialScript, strlen('<script'));
480 1
                $specialNonScript = substr($specialNonScript, 0, -strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
481
482 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
483
            }
484
        }
485 1
    }
486
487
    /**
488
     * @param string $html
489
     *
490
     * @return string
491
     */
492 2
    protected function keepBrokenHtml(string $html): string
493
    {
494
        do {
495 2
            $original = $html;
496
497 2
            $html = (string) \preg_replace_callback(
498 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
499 2
                function ($matches) {
500 2
                    return $matches['start'] .
501 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
502 2
                           $matches['value'] .
503 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
504 2
                           $matches['end'];
505 2
                },
506 2
                $html
507
            );
508 2
        } while ($original !== $html);
509
510
        do {
511 2
            $original = $html;
512
513 2
            $html = (string) \preg_replace_callback(
514 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
515 2
                function ($matches) {
516 2
                    $matches['broken'] = \str_replace(
517 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
518 2
                        ['</', '<', '>'],
519 2
                        $matches['broken']
520
                    );
521
522 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
523 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
524
525 2
                    return $matches['start'] . $matchesHash . $matches['end'];
526 2
                },
527 2
                $html
528
            );
529 2
        } while ($original !== $html);
530
531 2
        return \str_replace(
532 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
533 2
            ['</', '<', '>'],
534 2
            $html
535
        );
536
    }
537
538
    /**
539
     * Return element by #id.
540
     *
541
     * @param string $id
542
     *
543
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
544
     */
545 2
    public function getElementById(string $id)
546
    {
547 2
        return $this->find("#${id}", 0);
548
    }
549
550
    /**
551
     * Return element by tag name.
552
     *
553
     * @param string $name
554
     *
555
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
556
     */
557
    public function getElementByTagName(string $name)
558
    {
559 1
        $node = $this->document->getElementsByTagName($name)->item(0);
560
561 1
        if ($node === null) {
562
            return new SimpleHtmlDomNodeBlank();
563
        }
564
565 1
        return new SimpleHtmlDom($node);
566
    }
567
568
    /**
569
     * Returns elements by #id.
570
     *
571
     * @param string   $id
572
     * @param int|null $idx
573
     *
574
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
575
     */
576
    public function getElementsById(string $id, $idx = null)
577
    {
578
        return $this->find("#${id}", $idx);
579
    }
580
581
    /**
582
     * Returns elements by tag name.
583
     *
584
     * @param string   $name
585
     * @param int|null $idx
586
     *
587
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
588
     */
589 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
590
    {
591 3
        $nodesList = $this->document->getElementsByTagName($name);
592
593 3
        $elements = new SimpleHtmlDomNode();
594
595 3
        foreach ($nodesList as $node) {
596 3
            $elements[] = new SimpleHtmlDom($node);
597
        }
598
599
        // return all elements
600 3
        if ($idx === null) {
601 2
            return $elements;
602
        }
603
604
        // handle negative values
605 1
        if ($idx < 0) {
606
            $idx = \count($elements) + $idx;
607
        }
608
609
        // return one element
610 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
611
    }
612
613
    /**
614
     * Find one node with a CSS selector.
615
     *
616
     * @param string $selector
617
     *
618
     * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
619
     */
620
    public function findOne(string $selector)
621
    {
622 2
        return $this->find($selector, 0);
623
    }
624
625
    /**
626
     * Find list of nodes with a CSS selector.
627
     *
628
     * @param string $selector
629
     * @param int    $idx
630
     *
631
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
632
     */
633
    public function find(string $selector, $idx = null)
634
    {
635 85
        $xPathQuery = SelectorConverter::toXPath($selector);
636
637 85
        $xPath = new \DOMXPath($this->document);
638 85
        $nodesList = $xPath->query($xPathQuery);
639 85
        $elements = new SimpleHtmlDomNode();
640
641 85
        foreach ($nodesList as $node) {
642 81
            $elements[] = new SimpleHtmlDom($node);
643
        }
644
645
        // return all elements
646 85
        if ($idx === null) {
647 55
            return $elements;
648
        }
649
650
        // handle negative values
651 42
        if ($idx < 0) {
652 11
            $idx = \count($elements) + $idx;
653
        }
654
655
        // return one element
656 42
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
657
    }
658
659
    /**
660
     * @param string $content
661
     * @param bool   $multiDecodeNewHtmlEntity
662
     *
663
     * @return string
664
     */
665
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
666
    {
667
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
668
        //          so we try to remove it here again ...
669
670 69
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
671
            /** @noinspection HtmlRequiredLangAttribute */
672 30
            $content = \str_replace(
673
                [
674 30
                    "\n",
675
                    "\r\n",
676
                    "\r",
677
                    '<body>',
678
                    '</body>',
679
                    '<html>',
680
                    '</html>',
681
                ],
682 30
                '',
683 30
                $content
684
            );
685
        }
686
687 69
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) {
688
            /** @noinspection HtmlRequiredTitleElement */
689 31
            $content = \str_replace(
690
                [
691 31
                    '<head>',
692
                    '</head>',
693
                ],
694 31
                '',
695 31
                $content
696
            );
697
        }
698
699 69
        if ($this->isDOMDocumentCreatedWithFakeEndScript === true) {
700 1
            $content = \str_replace(
701 1
                '</script>',
702 1
                '',
703 1
                $content
704
            );
705
        }
706
707 69
        if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
708 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
709 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
710
        }
711
712 69
        if ($this->isDOMDocumentCreatedWithoutHtml === true) {
713 5
            $content = \str_replace(
714
                [
715 5
                    '<p>',
716
                    '</p>',
717
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
718
                ],
719 5
                '',
720 5
                $content
721
            );
722
        }
723
724
        /** @noinspection CheckTagEmptyBody */
725
        /** @noinspection HtmlExtraClosingTag */
726
        /** @noinspection HtmlRequiredTitleElement */
727 69
        $content = \trim(
728 69
            \str_replace(
729
                [
730 69
                    '<simpleHtmlDomP>',
731
                    '</simpleHtmlDomP>',
732
                    '<head><head>',
733
                    '</head></head>',
734
                    '<br></br>',
735
                ],
736
                [
737 69
                    '',
738
                    '',
739
                    '<head>',
740
                    '</head>',
741
                    '<br>',
742
                ],
743 69
                $content
744
            )
745
        );
746
747 69
        if ($multiDecodeNewHtmlEntity === true) {
748 3
            if (\class_exists('\voku\helper\UTF8')) {
749
750
                /** @noinspection PhpUndefinedClassInspection */
751
                $content = UTF8::rawurldecode($content);
752
            } else {
753
                do {
754 3
                    $content_compare = $content;
755
756 3
                    $content = \rawurldecode(
757 3
                        \html_entity_decode(
758 3
                            $content,
759 3
                            \ENT_QUOTES | \ENT_HTML5
760
                        )
761
                    );
762 3
                } while ($content_compare !== $content);
763
            }
764
        } else {
765 68
            $content = \rawurldecode(
766 68
                \html_entity_decode(
767 68
                    $content,
768 68
                    \ENT_QUOTES | \ENT_HTML5
769
                )
770
            );
771
        }
772
773 69
        return self::putReplacedBackToPreserveHtmlEntities($content);
774
    }
775
776
    /**
777
     * @return \DOMDocument
778
     */
779
    public function getDocument(): \DOMDocument
780
    {
781 39
        return $this->document;
782
    }
783
784
    /**
785
     * Get the encoding to use.
786
     *
787
     * @return string
788
     */
789
    private function getEncoding(): string
790
    {
791 133
        return $this->encoding;
792
    }
793
794
    /**
795
     * @return bool
796
     */
797
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
798
    {
799 9
        return $this->isDOMDocumentCreatedWithoutHtml;
800
    }
801
802
    /**
803
     * @return bool
804
     */
805
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
806
    {
807 44
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
808
    }
809
810
    /**
811
     * @return bool
812
     */
813
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
814
    {
815 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
816
    }
817
818
    /**
819
     * @return bool
820
     */
821
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
822
    {
823
        return $this->isDOMDocumentCreatedWithoutWrapper;
824
    }
825
826
    /**
827
     * Get dom node's outer html.
828
     *
829
     * @param bool $multiDecodeNewHtmlEntity
830
     *
831
     * @return string
832
     */
833
    public function html(bool $multiDecodeNewHtmlEntity = false): string
834
    {
835 44
        if ($this::$callback !== null) {
836
            \call_user_func($this::$callback, [$this]);
837
        }
838
839 44
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
840 23
            $content = $this->document->saveHTML($this->document->documentElement);
841
        } else {
842 29
            $content = $this->document->saveHTML();
843
        }
844
845 44
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
846
    }
847
848
    /**
849
     * @param bool $keepBrokenHtml
850
     *
851
     * @return HtmlDomParser
852
     */
853
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
854
    {
855 2
        $this->keepBrokenHtml = $keepBrokenHtml;
856
857 2
        return $this;
858
    }
859
860
    /**
861
     * Get the HTML as XML.
862
     *
863
     * @param bool $multiDecodeNewHtmlEntity
864
     *
865
     * @return string
866
     */
867
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
868
    {
869 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
870
871
        // remove the XML-header
872 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
873
874 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
875
    }
876
877
    /**
878
     * Get dom node's inner html.
879
     *
880
     * @param bool $multiDecodeNewHtmlEntity
881
     *
882
     * @return string
883
     */
884
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
885
    {
886
        // init
887 19
        $text = '';
888
889 19
        foreach ($this->document->documentElement->childNodes as $node) {
890 19
            $text .= $this->document->saveHTML($node);
891
        }
892
893 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
894
    }
895
896
    /**
897
     * Load HTML from string.
898
     *
899
     * @param string   $html
900
     * @param int|null $libXMLExtraOptions
901
     *
902
     * @throws \InvalidArgumentException if argument is not string
903
     *
904
     * @return HtmlDomParser
905
     */
906
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
907
    {
908 121
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
909
910 121
        return $this;
911
    }
912
913
    /**
914
     * Load HTML from file.
915
     *
916
     * @param string   $filePath
917
     * @param int|null $libXMLExtraOptions
918
     *
919
     * @throws \RuntimeException
920
     * @throws \InvalidArgumentException
921
     *
922
     * @return HtmlDomParser
923
     */
924
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
925
    {
926
        if (
927 11
            !\preg_match("/^https?:\/\//i", $filePath)
928
            &&
929 11
            !\file_exists($filePath)
930
        ) {
931 1
            throw new \RuntimeException("File ${filePath} not found");
932
        }
933
934
        try {
935 10
            if (\class_exists('\voku\helper\UTF8')) {
936
                /** @noinspection PhpUndefinedClassInspection */
937
                $html = UTF8::file_get_contents($filePath);
938
            } else {
939 10
                $html = \file_get_contents($filePath);
940
            }
941 1
        } catch (\Exception $e) {
942 1
            throw new \RuntimeException("Could not load file ${filePath}");
943
        }
944
945 9
        if ($html === false) {
946
            throw new \RuntimeException("Could not load file ${filePath}");
947
        }
948
949 9
        return $this->loadHtml($html, $libXMLExtraOptions);
950
    }
951
952
    /**
953
     * Save the html-dom as string.
954
     *
955
     * @param string $filepath
956
     *
957
     * @return string
958
     */
959
    public function save(string $filepath = ''): string
960
    {
961 1
        $string = $this->innerHtml();
962 1
        if ($filepath !== '') {
963
            \file_put_contents($filepath, $string, \LOCK_EX);
964
        }
965
966 1
        return $string;
967
    }
968
969
    /**
970
     * @param $functionName
971
     */
972
    public function set_callback($functionName)
973
    {
974
        $this::$callback = $functionName;
975
    }
976
977
    /**
978
     * Get dom node's plain text.
979
     *
980
     * @param bool $multiDecodeNewHtmlEntity
981
     *
982
     * @return string
983
     */
984
    public function text(bool $multiDecodeNewHtmlEntity = false): string
985
    {
986 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
987
    }
988
989
    public function __clone()
990
    {
991
        $this->document = clone $this->document;
992
    }
993
}
994