Completed
Push — master ( bc7cb0...68d119 )
by Lars
10:50 queued 03:14
created

HtmlDomParser::__get()   B

Complexity

Conditions 7
Paths 7

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 7

Importance

Changes 0
Metric Value
dl 0
loc 18
ccs 12
cts 12
cp 1
rs 8.8333
c 0
b 0
f 0
cc 7
nc 7
nop 1
crap 7
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser
35
{
36
    /**
37
     * @var array
38
     */
39
    protected static $functionAliases = [
40
        'outertext' => 'html',
41
        'outerhtml' => 'html',
42
        'innertext' => 'innerHtml',
43
        'innerhtml' => 'innerHtml',
44
        'load'      => 'loadHtml',
45
        'load_file' => 'loadHtmlFile',
46
    ];
47
48
    /**
49
     * @var string[][]
50
     */
51
    protected static $domLinkReplaceHelper = [
52
        'orig' => ['[', ']', '{', '}'],
53
        'tmp'  => [
54
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
55
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
56
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
57
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
58
        ],
59
    ];
60
61
    /**
62
     * @var array
63
     */
64
    protected static $domReplaceHelper = [
65
        'orig' => ['&', '|', '+', '%', '@'],
66
        'tmp'  => [
67
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
68
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
69
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
70
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
71
            '____SIMPLE_HTML_DOM__VOKU__AT____',
72
        ],
73
    ];
74
75
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
76
77
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
78
79
    /**
80
     * @var array
81
     */
82
    protected static $domBrokenReplaceHelper = [];
83
84
    /**
85
     * @var callable
86
     */
87
    protected static $callback;
88
89
    /**
90
     * @var \DOMDocument
91
     */
92
    protected $document;
93
94
    /**
95
     * @var string
96
     */
97
    protected $encoding = 'UTF-8';
98
99
    /**
100
     * @var bool
101
     */
102
    protected $isDOMDocumentCreatedWithoutHtml = false;
103
104
    /**
105
     * @var bool
106
     */
107
    protected $isDOMDocumentCreatedWithoutWrapper = false;
108
109
    /**
110
     * @var bool
111
     */
112
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
113
114
    /**
115
     * @var bool
116
     */
117
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
118
119
    /**
120
     * @var bool
121
     */
122
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
123
124
    /**
125
     * @var bool
126
     */
127
    protected $keepBrokenHtml;
128
129
    /**
130
     * Constructor
131
     *
132
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
133
     *
134
     * @throws \InvalidArgumentException
135
     */
136 138
    public function __construct($element = null)
137
    {
138 138
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
139
140
        // reset
141 138
        self::$domBrokenReplaceHelper = [];
142
143
        // DOMDocument settings
144 138
        $this->document->preserveWhiteSpace = true;
145 138
        $this->document->formatOutput = true;
146
147 138
        if ($element instanceof SimpleHtmlDomInterface) {
148 68
            $element = $element->getNode();
149
        }
150
151 138
        if ($element instanceof \DOMNode) {
152 68
            $domNode = $this->document->importNode($element, true);
153
154 68
            if ($domNode instanceof \DOMNode) {
155
                /** @noinspection UnusedFunctionResultInspection */
156 68
                $this->document->appendChild($domNode);
157
            }
158
159 68
            return;
160
        }
161
162 138
        if ($element !== null) {
163
            /** @noinspection UnusedFunctionResultInspection */
164 79
            $this->loadHtml($element);
165
        }
166 137
    }
167
168
    /**
169
     * @param string $name
170
     * @param array  $arguments
171
     *
172
     * @return bool|mixed
173
     */
174 50 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
175
    {
176 50
        $name = \strtolower($name);
177
178 50
        if (isset(self::$functionAliases[$name])) {
179 49
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
180
        }
181
182 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
183
    }
184
185
    /**
186
     * @param string $name
187
     * @param array  $arguments
188
     *
189
     * @throws \BadMethodCallException
190
     * @throws \RuntimeException
191
     * @throws \InvalidArgumentException
192
     *
193
     * @return HtmlDomParser
194
     */
195 19
    public static function __callStatic($name, $arguments)
196
    {
197 19
        $arguments0 = $arguments[0] ?? '';
198
199 19
        $arguments1 = $arguments[1] ?? null;
200
201 19
        if ($name === 'str_get_html') {
202 14
            $parser = new static();
203
204 14
            return $parser->loadHtml($arguments0, $arguments1);
205
        }
206
207 5
        if ($name === 'file_get_html') {
208 4
            $parser = new static();
209
210 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
211
        }
212
213 1
        throw new \BadMethodCallException('Method does not exist');
214
    }
215
216
    /** @noinspection MagicMethodsValidityInspection */
217
218
    /**
219
     * @param string $name
220
     *
221
     * @return string|null
222
     */
223 14
    public function __get($name)
224
    {
225 14
        $name = \strtolower($name);
226
227
        switch ($name) {
228 14
            case 'outerhtml':
229 14
            case 'outertext':
230 5
                return $this->html();
231 10
            case 'innerhtml':
232 4
            case 'innertext':
233 7
                return $this->innerHtml();
234 3
            case 'text':
235 3
            case 'plaintext':
236 2
                return $this->text();
237
        }
238
239 1
        return null;
240
    }
241
242
    /**
243
     * @param string $selector
244
     * @param int    $idx
245
     *
246
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
247
     */
248 3
    public function __invoke($selector, $idx = null)
249
    {
250 3
        return $this->find($selector, $idx);
251
    }
252
253
    /**
254
     * @return string
255
     */
256 17
    public function __toString()
257
    {
258 17
        return $this->html();
259
    }
260
261
    /**
262
     * does nothing (only for api-compatibility-reasons)
263
     *
264
     * @deprecated
265
     *
266
     * @return bool
267
     */
268 1
    public function clear(): bool
269
    {
270 1
        return true;
271
    }
272
273
    /**
274
     * @param string $html
275
     *
276
     * @return string
277
     */
278 127
    public static function replaceToPreserveHtmlEntities(string $html): string
279
    {
280
        // init
281 127
        $linksNew = [];
282 127
        $linksOld = [];
283
284 127
        if (\strpos($html, 'http') !== false) {
285
286
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
287 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
288 58
            \preg_match_all($regExUrl, $html, $linksOld);
289
290 58
            if (!empty($linksOld[1])) {
291 56
                $linksOld = $linksOld[1];
292 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
293 56
                    $linksNew[$linkKey] = \str_replace(
294 56
                        self::$domLinkReplaceHelper['orig'],
295 56
                        self::$domLinkReplaceHelper['tmp'],
296 56
                        $linkOld
297
                    );
298
                }
299
            }
300
        }
301
302 127
        $linksNewCount = \count($linksNew);
303 127
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
304 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
305 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
306
        } else {
307 75
            $search = self::$domReplaceHelper['orig'];
308 75
            $replace = self::$domReplaceHelper['tmp'];
309
        }
310
311 127
        return \str_replace($search, $replace, $html);
312
    }
313
314
    /**
315
     * @param string $html
316
     *
317
     * @return string
318
     */
319 80
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
320
    {
321 80
        static $DOM_REPLACE__HELPER_CACHE = null;
322
323 80
        if ($DOM_REPLACE__HELPER_CACHE === null) {
324 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
325 1
                self::$domLinkReplaceHelper['tmp'],
326 1
                self::$domReplaceHelper['tmp']
327
            );
328 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
329 1
                self::$domLinkReplaceHelper['orig'],
330 1
                self::$domReplaceHelper['orig']
331
            );
332
333 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
334 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
335
336 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
337 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
338
339 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
340 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
341
342 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
343 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
344
        }
345
346
        if (
347 80
            isset(self::$domBrokenReplaceHelper['tmp'])
348
            &&
349 80
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
350
        ) {
351 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
352
        }
353
354 80
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
355
    }
356
357
    /**
358
     * Create DOMDocument from HTML.
359
     *
360
     * @param string   $html
361
     * @param int|null $libXMLExtraOptions
362
     *
363
     * @return \DOMDocument
364
     */
365 126
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
366
    {
367 126
        if ($this->keepBrokenHtml) {
368 2
            $html = $this->keepBrokenHtml(\trim($html));
369
        }
370
371 126
        if (\strpos($html, '<') === false) {
372 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
373 125
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
374 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
375
        }
376
377 126
        if (\strpos($html, '<html') === false) {
378 75
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
379
        }
380
381
        /** @noinspection HtmlRequiredTitleElement */
382 126
        if (\strpos($html, '<head>') === false) {
383 77
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
384
        }
385
386
        if (
387 126
            \strpos($html, '</script>') === false
388
            &&
389 126
            \strpos($html, '<\/script>') !== false
390
        ) {
391 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
392
        }
393
394 126
        if (\strpos($html, '<script') !== false) {
395 13
            $this->html5FallbackForScriptTags($html);
396
397
            if (
398 13
                \strpos($html, 'type="text/html"') !== false
399
                ||
400 12
                \strpos($html, 'type=\'text/html\'') !== false
401
                ||
402 13
                \strpos($html, 'type=text/html') !== false
403
            ) {
404 1
                $this->keepSpecialScriptTags($html);
405
            }
406
        }
407
408
        // set error level
409 126
        $internalErrors = \libxml_use_internal_errors(true);
410 126
        $disableEntityLoader = \libxml_disable_entity_loader(true);
411 126
        \libxml_clear_errors();
412
413 126
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
414
415 126
        if (\defined('LIBXML_BIGLINES')) {
416 126
            $optionsXml |= \LIBXML_BIGLINES;
417
        }
418
419 126
        if (\defined('LIBXML_COMPACT')) {
420 126
            $optionsXml |= \LIBXML_COMPACT;
421
        }
422
423 126
        if (\defined('LIBXML_HTML_NODEFDTD')) {
424 126
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
425
        }
426
427 126
        if ($libXMLExtraOptions !== null) {
428 1
            $optionsXml |= $libXMLExtraOptions;
429
        }
430
431
        if (
432 126
            $this->isDOMDocumentCreatedWithoutWrapper
433
            ||
434 126
            $this->keepBrokenHtml
435
        ) {
436 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
437
        }
438
439 126
        $html = self::replaceToPreserveHtmlEntities($html);
440
441 126
        $documentFound = false;
442 126
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
443 126
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
444 45
            $domElementTmp = \dom_import_simplexml($sxe);
445 45
            if ($domElementTmp) {
446 45
                $documentFound = true;
447 45
                $this->document = $domElementTmp->ownerDocument;
448
            }
449
        }
450
451 126
        if ($documentFound === false) {
452
453
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
454 85
            $xmlHackUsed = false;
455 85
            if (\stripos('<?xml', $html) !== 0) {
456 85
                $xmlHackUsed = true;
457 85
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
458
            }
459
460 85
            $this->document->loadHTML($html, $optionsXml);
461
462
            // remove the "xml-encoding" hack
463 85
            if ($xmlHackUsed) {
464 85
                foreach ($this->document->childNodes as $child) {
465 85
                    if ($child->nodeType === \XML_PI_NODE) {
466
                        /** @noinspection UnusedFunctionResultInspection */
467 85
                        $this->document->removeChild($child);
468
469 85
                        break;
470
                    }
471
                }
472
            }
473
        }
474
475
        // set encoding
476 126
        $this->document->encoding = $this->getEncoding();
477
478
        // restore lib-xml settings
479 126
        \libxml_clear_errors();
480 126
        \libxml_use_internal_errors($internalErrors);
481 126
        \libxml_disable_entity_loader($disableEntityLoader);
482
483 126
        return $this->document;
484
    }
485
486
    /**
487
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
488
     *
489
     * @param string $html
490
     */
491 13
    protected function html5FallbackForScriptTags(string &$html)
492
    {
493
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
494
        /** @noinspection HtmlDeprecatedTag */
495 13
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
496
        $html = \preg_replace_callback($regExSpecialScript, static function ($scripts) {
497 12
            return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
498 13
        }, $html);
499 13
    }
500
501
    /**
502
     * @param string $html
503
     */
504 1
    protected function keepSpecialScriptTags(string &$html)
505
    {
506 1
        $specialScripts = [];
507
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
508 1
        $regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU';
509 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
510
511 1
        if (isset($specialScripts[0])) {
512 1
            foreach ($specialScripts[0] as $specialScript) {
513 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
514 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
515
                // remove the html5 fallback
516 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
517
518 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
519
            }
520
        }
521 1
    }
522
523
    /**
524
     * @param string $html
525
     *
526
     * @return string
527
     */
528 2
    protected function keepBrokenHtml(string $html): string
529
    {
530
        do {
531 2
            $original = $html;
532
533 2
            $html = (string) \preg_replace_callback(
534 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
535
                static function ($matches) {
536 2
                    return $matches['start'] .
537 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
538 2
                           $matches['value'] .
539 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
540 2
                           $matches['end'];
541 2
                },
542 2
                $html
543
            );
544 2
        } while ($original !== $html);
545
546
        do {
547 2
            $original = $html;
548
549 2
            $html = (string) \preg_replace_callback(
550 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
551
                static function ($matches) {
552 2
                    $matches['broken'] = \str_replace(
553 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
554 2
                        ['</', '<', '>'],
555 2
                        $matches['broken']
556
                    );
557
558 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
559 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
560
561 2
                    return $matches['start'] . $matchesHash . $matches['end'];
562 2
                },
563 2
                $html
564
            );
565 2
        } while ($original !== $html);
566
567 2
        return \str_replace(
568 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
569 2
            ['</', '<', '>'],
570 2
            $html
571
        );
572
    }
573
574
    /**
575
     * Return element by #id.
576
     *
577
     * @param string $id
578
     *
579
     * @return SimpleHtmlDomInterface
580
     */
581 2
    public function getElementById(string $id): SimpleHtmlDomInterface
582
    {
583 2
        return $this->findOne("#${id}");
584
    }
585
586
    /**
587
     * Returns elements by #id.
588
     *
589
     * @param string   $id
590
     * @param int|null $idx
591
     *
592
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
593
     */
594
    public function getElementsById(string $id, $idx = null)
595
    {
596
        return $this->find("#${id}", $idx);
597
    }
598
599
    /**
600
     * Return elements by .class.
601
     *
602
     * @param string $class
603
     *
604
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
605
     */
606
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
607
    {
608
        return $this->findMulti(".${class}");
609
    }
610
611
    /**
612
     * Return element by tag name.
613
     *
614
     * @param string $name
615
     *
616
     * @return SimpleHtmlDomInterface
617
     */
618 1
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
619
    {
620 1
        $node = $this->document->getElementsByTagName($name)->item(0);
621
622 1
        if ($node === null) {
623
            return new SimpleHtmlDomBlank();
624
        }
625
626 1
        return new SimpleHtmlDom($node);
627
    }
628
629
    /**
630
     * Returns elements by tag name.
631
     *
632
     * @param string   $name
633
     * @param int|null $idx
634
     *
635
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
636
     */
637 3 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
638
    {
639 3
        $nodesList = $this->document->getElementsByTagName($name);
640
641 3
        $elements = new SimpleHtmlDomNode();
642
643 3
        foreach ($nodesList as $node) {
644 3
            $elements[] = new SimpleHtmlDom($node);
645
        }
646
647
        // return all elements
648 3
        if ($idx === null) {
649 2
            if (\count($elements) === 0) {
650
                return new SimpleHtmlDomNodeBlank();
651
            }
652
653 2
            return $elements;
654
        }
655
656
        // handle negative values
657 1
        if ($idx < 0) {
658
            $idx = \count($elements) + $idx;
659
        }
660
661
        // return one element
662 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
663
    }
664
665
    /**
666
     * Find one node with a CSS selector.
667
     *
668
     * @param string $selector
669
     *
670
     * @return SimpleHtmlDomInterface
671
     */
672 4
    public function findOne(string $selector): SimpleHtmlDomInterface
673
    {
674 4
        return $this->find($selector, 0);
675
    }
676
677
    /**
678
     * Find nodes with a CSS selector.
679
     *
680
     * @param string $selector
681
     *
682
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
683
     */
684 4
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
685
    {
686 4
        return $this->find($selector, null);
687
    }
688
689
    /**
690
     * Find list of nodes with a CSS selector.
691
     *
692
     * @param string   $selector
693
     * @param int|null $idx
694
     *
695
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface
696
     */
697 89
    public function find(string $selector, $idx = null)
698
    {
699 89
        $xPathQuery = SelectorConverter::toXPath($selector);
700
701 89
        $xPath = new \DOMXPath($this->document);
702 89
        $nodesList = $xPath->query($xPathQuery);
703 89
        $elements = new SimpleHtmlDomNode();
704
705 89
        foreach ($nodesList as $node) {
706 82
            $elements[] = new SimpleHtmlDom($node);
707
        }
708
709
        // return all elements
710 89
        if ($idx === null) {
711 57
            if (\count($elements) === 0) {
712 13
                return new SimpleHtmlDomNodeBlank();
713
            }
714
715 54
            return $elements;
716
        }
717
718
        // handle negative values
719 45
        if ($idx < 0) {
720 11
            $idx = \count($elements) + $idx;
721
        }
722
723
        // return one element
724 45
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
725
    }
726
727
    /**
728
     * @param string $content
729
     * @param bool   $multiDecodeNewHtmlEntity
730
     *
731
     * @return string
732
     */
733 72
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
734
    {
735
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
736
        //          so we try to remove it here again ...
737
738 72
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
739
            /** @noinspection HtmlRequiredLangAttribute */
740 30
            $content = \str_replace(
741
                [
742 30
                    '<body>',
743
                    '</body>',
744
                    '<html>',
745
                    '</html>',
746
                ],
747 30
                '',
748 30
                $content
749
            );
750
        }
751
752 72
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
753
            /** @noinspection HtmlRequiredTitleElement */
754 31
            $content = \str_replace(
755
                [
756 31
                    '<head>',
757
                    '</head>',
758
                ],
759 31
                '',
760 31
                $content
761
            );
762
        }
763
764 72
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
765 1
            $content = \str_replace(
766 1
                '</script>',
767 1
                '',
768 1
                $content
769
            );
770
        }
771
772 72
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
773 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
774 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
775
        }
776
777 72
        if ($this->isDOMDocumentCreatedWithoutHtml) {
778 5
            $content = \str_replace(
779
                [
780 5
                    '<p>',
781
                    '</p>',
782
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
783
                ],
784 5
                '',
785 5
                $content
786
            );
787
        }
788
789
        /** @noinspection CheckTagEmptyBody */
790
        /** @noinspection HtmlExtraClosingTag */
791
        /** @noinspection HtmlRequiredTitleElement */
792 72
        $content = \trim(
793 72
            \str_replace(
794
                [
795 72
                    '<simpleHtmlDomP>',
796
                    '</simpleHtmlDomP>',
797
                    '<head><head>',
798
                    '</head></head>',
799
                    '<br></br>',
800
                ],
801
                [
802 72
                    '',
803
                    '',
804
                    '<head>',
805
                    '</head>',
806
                    '<br>',
807
                ],
808 72
                $content
809
            )
810
        );
811
812 72
        if ($multiDecodeNewHtmlEntity) {
813 3
            if (\class_exists('\voku\helper\UTF8')) {
814
815
                /** @noinspection PhpUndefinedClassInspection */
816
                $content = UTF8::rawurldecode($content);
817
            } else {
818
                do {
819 3
                    $content_compare = $content;
820
821 3
                    $content = \rawurldecode(
822 3
                        \html_entity_decode(
823 3
                            $content,
824 3
                            \ENT_QUOTES | \ENT_HTML5
825
                        )
826
                    );
827 3
                } while ($content_compare !== $content);
828
            }
829
        } else {
830 71
            $content = \rawurldecode(
831 71
                \html_entity_decode(
832 71
                    $content,
833 71
                    \ENT_QUOTES | \ENT_HTML5
834
                )
835
            );
836
        }
837
838 72
        return self::putReplacedBackToPreserveHtmlEntities($content);
839
    }
840
841
    /**
842
     * @return \DOMDocument
843
     */
844 39
    public function getDocument(): \DOMDocument
845
    {
846 39
        return $this->document;
847
    }
848
849
    /**
850
     * Get the encoding to use.
851
     *
852
     * @return string
853
     */
854 138
    private function getEncoding(): string
855
    {
856 138
        return $this->encoding;
857
    }
858
859
    /**
860
     * @return bool
861
     */
862 9
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
863
    {
864 9
        return $this->isDOMDocumentCreatedWithoutHtml;
865
    }
866
867
    /**
868
     * @return bool
869
     */
870 47
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
871
    {
872 47
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
873
    }
874
875
    /**
876
     * @return bool
877
     */
878 9
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
879
    {
880 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
881
    }
882
883
    /**
884
     * @return bool
885
     */
886
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
887
    {
888
        return $this->isDOMDocumentCreatedWithoutWrapper;
889
    }
890
891
    /**
892
     * Get dom node's outer html.
893
     *
894
     * @param bool $multiDecodeNewHtmlEntity
895
     *
896
     * @return string
897
     */
898 47
    public function html(bool $multiDecodeNewHtmlEntity = false): string
899
    {
900 47
        if ($this::$callback !== null) {
901
            \call_user_func($this::$callback, [$this]);
902
        }
903
904 47
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
905 23
            $content = $this->document->saveHTML($this->document->documentElement);
906
        } else {
907 32
            $content = $this->document->saveHTML();
908
        }
909
910 47
        if ($content === false) {
911
            return '';
912
        }
913
914 47
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
915
    }
916
917
    /**
918
     * @param bool $keepBrokenHtml
919
     *
920
     * @return HtmlDomParser
921
     */
922 2
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
923
    {
924 2
        $this->keepBrokenHtml = $keepBrokenHtml;
925
926 2
        return $this;
927
    }
928
929
    /**
930
     * Get the HTML as XML.
931
     *
932
     * @param bool $multiDecodeNewHtmlEntity
933
     *
934
     * @return string
935
     */
936 2
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
937
    {
938 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
939
940
        // remove the XML-header
941 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
942
943 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
944
    }
945
946
    /**
947
     * Get dom node's inner html.
948
     *
949
     * @param bool $multiDecodeNewHtmlEntity
950
     *
951
     * @return string
952
     */
953 19
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
954
    {
955
        // init
956 19
        $text = '';
957
958 19
        foreach ($this->document->documentElement->childNodes as $node) {
959 19
            $text .= $this->document->saveHTML($node);
960
        }
961
962 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
963
    }
964
965
    /**
966
     * Load HTML from string.
967
     *
968
     * @param string   $html
969
     * @param int|null $libXMLExtraOptions
970
     *
971
     * @throws \InvalidArgumentException if argument is not string
972
     *
973
     * @return HtmlDomParser
974
     */
975 126
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
976
    {
977 126
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
978
979 126
        return $this;
980
    }
981
982
    /**
983
     * Load HTML from file.
984
     *
985
     * @param string   $filePath
986
     * @param int|null $libXMLExtraOptions
987
     *
988
     * @throws \RuntimeException
989
     * @throws \InvalidArgumentException
990
     *
991
     * @return HtmlDomParser
992
     */
993 11
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
994
    {
995
        if (
996 11
            !\preg_match("/^https?:\/\//i", $filePath)
997
            &&
998 11
            !\file_exists($filePath)
999
        ) {
1000 1
            throw new \RuntimeException("File ${filePath} not found");
1001
        }
1002
1003
        try {
1004 10
            if (\class_exists('\voku\helper\UTF8')) {
1005
                /** @noinspection PhpUndefinedClassInspection */
1006
                $html = UTF8::file_get_contents($filePath);
1007
            } else {
1008 10
                $html = \file_get_contents($filePath);
1009
            }
1010 1
        } catch (\Exception $e) {
1011 1
            throw new \RuntimeException("Could not load file ${filePath}");
1012
        }
1013
1014 9
        if ($html === false) {
1015
            throw new \RuntimeException("Could not load file ${filePath}");
1016
        }
1017
1018 9
        return $this->loadHtml($html, $libXMLExtraOptions);
1019
    }
1020
1021
    /**
1022
     * Save the html-dom as string.
1023
     *
1024
     * @param string $filepath
1025
     *
1026
     * @return string
1027
     */
1028 1
    public function save(string $filepath = ''): string
1029
    {
1030 1
        $string = $this->innerHtml();
1031 1
        if ($filepath !== '') {
1032
            \file_put_contents($filepath, $string, \LOCK_EX);
1033
        }
1034
1035 1
        return $string;
1036
    }
1037
1038
    /**
1039
     * @param callable $functionName
1040
     */
1041
    public function set_callback($functionName)
1042
    {
1043
        static::$callback = $functionName;
1044
    }
1045
1046
    /**
1047
     * Get dom node's plain text.
1048
     *
1049
     * @param bool $multiDecodeNewHtmlEntity
1050
     *
1051
     * @return string
1052
     */
1053 3
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1054
    {
1055 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
1056
    }
1057
1058
    public function __clone()
1059
    {
1060
        $this->document = clone $this->document;
1061
    }
1062
}
1063