Completed
Push — master ( fc06c3...487741 )
by Lars
01:54
created

getIsDOMDocumentCreatedWithoutWrapper()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 1
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
9
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
10
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
11
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
12
 * @property-read string plaintext <p>Get dom node's plain text.</p>
13
 *
14
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
15
 * @method string outerHtml() <p>Get dom node's outer html.</p>
16
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
17
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
18
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
19
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from file.</p>
20
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from string.</p>
21
 */
22
class HtmlDomParser
23
{
24
    /**
25
     * @var array
26
     */
27
    protected static $functionAliases = [
28
        'outertext' => 'html',
29
        'outerhtml' => 'html',
30
        'innertext' => 'innerHtml',
31
        'innerhtml' => 'innerHtml',
32
        'load'      => 'loadHtml',
33
        'load_file' => 'loadHtmlFile',
34
    ];
35
36
    /**
37
     * @var string[][]
38
     */
39
    protected static $domLinkReplaceHelper = [
40
        'orig' => ['[', ']', '{', '}'],
41
        'tmp'  => [
42
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
43
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
44
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
45
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
46
        ],
47
    ];
48
49
    /**
50
     * @var array
51
     */
52
    protected static $domReplaceHelper = [
53
        'orig' => ['&', '|', '+', '%', '@'],
54
        'tmp'  => [
55
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
56
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
57
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
58
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
59
            '____SIMPLE_HTML_DOM__VOKU__AT____',
60
        ],
61
    ];
62
63
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
64
65
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
66
67
    /**
68
     * @var array
69
     */
70
    protected static $domBrokenReplaceHelper = [];
71
72
    /**
73
     * @var callable
74
     */
75
    protected static $callback;
76
77
    /**
78
     * @var \DOMDocument
79
     */
80
    protected $document;
81
82
    /**
83
     * @var string
84
     */
85
    protected $encoding = 'UTF-8';
86
87
    /**
88
     * @var bool
89
     */
90
    protected $isDOMDocumentCreatedWithoutHtml = false;
91
92
    /**
93
     * @var bool
94
     */
95
    protected $isDOMDocumentCreatedWithoutWrapper = false;
96
97
    /**
98
     * @var bool
99
     */
100
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
101
102
    /**
103
     * @var bool
104
     */
105
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
106
107
    /**
108
     * @var bool
109
     */
110
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
111
112
    /**
113
     * @var bool
114
     */
115
    protected $keepBrokenHtml;
116
117
    /**
118
     * Constructor
119
     *
120
     * @param \DOMNode|SimpleHtmlDom|string $element HTML code or SimpleHtmlDom, \DOMNode
121
     *
122
     * @throws \InvalidArgumentException
123
     */
124 135
    public function __construct($element = null)
125
    {
126 135
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
127
128
        // reset
129 135
        self::$domBrokenReplaceHelper = [];
130
131
        // DOMDocument settings
132 135
        $this->document->preserveWhiteSpace = true;
133 135
        $this->document->formatOutput = true;
134
135 135
        if ($element instanceof SimpleHtmlDom) {
136 67
            $element = $element->getNode();
137
        }
138
139 135
        if ($element instanceof \DOMNode) {
140 67
            $domNode = $this->document->importNode($element, true);
141
142 67
            if ($domNode instanceof \DOMNode) {
143
                /** @noinspection UnusedFunctionResultInspection */
144 67
                $this->document->appendChild($domNode);
145
            }
146
147 67
            return;
148
        }
149
150 135
        if ($element !== null) {
151
            /** @noinspection UnusedFunctionResultInspection */
152 76
            $this->loadHtml($element);
153
        }
154 134
    }
155
156
    /**
157
     * @param $name
158
     * @param $arguments
159
     *
160
     * @return bool|mixed
161
     */
162 50 View Code Duplication
    public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
    {
164 50
        $name = \strtolower($name);
165
166 50
        if (isset(self::$functionAliases[$name])) {
167 49
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
168
        }
169
170 1
        throw new \BadMethodCallException('Method does not exist: ' . $name);
171
    }
172
173
    /**
174
     * @param $name
175
     * @param $arguments
176
     *
177
     * @throws \BadMethodCallException
178
     * @throws \RuntimeException
179
     * @throws \InvalidArgumentException
180
     *
181
     * @return HtmlDomParser
182
     */
183 19
    public static function __callStatic($name, $arguments)
184
    {
185 19
        $arguments0 = $arguments[0] ?? '';
186
187 19
        $arguments1 = $arguments[1] ?? null;
188
189 19
        if ($name === 'str_get_html') {
190 14
            $parser = new self();
191
192 14
            return $parser->loadHtml($arguments0, $arguments1);
193
        }
194
195 5
        if ($name === 'file_get_html') {
196 4
            $parser = new self();
197
198 4
            return $parser->loadHtmlFile($arguments0, $arguments1);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $parser->loadHtml...guments0, $arguments1); (self) is incompatible with the return type documented by voku\helper\HtmlDomParser::__callStatic of type voku\helper\HtmlDomParser.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
199
        }
200
201 1
        throw new \BadMethodCallException('Method does not exist');
202
    }
203
204
    /** @noinspection MagicMethodsValidityInspection */
205
206
    /**
207
     * @param $name
208
     *
209
     * @return string
210
     */
211 14
    public function __get($name)
212
    {
213 14
        $name = \strtolower($name);
214
215
        switch ($name) {
216 14
            case 'outerhtml':
217 14
            case 'outertext':
218 5
                return $this->html();
219 10
            case 'innerhtml':
220 4
            case 'innertext':
221 7
                return $this->innerHtml();
222 3
            case 'text':
223 3
            case 'plaintext':
224 2
                return $this->text();
225
        }
226
227 1
        return null;
228
    }
229
230
    /**
231
     * @param string $selector
232
     * @param int    $idx
233
     *
234
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
235
     */
236 3
    public function __invoke($selector, $idx = null)
237
    {
238 3
        return $this->find($selector, $idx);
239
    }
240
241
    /**
242
     * @return string
243
     */
244 17
    public function __toString()
245
    {
246 17
        return $this->html();
247
    }
248
249
    /**
250
     * does nothing (only for api-compatibility-reasons)
251
     *
252
     * @deprecated
253
     *
254
     * @return bool
255
     */
256 1
    public function clear(): bool
257
    {
258 1
        return true;
259
    }
260
261
    /**
262
     * @param string $html
263
     *
264
     * @return string
265
     */
266 124
    public static function replaceToPreserveHtmlEntities(string $html): string
267
    {
268
        // init
269 124
        $linksNew = [];
270 124
        $linksOld = [];
271
272 124
        if (\strpos($html, 'http') !== false) {
273
274
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
275 58
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
276 58
            \preg_match_all($regExUrl, $html, $linksOld);
277
278 58
            if (!empty($linksOld[1])) {
279 56
                $linksOld = $linksOld[1];
280 56
                foreach ((array) $linksOld as $linkKey => $linkOld) {
281 56
                    $linksNew[$linkKey] = \str_replace(
282 56
                        self::$domLinkReplaceHelper['orig'],
283 56
                        self::$domLinkReplaceHelper['tmp'],
284 56
                        $linkOld
285
                    );
286
                }
287
            }
288
        }
289
290 124
        $linksNewCount = \count($linksNew);
291 124
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
292 56
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
293 56
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
294
        } else {
295 72
            $search = self::$domReplaceHelper['orig'];
296 72
            $replace = self::$domReplaceHelper['tmp'];
297
        }
298
299 124
        return \str_replace($search, $replace, $html);
300
    }
301
302
    /**
303
     * @param string $html
304
     *
305
     * @return string
306
     */
307 80
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
308
    {
309 80
        static $DOM_REPLACE__HELPER_CACHE = null;
310
311 80
        if ($DOM_REPLACE__HELPER_CACHE === null) {
312 1
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
313 1
                self::$domLinkReplaceHelper['tmp'],
314 1
                self::$domReplaceHelper['tmp']
315
            );
316 1
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
317 1
                self::$domLinkReplaceHelper['orig'],
318 1
                self::$domReplaceHelper['orig']
319
            );
320
321 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
322 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
323
324 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
325 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
326
327 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
328 1
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
329
330 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
331 1
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
332
        }
333
334
        if (
335 80
            isset(self::$domBrokenReplaceHelper['tmp'])
336
            &&
337 80
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
338
        ) {
339 2
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
340
        }
341
342 80
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
343
    }
344
345
    /**
346
     * Create DOMDocument from HTML.
347
     *
348
     * @param string   $html
349
     * @param int|null $libXMLExtraOptions
350
     *
351
     * @return \DOMDocument
352
     */
353 123
    private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
354
    {
355 123
        if ($this->keepBrokenHtml) {
356 2
            $html = $this->keepBrokenHtml(\trim($html));
357
        }
358
359 123
        if (\strpos($html, '<') === false) {
360 6
            $this->isDOMDocumentCreatedWithoutHtml = true;
361 122
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
362 5
            $this->isDOMDocumentCreatedWithoutWrapper = true;
363
        }
364
365 123
        if (\strpos($html, '<html') === false) {
366 72
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
367
        }
368
369
        /** @noinspection HtmlRequiredTitleElement */
370 123
        if (\strpos($html, '<head>') === false) {
371 74
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
372
        }
373
374
        if (
375 123
            \strpos($html, '</script>') === false
376
            &&
377 123
            \strpos($html, '<\/script>') !== false
378
        ) {
379 1
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
380
        }
381
382 123
        if (\strpos($html, '<script') !== false) {
383 13
            $this->html5FallbackForScriptTags($html);
384
385
            if (
386 13
                \strpos($html, 'type="text/html"') !== false
387
                ||
388 13
                \strpos($html, 'type=\'text/html\'') !== false
389
            ) {
390 1
                $this->keepSpecialScriptTags($html);
391
            }
392
        }
393
394
        // set error level
395 123
        $internalErrors = \libxml_use_internal_errors(true);
396 123
        $disableEntityLoader = \libxml_disable_entity_loader(true);
397 123
        \libxml_clear_errors();
398
399 123
        $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
400
401 123
        if (\defined('LIBXML_BIGLINES')) {
402 123
            $optionsXml |= \LIBXML_BIGLINES;
403
        }
404
405 123
        if (\defined('LIBXML_COMPACT')) {
406 123
            $optionsXml |= \LIBXML_COMPACT;
407
        }
408
409 123
        if (\defined('LIBXML_HTML_NODEFDTD')) {
410 123
            $optionsXml |= \LIBXML_HTML_NODEFDTD;
411
        }
412
413 123
        if ($libXMLExtraOptions !== null) {
414 1
            $optionsXml |= $libXMLExtraOptions;
415
        }
416
417
        if (
418 123
            $this->isDOMDocumentCreatedWithoutWrapper
419
            ||
420 123
            $this->keepBrokenHtml
421
        ) {
422 6
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
423
        }
424
425 123
        $html = self::replaceToPreserveHtmlEntities($html);
426
427 123
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
428 123
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
429 42
            $this->document = \dom_import_simplexml($sxe)->ownerDocument;
430
        } else {
431
432
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
433 85
            $xmlHackUsed = false;
434 85
            if (\stripos('<?xml', $html) !== 0) {
435 85
                $xmlHackUsed = true;
436 85
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
437
            }
438
439 85
            $this->document->loadHTML($html, $optionsXml);
440
441
            // remove the "xml-encoding" hack
442 85
            if ($xmlHackUsed) {
443 85
                foreach ($this->document->childNodes as $child) {
444 85
                    if ($child->nodeType === \XML_PI_NODE) {
445
                        /** @noinspection UnusedFunctionResultInspection */
446 85
                        $this->document->removeChild($child);
447
448 85
                        break;
449
                    }
450
                }
451
            }
452
        }
453
454
        // set encoding
455 123
        $this->document->encoding = $this->getEncoding();
456
457
        // restore lib-xml settings
458 123
        \libxml_clear_errors();
459 123
        \libxml_use_internal_errors($internalErrors);
460 123
        \libxml_disable_entity_loader($disableEntityLoader);
461
462 123
        return $this->document;
463
    }
464
465
    /**
466
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
467
     *
468
     * @param string $html
469
     */
470 13
    protected function html5FallbackForScriptTags(string &$html)
471
    {
472
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
473 13
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
474 13
        $html = \preg_replace_callback($regExSpecialScript, function($scripts) {
475 12
            return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/',$scripts['content']) . '</script>';
476 13
        },$html);
477 13
    }
478
479
    /**
480
     * @param string $html
481
     */
482 1
    protected function keepSpecialScriptTags(string &$html)
483
    {
484 1
        $specialScripts = [];
485
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
486 1
        $regExSpecialScript = '/<(script) [^>]*type=(["|\'])text\/html\2([^>]*)>.*<\/\1>/isU';
487 1
        \preg_match_all($regExSpecialScript, $html, $specialScripts);
488
489 1
        if (isset($specialScripts[0])) {
490 1
            foreach ($specialScripts[0] as $specialScript) {
491
492 1
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script'));
493 1
                $specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
494
                // remove the html5 fallback
495 1
                $specialNonScript = \str_replace('<\/', '</', $specialNonScript);
496
497 1
                $html = \str_replace($specialScript, $specialNonScript, $html);
498
            }
499
        }
500 1
    }
501
502
    /**
503
     * @param string $html
504
     *
505
     * @return string
506
     */
507 2
    protected function keepBrokenHtml(string $html): string
508
    {
509
        do {
510 2
            $original = $html;
511
512 2
            $html = (string) \preg_replace_callback(
513 2
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
514 2
                function ($matches) {
515 2
                    return $matches['start'] .
516 2
                           '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
517 2
                           $matches['value'] .
518 2
                           '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
519 2
                           $matches['end'];
520 2
                },
521 2
                $html
522
            );
523 2
        } while ($original !== $html);
524
525
        do {
526 2
            $original = $html;
527
528 2
            $html = (string) \preg_replace_callback(
529 2
                '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
530 2
                function ($matches) {
531 2
                    $matches['broken'] = \str_replace(
532 2
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
533 2
                        ['</', '<', '>'],
534 2
                        $matches['broken']
535
                    );
536
537 2
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
538 2
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']);
539
540 2
                    return $matches['start'] . $matchesHash . $matches['end'];
541 2
                },
542 2
                $html
543
            );
544 2
        } while ($original !== $html);
545
546 2
        return \str_replace(
547 2
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
548 2
            ['</', '<', '>'],
549 2
            $html
550
        );
551
    }
552
553
    /**
554
     * Return element by #id.
555
     *
556
     * @param string $id
557
     *
558
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
559
     */
560 2
    public function getElementById(string $id)
561
    {
562 2
        return $this->find("#${id}", 0);
563
    }
564
565
    /**
566
     * Return element by tag name.
567
     *
568
     * @param string $name
569
     *
570
     * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
571
     */
572
    public function getElementByTagName(string $name)
573
    {
574 1
        $node = $this->document->getElementsByTagName($name)->item(0);
575
576 1
        if ($node === null) {
577
            return new SimpleHtmlDomNodeBlank();
578
        }
579
580 1
        return new SimpleHtmlDom($node);
581
    }
582
583
    /**
584
     * Returns elements by #id.
585
     *
586
     * @param string   $id
587
     * @param int|null $idx
588
     *
589
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
590
     */
591
    public function getElementsById(string $id, $idx = null)
592
    {
593
        return $this->find("#${id}", $idx);
594
    }
595
596
    /**
597
     * Returns elements by tag name.
598
     *
599
     * @param string   $name
600
     * @param int|null $idx
601
     *
602
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNode|SimpleHtmlDomNodeBlank
603
     */
604 View Code Duplication
    public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
605
    {
606 3
        $nodesList = $this->document->getElementsByTagName($name);
607
608 3
        $elements = new SimpleHtmlDomNode();
609
610 3
        foreach ($nodesList as $node) {
611 3
            $elements[] = new SimpleHtmlDom($node);
612
        }
613
614
        // return all elements
615 3
        if ($idx === null) {
616 2
            return $elements;
617
        }
618
619
        // handle negative values
620 1
        if ($idx < 0) {
621
            $idx = \count($elements) + $idx;
622
        }
623
624
        // return one element
625 1
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
626
    }
627
628
    /**
629
     * Find one node with a CSS selector.
630
     *
631
     * @param string $selector
632
     *
633
     * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
634
     */
635
    public function findOne(string $selector)
636
    {
637 2
        return $this->find($selector, 0);
638
    }
639
640
    /**
641
     * Find list of nodes with a CSS selector.
642
     *
643
     * @param string $selector
644
     * @param int    $idx
645
     *
646
     * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeInterface
647
     */
648
    public function find(string $selector, $idx = null)
649
    {
650 86
        $xPathQuery = SelectorConverter::toXPath($selector);
651
652 86
        $xPath = new \DOMXPath($this->document);
653 86
        $nodesList = $xPath->query($xPathQuery);
654 86
        $elements = new SimpleHtmlDomNode();
655
656 86
        foreach ($nodesList as $node) {
657 82
            $elements[] = new SimpleHtmlDom($node);
658
        }
659
660
        // return all elements
661 86
        if ($idx === null) {
662 56
            return $elements;
663
        }
664
665
        // handle negative values
666 42
        if ($idx < 0) {
667 11
            $idx = \count($elements) + $idx;
668
        }
669
670
        // return one element
671 42
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
672
    }
673
674
    /**
675
     * @param string $content
676
     * @param bool   $multiDecodeNewHtmlEntity
677
     *
678
     * @return string
679
     */
680
    public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
681
    {
682
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
683
        //          so we try to remove it here again ...
684
685 71
        if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) {
686
            /** @noinspection HtmlRequiredLangAttribute */
687 30
            $content = \str_replace(
688
                [
689 30
                    '<body>',
690
                    '</body>',
691
                    '<html>',
692
                    '</html>',
693
                ],
694 30
                '',
695 30
                $content
696
            );
697
        }
698
699 71
        if ($this->isDOMDocumentCreatedWithoutHeadWrapper) {
700
            /** @noinspection HtmlRequiredTitleElement */
701 31
            $content = \str_replace(
702
                [
703 31
                    '<head>',
704
                    '</head>',
705
                ],
706 31
                '',
707 31
                $content
708
            );
709
        }
710
711 71
        if ($this->isDOMDocumentCreatedWithFakeEndScript) {
712 1
            $content = \str_replace(
713 1
                '</script>',
714 1
                '',
715 1
                $content
716
            );
717
        }
718
719 71
        if ($this->isDOMDocumentCreatedWithoutWrapper) {
720 4
            $content = (string) \preg_replace('/^<p>/', '', $content);
721 4
            $content = (string) \preg_replace('/<\/p>/', '', $content);
722
        }
723
724 71
        if ($this->isDOMDocumentCreatedWithoutHtml) {
725 5
            $content = \str_replace(
726
                [
727 5
                    '<p>',
728
                    '</p>',
729
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
730
                ],
731 5
                '',
732 5
                $content
733
            );
734
        }
735
736
        /** @noinspection CheckTagEmptyBody */
737
        /** @noinspection HtmlExtraClosingTag */
738
        /** @noinspection HtmlRequiredTitleElement */
739 71
        $content = \trim(
740 71
            \str_replace(
741
                [
742 71
                    '<simpleHtmlDomP>',
743
                    '</simpleHtmlDomP>',
744
                    '<head><head>',
745
                    '</head></head>',
746
                    '<br></br>',
747
                ],
748
                [
749 71
                    '',
750
                    '',
751
                    '<head>',
752
                    '</head>',
753
                    '<br>',
754
                ],
755 71
                $content
756
            )
757
        );
758
759 71
        if ($multiDecodeNewHtmlEntity) {
760 3
            if (\class_exists('\voku\helper\UTF8')) {
761
762
                /** @noinspection PhpUndefinedClassInspection */
763
                $content = UTF8::rawurldecode($content);
764
            } else {
765
                do {
766 3
                    $content_compare = $content;
767
768 3
                    $content = \rawurldecode(
769 3
                        \html_entity_decode(
770 3
                            $content,
771 3
                            \ENT_QUOTES | \ENT_HTML5
772
                        )
773
                    );
774 3
                } while ($content_compare !== $content);
775
            }
776
        } else {
777 70
            $content = \rawurldecode(
778 70
                \html_entity_decode(
779 70
                    $content,
780 70
                    \ENT_QUOTES | \ENT_HTML5
781
                )
782
            );
783
        }
784
785 71
        return self::putReplacedBackToPreserveHtmlEntities($content);
786
    }
787
788
    /**
789
     * @return \DOMDocument
790
     */
791
    public function getDocument(): \DOMDocument
792
    {
793 39
        return $this->document;
794
    }
795
796
    /**
797
     * Get the encoding to use.
798
     *
799
     * @return string
800
     */
801
    private function getEncoding(): string
802
    {
803 135
        return $this->encoding;
804
    }
805
806
    /**
807
     * @return bool
808
     */
809
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
810
    {
811 9
        return $this->isDOMDocumentCreatedWithoutHtml;
812
    }
813
814
    /**
815
     * @return bool
816
     */
817
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
818
    {
819 46
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
820
    }
821
822
    /**
823
     * @return bool
824
     */
825
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
826
    {
827 9
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
828
    }
829
830
    /**
831
     * @return bool
832
     */
833
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
834
    {
835
        return $this->isDOMDocumentCreatedWithoutWrapper;
836
    }
837
838
    /**
839
     * Get dom node's outer html.
840
     *
841
     * @param bool $multiDecodeNewHtmlEntity
842
     *
843
     * @return string
844
     */
845
    public function html(bool $multiDecodeNewHtmlEntity = false): string
846
    {
847 46
        if ($this::$callback !== null) {
848
            \call_user_func($this::$callback, [$this]);
849
        }
850
851 46
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
852 23
            $content = $this->document->saveHTML($this->document->documentElement);
853
        } else {
854 31
            $content = $this->document->saveHTML();
855
        }
856
857 46
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
858
    }
859
860
    /**
861
     * @param bool $keepBrokenHtml
862
     *
863
     * @return HtmlDomParser
864
     */
865
    public function useKeepBrokenHtml(bool $keepBrokenHtml): self
866
    {
867 2
        $this->keepBrokenHtml = $keepBrokenHtml;
868
869 2
        return $this;
870
    }
871
872
    /**
873
     * Get the HTML as XML.
874
     *
875
     * @param bool $multiDecodeNewHtmlEntity
876
     *
877
     * @return string
878
     */
879
    public function xml(bool $multiDecodeNewHtmlEntity = false): string
880
    {
881 2
        $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
882
883
        // remove the XML-header
884 2
        $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
885
886 2
        return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
887
    }
888
889
    /**
890
     * Get dom node's inner html.
891
     *
892
     * @param bool $multiDecodeNewHtmlEntity
893
     *
894
     * @return string
895
     */
896
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
897
    {
898
        // init
899 19
        $text = '';
900
901 19
        foreach ($this->document->documentElement->childNodes as $node) {
902 19
            $text .= $this->document->saveHTML($node);
903
        }
904
905 19
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
906
    }
907
908
    /**
909
     * Load HTML from string.
910
     *
911
     * @param string   $html
912
     * @param int|null $libXMLExtraOptions
913
     *
914
     * @throws \InvalidArgumentException if argument is not string
915
     *
916
     * @return HtmlDomParser
917
     */
918
    public function loadHtml(string $html, $libXMLExtraOptions = null): self
919
    {
920 123
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
921
922 123
        return $this;
923
    }
924
925
    /**
926
     * Load HTML from file.
927
     *
928
     * @param string   $filePath
929
     * @param int|null $libXMLExtraOptions
930
     *
931
     * @throws \RuntimeException
932
     * @throws \InvalidArgumentException
933
     *
934
     * @return HtmlDomParser
935
     */
936
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
937
    {
938
        if (
939 11
            !\preg_match("/^https?:\/\//i", $filePath)
940
            &&
941 11
            !\file_exists($filePath)
942
        ) {
943 1
            throw new \RuntimeException("File ${filePath} not found");
944
        }
945
946
        try {
947 10
            if (\class_exists('\voku\helper\UTF8')) {
948
                /** @noinspection PhpUndefinedClassInspection */
949
                $html = UTF8::file_get_contents($filePath);
950
            } else {
951 10
                $html = \file_get_contents($filePath);
952
            }
953 1
        } catch (\Exception $e) {
954 1
            throw new \RuntimeException("Could not load file ${filePath}");
955
        }
956
957 9
        if ($html === false) {
958
            throw new \RuntimeException("Could not load file ${filePath}");
959
        }
960
961 9
        return $this->loadHtml($html, $libXMLExtraOptions);
962
    }
963
964
    /**
965
     * Save the html-dom as string.
966
     *
967
     * @param string $filepath
968
     *
969
     * @return string
970
     */
971
    public function save(string $filepath = ''): string
972
    {
973 1
        $string = $this->innerHtml();
974 1
        if ($filepath !== '') {
975
            \file_put_contents($filepath, $string, \LOCK_EX);
976
        }
977
978 1
        return $string;
979
    }
980
981
    /**
982
     * @param $functionName
983
     */
984
    public function set_callback($functionName)
985
    {
986
        $this::$callback = $functionName;
987
    }
988
989
    /**
990
     * Get dom node's plain text.
991
     *
992
     * @param bool $multiDecodeNewHtmlEntity
993
     *
994
     * @return string
995
     */
996
    public function text(bool $multiDecodeNewHtmlEntity = false): string
997
    {
998 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
999
    }
1000
1001
    public function __clone()
1002
    {
1003
        $this->document = clone $this->document;
1004
    }
1005
}
1006