Completed
Pull Request — master (#61)
by
unknown
02:29
created

AbstractDomParser::innerHtml()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 13

Duplication

Lines 13
Ratio 100 %

Code Coverage

Tests 6
CRAP Score 3

Importance

Changes 0
Metric Value
dl 13
loc 13
ccs 6
cts 6
cp 1
rs 9.8333
c 0
b 0
f 0
cc 3
nc 2
nop 1
crap 3
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
13
14
    /**
15
     * @var string
16
     */
17
    protected static $domHtmlBrokenHtmlHelper = '____simple_html_dom__voku__broken_html____';
18
19
    /**
20
     * @var string
21
     */
22
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_script____';
23
24
    /**
25
     * @var array
26
     */
27
    protected static $domBrokenReplaceHelper = [];
28
29
    /**
30
     * @var string[][]
31
     */
32
    protected static $domLinkReplaceHelper = [
33
        'orig' => ['[', ']', '{', '}'],
34
        'tmp'  => [
35
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
36
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
37
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
38
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
39
        ],
40
    ];
41
42
    /**
43
     * @var string[][]
44
     */
45
    protected static $domReplaceHelper = [
46
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
47
        'tmp'  => [
48
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
49
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
50
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
51
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
52
            '____SIMPLE_HTML_DOM__VOKU__AT____',
53
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
54
        ],
55
    ];
56
57
    /**
58
     * @var callable|null
59
     *
60
     * @phpstan-var null|callable([\voku\helper\XmlDomParser|\voku\helper\HtmlDomParser]): void
61
     */
62
    protected static $callback;
63
64
    /**
65
     * @var string[]
66
     */
67
    protected static $functionAliases = [];
68
69
    /**
70
     * @var \DOMDocument
71
     */
72
    protected $document;
73
74
    /**
75
     * @var string
76
     */
77
    protected $encoding = 'UTF-8';
78
79
    /**
80
     * @param string $name
81
     * @param array  $arguments
82
     *
83
     * @return bool|mixed
84
     */
85
    public function __call($name, $arguments)
86
    {
87
        $name = \strtolower($name);
88
89
        if (isset(self::$functionAliases[$name])) {
90
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
91
        }
92
93
        throw new \BadMethodCallException('Method does not exist: ' . $name);
94
    }
95
96
    /**
97
     * @param string $name
98
     * @param array  $arguments
99
     *
100
     * @throws \BadMethodCallException
101
     * @throws \RuntimeException
102
     *
103
     * @return static
104
     */
105
    abstract public static function __callStatic($name, $arguments);
106
107
    public function __clone()
108
    {
109
        $this->document = clone $this->document;
110
    }
111
112
    /** @noinspection MagicMethodsValidityInspection */
113
114
    /**
115
     * @param string $name
116
     *
117
     * @return string|null
118
     */
119
    abstract public function __get($name);
120
121
    /**
122
     * @return string
123
     */
124
    abstract public function __toString();
125
126
    /**
127
     * does nothing (only for api-compatibility-reasons)
128
     *
129
     * @return bool
130
     *
131
     * @deprecated
132
     */
133
    public function clear(): bool
134
    {
135
        return true;
136
    }
137
138
    /**
139
     * Create DOMDocument from HTML.
140
     *
141
     * @param string   $html
142
     * @param int|null $libXMLExtraOptions
143
     *
144
     * @return \DOMDocument
145
     */
146
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
147
148
    /**
149
     * @param string $content
150
     * @param bool   $multiDecodeNewHtmlEntity
151
     *
152
     * @return string
153
     */
154 138
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
155
    {
156 138
        if ($multiDecodeNewHtmlEntity) {
157 3
            if (\class_exists('\voku\helper\UTF8')) {
158
                $content = UTF8::rawurldecode($content, true);
159
            } else {
160
                do {
161 3
                    $content_compare = $content;
162
163 3
                    $content = \rawurldecode(
164 3
                        \html_entity_decode(
165 3
                            $content,
166 3
                            \ENT_QUOTES | \ENT_HTML5
167
                        )
168
                    );
169 3
                } while ($content_compare !== $content);
170
            }
171
        } else {
172
            /** @noinspection NestedPositiveIfStatementsInspection */
173 137
            if (\class_exists('\voku\helper\UTF8')) {
174
                $content = UTF8::rawurldecode($content, false);
175
            } else {
176 137
                $content = \rawurldecode(
177 137
                    \html_entity_decode(
178 137
                        $content,
179 137
                        \ENT_QUOTES | \ENT_HTML5
180
                    )
181
                );
182
            }
183
        }
184
185 138
        return $content;
186
    }
187
188
    /**
189
     * Find list of nodes with a CSS selector.
190
     *
191
     * @param string   $selector
192
     * @param int|null $idx
193
     */
194
    abstract public function find(string $selector, $idx = null);
195
196
    /**
197
     * Find nodes with a CSS selector.
198
     *
199
     * @param string $selector
200
     */
201
    abstract public function findMulti(string $selector);
202
203
    /**
204
     * Find nodes with a CSS selector or false, if no element is found.
205
     *
206
     * @param string $selector
207
     */
208
    abstract public function findMultiOrFalse(string $selector);
209
210
    /**
211
     * Find one node with a CSS selector.
212
     *
213
     * @param string $selector
214
     */
215
    abstract public function findOne(string $selector);
216
217
    /**
218
     * Find one node with a CSS selector or false, if no element is found.
219
     *
220
     * @param string $selector
221
     */
222
    abstract public function findOneOrFalse(string $selector);
223
224
    /**
225
     * @return \DOMDocument
226
     */
227 50
    public function getDocument(): \DOMDocument
228
    {
229 50
        return $this->document;
230
    }
231
232
    /**
233
     * Get dom node's outer html.
234
     *
235
     * @param bool $multiDecodeNewHtmlEntity
236
     *
237
     * @return string
238
     */
239
    abstract public function html(bool $multiDecodeNewHtmlEntity = false): string;
240
241
    /**
242
     * Get dom node's inner html.
243
     *
244
     * @param bool $multiDecodeNewHtmlEntity
245
     *
246
     * @return string
247
     */
248 31 View Code Duplication
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
249
    {
250
        // init
251 31
        $text = '';
252
253 31
        if ($this->document->documentElement) {
254 31
            foreach ($this->document->documentElement->childNodes as $node) {
255 31
                $text .= $this->document->saveHTML($node);
256
            }
257
        }
258
259 31
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
260
    }
261
262
    /**
263
     * Get dom node's inner html.
264
     *
265
     * @param bool $multiDecodeNewHtmlEntity
266
     *
267
     * @return string
268
     */
269 View Code Duplication
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
270
    {
271
        // init
272
        $text = '';
273
274
        if ($this->document->documentElement) {
275
            foreach ($this->document->documentElement->childNodes as $node) {
276
                $text .= $this->document->saveXML($node);
277
            }
278
        }
279
280
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
281
    }
282
283
    /**
284
     * Load HTML from string.
285
     *
286
     * @param string   $html
287
     * @param int|null $libXMLExtraOptions
288
     *
289
     * @return DomParserInterface
290
     */
291
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
292
293
    /**
294
     * Load HTML from file.
295
     *
296
     * @param string   $filePath
297
     * @param int|null $libXMLExtraOptions
298
     *
299
     * @throws \RuntimeException
300
     *
301
     * @return DomParserInterface
302
     */
303
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
304
305
    /**
306
     * Save the html-dom as string.
307
     *
308
     * @param string $filepath
309
     *
310
     * @return string
311
     */
312 14
    public function save(string $filepath = ''): string
313
    {
314 14
        $string = $this->html();
315 14
        if ($filepath !== '') {
316 2
            \file_put_contents($filepath, $string, \LOCK_EX);
317
        }
318
319 14
        return $string;
320
    }
321
322
    /**
323
     * @param callable $functionName
324
     */
325
    public function set_callback($functionName)
326
    {
327
        static::$callback = $functionName;
328
    }
329
330
    /**
331
     * Get dom node's plain text.
332
     *
333
     * @param bool $multiDecodeNewHtmlEntity
334
     *
335
     * @return string
336
     */
337 4
    public function text(bool $multiDecodeNewHtmlEntity = false): string
338
    {
339 4
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
340
    }
341
342
    /**
343
     * Get the HTML as XML or plain XML if needed.
344
     *
345
     * @param bool $multiDecodeNewHtmlEntity
346
     * @param bool $htmlToXml
347
     * @param bool $removeXmlHeader
348
     * @param int  $options
349
     *
350
     * @return string
351
     */
352 4 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353
        bool $multiDecodeNewHtmlEntity = false,
354
        bool $htmlToXml = true,
355
        bool $removeXmlHeader = true,
356
        int $options = \LIBXML_NOEMPTYTAG
357
    ): string {
358 4
        $xml = $this->document->saveXML(null, $options);
359 4
        if ($xml === false) {
360
            return '';
361
        }
362
363 4
        if ($removeXmlHeader) {
364 4
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
365
        }
366
367 4
        if ($htmlToXml) {
368 2
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
369
        } else {
370 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
371
372 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
373
        }
374
375 4
        return $return;
376
    }
377
378
    /**
379
     * Get the encoding to use.
380
     *
381
     * @return string
382
     */
383 230
    protected function getEncoding(): string
384
    {
385 230
        return $this->encoding;
386
    }
387
388
    /**
389
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
390
     *
391
     * @param string $html
392
     */
393 23
    protected function html5FallbackForScriptTags(string &$html)
394
    {
395
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
396
        /** @noinspection HtmlDeprecatedTag */
397 23
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
398 23
        $htmlTmp = \preg_replace_callback(
399 23
            $regExSpecialScript,
400 23
            static function ($scripts) {
401 21
                if (empty($scripts['content'])) {
402 8
                    return $scripts[0];
403
                }
404
405 18
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
406 23
            },
407 23
            $html
408
        );
409
410 23
        if ($htmlTmp !== null) {
411 23
            $html = $htmlTmp;
412
        }
413 23
    }
414
415
    /**
416
     * @param string $html
417
     *
418
     * @return string
419
     */
420 154
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
421
    {
422 154
        static $DOM_REPLACE__HELPER_CACHE = null;
423
424 154
        if ($DOM_REPLACE__HELPER_CACHE === null) {
425 30
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
426 30
                self::$domLinkReplaceHelper['tmp'],
427 30
                self::$domReplaceHelper['tmp']
428
            );
429 30
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
430 30
                self::$domLinkReplaceHelper['orig'],
431 30
                self::$domReplaceHelper['orig']
432
            );
433
434 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
435 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
436
437 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
438 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
439
440 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
441 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
442
443 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
444 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
445
        }
446
447
        if (
448 154
            isset(self::$domBrokenReplaceHelper['tmp'])
449
            &&
450 154
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
451
        ) {
452 6
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
453
        }
454
455 154
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
456
    }
457
458
    /**
459
     * @param string $html
460
     *
461
     * @return string
462
     */
463 215
    public static function replaceToPreserveHtmlEntities(string $html): string
464
    {
465
        // init
466 215
        $linksNew = [];
467 215
        $linksOld = [];
468
469 215
        if (\strpos($html, 'http') !== false) {
470
471
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
472 68
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
473 68
            \preg_match_all($regExUrl, $html, $linksOld);
474
475 68
            if (!empty($linksOld[1])) {
476 65
                $linksOld = $linksOld[1];
477 65
                foreach ((array) $linksOld as $linkKey => $linkOld) {
478 65
                    $linksNew[$linkKey] = \str_replace(
479 65
                        self::$domLinkReplaceHelper['orig'],
480 65
                        self::$domLinkReplaceHelper['tmp'],
481 65
                        $linkOld
482
                    );
483
                }
484
            }
485
        }
486
487 215
        $linksNewCount = \count($linksNew);
488 215
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
489 65
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
490 65
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
491
        } else {
492 158
            $search = self::$domReplaceHelper['orig'];
493 158
            $replace = self::$domReplaceHelper['tmp'];
494
        }
495
496 215
        return \str_replace($search, $replace, $html);
497
    }
498
}
499