Completed
Push — master ( 75b23e...4c12d1 )
by Lars
04:25 queued 02:06
created

AbstractDomParser::createDOMDocument()

Size

Total Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 1
ccs 0
cts 0
cp 0
c 0
b 0
f 0
nc 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
13
14
    /**
15
     * @var string
16
     */
17
    protected static $domHtmlBrokenHtmlHelper = '____simple_html_dom__voku__broken_html____';
18
19
    /**
20
     * @var string
21
     */
22
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_script____';
23
24
    /**
25
     * @var array
26
     */
27
    protected static $domBrokenReplaceHelper = [];
28
29
    /**
30
     * @var string[][]
31
     */
32
    protected static $domLinkReplaceHelper = [
33
        'orig' => ['[', ']', '{', '}'],
34
        'tmp'  => [
35
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
36
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
37
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
38
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
39
        ],
40
    ];
41
42
    /**
43
     * @var string[][]
44
     */
45
    protected static $domReplaceHelper = [
46
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
47
        'tmp'  => [
48
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
49
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
50
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
51
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
52
            '____SIMPLE_HTML_DOM__VOKU__AT____',
53
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
54
        ],
55
    ];
56
57
    /**
58
     * @var callable|null
59
     */
60
    protected static $callback;
61
62
    /**
63
     * @var string[]
64
     */
65
    protected static $functionAliases = [];
66
67
    /**
68
     * @var \DOMDocument
69
     */
70
    protected $document;
71
72
    /**
73
     * @var string
74
     */
75
    protected $encoding = 'UTF-8';
76
77
    /**
78
     * @param string $name
79
     * @param array  $arguments
80
     *
81
     * @return bool|mixed
82
     */
83
    public function __call($name, $arguments)
84
    {
85
        $name = \strtolower($name);
86
87
        if (isset(self::$functionAliases[$name])) {
88
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
89
        }
90
91
        throw new \BadMethodCallException('Method does not exist: ' . $name);
92
    }
93
94
    /**
95
     * @param string $name
96
     * @param array  $arguments
97
     *
98
     * @throws \BadMethodCallException
99
     * @throws \RuntimeException
100
     *
101
     * @return static
102
     */
103
    abstract public static function __callStatic($name, $arguments);
104
105
    public function __clone()
106
    {
107
        $this->document = clone $this->document;
108
    }
109
110
    /** @noinspection MagicMethodsValidityInspection */
111
112
    /**
113
     * @param string $name
114
     *
115
     * @return string|null
116
     */
117
    abstract public function __get($name);
118
119
    /**
120
     * @return string
121
     */
122
    abstract public function __toString();
123
124
    /**
125
     * does nothing (only for api-compatibility-reasons)
126
     *
127
     * @return bool
128
     *
129
     * @deprecated
130
     */
131
    public function clear(): bool
132
    {
133
        return true;
134
    }
135
136
    /**
137
     * Create DOMDocument from HTML.
138
     *
139
     * @param string   $html
140
     * @param int|null $libXMLExtraOptions
141
     *
142
     * @return \DOMDocument
143
     */
144
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
145
146
    /**
147
     * @param string $content
148
     * @param bool   $multiDecodeNewHtmlEntity
149
     *
150
     * @return string
151
     */
152 119
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
153
    {
154 119
        if ($multiDecodeNewHtmlEntity) {
155 3
            if (\class_exists('\voku\helper\UTF8')) {
156
                /** @noinspection PhpUndefinedClassInspection */
157
                $content = UTF8::rawurldecode($content, true);
158
            } else {
159
                do {
160 3
                    $content_compare = $content;
161
162 3
                    $content = \rawurldecode(
163 3
                        \html_entity_decode(
164 3
                            $content,
165 3
                            \ENT_QUOTES | \ENT_HTML5
166
                        )
167
                    );
168 3
                } while ($content_compare !== $content);
169
            }
170
        } else {
171
            /** @noinspection NestedPositiveIfStatementsInspection */
172 118
            if (\class_exists('\voku\helper\UTF8')) {
173
                /** @noinspection PhpUndefinedClassInspection */
174
                $content = UTF8::rawurldecode($content, false);
175
            } else {
176 118
                $content = \rawurldecode(
177 118
                    \html_entity_decode(
178 118
                        $content,
179 118
                        \ENT_QUOTES | \ENT_HTML5
180
                    )
181
                );
182
            }
183
        }
184
185 119
        return $content;
186
    }
187
188
    /**
189
     * Find list of nodes with a CSS selector.
190
     *
191
     * @param string   $selector
192
     * @param int|null $idx
193
     */
194
    abstract public function find(string $selector, $idx = null);
195
196
    /**
197
     * Find nodes with a CSS selector.
198
     *
199
     * @param string $selector
200
     */
201
    abstract public function findMulti(string $selector);
202
203
    /**
204
     * Find nodes with a CSS selector or false, if no element is found.
205
     *
206
     * @param string $selector
207
     */
208
    abstract public function findMultiOrFalse(string $selector);
209
210
    /**
211
     * Find one node with a CSS selector.
212
     *
213
     * @param string $selector
214
     */
215
    abstract public function findOne(string $selector);
216
217
    /**
218
     * Find one node with a CSS selector or false, if no element is found.
219
     *
220
     * @param string $selector
221
     */
222
    abstract public function findOneOrFalse(string $selector);
223
224
    /**
225
     * @return \DOMDocument
226
     */
227 47
    public function getDocument(): \DOMDocument
228
    {
229 47
        return $this->document;
230
    }
231
232
    /**
233
     * Get dom node's outer html.
234
     *
235
     * @param bool $multiDecodeNewHtmlEntity
236
     *
237
     * @return string
238
     */
239
    abstract public function html(bool $multiDecodeNewHtmlEntity = false): string;
240
241
    /**
242
     * Get dom node's inner html.
243
     *
244
     * @param bool $multiDecodeNewHtmlEntity
245
     *
246
     * @return string
247
     */
248 28 View Code Duplication
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
249
    {
250
        // init
251 28
        $text = '';
252
253 28
        if ($this->document->documentElement) {
254 28
            foreach ($this->document->documentElement->childNodes as $node) {
255 28
                $text .= $this->document->saveHTML($node);
256
            }
257
        }
258
259 28
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
260
    }
261
262
    /**
263
     * Get dom node's inner html.
264
     *
265
     * @param bool $multiDecodeNewHtmlEntity
266
     *
267
     * @return string
268
     */
269 View Code Duplication
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
270
    {
271
        // init
272
        $text = '';
273
274
        if ($this->document->documentElement) {
275
            foreach ($this->document->documentElement->childNodes as $node) {
276
                $text .= $this->document->saveXML($node);
277
            }
278
        }
279
280
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
281
    }
282
283
    /**
284
     * Load HTML from string.
285
     *
286
     * @param string   $html
287
     * @param int|null $libXMLExtraOptions
288
     *
289
     * @return DomParserInterface
290
     */
291
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
292
293
    /**
294
     * Load HTML from file.
295
     *
296
     * @param string   $filePath
297
     * @param int|null $libXMLExtraOptions
298
     *
299
     * @throws \RuntimeException
300
     *
301
     * @return DomParserInterface
302
     */
303
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
304
305
    /**
306
     * Save the html-dom as string.
307
     *
308
     * @param string $filepath
309
     *
310
     * @return string
311
     */
312 13
    public function save(string $filepath = ''): string
313
    {
314 13
        $string = $this->html();
315 13
        if ($filepath !== '') {
316 1
            \file_put_contents($filepath, $string, \LOCK_EX);
317
        }
318
319 13
        return $string;
320
    }
321
322
    /**
323
     * @param callable $functionName
324
     */
325
    public function set_callback($functionName)
326
    {
327
        static::$callback = $functionName;
328
    }
329
330
    /**
331
     * Get dom node's plain text.
332
     *
333
     * @param bool $multiDecodeNewHtmlEntity
334
     *
335
     * @return string
336
     */
337 4
    public function text(bool $multiDecodeNewHtmlEntity = false): string
338
    {
339 4
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
340
    }
341
342
    /**
343
     * Get the HTML as XML or plain XML if needed.
344
     *
345
     * @param bool $multiDecodeNewHtmlEntity
346
     * @param bool $htmlToXml
347
     * @param bool $removeXmlHeader
348
     * @param int  $options
349
     *
350
     * @return string
351
     */
352 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353
        bool $multiDecodeNewHtmlEntity = false,
354
        bool $htmlToXml = true,
355
        bool $removeXmlHeader = true,
356
        int $options = \LIBXML_NOEMPTYTAG
357
    ): string {
358 2
        $xml = $this->document->saveXML(null, $options);
359 2
        if ($xml === false) {
360
            return '';
361
        }
362
363 2
        if ($removeXmlHeader) {
364 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
365
        }
366
367 2
        if ($htmlToXml) {
368
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
369
        } else {
370 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
371
372 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
373
        }
374
375 2
        return $return;
376
    }
377
378
    /**
379
     * Get the encoding to use.
380
     *
381
     * @return string
382
     */
383 209
    protected function getEncoding(): string
384
    {
385 209
        return $this->encoding;
386
    }
387
388
    /**
389
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
390
     *
391
     * @param string $html
392
     */
393 20
    protected function html5FallbackForScriptTags(string &$html)
394
    {
395
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
396
        /** @noinspection HtmlDeprecatedTag */
397 20
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
398 20
        $htmlTmp = \preg_replace_callback(
399 20
            $regExSpecialScript,
400 20
            static function ($scripts) {
401 18
                if (empty($scripts['content'])) {
402 6
                    return $scripts[0];
403
                }
404
405 15
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
406 20
            },
407 20
            $html
408
        );
409
410 20
        if ($htmlTmp !== null) {
411 20
            $html = $htmlTmp;
412
        }
413 20
    }
414
415
    /**
416
     * @param string $html
417
     *
418
     * @return string
419
     */
420 135
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
421
    {
422 135
        static $DOM_REPLACE__HELPER_CACHE = null;
423
424 135
        if ($DOM_REPLACE__HELPER_CACHE === null) {
425 30
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
426 30
                self::$domLinkReplaceHelper['tmp'],
427 30
                self::$domReplaceHelper['tmp']
428
            );
429 30
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
430 30
                self::$domLinkReplaceHelper['orig'],
431 30
                self::$domReplaceHelper['orig']
432
            );
433
434 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
435 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
436
437 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
438 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
439
440 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
441 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
442
443 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
444 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
445
        }
446
447
        if (
448 135
            isset(self::$domBrokenReplaceHelper['tmp'])
449
            &&
450 135
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
451
        ) {
452 4
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
453
        }
454
455 135
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
456
    }
457
458
    /**
459
     * @param string $html
460
     *
461
     * @return string
462
     */
463 195
    public static function replaceToPreserveHtmlEntities(string $html): string
464
    {
465
        // init
466 195
        $linksNew = [];
467 195
        $linksOld = [];
468
469 195
        if (\strpos($html, 'http') !== false) {
470
471
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
472 61
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
473 61
            \preg_match_all($regExUrl, $html, $linksOld);
474
475 61
            if (!empty($linksOld[1])) {
476 58
                $linksOld = $linksOld[1];
477 58
                foreach ((array) $linksOld as $linkKey => $linkOld) {
478 58
                    $linksNew[$linkKey] = \str_replace(
479 58
                        self::$domLinkReplaceHelper['orig'],
480 58
                        self::$domLinkReplaceHelper['tmp'],
481 58
                        $linkOld
482
                    );
483
                }
484
            }
485
        }
486
487 195
        $linksNewCount = \count($linksNew);
488 195
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
489 58
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
490 58
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
491
        } else {
492 142
            $search = self::$domReplaceHelper['orig'];
493 142
            $replace = self::$domReplaceHelper['tmp'];
494
        }
495
496 195
        return \str_replace($search, $replace, $html);
497
    }
498
}
499