Completed
Pull Request — master (#55)
by Volodymyr
01:53
created

AbstractDomParser::getEncoding()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
13
14
    /**
15
     * @var string
16
     */
17
    protected static $domHtmlBrokenHtmlHelper = '____simple_html_dom__voku__broken_html____';
18
19
    /**
20
     * @var string
21
     */
22
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_script____';
23
24
    /**
25
     * @var array
26
     */
27
    protected static $domBrokenReplaceHelper = [];
28
29
    /**
30
     * @var string[][]
31
     */
32
    protected static $domLinkReplaceHelper = [
33
        'orig' => ['[', ']', '{', '}'],
34
        'tmp'  => [
35
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
36
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
37
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
38
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
39
        ],
40
    ];
41
42
    /**
43
     * @var string[][]
44
     */
45
    protected static $domReplaceHelper = [
46
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
47
        'tmp'  => [
48
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
49
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
50
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
51
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
52
            '____SIMPLE_HTML_DOM__VOKU__AT____',
53
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
54
        ],
55
    ];
56
57
    /**
58
     * @var callable|null
59
     */
60
    protected static $callback;
61
62
    /**
63
     * @var string[]
64
     */
65
    protected static $functionAliases = [];
66
67
    /**
68
     * @var \DOMDocument
69
     */
70
    protected $document;
71
72
    /**
73
     * @var string
74
     */
75
    protected $encoding = 'UTF-8';
76
77
    /**
78
     * @param string $name
79
     * @param array  $arguments
80
     *
81
     * @return bool|mixed
82
     */
83
    public function __call($name, $arguments)
84
    {
85
        $name = \strtolower($name);
86
87
        if (isset(self::$functionAliases[$name])) {
88
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
89
        }
90
91
        throw new \BadMethodCallException('Method does not exist: ' . $name);
92
    }
93
94
    /**
95
     * @param string $name
96
     * @param array  $arguments
97
     *
98
     * @throws \BadMethodCallException
99
     * @throws \RuntimeException
100
     *
101
     * @return static
102
     */
103
    abstract public static function __callStatic($name, $arguments);
104
105
    public function __clone()
106
    {
107
        $this->document = clone $this->document;
108
    }
109
110
    /** @noinspection MagicMethodsValidityInspection */
111
112
    /**
113
     * @param string $name
114
     *
115
     * @return string|null
116
     */
117
    abstract public function __get($name);
118
119
    /**
120
     * @return string
121
     */
122
    abstract public function __toString();
123
124
    /**
125
     * does nothing (only for api-compatibility-reasons)
126
     *
127
     * @return bool
128
     *
129
     * @deprecated
130
     */
131
    public function clear(): bool
132
    {
133
        return true;
134
    }
135
136
    /**
137
     * Create DOMDocument from HTML.
138
     *
139
     * @param string   $html
140
     * @param int|null $libXMLExtraOptions
141
     *
142
     * @return \DOMDocument
143
     */
144
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
145
146
    /**
147
     * @param string $content
148
     * @param bool   $multiDecodeNewHtmlEntity
149
     *
150
     * @return string
151
     */
152 124
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
153
    {
154 124
        if ($multiDecodeNewHtmlEntity) {
155 3
            if (\class_exists('\voku\helper\UTF8')) {
156
                /** @noinspection PhpUndefinedClassInspection */
157
                $content = UTF8::rawurldecode($content, true);
158
            } else {
159
                do {
160 3
                    $content_compare = $content;
161
162 3
                    $content = \rawurldecode(
163 3
                        \html_entity_decode(
164 3
                            $content,
165 3
                            \ENT_QUOTES | \ENT_HTML5
166
                        )
167
                    );
168 3
                } while ($content_compare !== $content);
169
            }
170
        } else {
171
            /** @noinspection NestedPositiveIfStatementsInspection */
172 123
            if (\class_exists('\voku\helper\UTF8')) {
173
                /** @noinspection PhpUndefinedClassInspection */
174
                $content = UTF8::rawurldecode($content, false);
175
            } else {
176 123
                $content = \rawurldecode(
177 123
                    \html_entity_decode(
178 123
                        $content,
179 123
                        \ENT_QUOTES | \ENT_HTML5
180
                    )
181
                );
182
            }
183
        }
184
185 124
        return $content;
186
    }
187
188
    /**
189
     * Find list of nodes with a CSS selector.
190
     *
191
     * @param string   $selector
192
     * @param int|null $idx
193
     */
194
    abstract public function find(string $selector, $idx = null);
195
196
    /**
197
     * Find nodes with a CSS selector.
198
     *
199
     * @param string $selector
200
     */
201
    abstract public function findMulti(string $selector);
202
203
    /**
204
     * Find nodes with a CSS selector or false, if no element is found.
205
     *
206
     * @param string $selector
207
     */
208
    abstract public function findMultiOrFalse(string $selector);
209
210
    /**
211
     * Find one node with a CSS selector.
212
     *
213
     * @param string $selector
214
     */
215
    abstract public function findOne(string $selector);
216
217
    /**
218
     * Find one node with a CSS selector or false, if no element is found.
219
     *
220
     * @param string $selector
221
     */
222
    abstract public function findOneOrFalse(string $selector);
223
224
    /**
225
     * @return \DOMDocument
226
     */
227 48
    public function getDocument(): \DOMDocument
228
    {
229 48
        return $this->document;
230
    }
231
232
    /**
233
     * Get dom node's outer html.
234
     *
235
     * @param bool $multiDecodeNewHtmlEntity
236
     *
237
     * @return string
238
     */
239
    abstract public function html(bool $multiDecodeNewHtmlEntity = false): string;
240
241
    /**
242
     * Get dom node's inner html.
243
     *
244
     * @param bool $multiDecodeNewHtmlEntity
245
     *
246
     * @return string
247
     */
248 30 View Code Duplication
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
249
    {
250
        // init
251 30
        $text = '';
252
253 30
        if ($this->document->documentElement) {
254 30
            foreach ($this->document->documentElement->childNodes as $node) {
255 30
                $text .= $this->document->saveHTML($node);
256
            }
257
        }
258
259 30
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
260
    }
261
262
    /**
263
     * Get dom node's inner html.
264
     *
265
     * @param bool $multiDecodeNewHtmlEntity
266
     *
267
     * @return string
268
     */
269 View Code Duplication
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
270
    {
271
        // init
272
        $text = '';
273
274
        if ($this->document->documentElement) {
275
            foreach ($this->document->documentElement->childNodes as $node) {
276
                $text .= $this->document->saveXML($node);
277
            }
278
        }
279
280
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
281
    }
282
283
    /**
284
     * Load HTML from string.
285
     *
286
     * @param string   $html
287
     * @param int|null $libXMLExtraOptions
288
     *
289
     * @return DomParserInterface
290
     */
291
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
292
293
    /**
294
     * Load HTML from file.
295
     *
296
     * @param string   $filePath
297
     * @param int|null $libXMLExtraOptions
298
     *
299
     * @throws \RuntimeException
300
     *
301
     * @return DomParserInterface
302
     */
303
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
304
305
    /**
306
     * Save the html-dom as string.
307
     *
308
     * @param string $filepath
309
     *
310
     * @return string
311
     */
312 14
    public function save(string $filepath = ''): string
313
    {
314 14
        $string = $this->html();
315 14
        if ($filepath !== '') {
316 2
            \file_put_contents($filepath, $string, \LOCK_EX);
317
        }
318
319 14
        return $string;
320
    }
321
322
    /**
323
     * @param callable $functionName
324
     */
325
    public function set_callback($functionName)
326
    {
327
        static::$callback = $functionName;
328
    }
329
330
    /**
331
     * Get dom node's plain text.
332
     *
333
     * @param bool $multiDecodeNewHtmlEntity
334
     *
335
     * @return string
336
     */
337 4
    public function text(bool $multiDecodeNewHtmlEntity = false): string
338
    {
339 4
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
340
    }
341
342
    /**
343
     * Get the HTML as XML or plain XML if needed.
344
     *
345
     * @param bool $multiDecodeNewHtmlEntity
346
     * @param bool $htmlToXml
347
     * @param bool $removeXmlHeader
348
     * @param int  $options
349
     *
350
     * @return string
351
     */
352 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353
        bool $multiDecodeNewHtmlEntity = false,
354
        bool $htmlToXml = true,
355
        bool $removeXmlHeader = true,
356
        int $options = \LIBXML_NOEMPTYTAG
357
    ): string {
358 2
        $xml = $this->document->saveXML(null, $options);
359 2
        if ($xml === false) {
360
            return '';
361
        }
362
363 2
        if ($removeXmlHeader) {
364 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
365
        }
366
367 2
        if ($htmlToXml) {
368
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
369
        } else {
370 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
371
372 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
373
        }
374
375 2
        return $return;
376
    }
377
378
    /**
379
     * Get the encoding to use.
380
     *
381
     * @return string
382
     */
383 215
    protected function getEncoding(): string
384
    {
385 215
        return $this->encoding;
386
    }
387
388
    /**
389
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
390
     *
391
     * @param string $html
392
     */
393 23
    protected function html5FallbackForScriptTags(string &$html)
394
    {
395
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
396
        /** @noinspection HtmlDeprecatedTag */
397 23
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
398 23
        $htmlTmp = \preg_replace_callback(
399 23
            $regExSpecialScript,
400
            static function ($scripts) {
401 21
                if (empty($scripts['content'])) {
402 8
                    return $scripts[0];
403
                }
404
405 18
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
406 23
            },
407 23
            $html
408
        );
409
410 23
        if ($htmlTmp !== null) {
411 23
            $html = $htmlTmp;
412
        }
413 23
    }
414
415
    /**
416
     * @param string $html
417
     *
418
     * @return string
419
     */
420 140
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
421
    {
422 140
        static $DOM_REPLACE__HELPER_CACHE = null;
423
424 140
        if ($DOM_REPLACE__HELPER_CACHE === null) {
425 30
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
426 30
                self::$domLinkReplaceHelper['tmp'],
427 30
                self::$domReplaceHelper['tmp']
428
            );
429 30
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
430 30
                self::$domLinkReplaceHelper['orig'],
431 30
                self::$domReplaceHelper['orig']
432
            );
433
434 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
435 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
436
437 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
438 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
439
440 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
441 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
442
443 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
444 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
445
        }
446
447
        if (
448 140
            isset(self::$domBrokenReplaceHelper['tmp'])
449
            &&
450 140
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
451
        ) {
452 6
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
453
        }
454
455 140
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
456
    }
457
458
    /**
459
     * @param string $html
460
     *
461
     * @return string
462
     */
463 200
    public static function replaceToPreserveHtmlEntities(string $html): string
464
    {
465
        // init
466 200
        $linksNew = [];
467 200
        $linksOld = [];
468
469 200
        if (\strpos($html, 'http') !== false) {
470
471
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
472 63
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
473 63
            \preg_match_all($regExUrl, $html, $linksOld);
474
475 63
            if (!empty($linksOld[1])) {
476 60
                $linksOld = $linksOld[1];
477 60
                foreach ((array) $linksOld as $linkKey => $linkOld) {
478 60
                    $linksNew[$linkKey] = \str_replace(
479 60
                        self::$domLinkReplaceHelper['orig'],
480 60
                        self::$domLinkReplaceHelper['tmp'],
481 60
                        $linkOld
482
                    );
483
                }
484
            }
485
        }
486
487 200
        $linksNewCount = \count($linksNew);
488 200
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
489 60
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
490 60
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
491
        } else {
492 146
            $search = self::$domReplaceHelper['orig'];
493 146
            $replace = self::$domReplaceHelper['tmp'];
494
        }
495
496 200
        return \str_replace($search, $replace, $html);
497
    }
498
}
499