Completed
Push — master ( fefe4d...d45b97 )
by Lars
01:59
created

AbstractDomParser   A

Complexity

Total Complexity 37

Size/Duplication

Total Lines 470
Duplicated Lines 25.53 %

Coupling/Cohesion

Components 2
Dependencies 0

Test Coverage

Coverage 79.31%

Importance

Changes 0
Metric Value
wmc 37
lcom 2
cbo 0
dl 120
loc 470
ccs 92
cts 116
cp 0.7931
rs 9.44
c 0
b 0
f 0

25 Methods

Rating   Name   Duplication   Size   Complexity  
A __call() 0 10 2
__callStatic() 0 1 ?
A __clone() 0 4 1
__get() 0 1 ?
__toString() 0 1 ?
A clear() 0 4 1
createDOMDocument() 0 1 ?
A decodeHtmlEntity() 0 35 5
find() 0 1 ?
findMulti() 0 1 ?
findOne() 0 1 ?
A getDocument() 0 4 1
html() 0 1 ?
A innerHtml() 13 13 3
A innerXml() 13 13 3
loadHtml() 0 1 ?
loadHtmlFile() 0 1 ?
A save() 0 9 2
A set_callback() 0 4 1
A text() 0 4 1
A xml() 22 22 3
A getEncoding() 0 4 1
A html5FallbackForScriptTags() 0 21 3
A putReplacedBackToPreserveHtmlEntities() 37 37 4
B replaceToPreserveHtmlEntities() 35 35 6

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
13
14
    /**
15
     * @var string
16
     */
17
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____';
18
19
    /**
20
     * @var array
21
     */
22
    protected static $domBrokenReplaceHelper = [];
23
24
    /**
25
     * @var string[][]
26
     */
27
    protected static $domLinkReplaceHelper = [
28
        'orig' => ['[', ']', '{', '}'],
29
        'tmp'  => [
30
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
31
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
32
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
33
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
34
        ],
35
    ];
36
37
    /**
38
     * @var string[][]
39
     */
40
    protected static $domReplaceHelper = [
41
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
42
        'tmp'  => [
43
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
44
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
45
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
46
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
47
            '____SIMPLE_HTML_DOM__VOKU__AT____',
48
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
49
        ],
50
    ];
51
52
    /**
53
     * @var callable
54
     */
55
    protected static $callback;
56
57
    /**
58
     * @var string[]
59
     */
60
    protected static $functionAliases = [];
61
62
    /**
63
     * @var \DOMDocument
64
     */
65
    protected $document;
66
67
    /**
68
     * @var string
69
     */
70
    protected $encoding = 'UTF-8';
71
72
    /**
73
     * @param string $name
74
     * @param array  $arguments
75
     *
76
     * @return bool|mixed
77
     */
78
    public function __call($name, $arguments)
79
    {
80
        $name = \strtolower($name);
81
82
        if (isset(self::$functionAliases[$name])) {
83
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
84
        }
85
86
        throw new \BadMethodCallException('Method does not exist: ' . $name);
87
    }
88
89
    /**
90
     * @param string $name
91
     * @param array  $arguments
92
     *
93
     * @throws \BadMethodCallException
94
     * @throws \RuntimeException
95
     *
96
     * @return static
97
     */
98
    abstract public static function __callStatic($name, $arguments);
99
100
    public function __clone()
101
    {
102
        $this->document = clone $this->document;
103
    }
104
105
    /** @noinspection MagicMethodsValidityInspection */
106
107
    /**
108
     * @param string $name
109
     *
110
     * @return string|null
111
     */
112
    abstract public function __get($name);
113
114
    /**
115
     * @return string
116
     */
117
    abstract public function __toString();
118
119
    /**
120
     * does nothing (only for api-compatibility-reasons)
121
     *
122
     * @return bool
123
     *
124
     * @deprecated
125
     */
126
    public function clear(): bool
127
    {
128
        return true;
129
    }
130
131
    /**
132
     * Create DOMDocument from HTML.
133
     *
134
     * @param string   $html
135
     * @param int|null $libXMLExtraOptions
136
     *
137
     * @return \DOMDocument
138
     */
139
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
140
141
    /**
142
     * @param string $content
143
     * @param bool   $multiDecodeNewHtmlEntity
144
     *
145
     * @return string
146
     */
147 78
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
148
    {
149 78
        if ($multiDecodeNewHtmlEntity) {
150 3
            if (\class_exists('\voku\helper\UTF8')) {
151
                /** @noinspection PhpUndefinedClassInspection */
152
                $content = UTF8::rawurldecode($content, true);
153
            } else {
154
                do {
155 3
                    $content_compare = $content;
156
157 3
                    $content = \rawurldecode(
158 3
                        \html_entity_decode(
159 3
                            $content,
160 3
                            \ENT_QUOTES | \ENT_HTML5
161
                        )
162
                    );
163 3
                } while ($content_compare !== $content);
164
            }
165
        } else {
166
            /** @noinspection NestedPositiveIfStatementsInspection */
167 77
            if (\class_exists('\voku\helper\UTF8')) {
168
                /** @noinspection PhpUndefinedClassInspection */
169
                $content = UTF8::rawurldecode($content, false);
170
            } else {
171 77
                $content = \rawurldecode(
172 77
                    \html_entity_decode(
173 77
                        $content,
174 77
                        \ENT_QUOTES | \ENT_HTML5
175
                    )
176
                );
177
            }
178
        }
179
180 78
        return $content;
181
    }
182
183
    /**
184
     * Find list of nodes with a CSS selector.
185
     *
186
     * @param string   $selector
187
     * @param int|null $idx
188
     */
189
    abstract public function find(string $selector, $idx = null);
190
191
    /**
192
     * Find nodes with a CSS selector.
193
     *
194
     * @param string $selector
195
     */
196
    abstract public function findMulti(string $selector);
197
198
    /**
199
     * Find one node with a CSS selector.
200
     *
201
     * @param string $selector
202
     */
203
    abstract public function findOne(string $selector);
204
205
    /**
206
     * @return \DOMDocument
207
     */
208 39
    public function getDocument(): \DOMDocument
209
    {
210 39
        return $this->document;
211
    }
212
213
    /**
214
     * Get dom node's outer html.
215
     *
216
     * @param bool $multiDecodeNewHtmlEntity
217
     *
218
     * @return string
219
     */
220
    abstract public function html(bool $multiDecodeNewHtmlEntity = false): string;
221
222
    /**
223
     * Get dom node's inner html.
224
     *
225
     * @param bool $multiDecodeNewHtmlEntity
226
     *
227
     * @return string
228
     */
229 20 View Code Duplication
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
230
    {
231
        // init
232 20
        $text = '';
233
234 20
        if ($this->document->documentElement) {
235 20
            foreach ($this->document->documentElement->childNodes as $node) {
236 20
                $text .= $this->document->saveHTML($node);
237
            }
238
        }
239
240 20
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
241
    }
242
243
    /**
244
     * Get dom node's inner html.
245
     *
246
     * @param bool $multiDecodeNewHtmlEntity
247
     *
248
     * @return string
249
     */
250 View Code Duplication
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
251
    {
252
        // init
253
        $text = '';
254
255
        if ($this->document->documentElement) {
256
            foreach ($this->document->documentElement->childNodes as $node) {
257
                $text .= $this->document->saveXML($node);
258
            }
259
        }
260
261
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
262
    }
263
264
    /**
265
     * Load HTML from string.
266
     *
267
     * @param string   $html
268
     * @param int|null $libXMLExtraOptions
269
     *
270
     * @return DomParserInterface
271
     */
272
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
273
274
    /**
275
     * Load HTML from file.
276
     *
277
     * @param string   $filePath
278
     * @param int|null $libXMLExtraOptions
279
     *
280
     * @throws \RuntimeException
281
     *
282
     * @return DomParserInterface
283
     */
284
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
285
286
    /**
287
     * Save the html-dom as string.
288
     *
289
     * @param string $filepath
290
     *
291
     * @return string
292
     */
293 1
    public function save(string $filepath = ''): string
294
    {
295 1
        $string = $this->innerHtml();
296 1
        if ($filepath !== '') {
297
            \file_put_contents($filepath, $string, \LOCK_EX);
298
        }
299
300 1
        return $string;
301
    }
302
303
    /**
304
     * @param callable $functionName
305
     */
306
    public function set_callback($functionName)
307
    {
308
        static::$callback = $functionName;
309
    }
310
311
    /**
312
     * Get dom node's plain text.
313
     *
314
     * @param bool $multiDecodeNewHtmlEntity
315
     *
316
     * @return string
317
     */
318 3
    public function text(bool $multiDecodeNewHtmlEntity = false): string
319
    {
320 3
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
321
    }
322
323
    /**
324
     * Get the HTML as XML or plain XML if needed.
325
     *
326
     * @param bool $multiDecodeNewHtmlEntity
327
     * @param bool $htmlToXml
328
     * @param bool $removeXmlHeader
329
     * @param int  $options
330
     *
331
     * @return string
332
     */
333 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
334
        bool $multiDecodeNewHtmlEntity = false,
335
        bool $htmlToXml = true,
336
        bool $removeXmlHeader = true,
337
        int $options = \LIBXML_NOEMPTYTAG
338
    ): string {
339 2
        $xml = $this->document->saveXML(null, $options);
340
341 2
        if ($removeXmlHeader) {
342 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
343
        }
344
345 2
        if ($htmlToXml) {
346
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
347
        } else {
348 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
349
350 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
351
        }
352
353 2
        return $return;
354
    }
355
356
    /**
357
     * Get the encoding to use.
358
     *
359
     * @return string
360
     */
361 146
    protected function getEncoding(): string
362
    {
363 146
        return $this->encoding;
364
    }
365
366
    /**
367
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
368
     *
369
     * @param string $html
370
     */
371 15
    protected function html5FallbackForScriptTags(string &$html)
372
    {
373
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
374
        /** @noinspection HtmlDeprecatedTag */
375 15
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
376 15
        $htmlTmp = \preg_replace_callback(
377 15
            $regExSpecialScript,
378 15
            static function ($scripts) {
379 14
                if (empty($scripts['content'])) {
380 6
                    return $scripts[0];
381
                }
382
383 11
                return '<script' . $scripts['attr'] . '>' . str_replace('</', '<\/', $scripts['content']) . '</script>';
384 15
            },
385 15
            $html
386
        );
387
388 15
        if ($htmlTmp !== null) {
389 15
            $html = $htmlTmp;
390
        }
391 15
    }
392
393
    /**
394
     * @param string $html
395
     *
396
     * @return string
397
     */
398 2 View Code Duplication
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
399
    {
400 2
        static $DOM_REPLACE__HELPER_CACHE = null;
401
402 2
        if ($DOM_REPLACE__HELPER_CACHE === null) {
403 2
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
404 2
                self::$domLinkReplaceHelper['tmp'],
405 2
                self::$domReplaceHelper['tmp']
406
            );
407 2
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
408 2
                self::$domLinkReplaceHelper['orig'],
409 2
                self::$domReplaceHelper['orig']
410
            );
411
412 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
413 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
414
415 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
416 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
417
418 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
419 2
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
420
421 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
422 2
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
423
        }
424
425
        if (
426 2
            isset(self::$domBrokenReplaceHelper['tmp'])
427
            &&
428 2
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
429
        ) {
430
            $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
431
        }
432
433 2
        return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
434
    }
435
436
    /**
437
     * @param string $html
438
     *
439
     * @return string
440
     */
441 3 View Code Duplication
    public static function replaceToPreserveHtmlEntities(string $html): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
442
    {
443
        // init
444 3
        $linksNew = [];
445 3
        $linksOld = [];
446
447 3
        if (\strpos($html, 'http') !== false) {
448
449
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
450 1
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
451 1
            \preg_match_all($regExUrl, $html, $linksOld);
452
453 1
            if (!empty($linksOld[1])) {
454 1
                $linksOld = $linksOld[1];
455 1
                foreach ((array) $linksOld as $linkKey => $linkOld) {
456 1
                    $linksNew[$linkKey] = \str_replace(
457 1
                        self::$domLinkReplaceHelper['orig'],
458 1
                        self::$domLinkReplaceHelper['tmp'],
459 1
                        $linkOld
460
                    );
461
                }
462
            }
463
        }
464
465 3
        $linksNewCount = \count($linksNew);
466 3
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
467 1
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
468 1
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
469
        } else {
470 2
            $search = self::$domReplaceHelper['orig'];
471 2
            $replace = self::$domReplaceHelper['tmp'];
472
        }
473
474 3
        return \str_replace($search, $replace, $html);
475
    }
476
}
477