Completed
Push — master ( 61605a...ee63fc )
by Lars
16:14 queued 14:45
created

AbstractDomParser   A

Complexity

Total Complexity 37

Size/Duplication

Total Lines 489
Duplicated Lines 9.82 %

Coupling/Cohesion

Components 2
Dependencies 0

Test Coverage

Coverage 80.17%

Importance

Changes 0
Metric Value
wmc 37
lcom 2
cbo 0
dl 48
loc 489
ccs 93
cts 116
cp 0.8017
rs 9.44
c 0
b 0
f 0

27 Methods

Rating   Name   Duplication   Size   Complexity  
A __call() 0 10 2
__callStatic() 0 1 ?
A __clone() 0 4 1
__get() 0 1 ?
__toString() 0 1 ?
A clear() 0 4 1
createDOMDocument() 0 1 ?
A decodeHtmlEntity() 0 35 5
find() 0 1 ?
findMulti() 0 1 ?
findMultiOrFalse() 0 1 ?
findOne() 0 1 ?
findOneOrFalse() 0 1 ?
A getDocument() 0 4 1
html() 0 1 ?
A innerHtml() 13 13 3
A innerXml() 13 13 3
loadHtml() 0 1 ?
loadHtmlFile() 0 1 ?
A save() 0 9 2
A set_callback() 0 4 1
A text() 0 4 1
A xml() 22 22 3
A getEncoding() 0 4 1
A putReplacedBackToPreserveHtmlEntities() 0 37 4
B replaceToPreserveHtmlEntities() 0 35 6
A html5FallbackForScriptTags() 0 21 3

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
13
14
    /**
15
     * @var string
16
     */
17
    protected static $domHtmlBrokenHtmlHelper = '____simple_html_dom__voku__broken_html____';
18
19
    /**
20
     * @var string
21
     */
22
    protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_script____';
23
24
    /**
25
     * @var array
26
     */
27
    protected static $domBrokenReplaceHelper = [];
28
29
    /**
30
     * @var string[][]
31
     */
32
    protected static $domLinkReplaceHelper = [
33
        'orig' => ['[', ']', '{', '}'],
34
        'tmp'  => [
35
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
36
            '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
37
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
38
            '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
39
        ],
40
    ];
41
42
    /**
43
     * @var string[][]
44
     */
45
    protected static $domReplaceHelper = [
46
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
47
        'tmp'  => [
48
            '____SIMPLE_HTML_DOM__VOKU__AMP____',
49
            '____SIMPLE_HTML_DOM__VOKU__PIPE____',
50
            '____SIMPLE_HTML_DOM__VOKU__PLUS____',
51
            '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
52
            '____SIMPLE_HTML_DOM__VOKU__AT____',
53
            '<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"',
54
        ],
55
    ];
56
57
    /**
58
     * @var callable|null
59
     */
60
    protected static $callback;
61
62
    /**
63
     * @var string[]
64
     */
65
    protected static $functionAliases = [];
66
67
    /**
68
     * @var \DOMDocument
69
     */
70
    protected $document;
71
72
    /**
73
     * @var string
74
     */
75
    protected $encoding = 'UTF-8';
76
77
    /**
78
     * @param string $name
79
     * @param array  $arguments
80
     *
81
     * @return bool|mixed
82
     */
83
    public function __call($name, $arguments)
84
    {
85
        $name = \strtolower($name);
86
87
        if (isset(self::$functionAliases[$name])) {
88
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
89
        }
90
91
        throw new \BadMethodCallException('Method does not exist: ' . $name);
92
    }
93
94
    /**
95
     * @param string $name
96
     * @param array  $arguments
97
     *
98
     * @throws \BadMethodCallException
99
     * @throws \RuntimeException
100
     *
101
     * @return static
102
     */
103
    abstract public static function __callStatic($name, $arguments);
104
105
    public function __clone()
106
    {
107
        $this->document = clone $this->document;
108
    }
109
110
    /** @noinspection MagicMethodsValidityInspection */
111
112
    /**
113
     * @param string $name
114
     *
115
     * @return string|null
116
     */
117
    abstract public function __get($name);
118
119
    /**
120
     * @return string
121
     */
122
    abstract public function __toString();
123
124
    /**
125
     * does nothing (only for api-compatibility-reasons)
126
     *
127
     * @return bool
128
     *
129
     * @deprecated
130
     */
131
    public function clear(): bool
132
    {
133
        return true;
134
    }
135
136
    /**
137
     * Create DOMDocument from HTML.
138
     *
139
     * @param string   $html
140
     * @param int|null $libXMLExtraOptions
141
     *
142
     * @return \DOMDocument
143
     */
144
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
145
146
    /**
147
     * @param string $content
148
     * @param bool   $multiDecodeNewHtmlEntity
149
     *
150
     * @return string
151
     */
152 105
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
153
    {
154 105
        if ($multiDecodeNewHtmlEntity) {
155 3
            if (\class_exists('\voku\helper\UTF8')) {
156
                /** @noinspection PhpUndefinedClassInspection */
157
                $content = UTF8::rawurldecode($content, true);
158
            } else {
159
                do {
160 3
                    $content_compare = $content;
161
162 3
                    $content = \rawurldecode(
163 3
                        \html_entity_decode(
164 3
                            $content,
165 3
                            \ENT_QUOTES | \ENT_HTML5
166
                        )
167
                    );
168 3
                } while ($content_compare !== $content);
169
            }
170
        } else {
171
            /** @noinspection NestedPositiveIfStatementsInspection */
172 104
            if (\class_exists('\voku\helper\UTF8')) {
173
                /** @noinspection PhpUndefinedClassInspection */
174
                $content = UTF8::rawurldecode($content, false);
175
            } else {
176 104
                $content = \rawurldecode(
177 104
                    \html_entity_decode(
178 104
                        $content,
179 104
                        \ENT_QUOTES | \ENT_HTML5
180
                    )
181
                );
182
            }
183
        }
184
185 105
        return $content;
186
    }
187
188
    /**
189
     * Find list of nodes with a CSS selector.
190
     *
191
     * @param string   $selector
192
     * @param int|null $idx
193
     */
194
    abstract public function find(string $selector, $idx = null);
195
196
    /**
197
     * Find nodes with a CSS selector.
198
     *
199
     * @param string $selector
200
     */
201
    abstract public function findMulti(string $selector);
202
203
    /**
204
     * Find nodes with a CSS selector or false, if no element is found.
205
     *
206
     * @param string $selector
207
     */
208
    abstract public function findMultiOrFalse(string $selector);
209
210
    /**
211
     * Find one node with a CSS selector.
212
     *
213
     * @param string $selector
214
     */
215
    abstract public function findOne(string $selector);
216
217
    /**
218
     * Find one node with a CSS selector or false, if no element is found.
219
     *
220
     * @param string $selector
221
     */
222
    abstract public function findOneOrFalse(string $selector);
223
224
    /**
225
     * @return \DOMDocument
226
     */
227 42
    public function getDocument(): \DOMDocument
228
    {
229 42
        return $this->document;
230
    }
231
232
    /**
233
     * Get dom node's outer html.
234
     *
235
     * @param bool $multiDecodeNewHtmlEntity
236
     *
237
     * @return string
238
     */
239
    abstract public function html(bool $multiDecodeNewHtmlEntity = false): string;
240
241
    /**
242
     * Get dom node's inner html.
243
     *
244
     * @param bool $multiDecodeNewHtmlEntity
245
     *
246
     * @return string
247
     */
248 29 View Code Duplication
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
249
    {
250
        // init
251 29
        $text = '';
252
253 29
        if ($this->document->documentElement) {
254 29
            foreach ($this->document->documentElement->childNodes as $node) {
255 29
                $text .= $this->document->saveHTML($node);
256
            }
257
        }
258
259 29
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
260
    }
261
262
    /**
263
     * Get dom node's inner html.
264
     *
265
     * @param bool $multiDecodeNewHtmlEntity
266
     *
267
     * @return string
268
     */
269 View Code Duplication
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
270
    {
271
        // init
272
        $text = '';
273
274
        if ($this->document->documentElement) {
275
            foreach ($this->document->documentElement->childNodes as $node) {
276
                $text .= $this->document->saveXML($node);
277
            }
278
        }
279
280
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
281
    }
282
283
    /**
284
     * Load HTML from string.
285
     *
286
     * @param string   $html
287
     * @param int|null $libXMLExtraOptions
288
     *
289
     * @return DomParserInterface
290
     */
291
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
292
293
    /**
294
     * Load HTML from file.
295
     *
296
     * @param string   $filePath
297
     * @param int|null $libXMLExtraOptions
298
     *
299
     * @throws \RuntimeException
300
     *
301
     * @return DomParserInterface
302
     */
303
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
304
305
    /**
306
     * Save the html-dom as string.
307
     *
308
     * @param string $filepath
309
     *
310
     * @return string
311
     */
312 1
    public function save(string $filepath = ''): string
313
    {
314 1
        $string = $this->innerHtml();
315 1
        if ($filepath !== '') {
316
            \file_put_contents($filepath, $string, \LOCK_EX);
317
        }
318
319 1
        return $string;
320
    }
321
322
    /**
323
     * @param callable $functionName
324
     */
325
    public function set_callback($functionName)
326
    {
327
        static::$callback = $functionName;
328
    }
329
330
    /**
331
     * Get dom node's plain text.
332
     *
333
     * @param bool $multiDecodeNewHtmlEntity
334
     *
335
     * @return string
336
     */
337 4
    public function text(bool $multiDecodeNewHtmlEntity = false): string
338
    {
339 4
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
340
    }
341
342
    /**
343
     * Get the HTML as XML or plain XML if needed.
344
     *
345
     * @param bool $multiDecodeNewHtmlEntity
346
     * @param bool $htmlToXml
347
     * @param bool $removeXmlHeader
348
     * @param int  $options
349
     *
350
     * @return string
351
     */
352 2 View Code Duplication
    public function xml(
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353
        bool $multiDecodeNewHtmlEntity = false,
354
        bool $htmlToXml = true,
355
        bool $removeXmlHeader = true,
356
        int $options = \LIBXML_NOEMPTYTAG
357
    ): string {
358 2
        $xml = $this->document->saveXML(null, $options);
359
360 2
        if ($removeXmlHeader) {
361 2
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
362
        }
363
364 2
        if ($htmlToXml) {
365
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
366
        } else {
367 2
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
368
369 2
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
370
        }
371
372 2
        return $return;
373
    }
374
375
    /**
376
     * Get the encoding to use.
377
     *
378
     * @return string
379
     */
380 189
    protected function getEncoding(): string
381
    {
382 189
        return $this->encoding;
383
    }
384
385
    /**
386
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
387
     *
388
     * @param string $html
389
     */
390 20
    protected function html5FallbackForScriptTags(string &$html)
391
    {
392
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
393
        /** @noinspection HtmlDeprecatedTag */
394 20
        $regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU';
395 20
        $htmlTmp = \preg_replace_callback(
396 20
            $regExSpecialScript,
397 20
            static function ($scripts) {
398 18
                if (empty($scripts['content'])) {
399 6
                    return $scripts[0];
400
                }
401
402 15
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
403 20
            },
404 20
            $html
405
        );
406
407 20
        if ($htmlTmp !== null) {
408 20
            $html = $htmlTmp;
409
        }
410 20
    }
411
412
    /**
413
     * @param string $html
414
     *
415
     * @return string
416
     */
417 120
    public static function putReplacedBackToPreserveHtmlEntities(string $html): string
418
    {
419 120
        static $DOM_REPLACE__HELPER_CACHE = null;
420
421 120
        if ($DOM_REPLACE__HELPER_CACHE === null) {
422 30
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
423 30
                self::$domLinkReplaceHelper['tmp'],
424 30
                self::$domReplaceHelper['tmp']
425
            );
426 30
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
427 30
                self::$domLinkReplaceHelper['orig'],
428 30
                self::$domReplaceHelper['orig']
429
            );
430
431 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
432 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
433
434 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
435 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
436
437 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
438 30
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
439
440 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
441 30
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
442
        }
443
444
        if (
445 120
            isset(self::$domBrokenReplaceHelper['tmp'])
446
            &&
447 120
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
448
        ) {
449 4
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
450
        }
451
452 120
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
453
    }
454
455
    /**
456
     * @param string $html
457
     *
458
     * @return string
459
     */
460 178
    public static function replaceToPreserveHtmlEntities(string $html): string
461
    {
462
        // init
463 178
        $linksNew = [];
464 178
        $linksOld = [];
465
466 178
        if (\strpos($html, 'http') !== false) {
467
468
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
469 61
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
470 61
            \preg_match_all($regExUrl, $html, $linksOld);
471
472 61
            if (!empty($linksOld[1])) {
473 58
                $linksOld = $linksOld[1];
474 58
                foreach ((array) $linksOld as $linkKey => $linkOld) {
475 58
                    $linksNew[$linkKey] = \str_replace(
476 58
                        self::$domLinkReplaceHelper['orig'],
477 58
                        self::$domLinkReplaceHelper['tmp'],
478 58
                        $linkOld
479
                    );
480
                }
481
            }
482
        }
483
484 178
        $linksNewCount = \count($linksNew);
485 178
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
486 58
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
487 58
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
488
        } else {
489 125
            $search = self::$domReplaceHelper['orig'];
490 125
            $replace = self::$domReplaceHelper['tmp'];
491
        }
492
493 178
        return \str_replace($search, $replace, $html);
494
    }
495
}
496