Cleaner   A
last analyzed

Complexity

Total Complexity 22

Size/Duplication

Total Lines 118
Duplicated Lines 0 %

Test Coverage

Coverage 71.93%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 56
c 1
b 0
f 0
dl 0
loc 118
ccs 41
cts 57
cp 0.7193
rs 10
wmc 22

2 Methods

Rating   Name   Duplication   Size   Complexity  
A setUpRegexEncoding() 0 14 3
D clean() 0 90 19
1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Contracts\Dom\CleanerInterface;
8
use PHPHtmlParser\Exceptions\LogicalException;
9
use PHPHtmlParser\Options;
10
11
class Cleaner implements CleanerInterface
12
{
13
    /**
14
     * Cleans the html of any none-html information.
15
     *
16
     * @throws LogicalException
17
     */
18 297
    public function clean(string $str, Options $options, string $defaultCharset): string
19
    {
20 297
        if (!$options->isCleanupInput()) {
21
            // skip entire cleanup step
22 15
            return $str;
23
        }
24
25
        // check if the string is gziped
26 282
        $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII');
27 282
        if ($is_gzip) {
28
            $str = \gzdecode($str);
29
            if ($str === false) {
30
                throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');
31
            }
32
        }
33
34
        // we must handle character encoding
35 282
        $str = $this->setUpRegexEncoding($str, $options, $defaultCharset);
36
37
        // remove white space before closing tags
38 282
        $str = \mb_eregi_replace("'\s+>", "'>", $str);
39 282
        if ($str === false) {
40
            throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');
41
        }
42 282
        $str = \mb_eregi_replace('"\s+>', '">', $str);
43 282
        if ($str === false) {
44
            throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');
45
        }
46
47
        // clean out the \n\r
48 282
        $replace = ' ';
49 282
        if ($options->isPreserveLineBreaks()) {
50 3
            $replace = '&#10;';
51
        }
52 282
        $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
53 282
        if ($str === false) {
54
            throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');
55
        }
56
57
        // strip the doctype
58 282
        $str = \mb_eregi_replace('<!doctype(.*?)>', '', $str);
59 282
        if ($str === false) {
60
            throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');
61
        }
62
63
        // strip out comments
64 282
        $str = \mb_eregi_replace('<!--(.*?)-->', '', $str);
65 282
        if ($str === false) {
66
            throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');
67
        }
68
69
        // strip out cdata
70 282
        $str = \mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
71 282
        if ($str === false) {
72
            throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');
73
        }
74
75
        // strip out <script> tags
76 282
        if ($options->isRemoveScripts()) {
77 279
            $str = \mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
78 279
            if ($str === false) {
79
                throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 1.');
80
            }
81 279
            $str = \mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
82 279
            if ($str === false) {
83
                throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 2.');
84
            }
85
        }
86
87
        // strip out <style> tags
88 282
        if ($options->isRemoveStyles()) {
89 279
            $str = \mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
90 279
            if ($str === false) {
91
                throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 1.');
92
            }
93 279
            $str = \mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
94 279
            if ($str === false) {
95
                throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 2.');
96
            }
97
        }
98
99
        // strip smarty scripts
100 282
        if ($options->isRemoveSmartyScripts()) {
101 279
            $str = \mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
102 279
            if ($str === false) {
103
                throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove smarty scripts.');
104
            }
105
        }
106
107 282
        return $str;
108
    }
109
110
    /**
111
     * Sets up the mb_regex_encoding and converts the text to that encoding.
112
     *
113
     * @throws LogicalException
114
     */
115 282
    private function setUpRegexEncoding(string $str, Options $options, string $defaultCharset): string
116
    {
117 282
        $encoding = $defaultCharset;
118 282
        $enforceEncoding = $options->getEnforceEncoding();
119 282
        if ($enforceEncoding !== null) {
120
            //  they want to enforce the given encoding
121
            $encoding = $enforceEncoding;
122
        }
123
124 282
        if (!\mb_regex_encoding($encoding)) {
125
            throw new LogicalException('Character encoding was not able to be changed to ' . $encoding . '.');
126
        }
127
128 282
        return \mb_convert_encoding($str, $encoding);
129
    }
130
}
131