HtmlParser   B
last analyzed

Complexity

Total Complexity 48

Size/Duplication

Total Lines 266
Duplicated Lines 0 %

Test Coverage

Coverage 82.91%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
eloc 122
c 3
b 0
f 0
dl 0
loc 266
ccs 97
cts 117
cp 0.8291
rs 8.5599
wmc 48

4 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 2 1
A registerCallbacksHandler() 0 6 2
F transform() 0 209 42
A __call() 0 13 3

How to fix   Complexity   

Complex Class

Complex classes like HtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HtmlParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters\Html;
11
12
use Matecat\SubFiltering\Commons\Pipeline;
13
use ReflectionException;
14
use ReflectionMethod;
15
use RuntimeException;
16
17
/**
18
 * Class HtmlToPh
19
 *
20
 * Based on the code https://github.com/ericnorris/striptags
21
 * Rewritten/Improved and Changed for PHP
22
 *
23
 * @author  domenico [email protected] / [email protected]
24
 * @package SubFiltering
25
 *
26
 * @method _isTagValid( string $buffer )
27
 * @method _finalizeHTMLTag( string $buffer )
28
 * @method _fixWrongBuffer( string $buffer )
29
 * @method _finalizeScriptTag( string $buffer )
30
 * @method _finalizePlainText( string $plain_text_buffer )
31
 * @method _setSegmentContainsHtml()
32
 */
33
class HtmlParser {
34
35
    const STATE_PLAINTEXT = 0;
36
    const STATE_HTML      = 1;
37
    const STATE_COMMENT   = 2;
38
    const STATE_JS_CSS    = 3;
39
40
    /**
41
     * @var Pipeline
42
     */
43
    private $pipeline;
44
45
    /**
46
     * HtmlParser constructor.
47
     *
48
     * @param Pipeline $pipeline
49
     */
50 93
    public function __construct( Pipeline $pipeline = null ) {
51 93
        $this->pipeline = $pipeline;
52 93
    }
53
54
    /**
55
     * @var CallbacksHandler
56
     */
57
    protected $callbacksHandler;
58
59 93
    public function registerCallbacksHandler( $class ) {
60
        //check: $class must use CallbacksHandler trait
61 93
        if ( !in_array( CallbacksHandler::class, class_uses( $class ) ) ) {
62
            throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." );
63
        }
64 93
        $this->callbacksHandler = $class;
65 93
    }
66
67
    /**
68
     * @param $name
69
     * @param $arguments
70
     *
71
     * @return mixed
72
     * @throws ReflectionException
73
     */
74 93
    public function __call( $name, $arguments ) {
75
76 93
        if ( $this->callbacksHandler !== null ) {
77
            //Reflection to allow protected/private methods to be set as callback
78 93
            $reflector = new ReflectionMethod( $this->callbacksHandler, $name );
79 93
            if ( !$reflector->isPublic() ) {
80 93
                $reflector->setAccessible( true );
81
            }
82
83 93
            return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] );
84
        }
85
86
        return null;
87
88
    }
89
90 93
    public function transform( $segment ) {
91
92 93
        $originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY );
93
94 93
        $state             = static::STATE_PLAINTEXT;
95 93
        $html_buffer       = '';
96 93
        $plain_text_buffer = '';
97 93
        $in_quote_char     = '';
98 93
        $output            = '';
99
100 93
        foreach ( $originalSplit as $idx => $char ) {
101
102 93
            if ( $state == static::STATE_PLAINTEXT ) {
103 93
                switch ( $char ) {
104 93
                    case '<':
105 29
                        $state             = static::STATE_HTML;
106 29
                        $html_buffer       .= $char;
107 29
                        $output            .= $this->_finalizePlainText( $plain_text_buffer );
108 29
                        $plain_text_buffer = '';
109 29
                        break;
110
111
                    //
112
                    // *************************************
113
                    // NOTE 2021-06-15
114
                    // *************************************
115
                    //
116
                    // This case covers simple greater than sign (>),
117
                    // otherwise is ignored and leaved as >.
118
                    //
119 88
                    case '>':
120 2
                        $plain_text_buffer .= $this->_fixWrongBuffer( $char );
121 2
                        break;
122
123
                    default:
124 88
                        $plain_text_buffer .= $char;
125 93
                        break;
126
                }
127 29
            } elseif ( $state == static::STATE_HTML ) {
128 29
                switch ( $char ) {
129 29
                    case '<':
130
                        // is not possible to have angle brackets inside a tag, this case can not happen
131
                        // this code would ignore '>' if inside a quote, useless
132
                        // for more info see https://www.w3.org/TR/xml/#charsets
133
134
                        // if we found a second less than symbol the first one IS NOT a tag,
135
                        // treat the html_buffer as plain text and attach to the output
136
                        $output      .= $this->_fixWrongBuffer( $html_buffer );
137
                        $html_buffer = $char;
138
                        break;
139
140 29
                    case '>':
141
                        // is not possible to have angle brackets inside a tag, this case can not happen
142
                        // this code would ignore '>' if inside a quote, useless
143
                        // for more info see https://www.w3.org/TR/xml/#charsets
144
145 23
                        if ( in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] ) ) {
146 3
                            $html_buffer .= $char;
147 3
                            $state       = static::STATE_JS_CSS;
148 3
                            break;
149
                        }
150
151
                        // this is closing the tag in tag_buffer
152 20
                        $in_quote_char = '';
153 20
                        $state         = static::STATE_PLAINTEXT;
154 20
                        $html_buffer   .= $char;
155
156 20
                        if ( $this->_isTagValid( $html_buffer ) ) {
157 18
                            $output .= $this->_finalizeHTMLTag( $html_buffer );
158
                        } else {
159 3
                            $output .= $this->_fixWrongBuffer( $html_buffer );
160
                        }
161
162 20
                        if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
163
                            $this->_setSegmentContainsHtml();
164
                        }
165
166 20
                        $html_buffer = '';
167 20
                        break;
168
169 29
                    case '"':
170 29
                    case '\'':
171
                        // catch both single and double quotes
172
173 8
                        if ( $char == $in_quote_char ) {
174 7
                            $in_quote_char = '';
175
                        } else {
176 8
                            $in_quote_char = ( !empty( $in_quote_char ) ? $in_quote_char : $char );
177
                        }
178
179 8
                        $html_buffer .= $char;
180 8
                        break;
181
182 29
                    case '-':
183 2
                        if ( $html_buffer == '<!-' ) {
184
                            $state = static::STATE_COMMENT;
185
                        }
186
187 2
                        $html_buffer .= $char;
188 2
                        break;
189
190 29
                    case ' ': //0x20, is a space
191 24
                    case '\n':
192 15
                        if ( $html_buffer === '<' ) {
193 5
                            $state       = static::STATE_PLAINTEXT; // but we work in XML text, so encode it
194 5
                            $output      .= $this->_fixWrongBuffer( '< ' );
195 5
                            $html_buffer = '';
196
197 5
                            if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
198
                                $this->_setSegmentContainsHtml();
199
                            }
200
201 5
                            break;
202
                        }
203
204 10
                        $html_buffer .= $char;
205 10
                        break;
206
207
                    default:
208
209
                        // Check the last char
210 24
                        if ( $idx === ( count( $originalSplit ) - 1 ) ) {
211
212 1
                            $html_buffer .= $char;
213
214
                            //
215
                            // *************************************
216
                            // NOTE 2021-06-16
217
                            // *************************************
218
                            //
219
                            // Check if $html_buffer is valid. If not, then
220
                            // convert it to $plain_text_buffer.
221
                            //
222
                            // Example:
223
                            //
224
                            // $html_buffer = '<3 %}'
225
                            //
226
                            // is not a valid tag, so it's converted to $plain_text_buffer
227
                            //
228 1
                            if ( !$this->_isTagValid( $html_buffer ) ) {
229 1
                                $state             = static::STATE_PLAINTEXT; // but we work in XML text, so encode it
230 1
                                $plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer );
231 1
                                $html_buffer       = '';
232
233 1
                                if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
234
                                    $this->_setSegmentContainsHtml();
235
                                }
236
237 1
                                break;
238
                            }
239
240
                            break;
241
                        }
242
243 24
                        $html_buffer .= $char;
244 29
                        break;
245
                }
246 3
            } elseif ( $state == static::STATE_COMMENT ) {
247
248
                $html_buffer .= $char;
249
250
                if ( $char == '>' ) {
251
                    if ( substr( $html_buffer, -3 ) == '-->' ) {
252
                        // close the comment
253
                        $state       = static::STATE_PLAINTEXT;
254
                        $output      .= $this->_finalizeScriptTag( $html_buffer );
255
                        $html_buffer = '';
256
257
                        if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
258
                            $this->_setSegmentContainsHtml();
259
                        }
260
                    }
261
                }
262
263 3
            } elseif ( $state == static::STATE_JS_CSS ) {
264
265 3
                $html_buffer .= $char;
266
267 3
                if ( $char == '>' ) {
268 2
                    if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ] ) ) {
269
                        // close the comment
270 2
                        $state       = static::STATE_PLAINTEXT;
271 2
                        $output      .= $this->_finalizeScriptTag( $html_buffer );
272 2
                        $html_buffer = '';
273
274 2
                        if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
275
                            $this->_setSegmentContainsHtml();
276
                        }
277
                    }
278
                }
279
280
            }
281
        }
282
283
        //HTML Partial, add wrong HTML to preserve string content
284 93
        if ( !empty( $html_buffer ) ) {
285
286 1
            if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) {
287
                $this->_setSegmentContainsHtml();
288
            }
289
290 1
            $output .= $this->_fixWrongBuffer( $html_buffer );
291
        }
292
293
        //string ends with plain text, so no state change is triggered at the end of string
294 93
        if ( '' !== $plain_text_buffer and null !== $plain_text_buffer ) {
295 79
            $output .= $this->_finalizePlainText( $plain_text_buffer );
296
        }
297
298 93
        return $output;
299
300
    }
301
}
302