Passed
Push — master ( eead79...194803 )
by Domenico
02:03
created

HtmlParser::handleHtmlState()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 22
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 8

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 21
c 1
b 0
f 0
nc 8
nop 8
dl 0
loc 22
ccs 20
cts 20
cp 1
crap 8
rs 8.4444

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters\Html;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Commons\Pipeline;
14
use ReflectionException;
15
use ReflectionMethod;
16
use RuntimeException;
17
18
/**
19
 * HtmlParser
20
 *
21
 * A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments
22
 * in a given input string. It processes segments statefully, validates potential HTML tags,
23
 * and invokes handler callbacks for fragment finalization and error correction.
24
 *
25
 * Usage:
26
 *      - Register a callback handler (must consume CallbacksHandler trait).
27
 *      - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state.
28
 *
29
 * State Machine:
30
 *      - STATE_PLAINTEXT: Outside any tag, collecting plain text.
31
 *      - STATE_HTML:      Inside angle brackets `<...>`, potentially a tag.
32
 *      - STATE_COMMENT:   Inside a comment `<!-- ... -->`.
33
 *      - STATE_JS_CSS:    Inside <script> or <style> tags.
34
 *
35
 * Callbacks:
36
 *      The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization,
37
 *      HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection.
38
 *
39
 * @author  domenico [email protected] / [email protected]
40
 * @package Matecat\SubFiltering\Filters\Html
41
 *
42
 * @method _isTagValid( string $buffer )            Validate whether $buffer is a correct HTML tag.
43
 * @method _finalizeMarkupTag( string $buffer )        Handle completion of a valid HTML tag.
44
 * @method _fixWrongBuffer( string $buffer )         Correct and process abnormal tag-like input.
45
 * @method _finalizeScriptTag( string $buffer )      Finalize a <script>/<style> or comment content.
46
 * @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far.
47
 * @method _setSegmentContainsMarkup()                Set a flag on the parent pipeline that HTML has been found.
48
 */
49
class HtmlParser {
50
51
    /** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */
52
    const STATE_PLAINTEXT = 0;
53
    const STATE_HTML      = 1;
54
    const STATE_COMMENT   = 2;
55
    const STATE_JS_CSS    = 3;
56
57
    /**
58
     * Processing pipeline; used for HTML presence flagging.
59
     * @var Pipeline|null
60
     */
61
    private ?Pipeline $pipeline;
62
63
    /**
64
     * The handler object providing callback implementations (must use CallbacksHandler trait).
65
     * @var AbstractHandler
66
     */
67
    protected AbstractHandler $callbacksHandler;
68
69
    /**
70
     * HtmlParser constructor.
71
     *
72
     * @param Pipeline|null $pipeline
73
     */
74 97
    public function __construct( ?Pipeline $pipeline = null ) {
75 97
        $this->pipeline = $pipeline;
76
    }
77
78
    /**
79
     * Registers a handler for callbacks invoked during parsing.
80
     * The handler must use the CallbacksHandler trait (ensured at runtime).
81
     *
82
     * @param AbstractHandler $class Handler implementing required callbacks.
83
     *
84
     * @throws RuntimeException If the handler does not use the CallbacksHandler trait.
85
     */
86 97
    public function registerCallbacksHandler( AbstractHandler $class ) {
87
        //check: $class must use CallbacksHandler trait
88 97
        if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) {
89 1
            throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." );
90
        }
91 96
        $this->callbacksHandler = $class;
92 96
        $this->pipeline         = $this->callbacksHandler->getPipeline();
93
    }
94
95
    /**
96
     * Magic invoker for protected/private methods on the registered callbacks handler.
97
     * This enables the parser to call non-public handler methods at runtime,
98
     * supporting encapsulation of callback logic.
99
     *
100
     * @param string   $name      Method name to invoke.
101
     * @param string[] $arguments Single-element arguments array for handler callback.
102
     *
103
     * @return mixed             Return value from the handler's method.
104
     * @throws ReflectionException If a method cannot be found/reflected.
105
     */
106 95
    public function __call( string $name, array $arguments = [] ) {
107
108
        // Create a ReflectionMethod instance for the method being called on the callback handler
109 95
        $reflector = new ReflectionMethod( $this->callbacksHandler, $name );
110
111
        // If the method is not public, make it accessible
112 95
        if ( !$reflector->isPublic() ) {
113 95
            $reflector->setAccessible( true );
114
        }
115
116
        // Invoke the method on the callback handler with the provided arguments
117 95
        return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null );
118
    }
119
120
    /**
121
     * Parses and transforms an input string segment, differentiating between
122
     * plain text, HTML tags, comments, and <script>/<style> blocks.
123
     * Sanitizes invalid tags, finalizes detected segments via callbacks, and
124
     * collects a normalized string (with external handler support).
125
     *
126
     * @param string $segment The input string to parse and transform.
127
     *
128
     * @return string         The processed segment, with tags and text handled appropriately.
129
     */
130
131 96
    public function transform( string $segment ): string {
132
        // Split input into Unicode codepoints for accurate char-by-char iteration.
133 96
        $originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY );
134 96
        if ( empty( $originalSplit ) ) {
135 1
            return '';
136
        }
137
138 95
        $state             = static::STATE_PLAINTEXT;
139 95
        $html_buffer       = '';
140 95
        $plain_text_buffer = '';
141 95
        $in_quote_char     = '';
142 95
        $output            = '';
143 95
        $charCount         = count( $originalSplit );
144
145 95
        foreach ( $originalSplit as $idx => $char ) {
146
            switch ( $state ) {
147 95
                case static::STATE_PLAINTEXT:
148 95
                    $this->handlePlainTextState( $char, $state, $html_buffer, $plain_text_buffer, $output );
149 95
                    break;
150 30
                case static::STATE_HTML:
151 30
                    $this->handleHtmlState( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer, $output, $in_quote_char );
152 30
                    break;
153 4
                case static::STATE_COMMENT:
154 1
                    $this->handleCommentState( $char, $state, $html_buffer, $output );
155 1
                    break;
156 4
                case static::STATE_JS_CSS:
157 4
                    $this->handleJsCssState( $char, $state, $html_buffer, $output );
158 4
                    break;
159
            }
160
        }
161
162
        // HTML Partial at the end, treat as invalid and preserve the string content
163 95
        if ( !empty( $html_buffer ) ) {
164 1
            if ( $this->_isTagValid( $html_buffer ) && null !== $this->pipeline ) {
165 1
                $this->_setSegmentContainsMarkup();
166
            }
167 1
            $output .= $this->_fixWrongBuffer( $html_buffer );
168
        }
169
170
        // Any trailing plain text: finalize it.
171 95
        if ( '' !== $plain_text_buffer ) {
172 81
            $output .= $this->_finalizePlainText( $plain_text_buffer );
173
        }
174
175 95
        return $output;
176
    }
177
178
    /**
179
     * Handles character processing when in the STATE_PLAINTEXT.
180
     */
181 95
    private function handlePlainTextState( string $char, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output ): void {
182
        switch ( $char ) {
183 95
            case '<':
184
                // Potential new tag starts; finalize plain text so far.
185 30
                $state             = static::STATE_HTML;
186 30
                $html_buffer       .= $char;
187 30
                $output            .= $this->_finalizePlainText( $plain_text_buffer );
188 30
                $plain_text_buffer = '';
189 30
                break;
190 90
            case '>':
191
                // Unescaped '>' in plaintext; treat as literal via error handing.
192 3
                $plain_text_buffer .= $this->_fixWrongBuffer( $char );
193 3
                break;
194
            default:
195
                // Collect as plain text.
196 90
                $plain_text_buffer .= $char;
197 90
                break;
198
        }
199
    }
200
201
    /**
202
     * Handles character processing when in the STATE_HTML.
203
     * This method acts as a dispatcher based on the character.
204
     * Assumes parser state variables (state, html_buffer, etc.) are now class properties.
205
     */
206 30
    private function handleHtmlState( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output, string &$in_quote_char ): void {
207
        switch ( $char ) {
208 30
            case '<':
209 1
                $this->onLessThanInHtml( $char, $output, $html_buffer );
210 1
                break;
211 30
            case '>':
212 24
                $this->onGreaterThanInHtml( $char, $state, $html_buffer, $output, $in_quote_char );
213 24
                break;
214 30
            case '"':
215 30
            case '\'':
216 9
                $this->onQuoteInHtml( $char, $html_buffer, $in_quote_char );
217 9
                break;
218 30
            case '-':
219 3
                $this->onDashInHtml( $char, $state, $html_buffer );
220 3
                break;
221 30
            case ' ':
222 25
            case "\n":
223 16
                $this->onWhitespaceInHtml( $char, $state, $html_buffer, $output );
224 16
                break;
225
            default:
226 25
                $this->onDefaultCharInHtml( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer );
227 25
                break;
228
        }
229
    }
230
231
    /**
232
     * Handles the '<' character in the HTML state.
233
     */
234 1
    private function onLessThanInHtml( string $char, string &$output, string &$html_buffer ): void {
235
        // If we found a second less than symbol, the first one IS NOT a tag.
236
        // See https://www.w3.org/TR/xml/#charsets
237 1
        $output      .= $this->_fixWrongBuffer( $html_buffer );
238 1
        $html_buffer = $char;
239
    }
240
241
    /**
242
     * Handles the '>' character in the HTML state.
243
     */
244 24
    private function onGreaterThanInHtml( string $char, int &$state, string &$html_buffer, string &$output, string &$in_quote_char ): void {
245
        // End of current tag. Special-case for <script> or <style> blocks.
246 24
        if ( $this->isScriptOrStyleTag( $html_buffer ) ) {
247 4
            $html_buffer .= $char;
248 4
            $state       = static::STATE_JS_CSS;
249
250 4
            return;
251
        }
252
253 21
        $in_quote_char = '';
254 21
        $state         = static::STATE_PLAINTEXT;
255 21
        $html_buffer   .= $char;
256
257
        // Validate and finalize HTML tag. Invalid tags are corrected/errors handled.
258 21
        if ( $this->_isTagValid( $html_buffer ) ) {
259 19
            $output .= $this->_finalizeMarkupTag( $html_buffer );
260 19
            if ( null !== $this->pipeline ) {
261 19
                $this->_setSegmentContainsMarkup();
262
            }
263
        } else {
264 3
            $output .= $this->_fixWrongBuffer( $html_buffer );
265
        }
266 21
        $html_buffer = '';
267
    }
268
269
    /**
270
     * Handles quote characters ('"' or "'") in the HTML state.
271
     */
272 9
    private function onQuoteInHtml( string $char, string &$html_buffer, string &$in_quote_char ): void {
273
        // Track entry/exit into quoted attributes.
274 9
        if ( $char == $in_quote_char ) {
275 8
            $in_quote_char = ''; // Exiting quote
276 9
        } elseif ( $in_quote_char == '' ) {
277 9
            $in_quote_char = $char; // Entering quote
278
        }
279 9
        $html_buffer .= $char;
280
    }
281
282
    /**
283
     * Handles the '-' character in the HTML state.
284
     */
285 3
    private function onDashInHtml( string $char, int &$state, string &$html_buffer ): void {
286
        // Detect HTML comment opening ('<!--').
287 3
        if ( $html_buffer === '<!-' ) {
288 1
            $state = static::STATE_COMMENT;
289
        }
290 3
        $html_buffer .= $char;
291
    }
292
293
    /**
294
     * Handles whitespace characters in the HTML state.
295
     */
296 16
    private function onWhitespaceInHtml( string $char, int &$state, string &$html_buffer, string &$output ): void {
297
        // Space or newline immediately after '<' is invalid.
298 16
        if ( $html_buffer === '<' ) {
299 6
            $state       = static::STATE_PLAINTEXT;
300 6
            $output      .= $this->_fixWrongBuffer( '<' . $char );
301 6
            $html_buffer = '';
302 6
            if ( null !== $this->pipeline ) {
303 6
                $this->_setSegmentContainsMarkup();
304
            }
305
306 6
            return;
307
        }
308 11
        $html_buffer .= $char;
309
    }
310
311
    /**
312
     * Handles any other default character in the HTML state.
313
     */
314 25
    private function onDefaultCharInHtml( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer ): void {
315 25
        $html_buffer .= $char;
316
        // End of input: treat buffer as plain text if not a valid tag.
317 25
        if ( $idx === ( $charCount - 1 ) && !$this->_isTagValid( $html_buffer ) ) {
318 2
            $state             = static::STATE_PLAINTEXT; // Error: not a valid tag
319 2
            $plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer );
320 2
            $html_buffer       = '';
321
        }
322
    }
323
324
    /**
325
     * Handles character processing when in the STATE_COMMENT.
326
     */
327 1
    private function handleCommentState( string $char, int &$state, string &$html_buffer, string &$output ): void {
328 1
        $html_buffer .= $char;
329
        // Check for the end of a comment: '-->'
330 1
        if ( $char === '>' && substr( $html_buffer, -3 ) === '-->' ) {
331 1
            $state       = static::STATE_PLAINTEXT;
332 1
            $output      .= $this->_finalizeScriptTag( $html_buffer );
333 1
            $html_buffer = '';
334 1
            if ( null !== $this->pipeline ) {
335 1
                $this->_setSegmentContainsMarkup();
336
            }
337
        }
338
    }
339
340
    /**
341
     * Handles character processing when in the STATE_JS_CSS.
342
     */
343 4
    private function handleJsCssState( string $char, int &$state, string &$html_buffer, string &$output ): void {
344 4
        $html_buffer .= $char;
345
        // Detect close: e.g., '</script>' or '</style>'
346 4
        if ( $char === '>' ) {
347 3
            if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ], true ) ) {
348 3
                $state = static::STATE_PLAINTEXT;
349 3
                $this->_isTagValid( $html_buffer );
350 3
                $output      .= $this->_finalizeScriptTag( $html_buffer );
351 3
                $html_buffer = '';
352 3
                if ( null !== $this->pipeline ) {
353 3
                    $this->_setSegmentContainsMarkup();
354
                }
355
            }
356
        }
357
    }
358
359
    /**
360
     * Checks if the buffered HTML is the beginning of a script or style tag.
361
     */
362 24
    private function isScriptOrStyleTag( string $html_buffer ): bool {
363
        // A tag starts with '<script' or '<style'. This also covers variants with spaces or attributes.
364 24
        return in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] );
365
    }
366
367
}
368