HtmlParser::handleHtmlState()   B
last analyzed

Complexity

Conditions 8
Paths 8

Size

Total Lines 22
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 8

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 21
c 1
b 0
f 0
nc 8
nop 8
dl 0
loc 22
ccs 20
cts 20
cp 1
crap 8
rs 8.4444

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters\Html;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Commons\Pipeline;
14
use ReflectionException;
15
use ReflectionMethod;
16
use RuntimeException;
17
18
/**
19
 * HtmlParser
20
 *
21
 * A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments
22
 * in a given input string. It processes segments statefully, validates potential HTML tags,
23
 * and invokes handler callbacks for fragment finalization and error correction.
24
 *
25
 * Usage:
26
 *      - Register a callback handler (must consume CallbacksHandler trait).
27
 *      - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state.
28
 *
29
 * State Machine:
30
 *      - STATE_PLAINTEXT: Outside any tag, collecting plain text.
31
 *      - STATE_HTML:      Inside angle brackets `<...>`, potentially a tag.
32
 *      - STATE_COMMENT:   Inside a comment `<!-- ... -->`.
33
 *      - STATE_JS_CSS:    Inside <script> or <style> tags.
34
 *
35
 * Callbacks:
36
 *      The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization,
37
 *      HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection.
38
 *
39
 * @author  domenico [email protected] / [email protected]
40
 * @package Matecat\SubFiltering\Filters\Html
41
 *
42
 * @method _isTagValid( string $buffer )            Validate whether $buffer is a correct HTML tag.
43
 * @method _finalizeMarkupTag( string $buffer )        Handle completion of a valid HTML tag.
44
 * @method _fixWrongBuffer( string $buffer )         Correct and process abnormal tag-like input.
45
 * @method _finalizeScriptTag( string $buffer )      Finalize a <script>/<style> or comment content.
46
 * @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far.
47
 * @method _setSegmentContainsMarkup()                Set a flag on the parent pipeline that HTML has been found.
48
 */
49
class HtmlParser {
50
51
    /** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */
52
    const STATE_PLAINTEXT = 0;
53
    const STATE_HTML      = 1;
54
    const STATE_COMMENT   = 2;
55
    const STATE_JS_CSS    = 3;
56
57
    /**
58
     * Processing pipeline; used for HTML presence flagging.
59
     * @var Pipeline|null
60
     */
61
    private ?Pipeline $pipeline;
62
63
    /**
64
     * The handler object providing callback implementations (must use CallbacksHandler trait).
65
     * @var AbstractHandler
66
     */
67
    protected AbstractHandler $callbacksHandler;
68
69
    /**
70
     * HtmlParser constructor.
71
     *
72
     * @param Pipeline|null $pipeline
73
     */
74 98
    public function __construct( ?Pipeline $pipeline = null ) {
75 98
        $this->pipeline = $pipeline;
76
    }
77
78
    /**
79
     * Registers a handler for callbacks invoked during parsing.
80
     * The handler must use the CallbacksHandler trait (ensured at runtime).
81
     *
82
     * @param AbstractHandler $class Handler implementing required callbacks.
83
     *
84
     * @throws RuntimeException If the handler does not use the CallbacksHandler trait.
85
     */
86 98
    public function registerCallbacksHandler( AbstractHandler $class ) {
87
        //check: $class must use CallbacksHandler trait
88 98
        if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) {
89 1
            throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." );
90
        }
91 97
        $this->callbacksHandler = $class;
92 97
        $this->pipeline         = $this->callbacksHandler->getPipeline();
93
    }
94
95
    /**
96
     * Magic invoker for protected/private methods on the registered callbacks handler.
97
     * This enables the parser to call non-public handler methods at runtime,
98
     * supporting encapsulation of callback logic.
99
     *
100
     * @param string   $name      Method name to invoke.
101
     * @param string[] $arguments Single-element arguments array for handler callback.
102
     *
103
     * @return mixed             Return value from the handler's method.
104
     * @throws ReflectionException If a method cannot be found/reflected.
105
     */
106 96
    public function __call( string $name, array $arguments = [] ) {
107
108
        // Create a ReflectionMethod instance for the method being called on the callback handler
109 96
        $reflector = new ReflectionMethod( $this->callbacksHandler, $name );
110
111
        // If the method is not public, make it accessible
112 96
        if ( !$reflector->isPublic() ) {
113 96
            $reflector->setAccessible( true );
114
        }
115
116
        // Invoke the method on the callback handler with the provided arguments
117 96
        return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null );
118
    }
119
120
    /**
121
     * Parses and transforms an input string segment, differentiating between
122
     * plain text, HTML tags, comments, and <script>/<style> blocks.
123
     * Sanitizes invalid tags, finalizes detected segments via callbacks, and
124
     * collects a normalized string (with external handler support).
125
     *
126
     * @param string $segment The input string to parse and transform.
127
     *
128
     * @return string         The processed segment, with tags and text handled appropriately.
129
     */
130
131 97
    public function transform( string $segment ): string {
132
        // Split input into Unicode codepoints for accurate char-by-char iteration.
133 97
        $originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY );
134 97
        if ( empty( $originalSplit ) ) {
135 1
            return '';
136
        }
137
138 96
        $state             = static::STATE_PLAINTEXT;
139 96
        $html_buffer       = '';
140 96
        $plain_text_buffer = '';
141 96
        $in_quote_char     = '';
142 96
        $output            = '';
143 96
        $charCount         = count( $originalSplit );
144
145 96
        foreach ( $originalSplit as $idx => $char ) {
146
            switch ( $state ) {
147 96
                case static::STATE_PLAINTEXT:
148 96
                    $this->handlePlainTextState( $char, $state, $html_buffer, $plain_text_buffer, $output );
149 96
                    break;
150 31
                case static::STATE_HTML:
151 31
                    $this->handleHtmlState( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer, $output, $in_quote_char );
152 31
                    break;
153 4
                case static::STATE_COMMENT:
154 1
                    $this->handleCommentState( $char, $state, $html_buffer, $output );
155 1
                    break;
156 4
                case static::STATE_JS_CSS:
157 4
                    $this->handleJsCssState( $char, $state, $html_buffer, $output );
158 4
                    break;
159
            }
160
        }
161
162
        // HTML Partial at the end, treat as invalid and preserve the string content
163 96
        if ( !empty( $html_buffer ) ) {
164 1
            if ( $this->_isTagValid( $html_buffer ) && null !== $this->pipeline ) {
165 1
                $this->_setSegmentContainsMarkup();
166
            }
167 1
            $output .= $this->_fixWrongBuffer( $html_buffer );
168
        }
169
170
        // Any trailing plain text: finalize it.
171 96
        if ( '' !== $plain_text_buffer ) {
172 82
            $output .= $this->_finalizePlainText( $plain_text_buffer );
173
        }
174
175 96
        return $output;
176
    }
177
178
    /**
179
     * Handles character processing when in the STATE_PLAINTEXT.
180
     */
181 96
    private function handlePlainTextState( string $char, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output ): void {
182
        switch ( $char ) {
183 96
            case '<':
184
                // Potential new tag starts; finalize plain text so far.
185 31
                $state             = static::STATE_HTML;
186 31
                $html_buffer       .= $char;
187 31
                $output            .= $this->_finalizePlainText( $plain_text_buffer );
188 31
                $plain_text_buffer = '';
189 31
                break;
190 91
            case '>':
191
                // Unescaped '>' in plaintext; treat as literal via error handing.
192 3
                $plain_text_buffer .= $this->_fixWrongBuffer( $char );
193 3
                break;
194
            default:
195
                // Collect as plain text.
196 91
                $plain_text_buffer .= $char;
197 91
                break;
198
        }
199
    }
200
201
    /**
202
     * Handles character processing when in the STATE_HTML.
203
     * This method acts as a dispatcher based on the character.
204
     * Assumes parser state variables (state, html_buffer, etc.) are now class properties.
205
     */
206 31
    private function handleHtmlState( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output, string &$in_quote_char ): void {
207
        switch ( $char ) {
208 31
            case '<':
209 1
                $this->onLessThanInHtml( $char, $output, $html_buffer );
210 1
                break;
211 31
            case '>':
212 25
                $this->onGreaterThanInHtml( $char, $state, $html_buffer, $output, $in_quote_char );
213 25
                break;
214 31
            case '"':
215 31
            case '\'':
216 9
                $this->onQuoteInHtml( $char, $html_buffer, $in_quote_char );
217 9
                break;
218 31
            case '-':
219 3
                $this->onDashInHtml( $char, $state, $html_buffer );
220 3
                break;
221 31
            case ' ':
222 26
            case "\n":
223 17
                $this->onWhitespaceInHtml( $char, $state, $html_buffer, $output );
224 17
                break;
225
            default:
226 26
                $this->onDefaultCharInHtml( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer );
227 26
                break;
228
        }
229
    }
230
231
    /**
232
     * Handles the '<' character in the HTML state.
233
     */
234 1
    private function onLessThanInHtml( string $char, string &$output, string &$html_buffer ): void {
235
        // If we found a second less than symbol, the first one IS NOT a tag.
236
        // See https://www.w3.org/TR/xml/#charsets
237 1
        $output      .= $this->_fixWrongBuffer( $html_buffer );
238 1
        $html_buffer = $char;
239
    }
240
241
    /**
242
     * Handles the '>' character in the HTML state.
243
     */
244 25
    private function onGreaterThanInHtml( string $char, int &$state, string &$html_buffer, string &$output, string &$in_quote_char ): void {
245
        // End of current tag. Special-case for <script> or <style> blocks.
246 25
        if ( $this->isScriptOrStyleTag( $html_buffer ) ) {
247 4
            $html_buffer .= $char;
248 4
            $state       = static::STATE_JS_CSS;
249
250 4
            return;
251
        }
252
253 22
        $in_quote_char = '';
254 22
        $state         = static::STATE_PLAINTEXT;
255 22
        $html_buffer   .= $char;
256
257
        // Validate and finalize HTML tag. Invalid tags are corrected/errors handled.
258 22
        if ( $this->_isTagValid( $html_buffer ) ) {
259 19
            $output .= $this->_finalizeMarkupTag( $html_buffer );
260 19
            if ( null !== $this->pipeline ) {
261 19
                $this->_setSegmentContainsMarkup();
262
            }
263
        } else {
264 4
            $output .= $this->_fixWrongBuffer( $html_buffer );
265
        }
266 22
        $html_buffer = '';
267
    }
268
269
    /**
270
     * Handles quote characters ('"' or "'") in the HTML state.
271
     */
272 9
    private function onQuoteInHtml( string $char, string &$html_buffer, string &$in_quote_char ): void {
273
        // Track entry/exit into quoted attributes.
274 9
        if ( $char == $in_quote_char ) {
275 8
            $in_quote_char = ''; // Exiting quote
276 9
        } elseif ( $in_quote_char == '' ) {
277 9
            $in_quote_char = $char; // Entering quote
278
        }
279 9
        $html_buffer .= $char;
280
    }
281
282
    /**
283
     * Handles the '-' character in the HTML state.
284
     */
285 3
    private function onDashInHtml( string $char, int &$state, string &$html_buffer ): void {
286
        // Detect HTML comment opening ('<!--').
287 3
        if ( $html_buffer === '<!-' ) {
288 1
            $state = static::STATE_COMMENT;
289
        }
290 3
        $html_buffer .= $char;
291
    }
292
293
    /**
294
     * Handles whitespace characters in the HTML state.
295
     */
296 17
    private function onWhitespaceInHtml( string $char, int &$state, string &$html_buffer, string &$output ): void {
297
        // Space or newline immediately after '<' is invalid.
298 17
        if ( $html_buffer === '<' ) {
299 6
            $state       = static::STATE_PLAINTEXT;
300 6
            $output      .= $this->_fixWrongBuffer( '<' . $char );
301 6
            $html_buffer = '';
302 6
            if ( null !== $this->pipeline ) {
303 6
                $this->_setSegmentContainsMarkup();
304
            }
305
306 6
            return;
307
        }
308 12
        $html_buffer .= $char;
309
    }
310
311
    /**
312
     * Handles any other default character in the HTML state.
313
     */
314 26
    private function onDefaultCharInHtml( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer ): void {
315 26
        $html_buffer .= $char;
316
        // End of input: treat buffer as plain text if not a valid tag.
317 26
        if ( $idx === ( $charCount - 1 ) && !$this->_isTagValid( $html_buffer ) ) {
318 2
            $state             = static::STATE_PLAINTEXT; // Error: not a valid tag
319 2
            $plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer );
320 2
            $html_buffer       = '';
321
        }
322
    }
323
324
    /**
325
     * Handles character processing when in the STATE_COMMENT.
326
     */
327 1
    private function handleCommentState( string $char, int &$state, string &$html_buffer, string &$output ): void {
328 1
        $html_buffer .= $char;
329
        // Check for the end of a comment: '-->'
330 1
        if ( $char === '>' && substr( $html_buffer, -3 ) === '-->' ) {
331 1
            $state       = static::STATE_PLAINTEXT;
332 1
            $output      .= $this->_finalizeScriptTag( $html_buffer );
333 1
            $html_buffer = '';
334 1
            if ( null !== $this->pipeline ) {
335 1
                $this->_setSegmentContainsMarkup();
336
            }
337
        }
338
    }
339
340
    /**
341
     * Handles character processing when in the STATE_JS_CSS.
342
     */
343 4
    private function handleJsCssState( string $char, int &$state, string &$html_buffer, string &$output ): void {
344 4
        $html_buffer .= $char;
345
        // Detect close: e.g., '</script>' or '</style>'
346 4
        if ( $char === '>' ) {
347 3
            if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ], true ) ) {
348 3
                $state = static::STATE_PLAINTEXT;
349 3
                $this->_isTagValid( $html_buffer );
350 3
                $output      .= $this->_finalizeScriptTag( $html_buffer );
351 3
                $html_buffer = '';
352 3
                if ( null !== $this->pipeline ) {
353 3
                    $this->_setSegmentContainsMarkup();
354
                }
355
            }
356
        }
357
    }
358
359
    /**
360
     * Checks if the buffered HTML is the beginning of a script or style tag.
361
     */
362 25
    private function isScriptOrStyleTag( string $html_buffer ): bool {
363
        // A tag starts with '<script' or '<style'. This also covers variants with spaces or attributes.
364 25
        return in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] );
365
    }
366
367
}
368