HtmlParser::transform()   B
last analyzed

Complexity

Conditions 11
Paths 37

Size

Total Lines 45
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 11

Importance

Changes 4
Bugs 0 Features 0
Metric Value
cc 11
eloc 30
c 4
b 0
f 0
nc 37
nop 1
dl 0
loc 45
ccs 30
cts 30
cp 1
crap 11
rs 7.3166

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters\Html;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Commons\Pipeline;
14
use ReflectionException;
15
use ReflectionMethod;
16
use RuntimeException;
17
18
/**
19
 * HtmlParser
20
 *
21
 * A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments
22
 * in a given input string. It processes segments statefully, validates potential HTML tags,
23
 * and invokes handler callbacks for fragment finalization and error correction.
24
 *
25
 * Usage:
26
 *      - Register a callback handler (must consume CallbacksHandler trait).
27
 *      - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state.
28
 *
29
 * State Machine:
30
 *      - STATE_PLAINTEXT: Outside any tag, collecting plain text.
31
 *      - STATE_HTML:      Inside angle brackets `<...>`, potentially a tag.
32
 *      - STATE_COMMENT:   Inside a comment `<!-- ... -->`.
33
 *      - STATE_JS_CSS:    Inside <script> or <style> tags.
34
 *
35
 * Callbacks:
36
 *      The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization,
37
 *      HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection.
38
 *
39
 * @author  domenico [email protected] / [email protected]
40
 * @package Matecat\SubFiltering\Filters\Html
41
 *
42
 * @method _isTagValid( string $buffer )            Validate whether $buffer is a correct HTML tag.
43
 * @method _finalizeMarkupTag( string $buffer )        Handle completion of a valid HTML tag.
44
 * @method _fixWrongBuffer( string $buffer )         Correct and process abnormal tag-like input.
45
 * @method _finalizeScriptTag( string $buffer )      Finalize a <script>/<style> or comment content.
46
 * @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far.
47
 * @method _setSegmentContainsMarkup()                Set a flag on the parent pipeline that HTML has been found.
48
 */
49
class HtmlParser {
50
51
    /** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */
52
    const STATE_PLAINTEXT = 0;
53
    const STATE_HTML      = 1;
54
    const STATE_COMMENT   = 2;
55
    const STATE_JS_CSS    = 3;
56
57
    /**
58
     * Processing pipeline; used for HTML presence flagging.
59
     * @var Pipeline|null
60
     */
61
    private ?Pipeline $pipeline;
62
63
    /**
64
     * The handler object providing callback implementations (must use CallbacksHandler trait).
65
     * @var AbstractHandler
66
     */
67
    protected AbstractHandler $callbacksHandler;
68
69
    /**
70
     * HtmlParser constructor.
71
     *
72
     * @param Pipeline|null $pipeline
73
     */
74 98
    public function __construct( ?Pipeline $pipeline = null ) {
75 98
        $this->pipeline = $pipeline;
76
    }
77
78
    /**
79
     * Registers a handler for callbacks invoked during parsing.
80
     * The handler must use the CallbacksHandler trait (ensured at runtime).
81
     *
82
     * @param AbstractHandler $class Handler implementing required callbacks.
83
     *
84
     * @throws RuntimeException If the handler does not use the CallbacksHandler trait.
85
     */
86 98
    public function registerCallbacksHandler( AbstractHandler $class ) {
87
        //check: $class must use CallbacksHandler trait
88 98
        if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) {
89 1
            throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." );
90
        }
91 97
        $this->callbacksHandler = $class;
92 97
        $this->pipeline         = $this->callbacksHandler->getPipeline();
93
    }
94
95
    /**
96
     * Magic invoker for protected/private methods on the registered callbacks handler.
97
     * This enables the parser to call non-public handler methods at runtime,
98
     * supporting encapsulation of callback logic.
99
     *
100
     * @param string   $name      Method name to invoke.
101
     * @param string[] $arguments Single-element arguments array for handler callback.
102
     *
103
     * @return mixed             Return value from the handler's method.
104
     * @throws ReflectionException If a method cannot be found/reflected.
105
     */
106 96
    public function __call( string $name, array $arguments = [] ) {
107
108
        // Create a ReflectionMethod instance for the method being called on the callback handler
109 96
        $reflector = new ReflectionMethod( $this->callbacksHandler, $name );
110
111
        // If the method is not public, make it accessible
112 96
        if ( !$reflector->isPublic() ) {
113 96
            $reflector->setAccessible( true );
114
        }
115
116
        // Invoke the method on the callback handler with the provided arguments
117 96
        return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null );
118
    }
119
120
    /**
121
     * Parses and transforms an input string segment, differentiating between
122
     * plain text, HTML tags, comments, and <script>/<style> blocks.
123
     * Sanitizes invalid tags, finalizes detected segments via callbacks, and
124
     * collects a normalized string (with external handler support).
125
     *
126
     * @param string $segment The input string to parse and transform.
127
     *
128
     * @return string         The processed segment, with tags and text handled appropriately.
129
     */
130
131 97
    public function transform( string $segment ): string {
132
        // Split input into Unicode codepoints for accurate char-by-char iteration.
133 97
        $originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY );
134 97
        if ( empty( $originalSplit ) ) {
135 1
            return '';
136
        }
137
138 96
        $state             = static::STATE_PLAINTEXT;
139 96
        $html_buffer       = '';
140 96
        $plain_text_buffer = '';
141 96
        $in_quote_char     = '';
142 96
        $output            = '';
143 96
        $charCount         = count( $originalSplit );
144
145 96
        foreach ( $originalSplit as $idx => $char ) {
146
            switch ( $state ) {
147 96
                case static::STATE_PLAINTEXT:
148 96
                    $this->handlePlainTextState( $char, $state, $html_buffer, $plain_text_buffer, $output );
149 96
                    break;
150 31
                case static::STATE_HTML:
151 31
                    $this->handleHtmlState( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer, $output, $in_quote_char );
152 31
                    break;
153 4
                case static::STATE_COMMENT:
154 1
                    $this->handleCommentState( $char, $state, $html_buffer, $output );
155 1
                    break;
156 4
                case static::STATE_JS_CSS:
157 4
                    $this->handleJsCssState( $char, $state, $html_buffer, $output );
158 4
                    break;
159
            }
160
        }
161
162
        // HTML Partial at the end, treat as invalid and preserve the string content
163 96
        if ( !empty( $html_buffer ) ) {
164 1
            if ( $this->_isTagValid( $html_buffer ) && null !== $this->pipeline ) {
165 1
                $this->_setSegmentContainsMarkup();
166
            }
167 1
            $output .= $this->_fixWrongBuffer( $html_buffer );
168
        }
169
170
        // Any trailing plain text: finalize it.
171 96
        if ( '' !== $plain_text_buffer ) {
172 82
            $output .= $this->_finalizePlainText( $plain_text_buffer );
173
        }
174
175 96
        return $output;
176
    }
177
178
    /**
179
     * Handles character processing when in the STATE_PLAINTEXT.
180
     */
181 96
    private function handlePlainTextState( string $char, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output ): void {
182
        switch ( $char ) {
183 96
            case '<':
184
                // Potential new tag starts; finalize plain text so far.
185 31
                $state             = static::STATE_HTML;
186 31
                $html_buffer       .= $char;
187 31
                $output            .= $this->_finalizePlainText( $plain_text_buffer );
188 31
                $plain_text_buffer = '';
189 31
                break;
190 91
            case '>':
191
                // Unescaped '>' in plaintext; treat as literal via error handing.
192 3
                $plain_text_buffer .= $this->_fixWrongBuffer( $char );
193 3
                break;
194
            default:
195
                // Collect as plain text.
196 91
                $plain_text_buffer .= $char;
197 91
                break;
198
        }
199
    }
200
201
    /**
202
     * Handles character processing when in the STATE_HTML.
203
     * This method acts as a dispatcher based on the character.
204
     * Assumes parser state variables (state, html_buffer, etc.) are now class properties.
205
     */
206 31
    private function handleHtmlState( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output, string &$in_quote_char ): void {
207
        switch ( $char ) {
208 31
            case '<':
209 1
                $this->onLessThanInHtml( $char, $output, $html_buffer );
210 1
                break;
211 31
            case '>':
212 25
                $this->onGreaterThanInHtml( $char, $state, $html_buffer, $output, $in_quote_char );
213 25
                break;
214 31
            case '"':
215 31
            case '\'':
216 9
                $this->onQuoteInHtml( $char, $html_buffer, $in_quote_char );
217 9
                break;
218 31
            case '-':
219 3
                $this->onDashInHtml( $char, $state, $html_buffer );
220 3
                break;
221 31
            case ' ':
222 26
            case "\n":
223 17
                $this->onWhitespaceInHtml( $char, $state, $html_buffer, $output );
224 17
                break;
225
            default:
226 26
                $this->onDefaultCharInHtml( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer );
227 26
                break;
228
        }
229
    }
230
231
    /**
232
     * Handles the '<' character in the HTML state.
233
     */
234 1
    private function onLessThanInHtml( string $char, string &$output, string &$html_buffer ): void {
235
        // If we found a second less than symbol, the first one IS NOT a tag.
236
        // See https://www.w3.org/TR/xml/#charsets
237 1
        $output      .= $this->_fixWrongBuffer( $html_buffer );
238 1
        $html_buffer = $char;
239
    }
240
241
    /**
242
     * Handles the '>' character in the HTML state.
243
     */
244 25
    private function onGreaterThanInHtml( string $char, int &$state, string &$html_buffer, string &$output, string &$in_quote_char ): void {
245
        // End of current tag. Special-case for <script> or <style> blocks.
246 25
        if ( $this->isScriptOrStyleTag( $html_buffer ) ) {
247 4
            $html_buffer .= $char;
248 4
            $state       = static::STATE_JS_CSS;
249
250 4
            return;
251
        }
252
253 22
        $in_quote_char = '';
254 22
        $state         = static::STATE_PLAINTEXT;
255 22
        $html_buffer   .= $char;
256
257
        // Validate and finalize HTML tag. Invalid tags are corrected/errors handled.
258 22
        if ( $this->_isTagValid( $html_buffer ) ) {
259 19
            $output .= $this->_finalizeMarkupTag( $html_buffer );
260 19
            if ( null !== $this->pipeline ) {
261 19
                $this->_setSegmentContainsMarkup();
262
            }
263
        } else {
264 4
            $output .= $this->_fixWrongBuffer( $html_buffer );
265
        }
266 22
        $html_buffer = '';
267
    }
268
269
    /**
270
     * Handles quote characters ('"' or "'") in the HTML state.
271
     */
272 9
    private function onQuoteInHtml( string $char, string &$html_buffer, string &$in_quote_char ): void {
273
        // Track entry/exit into quoted attributes.
274 9
        if ( $char == $in_quote_char ) {
275 8
            $in_quote_char = ''; // Exiting quote
276 9
        } elseif ( $in_quote_char == '' ) {
277 9
            $in_quote_char = $char; // Entering quote
278
        }
279 9
        $html_buffer .= $char;
280
    }
281
282
    /**
283
     * Handles the '-' character in the HTML state.
284
     */
285 3
    private function onDashInHtml( string $char, int &$state, string &$html_buffer ): void {
286
        // Detect HTML comment opening ('<!--').
287 3
        if ( $html_buffer === '<!-' ) {
288 1
            $state = static::STATE_COMMENT;
289
        }
290 3
        $html_buffer .= $char;
291
    }
292
293
    /**
294
     * Handles whitespace characters in the HTML state.
295
     */
296 17
    private function onWhitespaceInHtml( string $char, int &$state, string &$html_buffer, string &$output ): void {
297
        // Space or newline immediately after '<' is invalid.
298 17
        if ( $html_buffer === '<' ) {
299 6
            $state       = static::STATE_PLAINTEXT;
300 6
            $output      .= $this->_fixWrongBuffer( '<' . $char );
301 6
            $html_buffer = '';
302 6
            if ( null !== $this->pipeline ) {
303 6
                $this->_setSegmentContainsMarkup();
304
            }
305
306 6
            return;
307
        }
308 12
        $html_buffer .= $char;
309
    }
310
311
    /**
312
     * Handles any other default character in the HTML state.
313
     */
314 26
    private function onDefaultCharInHtml( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer ): void {
315 26
        $html_buffer .= $char;
316
        // End of input: treat buffer as plain text if not a valid tag.
317 26
        if ( $idx === ( $charCount - 1 ) && !$this->_isTagValid( $html_buffer ) ) {
318 2
            $state             = static::STATE_PLAINTEXT; // Error: not a valid tag
319 2
            $plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer );
320 2
            $html_buffer       = '';
321
        }
322
    }
323
324
    /**
325
     * Handles character processing when in the STATE_COMMENT.
326
     */
327 1
    private function handleCommentState( string $char, int &$state, string &$html_buffer, string &$output ): void {
328 1
        $html_buffer .= $char;
329
        // Check for the end of a comment: '-->'
330 1
        if ( $char === '>' && substr( $html_buffer, -3 ) === '-->' ) {
331 1
            $state       = static::STATE_PLAINTEXT;
332 1
            $output      .= $this->_finalizeScriptTag( $html_buffer );
333 1
            $html_buffer = '';
334 1
            if ( null !== $this->pipeline ) {
335 1
                $this->_setSegmentContainsMarkup();
336
            }
337
        }
338
    }
339
340
    /**
341
     * Handles character processing when in the STATE_JS_CSS.
342
     */
343 4
    private function handleJsCssState( string $char, int &$state, string &$html_buffer, string &$output ): void {
344 4
        $html_buffer .= $char;
345
        // Detect close: e.g., '</script>' or '</style>'
346 4
        if ( $char === '>' ) {
347 3
            if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ], true ) ) {
348 3
                $state = static::STATE_PLAINTEXT;
349 3
                $this->_isTagValid( $html_buffer );
350 3
                $output      .= $this->_finalizeScriptTag( $html_buffer );
351 3
                $html_buffer = '';
352 3
                if ( null !== $this->pipeline ) {
353 3
                    $this->_setSegmentContainsMarkup();
354
                }
355
            }
356
        }
357
    }
358
359
    /**
360
     * Checks if the buffered HTML is the beginning of a script or style tag.
361
     */
362 25
    private function isScriptOrStyleTag( string $html_buffer ): bool {
363
        // A tag starts with '<script' or '<style'. This also covers variants with spaces or attributes.
364 25
        return in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] );
365
    }
366
367
}
368