Passed
Push — master ( eead79...194803 )
by Domenico
02:03
created

HtmlParser::onDashInHtml()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
c 1
b 0
f 0
nc 2
nop 3
dl 0
loc 6
ccs 4
cts 4
cp 1
crap 2
rs 10
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters\Html;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Commons\Pipeline;
14
use ReflectionException;
15
use ReflectionMethod;
16
use RuntimeException;
17
18
/**
19
 * HtmlParser
20
 *
21
 * A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments
22
 * in a given input string. It processes segments statefully, validates potential HTML tags,
23
 * and invokes handler callbacks for fragment finalization and error correction.
24
 *
25
 * Usage:
26
 *      - Register a callback handler (must consume CallbacksHandler trait).
27
 *      - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state.
28
 *
29
 * State Machine:
30
 *      - STATE_PLAINTEXT: Outside any tag, collecting plain text.
31
 *      - STATE_HTML:      Inside angle brackets `<...>`, potentially a tag.
32
 *      - STATE_COMMENT:   Inside a comment `<!-- ... -->`.
33
 *      - STATE_JS_CSS:    Inside <script> or <style> tags.
34
 *
35
 * Callbacks:
36
 *      The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization,
37
 *      HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection.
38
 *
39
 * @author  domenico [email protected] / [email protected]
40
 * @package Matecat\SubFiltering\Filters\Html
41
 *
42
 * @method _isTagValid( string $buffer )            Validate whether $buffer is a correct HTML tag.
43
 * @method _finalizeMarkupTag( string $buffer )        Handle completion of a valid HTML tag.
44
 * @method _fixWrongBuffer( string $buffer )         Correct and process abnormal tag-like input.
45
 * @method _finalizeScriptTag( string $buffer )      Finalize a <script>/<style> or comment content.
46
 * @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far.
47
 * @method _setSegmentContainsMarkup()                Set a flag on the parent pipeline that HTML has been found.
48
 */
49
class HtmlParser {
50
51
    /** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */
52
    const STATE_PLAINTEXT = 0;
53
    const STATE_HTML      = 1;
54
    const STATE_COMMENT   = 2;
55
    const STATE_JS_CSS    = 3;
56
57
    /**
58
     * Processing pipeline; used for HTML presence flagging.
59
     * @var Pipeline|null
60
     */
61
    private ?Pipeline $pipeline;
62
63
    /**
64
     * The handler object providing callback implementations (must use CallbacksHandler trait).
65
     * @var AbstractHandler
66
     */
67
    protected AbstractHandler $callbacksHandler;
68
69
    /**
70
     * HtmlParser constructor.
71
     *
72
     * @param Pipeline|null $pipeline
73
     */
74 97
    public function __construct( ?Pipeline $pipeline = null ) {
75 97
        $this->pipeline = $pipeline;
76
    }
77
78
    /**
79
     * Registers a handler for callbacks invoked during parsing.
80
     * The handler must use the CallbacksHandler trait (ensured at runtime).
81
     *
82
     * @param AbstractHandler $class Handler implementing required callbacks.
83
     *
84
     * @throws RuntimeException If the handler does not use the CallbacksHandler trait.
85
     */
86 97
    public function registerCallbacksHandler( AbstractHandler $class ) {
87
        //check: $class must use CallbacksHandler trait
88 97
        if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) {
89 1
            throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." );
90
        }
91 96
        $this->callbacksHandler = $class;
92 96
        $this->pipeline         = $this->callbacksHandler->getPipeline();
93
    }
94
95
    /**
96
     * Magic invoker for protected/private methods on the registered callbacks handler.
97
     * This enables the parser to call non-public handler methods at runtime,
98
     * supporting encapsulation of callback logic.
99
     *
100
     * @param string   $name      Method name to invoke.
101
     * @param string[] $arguments Single-element arguments array for handler callback.
102
     *
103
     * @return mixed             Return value from the handler's method.
104
     * @throws ReflectionException If a method cannot be found/reflected.
105
     */
106 95
    public function __call( string $name, array $arguments = [] ) {
107
108
        // Create a ReflectionMethod instance for the method being called on the callback handler
109 95
        $reflector = new ReflectionMethod( $this->callbacksHandler, $name );
110
111
        // If the method is not public, make it accessible
112 95
        if ( !$reflector->isPublic() ) {
113 95
            $reflector->setAccessible( true );
114
        }
115
116
        // Invoke the method on the callback handler with the provided arguments
117 95
        return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null );
118
    }
119
120
    /**
121
     * Parses and transforms an input string segment, differentiating between
122
     * plain text, HTML tags, comments, and <script>/<style> blocks.
123
     * Sanitizes invalid tags, finalizes detected segments via callbacks, and
124
     * collects a normalized string (with external handler support).
125
     *
126
     * @param string $segment The input string to parse and transform.
127
     *
128
     * @return string         The processed segment, with tags and text handled appropriately.
129
     */
130
131 96
    public function transform( string $segment ): string {
132
        // Split input into Unicode codepoints for accurate char-by-char iteration.
133 96
        $originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY );
134 96
        if ( empty( $originalSplit ) ) {
135 1
            return '';
136
        }
137
138 95
        $state             = static::STATE_PLAINTEXT;
139 95
        $html_buffer       = '';
140 95
        $plain_text_buffer = '';
141 95
        $in_quote_char     = '';
142 95
        $output            = '';
143 95
        $charCount         = count( $originalSplit );
144
145 95
        foreach ( $originalSplit as $idx => $char ) {
146
            switch ( $state ) {
147 95
                case static::STATE_PLAINTEXT:
148 95
                    $this->handlePlainTextState( $char, $state, $html_buffer, $plain_text_buffer, $output );
149 95
                    break;
150 30
                case static::STATE_HTML:
151 30
                    $this->handleHtmlState( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer, $output, $in_quote_char );
152 30
                    break;
153 4
                case static::STATE_COMMENT:
154 1
                    $this->handleCommentState( $char, $state, $html_buffer, $output );
155 1
                    break;
156 4
                case static::STATE_JS_CSS:
157 4
                    $this->handleJsCssState( $char, $state, $html_buffer, $output );
158 4
                    break;
159
            }
160
        }
161
162
        // HTML Partial at the end, treat as invalid and preserve the string content
163 95
        if ( !empty( $html_buffer ) ) {
164 1
            if ( $this->_isTagValid( $html_buffer ) && null !== $this->pipeline ) {
165 1
                $this->_setSegmentContainsMarkup();
166
            }
167 1
            $output .= $this->_fixWrongBuffer( $html_buffer );
168
        }
169
170
        // Any trailing plain text: finalize it.
171 95
        if ( '' !== $plain_text_buffer ) {
172 81
            $output .= $this->_finalizePlainText( $plain_text_buffer );
173
        }
174
175 95
        return $output;
176
    }
177
178
    /**
179
     * Handles character processing when in the STATE_PLAINTEXT.
180
     */
181 95
    private function handlePlainTextState( string $char, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output ): void {
182
        switch ( $char ) {
183 95
            case '<':
184
                // Potential new tag starts; finalize plain text so far.
185 30
                $state             = static::STATE_HTML;
186 30
                $html_buffer       .= $char;
187 30
                $output            .= $this->_finalizePlainText( $plain_text_buffer );
188 30
                $plain_text_buffer = '';
189 30
                break;
190 90
            case '>':
191
                // Unescaped '>' in plaintext; treat as literal via error handing.
192 3
                $plain_text_buffer .= $this->_fixWrongBuffer( $char );
193 3
                break;
194
            default:
195
                // Collect as plain text.
196 90
                $plain_text_buffer .= $char;
197 90
                break;
198
        }
199
    }
200
201
    /**
202
     * Handles character processing when in the STATE_HTML.
203
     * This method acts as a dispatcher based on the character.
204
     * Assumes parser state variables (state, html_buffer, etc.) are now class properties.
205
     */
206 30
    private function handleHtmlState( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output, string &$in_quote_char ): void {
207
        switch ( $char ) {
208 30
            case '<':
209 1
                $this->onLessThanInHtml( $char, $output, $html_buffer );
210 1
                break;
211 30
            case '>':
212 24
                $this->onGreaterThanInHtml( $char, $state, $html_buffer, $output, $in_quote_char );
213 24
                break;
214 30
            case '"':
215 30
            case '\'':
216 9
                $this->onQuoteInHtml( $char, $html_buffer, $in_quote_char );
217 9
                break;
218 30
            case '-':
219 3
                $this->onDashInHtml( $char, $state, $html_buffer );
220 3
                break;
221 30
            case ' ':
222 25
            case "\n":
223 16
                $this->onWhitespaceInHtml( $char, $state, $html_buffer, $output );
224 16
                break;
225
            default:
226 25
                $this->onDefaultCharInHtml( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer );
227 25
                break;
228
        }
229
    }
230
231
    /**
232
     * Handles the '<' character in the HTML state.
233
     */
234 1
    private function onLessThanInHtml( string $char, string &$output, string &$html_buffer ): void {
235
        // If we found a second less than symbol, the first one IS NOT a tag.
236
        // See https://www.w3.org/TR/xml/#charsets
237 1
        $output      .= $this->_fixWrongBuffer( $html_buffer );
238 1
        $html_buffer = $char;
239
    }
240
241
    /**
242
     * Handles the '>' character in the HTML state.
243
     */
244 24
    private function onGreaterThanInHtml( string $char, int &$state, string &$html_buffer, string &$output, string &$in_quote_char ): void {
245
        // End of current tag. Special-case for <script> or <style> blocks.
246 24
        if ( $this->isScriptOrStyleTag( $html_buffer ) ) {
247 4
            $html_buffer .= $char;
248 4
            $state       = static::STATE_JS_CSS;
249
250 4
            return;
251
        }
252
253 21
        $in_quote_char = '';
254 21
        $state         = static::STATE_PLAINTEXT;
255 21
        $html_buffer   .= $char;
256
257
        // Validate and finalize HTML tag. Invalid tags are corrected/errors handled.
258 21
        if ( $this->_isTagValid( $html_buffer ) ) {
259 19
            $output .= $this->_finalizeMarkupTag( $html_buffer );
260 19
            if ( null !== $this->pipeline ) {
261 19
                $this->_setSegmentContainsMarkup();
262
            }
263
        } else {
264 3
            $output .= $this->_fixWrongBuffer( $html_buffer );
265
        }
266 21
        $html_buffer = '';
267
    }
268
269
    /**
270
     * Handles quote characters ('"' or "'") in the HTML state.
271
     */
272 9
    private function onQuoteInHtml( string $char, string &$html_buffer, string &$in_quote_char ): void {
273
        // Track entry/exit into quoted attributes.
274 9
        if ( $char == $in_quote_char ) {
275 8
            $in_quote_char = ''; // Exiting quote
276 9
        } elseif ( $in_quote_char == '' ) {
277 9
            $in_quote_char = $char; // Entering quote
278
        }
279 9
        $html_buffer .= $char;
280
    }
281
282
    /**
283
     * Handles the '-' character in the HTML state.
284
     */
285 3
    private function onDashInHtml( string $char, int &$state, string &$html_buffer ): void {
286
        // Detect HTML comment opening ('<!--').
287 3
        if ( $html_buffer === '<!-' ) {
288 1
            $state = static::STATE_COMMENT;
289
        }
290 3
        $html_buffer .= $char;
291
    }
292
293
    /**
294
     * Handles whitespace characters in the HTML state.
295
     */
296 16
    private function onWhitespaceInHtml( string $char, int &$state, string &$html_buffer, string &$output ): void {
297
        // Space or newline immediately after '<' is invalid.
298 16
        if ( $html_buffer === '<' ) {
299 6
            $state       = static::STATE_PLAINTEXT;
300 6
            $output      .= $this->_fixWrongBuffer( '<' . $char );
301 6
            $html_buffer = '';
302 6
            if ( null !== $this->pipeline ) {
303 6
                $this->_setSegmentContainsMarkup();
304
            }
305
306 6
            return;
307
        }
308 11
        $html_buffer .= $char;
309
    }
310
311
    /**
312
     * Handles any other default character in the HTML state.
313
     */
314 25
    private function onDefaultCharInHtml( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer ): void {
315 25
        $html_buffer .= $char;
316
        // End of input: treat buffer as plain text if not a valid tag.
317 25
        if ( $idx === ( $charCount - 1 ) && !$this->_isTagValid( $html_buffer ) ) {
318 2
            $state             = static::STATE_PLAINTEXT; // Error: not a valid tag
319 2
            $plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer );
320 2
            $html_buffer       = '';
321
        }
322
    }
323
324
    /**
325
     * Handles character processing when in the STATE_COMMENT.
326
     */
327 1
    private function handleCommentState( string $char, int &$state, string &$html_buffer, string &$output ): void {
328 1
        $html_buffer .= $char;
329
        // Check for the end of a comment: '-->'
330 1
        if ( $char === '>' && substr( $html_buffer, -3 ) === '-->' ) {
331 1
            $state       = static::STATE_PLAINTEXT;
332 1
            $output      .= $this->_finalizeScriptTag( $html_buffer );
333 1
            $html_buffer = '';
334 1
            if ( null !== $this->pipeline ) {
335 1
                $this->_setSegmentContainsMarkup();
336
            }
337
        }
338
    }
339
340
    /**
341
     * Handles character processing when in the STATE_JS_CSS.
342
     */
343 4
    private function handleJsCssState( string $char, int &$state, string &$html_buffer, string &$output ): void {
344 4
        $html_buffer .= $char;
345
        // Detect close: e.g., '</script>' or '</style>'
346 4
        if ( $char === '>' ) {
347 3
            if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ], true ) ) {
348 3
                $state = static::STATE_PLAINTEXT;
349 3
                $this->_isTagValid( $html_buffer );
350 3
                $output      .= $this->_finalizeScriptTag( $html_buffer );
351 3
                $html_buffer = '';
352 3
                if ( null !== $this->pipeline ) {
353 3
                    $this->_setSegmentContainsMarkup();
354
                }
355
            }
356
        }
357
    }
358
359
    /**
360
     * Checks if the buffered HTML is the beginning of a script or style tag.
361
     */
362 24
    private function isScriptOrStyleTag( string $html_buffer ): bool {
363
        // A tag starts with '<script' or '<style'. This also covers variants with spaces or attributes.
364 24
        return in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] );
365
    }
366
367
}
368