1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Created by PhpStorm. |
4
|
|
|
* @author domenico [email protected] / [email protected] |
5
|
|
|
* Date: 05/11/18 |
6
|
|
|
* Time: 15.30 |
7
|
|
|
* |
8
|
|
|
*/ |
9
|
|
|
|
10
|
|
|
namespace Matecat\SubFiltering\Filters\Html; |
11
|
|
|
|
12
|
|
|
use Matecat\SubFiltering\Commons\AbstractHandler; |
13
|
|
|
use Matecat\SubFiltering\Commons\Pipeline; |
14
|
|
|
use ReflectionException; |
15
|
|
|
use ReflectionMethod; |
16
|
|
|
use RuntimeException; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* HtmlParser |
20
|
|
|
* |
21
|
|
|
* A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments |
22
|
|
|
* in a given input string. It processes segments statefully, validates potential HTML tags, |
23
|
|
|
* and invokes handler callbacks for fragment finalization and error correction. |
24
|
|
|
* |
25
|
|
|
* Usage: |
26
|
|
|
* - Register a callback handler (must consume CallbacksHandler trait). |
27
|
|
|
* - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state. |
28
|
|
|
* |
29
|
|
|
* State Machine: |
30
|
|
|
* - STATE_PLAINTEXT: Outside any tag, collecting plain text. |
31
|
|
|
* - STATE_HTML: Inside angle brackets `<...>`, potentially a tag. |
32
|
|
|
* - STATE_COMMENT: Inside a comment `<!-- ... -->`. |
33
|
|
|
* - STATE_JS_CSS: Inside <script> or <style> tags. |
34
|
|
|
* |
35
|
|
|
* Callbacks: |
36
|
|
|
* The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization, |
37
|
|
|
* HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection. |
38
|
|
|
* |
39
|
|
|
* @author domenico [email protected] / [email protected] |
40
|
|
|
* @package Matecat\SubFiltering\Filters\Html |
41
|
|
|
* |
42
|
|
|
* @method _isTagValid( string $buffer ) Validate whether $buffer is a correct HTML tag. |
43
|
|
|
* @method _finalizeMarkupTag( string $buffer ) Handle completion of a valid HTML tag. |
44
|
|
|
* @method _fixWrongBuffer( string $buffer ) Correct and process abnormal tag-like input. |
45
|
|
|
* @method _finalizeScriptTag( string $buffer ) Finalize a <script>/<style> or comment content. |
46
|
|
|
* @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far. |
47
|
|
|
* @method _setSegmentContainsMarkup() Set a flag on the parent pipeline that HTML has been found. |
48
|
|
|
*/ |
49
|
|
|
class HtmlParser { |
50
|
|
|
|
51
|
|
|
/** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */ |
52
|
|
|
const STATE_PLAINTEXT = 0; |
53
|
|
|
const STATE_HTML = 1; |
54
|
|
|
const STATE_COMMENT = 2; |
55
|
|
|
const STATE_JS_CSS = 3; |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* Processing pipeline; used for HTML presence flagging. |
59
|
|
|
* @var Pipeline|null |
60
|
|
|
*/ |
61
|
|
|
private ?Pipeline $pipeline; |
62
|
|
|
|
63
|
|
|
/** |
64
|
|
|
* The handler object providing callback implementations (must use CallbacksHandler trait). |
65
|
|
|
* @var AbstractHandler |
66
|
|
|
*/ |
67
|
|
|
protected AbstractHandler $callbacksHandler; |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* HtmlParser constructor. |
71
|
|
|
* |
72
|
|
|
* @param Pipeline|null $pipeline |
73
|
|
|
*/ |
74
|
98 |
|
public function __construct( ?Pipeline $pipeline = null ) { |
75
|
98 |
|
$this->pipeline = $pipeline; |
76
|
|
|
} |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* Registers a handler for callbacks invoked during parsing. |
80
|
|
|
* The handler must use the CallbacksHandler trait (ensured at runtime). |
81
|
|
|
* |
82
|
|
|
* @param AbstractHandler $class Handler implementing required callbacks. |
83
|
|
|
* |
84
|
|
|
* @throws RuntimeException If the handler does not use the CallbacksHandler trait. |
85
|
|
|
*/ |
86
|
98 |
|
public function registerCallbacksHandler( AbstractHandler $class ) { |
87
|
|
|
//check: $class must use CallbacksHandler trait |
88
|
98 |
|
if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) { |
89
|
1 |
|
throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." ); |
90
|
|
|
} |
91
|
97 |
|
$this->callbacksHandler = $class; |
92
|
97 |
|
$this->pipeline = $this->callbacksHandler->getPipeline(); |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Magic invoker for protected/private methods on the registered callbacks handler. |
97
|
|
|
* This enables the parser to call non-public handler methods at runtime, |
98
|
|
|
* supporting encapsulation of callback logic. |
99
|
|
|
* |
100
|
|
|
* @param string $name Method name to invoke. |
101
|
|
|
* @param string[] $arguments Single-element arguments array for handler callback. |
102
|
|
|
* |
103
|
|
|
* @return mixed Return value from the handler's method. |
104
|
|
|
* @throws ReflectionException If a method cannot be found/reflected. |
105
|
|
|
*/ |
106
|
96 |
|
public function __call( string $name, array $arguments = [] ) { |
107
|
|
|
|
108
|
|
|
// Create a ReflectionMethod instance for the method being called on the callback handler |
109
|
96 |
|
$reflector = new ReflectionMethod( $this->callbacksHandler, $name ); |
110
|
|
|
|
111
|
|
|
// If the method is not public, make it accessible |
112
|
96 |
|
if ( !$reflector->isPublic() ) { |
113
|
96 |
|
$reflector->setAccessible( true ); |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
// Invoke the method on the callback handler with the provided arguments |
117
|
96 |
|
return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null ); |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Parses and transforms an input string segment, differentiating between |
122
|
|
|
* plain text, HTML tags, comments, and <script>/<style> blocks. |
123
|
|
|
* Sanitizes invalid tags, finalizes detected segments via callbacks, and |
124
|
|
|
* collects a normalized string (with external handler support). |
125
|
|
|
* |
126
|
|
|
* @param string $segment The input string to parse and transform. |
127
|
|
|
* |
128
|
|
|
* @return string The processed segment, with tags and text handled appropriately. |
129
|
|
|
*/ |
130
|
|
|
|
131
|
97 |
|
public function transform( string $segment ): string { |
132
|
|
|
// Split input into Unicode codepoints for accurate char-by-char iteration. |
133
|
97 |
|
$originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY ); |
134
|
97 |
|
if ( empty( $originalSplit ) ) { |
135
|
1 |
|
return ''; |
136
|
|
|
} |
137
|
|
|
|
138
|
96 |
|
$state = static::STATE_PLAINTEXT; |
139
|
96 |
|
$html_buffer = ''; |
140
|
96 |
|
$plain_text_buffer = ''; |
141
|
96 |
|
$in_quote_char = ''; |
142
|
96 |
|
$output = ''; |
143
|
96 |
|
$charCount = count( $originalSplit ); |
144
|
|
|
|
145
|
96 |
|
foreach ( $originalSplit as $idx => $char ) { |
146
|
|
|
switch ( $state ) { |
147
|
96 |
|
case static::STATE_PLAINTEXT: |
148
|
96 |
|
$this->handlePlainTextState( $char, $state, $html_buffer, $plain_text_buffer, $output ); |
149
|
96 |
|
break; |
150
|
31 |
|
case static::STATE_HTML: |
151
|
31 |
|
$this->handleHtmlState( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer, $output, $in_quote_char ); |
152
|
31 |
|
break; |
153
|
4 |
|
case static::STATE_COMMENT: |
154
|
1 |
|
$this->handleCommentState( $char, $state, $html_buffer, $output ); |
155
|
1 |
|
break; |
156
|
4 |
|
case static::STATE_JS_CSS: |
157
|
4 |
|
$this->handleJsCssState( $char, $state, $html_buffer, $output ); |
158
|
4 |
|
break; |
159
|
|
|
} |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
// HTML Partial at the end, treat as invalid and preserve the string content |
163
|
96 |
|
if ( !empty( $html_buffer ) ) { |
164
|
1 |
|
if ( $this->_isTagValid( $html_buffer ) && null !== $this->pipeline ) { |
165
|
1 |
|
$this->_setSegmentContainsMarkup(); |
166
|
|
|
} |
167
|
1 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
168
|
|
|
} |
169
|
|
|
|
170
|
|
|
// Any trailing plain text: finalize it. |
171
|
96 |
|
if ( '' !== $plain_text_buffer ) { |
172
|
82 |
|
$output .= $this->_finalizePlainText( $plain_text_buffer ); |
173
|
|
|
} |
174
|
|
|
|
175
|
96 |
|
return $output; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
/** |
179
|
|
|
* Handles character processing when in the STATE_PLAINTEXT. |
180
|
|
|
*/ |
181
|
96 |
|
private function handlePlainTextState( string $char, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output ): void { |
182
|
|
|
switch ( $char ) { |
183
|
96 |
|
case '<': |
184
|
|
|
// Potential new tag starts; finalize plain text so far. |
185
|
31 |
|
$state = static::STATE_HTML; |
186
|
31 |
|
$html_buffer .= $char; |
187
|
31 |
|
$output .= $this->_finalizePlainText( $plain_text_buffer ); |
188
|
31 |
|
$plain_text_buffer = ''; |
189
|
31 |
|
break; |
190
|
91 |
|
case '>': |
191
|
|
|
// Unescaped '>' in plaintext; treat as literal via error handing. |
192
|
3 |
|
$plain_text_buffer .= $this->_fixWrongBuffer( $char ); |
193
|
3 |
|
break; |
194
|
|
|
default: |
195
|
|
|
// Collect as plain text. |
196
|
91 |
|
$plain_text_buffer .= $char; |
197
|
91 |
|
break; |
198
|
|
|
} |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* Handles character processing when in the STATE_HTML. |
203
|
|
|
* This method acts as a dispatcher based on the character. |
204
|
|
|
* Assumes parser state variables (state, html_buffer, etc.) are now class properties. |
205
|
|
|
*/ |
206
|
31 |
|
private function handleHtmlState( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer, string &$output, string &$in_quote_char ): void { |
207
|
|
|
switch ( $char ) { |
208
|
31 |
|
case '<': |
209
|
1 |
|
$this->onLessThanInHtml( $char, $output, $html_buffer ); |
210
|
1 |
|
break; |
211
|
31 |
|
case '>': |
212
|
25 |
|
$this->onGreaterThanInHtml( $char, $state, $html_buffer, $output, $in_quote_char ); |
213
|
25 |
|
break; |
214
|
31 |
|
case '"': |
215
|
31 |
|
case '\'': |
216
|
9 |
|
$this->onQuoteInHtml( $char, $html_buffer, $in_quote_char ); |
217
|
9 |
|
break; |
218
|
31 |
|
case '-': |
219
|
3 |
|
$this->onDashInHtml( $char, $state, $html_buffer ); |
220
|
3 |
|
break; |
221
|
31 |
|
case ' ': |
222
|
26 |
|
case "\n": |
223
|
17 |
|
$this->onWhitespaceInHtml( $char, $state, $html_buffer, $output ); |
224
|
17 |
|
break; |
225
|
|
|
default: |
226
|
26 |
|
$this->onDefaultCharInHtml( $char, $idx, $charCount, $state, $html_buffer, $plain_text_buffer ); |
227
|
26 |
|
break; |
228
|
|
|
} |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
/** |
232
|
|
|
* Handles the '<' character in the HTML state. |
233
|
|
|
*/ |
234
|
1 |
|
private function onLessThanInHtml( string $char, string &$output, string &$html_buffer ): void { |
235
|
|
|
// If we found a second less than symbol, the first one IS NOT a tag. |
236
|
|
|
// See https://www.w3.org/TR/xml/#charsets |
237
|
1 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
238
|
1 |
|
$html_buffer = $char; |
239
|
|
|
} |
240
|
|
|
|
241
|
|
|
/** |
242
|
|
|
* Handles the '>' character in the HTML state. |
243
|
|
|
*/ |
244
|
25 |
|
private function onGreaterThanInHtml( string $char, int &$state, string &$html_buffer, string &$output, string &$in_quote_char ): void { |
245
|
|
|
// End of current tag. Special-case for <script> or <style> blocks. |
246
|
25 |
|
if ( $this->isScriptOrStyleTag( $html_buffer ) ) { |
247
|
4 |
|
$html_buffer .= $char; |
248
|
4 |
|
$state = static::STATE_JS_CSS; |
249
|
|
|
|
250
|
4 |
|
return; |
251
|
|
|
} |
252
|
|
|
|
253
|
22 |
|
$in_quote_char = ''; |
254
|
22 |
|
$state = static::STATE_PLAINTEXT; |
255
|
22 |
|
$html_buffer .= $char; |
256
|
|
|
|
257
|
|
|
// Validate and finalize HTML tag. Invalid tags are corrected/errors handled. |
258
|
22 |
|
if ( $this->_isTagValid( $html_buffer ) ) { |
259
|
19 |
|
$output .= $this->_finalizeMarkupTag( $html_buffer ); |
260
|
19 |
|
if ( null !== $this->pipeline ) { |
261
|
19 |
|
$this->_setSegmentContainsMarkup(); |
262
|
|
|
} |
263
|
|
|
} else { |
264
|
4 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
265
|
|
|
} |
266
|
22 |
|
$html_buffer = ''; |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
/** |
270
|
|
|
* Handles quote characters ('"' or "'") in the HTML state. |
271
|
|
|
*/ |
272
|
9 |
|
private function onQuoteInHtml( string $char, string &$html_buffer, string &$in_quote_char ): void { |
273
|
|
|
// Track entry/exit into quoted attributes. |
274
|
9 |
|
if ( $char == $in_quote_char ) { |
275
|
8 |
|
$in_quote_char = ''; // Exiting quote |
276
|
9 |
|
} elseif ( $in_quote_char == '' ) { |
277
|
9 |
|
$in_quote_char = $char; // Entering quote |
278
|
|
|
} |
279
|
9 |
|
$html_buffer .= $char; |
280
|
|
|
} |
281
|
|
|
|
282
|
|
|
/** |
283
|
|
|
* Handles the '-' character in the HTML state. |
284
|
|
|
*/ |
285
|
3 |
|
private function onDashInHtml( string $char, int &$state, string &$html_buffer ): void { |
286
|
|
|
// Detect HTML comment opening ('<!--'). |
287
|
3 |
|
if ( $html_buffer === '<!-' ) { |
288
|
1 |
|
$state = static::STATE_COMMENT; |
289
|
|
|
} |
290
|
3 |
|
$html_buffer .= $char; |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
/** |
294
|
|
|
* Handles whitespace characters in the HTML state. |
295
|
|
|
*/ |
296
|
17 |
|
private function onWhitespaceInHtml( string $char, int &$state, string &$html_buffer, string &$output ): void { |
297
|
|
|
// Space or newline immediately after '<' is invalid. |
298
|
17 |
|
if ( $html_buffer === '<' ) { |
299
|
6 |
|
$state = static::STATE_PLAINTEXT; |
300
|
6 |
|
$output .= $this->_fixWrongBuffer( '<' . $char ); |
301
|
6 |
|
$html_buffer = ''; |
302
|
6 |
|
if ( null !== $this->pipeline ) { |
303
|
6 |
|
$this->_setSegmentContainsMarkup(); |
304
|
|
|
} |
305
|
|
|
|
306
|
6 |
|
return; |
307
|
|
|
} |
308
|
12 |
|
$html_buffer .= $char; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
/** |
312
|
|
|
* Handles any other default character in the HTML state. |
313
|
|
|
*/ |
314
|
26 |
|
private function onDefaultCharInHtml( string $char, int $idx, int $charCount, int &$state, string &$html_buffer, string &$plain_text_buffer ): void { |
315
|
26 |
|
$html_buffer .= $char; |
316
|
|
|
// End of input: treat buffer as plain text if not a valid tag. |
317
|
26 |
|
if ( $idx === ( $charCount - 1 ) && !$this->_isTagValid( $html_buffer ) ) { |
318
|
2 |
|
$state = static::STATE_PLAINTEXT; // Error: not a valid tag |
319
|
2 |
|
$plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer ); |
320
|
2 |
|
$html_buffer = ''; |
321
|
|
|
} |
322
|
|
|
} |
323
|
|
|
|
324
|
|
|
/** |
325
|
|
|
* Handles character processing when in the STATE_COMMENT. |
326
|
|
|
*/ |
327
|
1 |
|
private function handleCommentState( string $char, int &$state, string &$html_buffer, string &$output ): void { |
328
|
1 |
|
$html_buffer .= $char; |
329
|
|
|
// Check for the end of a comment: '-->' |
330
|
1 |
|
if ( $char === '>' && substr( $html_buffer, -3 ) === '-->' ) { |
331
|
1 |
|
$state = static::STATE_PLAINTEXT; |
332
|
1 |
|
$output .= $this->_finalizeScriptTag( $html_buffer ); |
333
|
1 |
|
$html_buffer = ''; |
334
|
1 |
|
if ( null !== $this->pipeline ) { |
335
|
1 |
|
$this->_setSegmentContainsMarkup(); |
336
|
|
|
} |
337
|
|
|
} |
338
|
|
|
} |
339
|
|
|
|
340
|
|
|
/** |
341
|
|
|
* Handles character processing when in the STATE_JS_CSS. |
342
|
|
|
*/ |
343
|
4 |
|
private function handleJsCssState( string $char, int &$state, string &$html_buffer, string &$output ): void { |
344
|
4 |
|
$html_buffer .= $char; |
345
|
|
|
// Detect close: e.g., '</script>' or '</style>' |
346
|
4 |
|
if ( $char === '>' ) { |
347
|
3 |
|
if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ], true ) ) { |
348
|
3 |
|
$state = static::STATE_PLAINTEXT; |
349
|
3 |
|
$this->_isTagValid( $html_buffer ); |
350
|
3 |
|
$output .= $this->_finalizeScriptTag( $html_buffer ); |
351
|
3 |
|
$html_buffer = ''; |
352
|
3 |
|
if ( null !== $this->pipeline ) { |
353
|
3 |
|
$this->_setSegmentContainsMarkup(); |
354
|
|
|
} |
355
|
|
|
} |
356
|
|
|
} |
357
|
|
|
} |
358
|
|
|
|
359
|
|
|
/** |
360
|
|
|
* Checks if the buffered HTML is the beginning of a script or style tag. |
361
|
|
|
*/ |
362
|
25 |
|
private function isScriptOrStyleTag( string $html_buffer ): bool { |
363
|
|
|
// A tag starts with '<script' or '<style'. This also covers variants with spaces or attributes. |
364
|
25 |
|
return in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] ); |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
} |
368
|
|
|
|