|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Created by PhpStorm. |
|
4
|
|
|
* @author domenico [email protected] / [email protected] |
|
5
|
|
|
* Date: 05/11/18 |
|
6
|
|
|
* Time: 15.30 |
|
7
|
|
|
* |
|
8
|
|
|
*/ |
|
9
|
|
|
|
|
10
|
|
|
namespace Matecat\SubFiltering\Filters\Html; |
|
11
|
|
|
|
|
12
|
|
|
use Matecat\SubFiltering\Commons\AbstractHandler; |
|
13
|
|
|
use Matecat\SubFiltering\Commons\Pipeline; |
|
14
|
|
|
use ReflectionException; |
|
15
|
|
|
use ReflectionMethod; |
|
16
|
|
|
use RuntimeException; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* HtmlParser |
|
20
|
|
|
* |
|
21
|
|
|
* A robust HTML/text parsing utility that distinguishes between plaintext, HTML, comments, and script/style segments |
|
22
|
|
|
* in a given input string. It processes segments statefully, validates potential HTML tags, |
|
23
|
|
|
* and invokes handler callbacks for fragment finalization and error correction. |
|
24
|
|
|
* |
|
25
|
|
|
* Usage: |
|
26
|
|
|
* - Register a callback handler (must consume CallbacksHandler trait). |
|
27
|
|
|
* - Call transform() to process a segment and convert its contents into a safe, normalized, and well-formed state. |
|
28
|
|
|
* |
|
29
|
|
|
* State Machine: |
|
30
|
|
|
* - STATE_PLAINTEXT: Outside any tag, collecting plain text. |
|
31
|
|
|
* - STATE_HTML: Inside angle brackets `<...>`, potentially a tag. |
|
32
|
|
|
* - STATE_COMMENT: Inside a comment `<!-- ... -->`. |
|
33
|
|
|
* - STATE_JS_CSS: Inside <script> or <style> tags. |
|
34
|
|
|
* |
|
35
|
|
|
* Callbacks: |
|
36
|
|
|
* The handler passed in registerCallbacksHandler must implement tag validation, plain text finalization, |
|
37
|
|
|
* HTML tag finalization, error correction, comment/script handling, and flagging for HTML content detection. |
|
38
|
|
|
* |
|
39
|
|
|
* @author domenico [email protected] / [email protected] |
|
40
|
|
|
* @package Matecat\SubFiltering\Filters\Html |
|
41
|
|
|
* |
|
42
|
|
|
* @method _isTagValid( string $buffer ) Validate whether $buffer is a correct HTML tag. |
|
43
|
|
|
* @method _finalizeMarkupTag( string $buffer ) Handle completion of a valid HTML tag. |
|
44
|
|
|
* @method _fixWrongBuffer( string $buffer ) Correct and process abnormal tag-like input. |
|
45
|
|
|
* @method _finalizeScriptTag( string $buffer ) Finalize a <script>/<style> or comment content. |
|
46
|
|
|
* @method _finalizePlainText( string $plain_buffer ) Finalize plain text collected so far. |
|
47
|
|
|
* @method _setSegmentContainsMarkup() Set a flag on the parent pipeline that HTML has been found. |
|
48
|
|
|
*/ |
|
49
|
|
|
class HtmlParser { |
|
50
|
|
|
|
|
51
|
|
|
/** Parser states for input processing (plaintext, HTML tag, comment, or script/style). */ |
|
52
|
|
|
const STATE_PLAINTEXT = 0; |
|
53
|
|
|
const STATE_HTML = 1; |
|
54
|
|
|
const STATE_COMMENT = 2; |
|
55
|
|
|
const STATE_JS_CSS = 3; |
|
56
|
|
|
|
|
57
|
|
|
/** |
|
58
|
|
|
* Processing pipeline; used for HTML presence flagging. |
|
59
|
|
|
* @var Pipeline|null |
|
60
|
|
|
*/ |
|
61
|
|
|
private ?Pipeline $pipeline; |
|
62
|
|
|
|
|
63
|
|
|
/** |
|
64
|
|
|
* The handler object providing callback implementations (must use CallbacksHandler trait). |
|
65
|
|
|
* @var AbstractHandler |
|
66
|
|
|
*/ |
|
67
|
|
|
protected AbstractHandler $callbacksHandler; |
|
68
|
|
|
|
|
69
|
|
|
/** |
|
70
|
|
|
* HtmlParser constructor. |
|
71
|
|
|
* |
|
72
|
|
|
* @param Pipeline|null $pipeline |
|
73
|
|
|
*/ |
|
74
|
95 |
|
public function __construct( ?Pipeline $pipeline = null ) { |
|
75
|
95 |
|
$this->pipeline = $pipeline; |
|
76
|
|
|
} |
|
77
|
|
|
|
|
78
|
|
|
/** |
|
79
|
|
|
* Registers a handler for callbacks invoked during parsing. |
|
80
|
|
|
* The handler must use the CallbacksHandler trait (ensured at runtime). |
|
81
|
|
|
* |
|
82
|
|
|
* @param AbstractHandler $class Handler implementing required callbacks. |
|
83
|
|
|
* |
|
84
|
|
|
* @throws RuntimeException If the handler does not use the CallbacksHandler trait. |
|
85
|
|
|
*/ |
|
86
|
95 |
|
public function registerCallbacksHandler( AbstractHandler $class ) { |
|
87
|
|
|
//check: $class must use CallbacksHandler trait |
|
88
|
95 |
|
if ( !in_array( CallbacksHandler::class, array_merge( class_uses( $class ), class_uses( get_parent_class( $class ) ) ) ) ) { |
|
89
|
1 |
|
throw new RuntimeException( "Class passed to " . __METHOD__ . " must use " . CallbacksHandler::class . " trait." ); |
|
90
|
|
|
} |
|
91
|
94 |
|
$this->callbacksHandler = $class; |
|
92
|
94 |
|
$this->pipeline = $this->callbacksHandler->getPipeline(); |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
/** |
|
96
|
|
|
* Magic invoker for protected/private methods on the registered callbacks handler. |
|
97
|
|
|
* This enables the parser to call non-public handler methods at runtime, |
|
98
|
|
|
* supporting encapsulation of callback logic. |
|
99
|
|
|
* |
|
100
|
|
|
* @param string $name Method name to invoke. |
|
101
|
|
|
* @param string[] $arguments Single-element arguments array for handler callback. |
|
102
|
|
|
* |
|
103
|
|
|
* @return mixed Return value from the handler's method. |
|
104
|
|
|
* @throws ReflectionException If a method cannot be found/reflected. |
|
105
|
|
|
*/ |
|
106
|
94 |
|
public function __call( string $name, array $arguments = [] ) { |
|
107
|
|
|
|
|
108
|
|
|
// Create a ReflectionMethod instance for the method being called on the callback handler |
|
109
|
94 |
|
$reflector = new ReflectionMethod( $this->callbacksHandler, $name ); |
|
110
|
|
|
|
|
111
|
|
|
// If the method is not public, make it accessible |
|
112
|
94 |
|
if ( !$reflector->isPublic() ) { |
|
113
|
94 |
|
$reflector->setAccessible( true ); |
|
114
|
|
|
} |
|
115
|
|
|
|
|
116
|
|
|
// Invoke the method on the callback handler with the provided arguments |
|
117
|
94 |
|
return $reflector->invoke( $this->callbacksHandler, $arguments[ 0 ] ?? null ); |
|
118
|
|
|
} |
|
119
|
|
|
|
|
120
|
|
|
/** |
|
121
|
|
|
* Parses and transforms an input string segment, differentiating between |
|
122
|
|
|
* plain text, HTML tags, comments, and <script>/<style> blocks. |
|
123
|
|
|
* Sanitizes invalid tags, finalizes detected segments via callbacks, and |
|
124
|
|
|
* collects a normalized string (with external handler support). |
|
125
|
|
|
* |
|
126
|
|
|
* @param string $segment The input string to parse and transform. |
|
127
|
|
|
* |
|
128
|
|
|
* @return string The processed segment, with tags and text handled appropriately. |
|
129
|
|
|
*/ |
|
130
|
94 |
|
public function transform( string $segment ): string { |
|
131
|
|
|
|
|
132
|
|
|
// Split input into Unicode codepoints for accurate char-by-char iteration. |
|
133
|
94 |
|
$originalSplit = preg_split( '//u', $segment, -1, PREG_SPLIT_NO_EMPTY ); |
|
134
|
|
|
|
|
135
|
94 |
|
$state = static::STATE_PLAINTEXT; |
|
136
|
94 |
|
$html_buffer = ''; |
|
137
|
94 |
|
$plain_text_buffer = ''; |
|
138
|
94 |
|
$in_quote_char = ''; |
|
139
|
94 |
|
$output = ''; |
|
140
|
|
|
|
|
141
|
94 |
|
foreach ( $originalSplit as $idx => $char ) { |
|
142
|
|
|
|
|
143
|
94 |
|
if ( $state == static::STATE_PLAINTEXT ) { |
|
144
|
|
|
switch ( $char ) { |
|
145
|
94 |
|
case '<': |
|
146
|
|
|
// Potential new tag starts; finalize plain text so far. |
|
147
|
30 |
|
$state = static::STATE_HTML; |
|
148
|
30 |
|
$html_buffer .= $char; |
|
149
|
30 |
|
$output .= $this->_finalizePlainText( $plain_text_buffer ); |
|
150
|
30 |
|
$plain_text_buffer = ''; |
|
151
|
30 |
|
break; |
|
152
|
|
|
|
|
153
|
89 |
|
case '>': |
|
154
|
|
|
// Unescaped '>' in plaintext; treat as literal via error handing. |
|
155
|
3 |
|
$plain_text_buffer .= $this->_fixWrongBuffer( $char ); |
|
156
|
3 |
|
break; |
|
157
|
|
|
|
|
158
|
|
|
default: |
|
159
|
|
|
// Collect as plain text. |
|
160
|
89 |
|
$plain_text_buffer .= $char; |
|
161
|
94 |
|
break; |
|
162
|
|
|
} |
|
163
|
30 |
|
} elseif ( $state == static::STATE_HTML ) { |
|
164
|
|
|
// Inside potential HTML tag |
|
165
|
|
|
switch ( $char ) { |
|
166
|
30 |
|
case '<': |
|
167
|
|
|
// If we found a second less than symbol, the first one IS NOT a tag, |
|
168
|
|
|
// treat the html_buffer as plain text and attach to the output. |
|
169
|
|
|
// For more info see https://www.w3.org/TR/xml/#charsets |
|
170
|
1 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
|
171
|
1 |
|
$html_buffer = $char; |
|
172
|
1 |
|
break; |
|
173
|
|
|
|
|
174
|
30 |
|
case '>': |
|
175
|
|
|
// End of current tag. Special-case for <script> or <style> blocks. |
|
176
|
24 |
|
if ( in_array( substr( $html_buffer, 0, 8 ), [ '<script ', '<style', '<script', '<style ' ] ) ) { |
|
177
|
4 |
|
$html_buffer .= $char; |
|
178
|
4 |
|
$state = static::STATE_JS_CSS; |
|
179
|
4 |
|
break; |
|
180
|
|
|
} |
|
181
|
|
|
|
|
182
|
|
|
// This is closing the tag in tag_buffer |
|
183
|
21 |
|
$in_quote_char = ''; |
|
184
|
21 |
|
$state = static::STATE_PLAINTEXT; |
|
185
|
21 |
|
$html_buffer .= $char; |
|
186
|
|
|
|
|
187
|
|
|
// Validate and finalize HTML tag. Invalid tags are corrected/errors handled. |
|
188
|
21 |
|
if ( $this->_isTagValid( $html_buffer ) ) { |
|
189
|
19 |
|
$output .= $this->_finalizeMarkupTag( $html_buffer ); |
|
190
|
19 |
|
if ( null !== $this->pipeline ) { |
|
191
|
|
|
// Mark the segment as containing HTML if required. |
|
192
|
19 |
|
$this->_setSegmentContainsMarkup(); |
|
193
|
|
|
} |
|
194
|
|
|
} else { |
|
195
|
3 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
|
196
|
|
|
} |
|
197
|
21 |
|
$html_buffer = ''; |
|
198
|
21 |
|
break; |
|
199
|
|
|
|
|
200
|
30 |
|
case '"': |
|
201
|
30 |
|
case '\'': |
|
202
|
|
|
// Track entry/exit into quoted attributes inside tag (<tag attr="...">). |
|
203
|
|
|
// Catch both single and double quotes |
|
204
|
9 |
|
if ( $char == $in_quote_char ) { |
|
205
|
8 |
|
$in_quote_char = ''; |
|
206
|
|
|
} else { |
|
207
|
9 |
|
$in_quote_char = ( !empty( $in_quote_char ) ? $in_quote_char : $char ); |
|
208
|
|
|
} |
|
209
|
|
|
|
|
210
|
9 |
|
$html_buffer .= $char; |
|
211
|
9 |
|
break; |
|
212
|
|
|
|
|
213
|
30 |
|
case '-': |
|
214
|
|
|
// Detect HTML comment opening ('<!--'). |
|
215
|
3 |
|
if ( $html_buffer == '<!-' ) { |
|
216
|
1 |
|
$state = static::STATE_COMMENT; |
|
217
|
|
|
} |
|
218
|
|
|
|
|
219
|
3 |
|
$html_buffer .= $char; |
|
220
|
3 |
|
break; |
|
221
|
|
|
|
|
222
|
30 |
|
case ' ': // Space or |
|
223
|
25 |
|
case '\n': // newline immediately after '<' (invalid) |
|
224
|
16 |
|
if ( $html_buffer === '<' ) { |
|
225
|
|
|
// Lone '<' in text: treat as error, emit as text. |
|
226
|
6 |
|
$state = static::STATE_PLAINTEXT; // But we work in XML text, so encode it |
|
227
|
6 |
|
$output .= $this->_fixWrongBuffer( '< ' ); |
|
228
|
6 |
|
$html_buffer = ''; |
|
229
|
|
|
|
|
230
|
6 |
|
if ( null !== $this->pipeline ) { |
|
231
|
6 |
|
$this->_setSegmentContainsMarkup(); |
|
232
|
|
|
} |
|
233
|
|
|
|
|
234
|
6 |
|
break; |
|
235
|
|
|
} |
|
236
|
|
|
|
|
237
|
11 |
|
$html_buffer .= $char; |
|
238
|
11 |
|
break; |
|
239
|
|
|
|
|
240
|
|
|
default: |
|
241
|
|
|
|
|
242
|
25 |
|
$html_buffer .= $char; |
|
243
|
|
|
|
|
244
|
|
|
// End of input: treat buffer as plain text if not a valid tag. |
|
245
|
25 |
|
if ( $idx === ( count( $originalSplit ) - 1 ) ) { |
|
246
|
|
|
|
|
247
|
|
|
// End of input: treat buffer as plain text if not a valid tag. |
|
248
|
2 |
|
if ( !$this->_isTagValid( $html_buffer ) ) { |
|
249
|
2 |
|
$state = static::STATE_PLAINTEXT; // Error: not a valid tag |
|
250
|
2 |
|
$plain_text_buffer .= $this->_fixWrongBuffer( $html_buffer ); |
|
251
|
2 |
|
$html_buffer = ''; |
|
252
|
2 |
|
break; |
|
253
|
|
|
} |
|
254
|
|
|
|
|
255
|
|
|
} |
|
256
|
|
|
|
|
257
|
30 |
|
break; |
|
258
|
|
|
} |
|
259
|
4 |
|
} elseif ( $state == static::STATE_COMMENT ) { |
|
260
|
|
|
// In an HTML comment block |
|
261
|
1 |
|
$html_buffer .= $char; |
|
262
|
|
|
|
|
263
|
|
|
// Check for the end of a comment: '-->' |
|
264
|
1 |
|
if ( $char == '>' ) { |
|
265
|
1 |
|
if ( substr( $html_buffer, -3 ) == '-->' ) { |
|
266
|
|
|
// Close the comment |
|
267
|
1 |
|
$state = static::STATE_PLAINTEXT; |
|
268
|
1 |
|
$output .= $this->_finalizeScriptTag( $html_buffer ); |
|
269
|
1 |
|
$html_buffer = ''; |
|
270
|
|
|
|
|
271
|
1 |
|
if ( null !== $this->pipeline ) { |
|
272
|
1 |
|
$this->_setSegmentContainsMarkup(); |
|
273
|
|
|
} |
|
274
|
|
|
} |
|
275
|
|
|
} |
|
276
|
|
|
|
|
277
|
4 |
|
} elseif ( $state == static::STATE_JS_CSS ) { |
|
278
|
|
|
// In a <script> or <style> tag block (until closing tag) |
|
279
|
4 |
|
$html_buffer .= $char; |
|
280
|
|
|
|
|
281
|
|
|
// Detect close: e.g., '</script>' or '</style>' |
|
282
|
4 |
|
if ( $char == '>' ) { |
|
283
|
3 |
|
if ( in_array( substr( $html_buffer, -6 ), [ 'cript>', 'style>' ] ) ) { |
|
284
|
|
|
// Close the script/style block |
|
285
|
3 |
|
$state = static::STATE_PLAINTEXT; |
|
286
|
|
|
// Validate and finalize the script/style block |
|
287
|
3 |
|
$this->_isTagValid( $html_buffer ); |
|
288
|
3 |
|
$output .= $this->_finalizeScriptTag( $html_buffer ); |
|
289
|
3 |
|
$html_buffer = ''; |
|
290
|
|
|
|
|
291
|
3 |
|
if ( null !== $this->pipeline ) { |
|
292
|
3 |
|
$this->_setSegmentContainsMarkup(); |
|
293
|
|
|
} |
|
294
|
|
|
|
|
295
|
|
|
} |
|
296
|
|
|
} |
|
297
|
|
|
|
|
298
|
|
|
} |
|
299
|
|
|
} |
|
300
|
|
|
|
|
301
|
|
|
//HTML Partial at the end, treat as invalid and preserve the string content |
|
302
|
94 |
|
if ( !empty( $html_buffer ) ) { |
|
303
|
|
|
|
|
304
|
1 |
|
if ( $this->_isTagValid( $html_buffer ) and null !== $this->pipeline ) { |
|
305
|
1 |
|
$this->_setSegmentContainsMarkup(); |
|
306
|
|
|
} |
|
307
|
|
|
|
|
308
|
1 |
|
$output .= $this->_fixWrongBuffer( $html_buffer ); |
|
309
|
|
|
} |
|
310
|
|
|
|
|
311
|
|
|
// Any trailing plain text: finalize it. |
|
312
|
94 |
|
if ( '' !== $plain_text_buffer ) { |
|
313
|
80 |
|
$output .= $this->_finalizePlainText( $plain_text_buffer ); |
|
314
|
|
|
} |
|
315
|
|
|
|
|
316
|
94 |
|
return $output; |
|
317
|
|
|
|
|
318
|
|
|
} |
|
319
|
|
|
} |
|
320
|
|
|
|