|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Created by PhpStorm. |
|
4
|
|
|
* @author domenico [email protected] / [email protected] |
|
5
|
|
|
* Date: 05/11/18 |
|
6
|
|
|
* Time: 15.30 |
|
7
|
|
|
* |
|
8
|
|
|
*/ |
|
9
|
|
|
|
|
10
|
|
|
namespace Matecat\SubFiltering\Filters; |
|
11
|
|
|
|
|
12
|
|
|
use Matecat\SubFiltering\Commons\AbstractHandler; |
|
13
|
|
|
use Matecat\SubFiltering\Enum\ConstantEnum; |
|
14
|
|
|
use Matecat\SubFiltering\Enum\CTypeEnum; |
|
15
|
|
|
use Matecat\SubFiltering\Filters\Html\CallbacksHandler; |
|
16
|
|
|
use Matecat\SubFiltering\Filters\Html\HtmlParser; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* Class HtmlToPh |
|
20
|
|
|
* |
|
21
|
|
|
* This class converts HTML tags within a string into placeholder tags (<ph>). |
|
22
|
|
|
* It uses an HtmlParser with a set of callbacks to process different parts of the HTML content. |
|
23
|
|
|
* |
|
24
|
|
|
* @author domenico [email protected] / [email protected] |
|
25
|
|
|
* @package SubFiltering |
|
26
|
|
|
* |
|
27
|
|
|
*/ |
|
28
|
|
|
class MarkupToPh extends AbstractHandler { |
|
29
|
|
|
|
|
30
|
|
|
use CallbacksHandler; |
|
31
|
|
|
|
|
32
|
|
|
protected bool $isHTML = false; |
|
33
|
|
|
|
|
34
|
|
|
/** |
|
35
|
|
|
* Handles plain text content. Returns the buffer unchanged. |
|
36
|
|
|
* |
|
37
|
|
|
* @param string $buffer The plain text buffer. |
|
38
|
|
|
* |
|
39
|
|
|
* @return string The original buffer. |
|
40
|
|
|
*/ |
|
41
|
96 |
|
protected function _finalizePlainText( string $buffer ): string { |
|
42
|
96 |
|
return $buffer; |
|
43
|
|
|
} |
|
44
|
|
|
|
|
45
|
|
|
/** |
|
46
|
|
|
* Handles and finalizes an HTML tag. |
|
47
|
|
|
* |
|
48
|
|
|
* This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself. |
|
49
|
|
|
* This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;b=2"` |
|
50
|
|
|
* becomes `href="...?a=1&b=2"`. |
|
51
|
|
|
* |
|
52
|
|
|
* @param string $buffer The HTML tag string. |
|
53
|
|
|
* |
|
54
|
|
|
* @return string The generated <ph> placeholder tag. |
|
55
|
|
|
*/ |
|
56
|
19 |
|
protected function _finalizeMarkupTag( string $buffer ): string { |
|
57
|
|
|
// Decode attributes by locking < and > first |
|
58
|
|
|
// Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string |
|
59
|
|
|
// Ex: |
|
60
|
|
|
// incoming string: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> |
|
61
|
|
|
// this should be: <a href="/users/settings?test=123&foobar=1" target="_blank"> with only one ampersand encoding |
|
62
|
|
|
// |
|
63
|
19 |
|
$buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer ); |
|
64
|
19 |
|
$buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' ); |
|
65
|
19 |
|
$buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer ); |
|
66
|
|
|
|
|
67
|
19 |
|
return $this->_finalizeTag( $buffer ); |
|
68
|
|
|
|
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Converts a generic tag string into a <ph> placeholder. |
|
73
|
|
|
* The original tag is stored in the 'equiv-text' attribute, base64 encoded. |
|
74
|
|
|
* |
|
75
|
|
|
* @param string $buffer The tag string to convert. |
|
76
|
|
|
* |
|
77
|
|
|
* @return string The resulting <ph> tag. |
|
78
|
|
|
*/ |
|
79
|
21 |
|
protected function _finalizeTag( string $buffer ): string { |
|
80
|
21 |
|
$isHTML = $this->isHTML; |
|
81
|
21 |
|
$this->isHTML = false; |
|
82
|
|
|
|
|
83
|
21 |
|
return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>'; |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
/** |
|
87
|
|
|
* "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets. |
|
88
|
|
|
* |
|
89
|
|
|
* @param string $buffer The string buffer. |
|
90
|
|
|
* |
|
91
|
|
|
* @return string The fixed string with escaped angle brackets. |
|
92
|
|
|
*/ |
|
93
|
13 |
|
protected function _fixWrongBuffer( string $buffer ): string { |
|
94
|
13 |
|
$buffer = str_replace( "<", "<", $buffer ); |
|
95
|
|
|
|
|
96
|
13 |
|
return str_replace( ">", ">", $buffer ); |
|
97
|
|
|
} |
|
98
|
|
|
|
|
99
|
|
|
/** |
|
100
|
|
|
* Finalizes a <script> tag by converting it into a placeholder. |
|
101
|
|
|
* |
|
102
|
|
|
* @param string $buffer The script tag string. |
|
103
|
|
|
* |
|
104
|
|
|
* @return string The generated <ph> placeholder tag. |
|
105
|
|
|
*/ |
|
106
|
3 |
|
protected function _finalizeScriptTag( string $buffer ): string { |
|
107
|
3 |
|
return $this->_finalizeTag( $buffer ); |
|
108
|
|
|
} |
|
109
|
|
|
|
|
110
|
|
|
/** |
|
111
|
|
|
* Validates a given tag string based on specific criteria for HTML5 and XML tags. |
|
112
|
|
|
* |
|
113
|
|
|
* The method determines whether a given tag string is valid by: |
|
114
|
|
|
* 1. Ensuring there are no placeholder markers (e.g., `##LESSTHAN##`, `##GREATERTHAN##`). |
|
115
|
|
|
* 2. Matching against a comprehensive HTML5 tag and attribute structure using regex. |
|
116
|
|
|
* 3. Optionally performing a stricter validation for XML tag structures. |
|
117
|
|
|
* |
|
118
|
|
|
* @param string $buffer The string representation of a tag to be validated. |
|
119
|
|
|
* |
|
120
|
|
|
* @return bool Returns true if the tag is considered valid; false otherwise. |
|
121
|
|
|
*/ |
|
122
|
26 |
|
protected function _isTagValid( string $buffer ): bool { |
|
123
|
|
|
|
|
124
|
|
|
// This is a safeguard against misinterpreting partially processed strings. |
|
125
|
|
|
// During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##). |
|
126
|
|
|
// If such placeholders exist within what looks like a tag, it means the tag's |
|
127
|
|
|
// content is not yet restored, so we must not treat it as a valid, final tag. |
|
128
|
|
|
// For example, an original string like '<a href="<x/>">' could become |
|
129
|
|
|
// '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag. |
|
130
|
26 |
|
if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) { |
|
131
|
2 |
|
return false; |
|
132
|
|
|
} |
|
133
|
|
|
|
|
134
|
|
|
/** |
|
135
|
|
|
* Validates if the given buffer contains a valid HTML5 tag. |
|
136
|
|
|
* |
|
137
|
|
|
* This method uses a regular expression to match and validate HTML5 tags, including their attributes. |
|
138
|
|
|
* It supports a wide range of HTML5 elements and global attributes, ensuring that the buffer adheres |
|
139
|
|
|
* to the HTML5 specification. |
|
140
|
|
|
* |
|
141
|
|
|
* Features: |
|
142
|
|
|
* - Matches all valid HTML5 tags, including opening, closing, and self-closing tags. |
|
143
|
|
|
* - Handles global attributes such as id, class, style, data-* attributes, ARIA attributes, and event handlers. |
|
144
|
|
|
* - Supports attribute values in double quotes, single quotes, or unquoted. |
|
145
|
|
|
* - Robust to multiple attributes, whitespace, and Unicode characters. |
|
146
|
|
|
* |
|
147
|
|
|
* Example HTML matched by the regex: |
|
148
|
|
|
* - `<div class="example" data-info="123">Content</div>` |
|
149
|
|
|
* - `<img src="image.png" alt="Image" />` |
|
150
|
|
|
* - `<button onclick="alert('Click!')">Click me</button>` |
|
151
|
|
|
* |
|
152
|
|
|
* @see https://regex101.com/r/o546zS/2 |
|
153
|
|
|
* |
|
154
|
|
|
* @param string $buffer The string to validate as an HTML5 tag. |
|
155
|
|
|
* |
|
156
|
|
|
* @return bool Returns true if the buffer contains a valid HTML5 tag; false otherwise. |
|
157
|
|
|
*/ |
|
158
|
24 |
|
if ( preg_match( '#</?(?:a|abbr|address|area|article|aside|audio|b|base|bdi|bdo|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dialog|div|dl|dt|em|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|head|header|hr|html|i|iframe|img|input|ins|kbd|label|legend|li|link|main|map|mark|menu|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|picture|pre|progress|q|rb|rp|rt|rtc|ruby|s|samp|script|section|select|slot|small|source|span|strong|style|sub|summary|sup|table|tbody|td|template|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr)(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) { |
|
159
|
15 |
|
$this->isHTML = true; |
|
160
|
|
|
|
|
161
|
15 |
|
return true; |
|
162
|
|
|
} |
|
163
|
|
|
|
|
164
|
|
|
/** |
|
165
|
|
|
* Validates the general structure of an XML tag using a stricter regex. |
|
166
|
|
|
* |
|
167
|
|
|
* This validation ensures that the XML tag adheres to the following rules: |
|
168
|
|
|
* - The tag may optionally start with a '/' character. |
|
169
|
|
|
* - The tag name must NOT start with a number or a hyphen. |
|
170
|
|
|
* - The tag name can only contain alphanumeric characters, hyphens (-), dots (.), and underscores (_). |
|
171
|
|
|
* - The tag name must have at least one character. |
|
172
|
|
|
* - The tag must end with a letter, a digit, a single quote ('), a double quote ("), or a forward slash (/). |
|
173
|
|
|
* - Attributes must be defined with an equal sign and quoted values (either single or double quotes). |
|
174
|
|
|
* |
|
175
|
|
|
* Notes: |
|
176
|
|
|
* - Unicode letters in element and attribute names are not allowed. |
|
177
|
|
|
* - This validation is stricter than the HTML5 validation and is tailored for XML documents. |
|
178
|
|
|
* - For more details, see the XML specification: https://www.w3.org/TR/xml/#NT-Attribute |
|
179
|
|
|
* |
|
180
|
|
|
* @see https://regex101.com/r/hsk9KU/4 |
|
181
|
|
|
* |
|
182
|
|
|
* @param string $buffer The string representation of the tag to validate. |
|
183
|
|
|
* |
|
184
|
|
|
* @return bool Returns true if the tag matches the stricter XML structure; false otherwise. |
|
185
|
|
|
*/ |
|
186
|
10 |
|
if ( preg_match( '#</?(?![0-9\-]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+=(?:"[^"]*"|\'[^\']*\'))*\s*/?>#ui', $buffer ) ) { |
|
187
|
7 |
|
return true; |
|
188
|
|
|
} |
|
189
|
|
|
|
|
190
|
4 |
|
return false; |
|
191
|
|
|
|
|
192
|
|
|
} |
|
193
|
|
|
|
|
194
|
|
|
/** |
|
195
|
|
|
* Main transformation method. |
|
196
|
|
|
* |
|
197
|
|
|
* It instantiates an HtmlParser, registers this class as the callback handler, |
|
198
|
|
|
* and processes the input segment to convert HTML tags to placeholders. |
|
199
|
|
|
* |
|
200
|
|
|
* @param string $segment The input string segment to process. |
|
201
|
|
|
* |
|
202
|
|
|
* @return string The transformed segment. |
|
203
|
|
|
*/ |
|
204
|
97 |
|
public function transform( string $segment ): string { |
|
205
|
|
|
|
|
206
|
|
|
// restore < e > |
|
207
|
97 |
|
$segment = str_replace( "<", "<", $segment ); |
|
208
|
97 |
|
$segment = str_replace( ">", ">", $segment ); |
|
209
|
|
|
|
|
210
|
97 |
|
$parser = new HtmlParser(); |
|
211
|
97 |
|
$parser->registerCallbacksHandler( $this ); |
|
212
|
|
|
|
|
213
|
97 |
|
return $parser->transform( $segment ); |
|
214
|
|
|
} |
|
215
|
|
|
|
|
216
|
|
|
} |