1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Created by PhpStorm. |
4
|
|
|
* @author domenico [email protected] / [email protected] |
5
|
|
|
* Date: 05/11/18 |
6
|
|
|
* Time: 15.30 |
7
|
|
|
* |
8
|
|
|
*/ |
9
|
|
|
|
10
|
|
|
namespace Matecat\SubFiltering\Filters; |
11
|
|
|
|
12
|
|
|
use Matecat\SubFiltering\Commons\AbstractHandler; |
13
|
|
|
use Matecat\SubFiltering\Enum\ConstantEnum; |
14
|
|
|
use Matecat\SubFiltering\Enum\CTypeEnum; |
15
|
|
|
use Matecat\SubFiltering\Filters\Html\CallbacksHandler; |
16
|
|
|
use Matecat\SubFiltering\Filters\Html\HtmlParser; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Class HtmlToPh |
20
|
|
|
* |
21
|
|
|
* This class converts HTML tags within a string into placeholder tags (<ph>). |
22
|
|
|
* It uses an HtmlParser with a set of callbacks to process different parts of the HTML content. |
23
|
|
|
* |
24
|
|
|
* @author domenico [email protected] / [email protected] |
25
|
|
|
* @package SubFiltering |
26
|
|
|
* |
27
|
|
|
*/ |
28
|
|
|
class MarkupToPh extends AbstractHandler { |
29
|
|
|
|
30
|
|
|
use CallbacksHandler; |
31
|
|
|
|
32
|
|
|
protected bool $isHTML = false; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Handles plain text content. Returns the buffer unchanged. |
36
|
|
|
* |
37
|
|
|
* @param string $buffer The plain text buffer. |
38
|
|
|
* |
39
|
|
|
* @return string The original buffer. |
40
|
|
|
*/ |
41
|
96 |
|
protected function _finalizePlainText( string $buffer ): string { |
42
|
96 |
|
return $buffer; |
43
|
|
|
} |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Handles and finalizes an HTML tag. |
47
|
|
|
* |
48
|
|
|
* This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself. |
49
|
|
|
* This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;b=2"` |
50
|
|
|
* becomes `href="...?a=1&b=2"`. |
51
|
|
|
* |
52
|
|
|
* @param string $buffer The HTML tag string. |
53
|
|
|
* |
54
|
|
|
* @return string The generated <ph> placeholder tag. |
55
|
|
|
*/ |
56
|
19 |
|
protected function _finalizeMarkupTag( string $buffer ): string { |
57
|
|
|
// Decode attributes by locking < and > first |
58
|
|
|
// Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string |
59
|
|
|
// Ex: |
60
|
|
|
// incoming string: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> |
61
|
|
|
// this should be: <a href="/users/settings?test=123&foobar=1" target="_blank"> with only one ampersand encoding |
62
|
|
|
// |
63
|
19 |
|
$buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer ); |
64
|
19 |
|
$buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' ); |
65
|
19 |
|
$buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer ); |
66
|
|
|
|
67
|
19 |
|
return $this->_finalizeTag( $buffer ); |
68
|
|
|
|
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Converts a generic tag string into a <ph> placeholder. |
73
|
|
|
* The original tag is stored in the 'equiv-text' attribute, base64 encoded. |
74
|
|
|
* |
75
|
|
|
* @param string $buffer The tag string to convert. |
76
|
|
|
* |
77
|
|
|
* @return string The resulting <ph> tag. |
78
|
|
|
*/ |
79
|
21 |
|
protected function _finalizeTag( string $buffer ): string { |
80
|
21 |
|
$isHTML = $this->isHTML; |
81
|
21 |
|
$this->isHTML = false; |
82
|
|
|
|
83
|
21 |
|
return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>'; |
84
|
|
|
} |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets. |
88
|
|
|
* |
89
|
|
|
* @param string $buffer The string buffer. |
90
|
|
|
* |
91
|
|
|
* @return string The fixed string with escaped angle brackets. |
92
|
|
|
*/ |
93
|
13 |
|
protected function _fixWrongBuffer( string $buffer ): string { |
94
|
13 |
|
$buffer = str_replace( "<", "<", $buffer ); |
95
|
|
|
|
96
|
13 |
|
return str_replace( ">", ">", $buffer ); |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
/** |
100
|
|
|
* Finalizes a <script> tag by converting it into a placeholder. |
101
|
|
|
* |
102
|
|
|
* @param string $buffer The script tag string. |
103
|
|
|
* |
104
|
|
|
* @return string The generated <ph> placeholder tag. |
105
|
|
|
*/ |
106
|
3 |
|
protected function _finalizeScriptTag( string $buffer ): string { |
107
|
3 |
|
return $this->_finalizeTag( $buffer ); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* Validates a given tag string based on specific criteria for HTML5 and XML tags. |
112
|
|
|
* |
113
|
|
|
* The method determines whether a given tag string is valid by: |
114
|
|
|
* 1. Ensuring there are no placeholder markers (e.g., `##LESSTHAN##`, `##GREATERTHAN##`). |
115
|
|
|
* 2. Matching against a comprehensive HTML5 tag and attribute structure using regex. |
116
|
|
|
* 3. Optionally performing a stricter validation for XML tag structures. |
117
|
|
|
* |
118
|
|
|
* @param string $buffer The string representation of a tag to be validated. |
119
|
|
|
* |
120
|
|
|
* @return bool Returns true if the tag is considered valid; false otherwise. |
121
|
|
|
*/ |
122
|
26 |
|
protected function _isTagValid( string $buffer ): bool { |
123
|
|
|
|
124
|
|
|
// This is a safeguard against misinterpreting partially processed strings. |
125
|
|
|
// During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##). |
126
|
|
|
// If such placeholders exist within what looks like a tag, it means the tag's |
127
|
|
|
// content is not yet restored, so we must not treat it as a valid, final tag. |
128
|
|
|
// For example, an original string like '<a href="<x/>">' could become |
129
|
|
|
// '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag. |
130
|
26 |
|
if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) { |
131
|
2 |
|
return false; |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* Validates if the given buffer contains a valid HTML5 tag. |
136
|
|
|
* |
137
|
|
|
* This method uses a regular expression to match and validate HTML5 tags, including their attributes. |
138
|
|
|
* It supports a wide range of HTML5 elements and global attributes, ensuring that the buffer adheres |
139
|
|
|
* to the HTML5 specification. |
140
|
|
|
* |
141
|
|
|
* Features: |
142
|
|
|
* - Matches all valid HTML5 tags, including opening, closing, and self-closing tags. |
143
|
|
|
* - Handles global attributes such as id, class, style, data-* attributes, ARIA attributes, and event handlers. |
144
|
|
|
* - Supports attribute values in double quotes, single quotes, or unquoted. |
145
|
|
|
* - Robust to multiple attributes, whitespace, and Unicode characters. |
146
|
|
|
* |
147
|
|
|
* Example HTML matched by the regex: |
148
|
|
|
* - `<div class="example" data-info="123">Content</div>` |
149
|
|
|
* - `<img src="image.png" alt="Image" />` |
150
|
|
|
* - `<button onclick="alert('Click!')">Click me</button>` |
151
|
|
|
* |
152
|
|
|
* @see https://regex101.com/r/o546zS/2 |
153
|
|
|
* |
154
|
|
|
* @param string $buffer The string to validate as an HTML5 tag. |
155
|
|
|
* |
156
|
|
|
* @return bool Returns true if the buffer contains a valid HTML5 tag; false otherwise. |
157
|
|
|
*/ |
158
|
24 |
|
if ( preg_match( '#</?(?:a|abbr|address|area|article|aside|audio|b|base|bdi|bdo|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dialog|div|dl|dt|em|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|head|header|hr|html|i|iframe|img|input|ins|kbd|label|legend|li|link|main|map|mark|menu|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|picture|pre|progress|q|rb|rp|rt|rtc|ruby|s|samp|script|section|select|slot|small|source|span|strong|style|sub|summary|sup|table|tbody|td|template|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr)(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) { |
159
|
15 |
|
$this->isHTML = true; |
160
|
|
|
|
161
|
15 |
|
return true; |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
/** |
165
|
|
|
* Validates the general structure of an XML tag using a stricter regex. |
166
|
|
|
* |
167
|
|
|
* This validation ensures that the XML tag adheres to the following rules: |
168
|
|
|
* - The tag may optionally start with a '/' character. |
169
|
|
|
* - The tag name must NOT start with a number or a hyphen. |
170
|
|
|
* - The tag name can only contain alphanumeric characters, hyphens (-), dots (.), and underscores (_). |
171
|
|
|
* - The tag name must have at least one character. |
172
|
|
|
* - The tag must end with a letter, a digit, a single quote ('), a double quote ("), or a forward slash (/). |
173
|
|
|
* - Attributes must be defined with an equal sign and quoted values (either single or double quotes). |
174
|
|
|
* |
175
|
|
|
* Notes: |
176
|
|
|
* - Unicode letters in element and attribute names are not allowed. |
177
|
|
|
* - This validation is stricter than the HTML5 validation and is tailored for XML documents. |
178
|
|
|
* - For more details, see the XML specification: https://www.w3.org/TR/xml/#NT-Attribute |
179
|
|
|
* |
180
|
|
|
* @see https://regex101.com/r/hsk9KU/4 |
181
|
|
|
* |
182
|
|
|
* @param string $buffer The string representation of the tag to validate. |
183
|
|
|
* |
184
|
|
|
* @return bool Returns true if the tag matches the stricter XML structure; false otherwise. |
185
|
|
|
*/ |
186
|
10 |
|
if ( preg_match( '#</?(?![0-9\-]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+=(?:"[^"]*"|\'[^\']*\'))*\s*/?>#ui', $buffer ) ) { |
187
|
7 |
|
return true; |
188
|
|
|
} |
189
|
|
|
|
190
|
4 |
|
return false; |
191
|
|
|
|
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
/** |
195
|
|
|
* Main transformation method. |
196
|
|
|
* |
197
|
|
|
* It instantiates an HtmlParser, registers this class as the callback handler, |
198
|
|
|
* and processes the input segment to convert HTML tags to placeholders. |
199
|
|
|
* |
200
|
|
|
* @param string $segment The input string segment to process. |
201
|
|
|
* |
202
|
|
|
* @return string The transformed segment. |
203
|
|
|
*/ |
204
|
97 |
|
public function transform( string $segment ): string { |
205
|
|
|
|
206
|
|
|
// restore < e > |
207
|
97 |
|
$segment = str_replace( "<", "<", $segment ); |
208
|
97 |
|
$segment = str_replace( ">", ">", $segment ); |
209
|
|
|
|
210
|
97 |
|
$parser = new HtmlParser(); |
211
|
97 |
|
$parser->registerCallbacksHandler( $this ); |
212
|
|
|
|
213
|
97 |
|
return $parser->transform( $segment ); |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
} |