|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Created by PhpStorm. |
|
4
|
|
|
* @author domenico [email protected] / [email protected] |
|
5
|
|
|
* Date: 05/11/18 |
|
6
|
|
|
* Time: 15.30 |
|
7
|
|
|
* |
|
8
|
|
|
*/ |
|
9
|
|
|
|
|
10
|
|
|
namespace Matecat\SubFiltering\Filters; |
|
11
|
|
|
|
|
12
|
|
|
use Matecat\SubFiltering\Commons\AbstractHandler; |
|
13
|
|
|
use Matecat\SubFiltering\Enum\ConstantEnum; |
|
14
|
|
|
use Matecat\SubFiltering\Enum\CTypeEnum; |
|
15
|
|
|
use Matecat\SubFiltering\Filters\Html\CallbacksHandler; |
|
16
|
|
|
use Matecat\SubFiltering\Filters\Html\HtmlParser; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* Class HtmlToPh |
|
20
|
|
|
* |
|
21
|
|
|
* This class converts HTML tags within a string into placeholder tags (<ph>). |
|
22
|
|
|
* It uses an HtmlParser with a set of callbacks to process different parts of the HTML content. |
|
23
|
|
|
* |
|
24
|
|
|
* @author domenico [email protected] / [email protected] |
|
25
|
|
|
* @package SubFiltering |
|
26
|
|
|
* |
|
27
|
|
|
*/ |
|
28
|
|
|
class XmlToPh extends AbstractHandler { |
|
29
|
|
|
|
|
30
|
|
|
use CallbacksHandler; |
|
31
|
|
|
|
|
32
|
|
|
protected bool $isHTML = false; |
|
33
|
|
|
|
|
34
|
|
|
/** |
|
35
|
|
|
* Handles plain text content. Returns the buffer unchanged. |
|
36
|
|
|
* |
|
37
|
|
|
* @param string $buffer The plain text buffer. |
|
38
|
|
|
* |
|
39
|
|
|
* @return string The original buffer. |
|
40
|
|
|
*/ |
|
41
|
95 |
|
protected function _finalizePlainText( string $buffer ): string { |
|
42
|
95 |
|
return $buffer; |
|
43
|
|
|
} |
|
44
|
|
|
|
|
45
|
|
|
/** |
|
46
|
|
|
* Handles and finalizes an HTML tag. |
|
47
|
|
|
* |
|
48
|
|
|
* This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself. |
|
49
|
|
|
* This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;b=2"` |
|
50
|
|
|
* becomes `href="...?a=1&b=2"`. |
|
51
|
|
|
* |
|
52
|
|
|
* @param string $buffer The HTML tag string. |
|
53
|
|
|
* |
|
54
|
|
|
* @return string The generated <ph> placeholder tag. |
|
55
|
|
|
*/ |
|
56
|
19 |
|
protected function _finalizeMarkupTag( string $buffer ): string { |
|
57
|
|
|
// Decode attributes by locking < and > first |
|
58
|
|
|
// Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string |
|
59
|
|
|
// Ex: |
|
60
|
|
|
// incoming string: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> |
|
61
|
|
|
// this should be: <a href="/users/settings?test=123&foobar=1" target="_blank"> with only one ampersand encoding |
|
62
|
|
|
// |
|
63
|
19 |
|
$buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer ); |
|
64
|
19 |
|
$buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' ); |
|
65
|
19 |
|
$buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer ); |
|
66
|
|
|
|
|
67
|
19 |
|
return $this->_finalizeTag( $buffer ); |
|
68
|
|
|
|
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Converts a generic tag string into a <ph> placeholder. |
|
73
|
|
|
* The original tag is stored in the 'equiv-text' attribute, base64 encoded. |
|
74
|
|
|
* |
|
75
|
|
|
* @param string $buffer The tag string to convert. |
|
76
|
|
|
* |
|
77
|
|
|
* @return string The resulting <ph> tag. |
|
78
|
|
|
*/ |
|
79
|
21 |
|
protected function _finalizeTag( string $buffer ): string { |
|
80
|
21 |
|
return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $this->isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>'; |
|
81
|
|
|
} |
|
82
|
|
|
|
|
83
|
|
|
/** |
|
84
|
|
|
* "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets. |
|
85
|
|
|
* |
|
86
|
|
|
* @param string $buffer The string buffer. |
|
87
|
|
|
* |
|
88
|
|
|
* @return string The fixed string with escaped angle brackets. |
|
89
|
|
|
*/ |
|
90
|
12 |
|
protected function _fixWrongBuffer( string $buffer ): string { |
|
91
|
12 |
|
$buffer = str_replace( "<", "<", $buffer ); |
|
92
|
|
|
|
|
93
|
12 |
|
return str_replace( ">", ">", $buffer ); |
|
94
|
|
|
} |
|
95
|
|
|
|
|
96
|
|
|
/** |
|
97
|
|
|
* Finalizes a <script> tag by converting it into a placeholder. |
|
98
|
|
|
* |
|
99
|
|
|
* @param string $buffer The script tag string. |
|
100
|
|
|
* |
|
101
|
|
|
* @return string The generated <ph> placeholder tag. |
|
102
|
|
|
*/ |
|
103
|
3 |
|
protected function _finalizeScriptTag( string $buffer ): string { |
|
104
|
3 |
|
return $this->_finalizeTag( $buffer ); |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
|
|
/** |
|
108
|
|
|
* Validates if a given string is a legitimate XML or HTML-like tag. |
|
109
|
|
|
* |
|
110
|
|
|
* This method provides a robust way to identify tags, avoiding the common pitfalls of |
|
111
|
|
|
* simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a |
|
112
|
|
|
* two-step validation process: |
|
113
|
|
|
* 1. A regular expression checks for a valid tag structure (name, attributes, brackets). |
|
114
|
|
|
* 2. A check ensures the string doesn't contain internal placeholders, which would indicate |
|
115
|
|
|
* it's a partially processed string and not a single, complete tag. |
|
116
|
|
|
* |
|
117
|
|
|
* @param string $buffer The string to validate. |
|
118
|
|
|
* |
|
119
|
|
|
* @return bool True if the buffer is a valid tag, false otherwise. |
|
120
|
|
|
*/ |
|
121
|
25 |
|
protected function _isTagValid( string $buffer ): bool { |
|
122
|
|
|
|
|
123
|
|
|
// This is a safeguard against misinterpreting partially processed strings. |
|
124
|
|
|
// During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##). |
|
125
|
|
|
// If such placeholders exist within what looks like a tag, it means the tag's |
|
126
|
|
|
// content is not yet restored, so we must not treat it as a valid, final tag. |
|
127
|
|
|
// For example, an original string like '<a href="<x/>">' could become |
|
128
|
|
|
// '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag. |
|
129
|
25 |
|
if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) { |
|
130
|
2 |
|
return false; |
|
131
|
|
|
} |
|
132
|
|
|
|
|
133
|
|
|
/* |
|
134
|
|
|
* accept tags start with: |
|
135
|
|
|
* - starting with / ( optional ) |
|
136
|
|
|
* - NOT starting with a number |
|
137
|
|
|
* - containing [a-zA-Z0-9\-\._] at least 1 |
|
138
|
|
|
* - ending with a letter a-zA-Z0-9 or a quote "' or / |
|
139
|
|
|
* |
|
140
|
|
|
* Not accept Unicode letters in attributes |
|
141
|
|
|
* @see https://regex101.com/r/fZGsUT/1 |
|
142
|
|
|
*/ |
|
143
|
|
|
// This regex validates the general structure of an XML/HTML tag. |
|
144
|
|
|
// It checks for a valid tag name (not starting with a number), optional attributes |
|
145
|
|
|
// (with quoted or unquoted values), and correct opening/closing brackets. |
|
146
|
23 |
|
if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) { |
|
147
|
|
|
|
|
148
|
|
|
/** |
|
149
|
|
|
* HTML5 Tag Matcher and Global Attribute Parser |
|
150
|
|
|
* |
|
151
|
|
|
* This module provides a comprehensive approach to matching and validating HTML5 elements |
|
152
|
|
|
* with a focus on global attributes, including `data-*` attributes with complex Unicode names. |
|
153
|
|
|
* |
|
154
|
|
|
* Features: |
|
155
|
|
|
* 1. Matches all valid HTML5 tags, including structural, text, inline, form, multimedia, table, |
|
156
|
|
|
* script, interactive, and miscellaneous tags. |
|
157
|
|
|
* 2. Supports opening tags, closing tags, and self-closing tags. |
|
158
|
|
|
* 3. Supports global attributes: |
|
159
|
|
|
* - Standard global attributes: id, class, style, title, lang, dir, hidden, draggable, etc. |
|
160
|
|
|
* - Data attributes: data-* with Unicode, emoji, or complex characters. |
|
161
|
|
|
* - ARIA attributes: role, aria-*. |
|
162
|
|
|
* - Event handlers: on*, e.g., onclick, onmouseover. |
|
163
|
|
|
* - Deprecated XML attributes: xml:lang, xml:base. |
|
164
|
|
|
* 4. Handles attribute values in double quotes, single quotes, or unquoted. |
|
165
|
|
|
* 5. Example usage includes parsing headings (`h1`-`h6`) with multiple global attributes, |
|
166
|
|
|
* as well as other HTML5 elements with complex `data-*` attributes. |
|
167
|
|
|
* |
|
168
|
|
|
* Regex Summary: |
|
169
|
|
|
* - Tag matching: matches all HTML5 tags listed in the specification. |
|
170
|
|
|
* - Attribute matching: matches zero or more global attributes including complex `data-*` names. |
|
171
|
|
|
* - Robust to multiple attributes, whitespace, self-closing tags, and Unicode characters. |
|
172
|
|
|
* |
|
173
|
|
|
* Example HTML matched by the regex: |
|
174
|
|
|
* <h1 id="title1" class="main" data-élément-αριθμός="321">Heading</h1> |
|
175
|
|
|
* <img src="image.png" alt="Photo" data-info="📸"/> |
|
176
|
|
|
* <div hidden data-属性名123="有效">Content</div> |
|
177
|
|
|
* <button onclick="alert('Click!')">Click me</button> |
|
178
|
|
|
* |
|
179
|
|
|
* Notes: |
|
180
|
|
|
* - This regex is intended for validation and parsing in contexts that allow Unicode and extended characters. |
|
181
|
|
|
* - For `dataset` access in JavaScript, `getAttribute` is recommended for attributes with non-ASCII names. |
|
182
|
|
|
*/ |
|
183
|
22 |
|
if ( preg_match( '#<\s*/?\s*(?:html|head|body|header|footer|main|section|article|nav|aside|h1|h2|h3|h4|h5|h6|p|hr|pre|blockquote|ol|ul|li|dl|dt|dd|figure|figcaption|div|a|em|strong|small|s|cite|q|dfn|abbr|ruby|rt|rp|data|time|code|var|samp|kbd|sub|sup|i|b|u|mark|bdi|bdo|span|br|wbr|form|label|input|button|select|datalist|optgroup|option|textarea|output|fieldset|legend|meter|progress|img|audio|video|source|track|picture|map|area|iframe|embed|object|param|table|caption|colgroup|col|tbody|thead|tfoot|tr|td|th|script|noscript|template|canvas|link|style|meta|base|title|details|summary|dialog|menu|menuitem|slot|portal)\b(?:\s+(?:accesskey|class|contenteditable|data-[^\s=]+|dir|draggable|enterkeyhint|hidden|id|inert|inputmode|lang|popover|spellcheck|style|tabindex|title|translate|xml:lang|xml:base|role|aria-[^\s=]+|on\w+)(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?\s*>#ui', $buffer ) ) { |
|
184
|
15 |
|
$this->isHTML = true; |
|
185
|
|
|
} |
|
186
|
|
|
|
|
187
|
22 |
|
return true; |
|
188
|
|
|
} |
|
189
|
|
|
|
|
190
|
3 |
|
return false; |
|
191
|
|
|
|
|
192
|
|
|
} |
|
193
|
|
|
|
|
194
|
|
|
/** |
|
195
|
|
|
* Main transformation method. |
|
196
|
|
|
* |
|
197
|
|
|
* It instantiates an HtmlParser, registers this class as the callback handler, |
|
198
|
|
|
* and processes the input segment to convert HTML tags to placeholders. |
|
199
|
|
|
* |
|
200
|
|
|
* @param string $segment The input string segment to process. |
|
201
|
|
|
* |
|
202
|
|
|
* @return string The transformed segment. |
|
203
|
|
|
*/ |
|
204
|
95 |
|
public function transform( string $segment ): string { |
|
205
|
95 |
|
$parser = new HtmlParser(); |
|
206
|
95 |
|
$parser->registerCallbacksHandler( $this ); |
|
207
|
|
|
|
|
208
|
95 |
|
return $parser->transform( $segment ); |
|
209
|
|
|
} |
|
210
|
|
|
|
|
211
|
|
|
} |