MarkupToPh::_fixWrongBuffer()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 4
ccs 3
cts 3
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Enum\ConstantEnum;
14
use Matecat\SubFiltering\Enum\CTypeEnum;
15
use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
16
use Matecat\SubFiltering\Filters\Html\HtmlParser;
17
18
/**
19
 * Class HtmlToPh
20
 *
21
 * This class converts HTML tags within a string into placeholder tags (<ph>).
22
 * It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
23
 *
24
 * @author  domenico [email protected] / [email protected]
25
 * @package SubFiltering
26
 *
27
 */
28
class MarkupToPh extends AbstractHandler {
29
30
    use CallbacksHandler;
31
32
    protected bool $isHTML = false;
33
34
    /**
35
     * Handles plain text content. Returns the buffer unchanged.
36
     *
37
     * @param string $buffer The plain text buffer.
38
     *
39
     * @return string The original buffer.
40
     */
41 96
    protected function _finalizePlainText( string $buffer ): string {
42 96
        return $buffer;
43
    }
44
45
    /**
46
     * Handles and finalizes an HTML tag.
47
     *
48
     * This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
49
     * This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;amp;b=2"`
50
     * becomes `href="...?a=1&amp;b=2"`.
51
     *
52
     * @param string $buffer The HTML tag string.
53
     *
54
     * @return string The generated <ph> placeholder tag.
55
     */
56 19
    protected function _finalizeMarkupTag( string $buffer ): string {
57
        // Decode attributes by locking < and > first
58
        // Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
59
        // Ex:
60
        // incoming string: <a href="/users/settings?test=123&amp;amp;foobar=1" target="_blank">
61
        // this should be: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> with only one ampersand encoding
62
        //
63 19
        $buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
64 19
        $buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' );
65 19
        $buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );
66
67 19
        return $this->_finalizeTag( $buffer );
68
69
    }
70
71
    /**
72
     * Converts a generic tag string into a <ph> placeholder.
73
     * The original tag is stored in the 'equiv-text' attribute, base64 encoded.
74
     *
75
     * @param string $buffer The tag string to convert.
76
     *
77
     * @return string The resulting <ph> tag.
78
     */
79 21
    protected function _finalizeTag( string $buffer ): string {
80 21
        $isHTML       = $this->isHTML;
81 21
        $this->isHTML = false;
82
83 21
        return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>';
84
    }
85
86
    /**
87
     * "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
88
     *
89
     * @param string $buffer The string buffer.
90
     *
91
     * @return string The fixed string with escaped angle brackets.
92
     */
93 13
    protected function _fixWrongBuffer( string $buffer ): string {
94 13
        $buffer = str_replace( "<", "&lt;", $buffer );
95
96 13
        return str_replace( ">", "&gt;", $buffer );
97
    }
98
99
    /**
100
     * Finalizes a <script> tag by converting it into a placeholder.
101
     *
102
     * @param string $buffer The script tag string.
103
     *
104
     * @return string The generated <ph> placeholder tag.
105
     */
106 3
    protected function _finalizeScriptTag( string $buffer ): string {
107 3
        return $this->_finalizeTag( $buffer );
108
    }
109
110
    /**
111
     * Validates a given tag string based on specific criteria for HTML5 and XML tags.
112
     *
113
     * The method determines whether a given tag string is valid by:
114
     * 1. Ensuring there are no placeholder markers (e.g., `##LESSTHAN##`, `##GREATERTHAN##`).
115
     * 2. Matching against a comprehensive HTML5 tag and attribute structure using regex.
116
     * 3. Optionally performing a stricter validation for XML tag structures.
117
     *
118
     * @param string $buffer The string representation of a tag to be validated.
119
     *
120
     * @return bool Returns true if the tag is considered valid; false otherwise.
121
     */
122 26
    protected function _isTagValid( string $buffer ): bool {
123
124
        // This is a safeguard against misinterpreting partially processed strings.
125
        // During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
126
        // If such placeholders exist within what looks like a tag, it means the tag's
127
        // content is not yet restored, so we must not treat it as a valid, final tag.
128
        // For example, an original string like '&lt;a href="<x/>"&gt;' could become
129
        // '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
130 26
        if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
131 2
            return false;
132
        }
133
134
        /**
135
         * Validates if the given buffer contains a valid HTML5 tag.
136
         *
137
         * This method uses a regular expression to match and validate HTML5 tags, including their attributes.
138
         * It supports a wide range of HTML5 elements and global attributes, ensuring that the buffer adheres
139
         * to the HTML5 specification.
140
         *
141
         * Features:
142
         * - Matches all valid HTML5 tags, including opening, closing, and self-closing tags.
143
         * - Handles global attributes such as id, class, style, data-* attributes, ARIA attributes, and event handlers.
144
         * - Supports attribute values in double quotes, single quotes, or unquoted.
145
         * - Robust to multiple attributes, whitespace, and Unicode characters.
146
         *
147
         * Example HTML matched by the regex:
148
         * - `<div class="example" data-info="123">Content</div>`
149
         * - `<img src="image.png" alt="Image" />`
150
         * - `<button onclick="alert('Click!')">Click me</button>`
151
         *
152
         * @see https://regex101.com/r/o546zS/2
153
         *
154
         * @param string $buffer The string to validate as an HTML5 tag.
155
         *
156
         * @return bool Returns true if the buffer contains a valid HTML5 tag; false otherwise.
157
         */
158 24
        if ( preg_match( '#</?(?:a|abbr|address|area|article|aside|audio|b|base|bdi|bdo|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dialog|div|dl|dt|em|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|head|header|hr|html|i|iframe|img|input|ins|kbd|label|legend|li|link|main|map|mark|menu|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|picture|pre|progress|q|rb|rp|rt|rtc|ruby|s|samp|script|section|select|slot|small|source|span|strong|style|sub|summary|sup|table|tbody|td|template|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr)(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) {
159 15
            $this->isHTML = true;
160
161 15
            return true;
162
        }
163
164
        /**
165
         * Validates the general structure of an XML tag using a stricter regex.
166
         *
167
         * This validation ensures that the XML tag adheres to the following rules:
168
         * - The tag may optionally start with a '/' character.
169
         * - The tag name must NOT start with a number or a hyphen.
170
         * - The tag name can only contain alphanumeric characters, hyphens (-), dots (.), and underscores (_).
171
         * - The tag name must have at least one character.
172
         * - The tag must end with a letter, a digit, a single quote ('), a double quote ("), or a forward slash (/).
173
         * - Attributes must be defined with an equal sign and quoted values (either single or double quotes).
174
         *
175
         * Notes:
176
         * - Unicode letters in element and attribute names are not allowed.
177
         * - This validation is stricter than the HTML5 validation and is tailored for XML documents.
178
         * - For more details, see the XML specification: https://www.w3.org/TR/xml/#NT-Attribute
179
         *
180
         * @see https://regex101.com/r/hsk9KU/4
181
         *
182
         * @param string $buffer The string representation of the tag to validate.
183
         *
184
         * @return bool Returns true if the tag matches the stricter XML structure; false otherwise.
185
         */
186 10
        if ( preg_match( '#</?(?![0-9\-]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+=(?:"[^"]*"|\'[^\']*\'))*\s*/?>#ui', $buffer ) ) {
187 7
            return true;
188
        }
189
190 4
        return false;
191
192
    }
193
194
    /**
195
     * Main transformation method.
196
     *
197
     * It instantiates an HtmlParser, registers this class as the callback handler,
198
     * and processes the input segment to convert HTML tags to placeholders.
199
     *
200
     * @param string $segment The input string segment to process.
201
     *
202
     * @return string The transformed segment.
203
     */
204 97
    public function transform( string $segment ): string {
205
206
        // restore < e >
207 97
        $segment = str_replace( "&lt;", "<", $segment );
208 97
        $segment = str_replace( "&gt;", ">", $segment );
209
210 97
        $parser = new HtmlParser();
211 97
        $parser->registerCallbacksHandler( $this );
212
213 97
        return $parser->transform( $segment );
214
    }
215
216
}