Passed
Pull Request — master (#51)
by Domenico
01:51
created

XmlToPh   A

Complexity

Total Complexity 12

Size/Duplication

Total Lines 181
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 22
c 1
b 0
f 0
dl 0
loc 181
ccs 26
cts 26
cp 1
rs 10
wmc 12

7 Methods

Rating   Name   Duplication   Size   Complexity  
A _isTagValid() 0 70 5
A _finalizeMarkupTag() 0 12 1
A _finalizeTag() 0 2 2
A _finalizePlainText() 0 2 1
A _fixWrongBuffer() 0 4 1
A transform() 0 5 1
A _finalizeScriptTag() 0 2 1
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Enum\ConstantEnum;
14
use Matecat\SubFiltering\Enum\CTypeEnum;
15
use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
16
use Matecat\SubFiltering\Filters\Html\HtmlParser;
17
18
/**
19
 * Class HtmlToPh
20
 *
21
 * This class converts HTML tags within a string into placeholder tags (<ph>).
22
 * It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
23
 *
24
 * @author  domenico [email protected] / [email protected]
25
 * @package SubFiltering
26
 *
27
 */
28
class XmlToPh extends AbstractHandler {
29
30
    use CallbacksHandler;
31
32
    protected bool $isHTML = false;
33
34
    /**
35
     * Handles plain text content. Returns the buffer unchanged.
36
     *
37
     * @param string $buffer The plain text buffer.
38
     *
39
     * @return string The original buffer.
40
     */
41 95
    protected function _finalizePlainText( string $buffer ): string {
42 95
        return $buffer;
43
    }
44
45
    /**
46
     * Handles and finalizes an HTML tag.
47
     *
48
     * This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
49
     * This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;amp;b=2"`
50
     * becomes `href="...?a=1&amp;b=2"`.
51
     *
52
     * @param string $buffer The HTML tag string.
53
     *
54
     * @return string The generated <ph> placeholder tag.
55
     */
56 19
    protected function _finalizeMarkupTag( string $buffer ): string {
57
        // Decode attributes by locking < and > first
58
        // Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
59
        // Ex:
60
        // incoming string: <a href="/users/settings?test=123&amp;amp;foobar=1" target="_blank">
61
        // this should be: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> with only one ampersand encoding
62
        //
63 19
        $buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
64 19
        $buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' );
65 19
        $buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );
66
67 19
        return $this->_finalizeTag( $buffer );
68
69
    }
70
71
    /**
72
     * Converts a generic tag string into a <ph> placeholder.
73
     * The original tag is stored in the 'equiv-text' attribute, base64 encoded.
74
     *
75
     * @param string $buffer The tag string to convert.
76
     *
77
     * @return string The resulting <ph> tag.
78
     */
79 21
    protected function _finalizeTag( string $buffer ): string {
80 21
        return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $this->isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>';
81
    }
82
83
    /**
84
     * "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
85
     *
86
     * @param string $buffer The string buffer.
87
     *
88
     * @return string The fixed string with escaped angle brackets.
89
     */
90 12
    protected function _fixWrongBuffer( string $buffer ): string {
91 12
        $buffer = str_replace( "<", "&lt;", $buffer );
92
93 12
        return str_replace( ">", "&gt;", $buffer );
94
    }
95
96
    /**
97
     * Finalizes a <script> tag by converting it into a placeholder.
98
     *
99
     * @param string $buffer The script tag string.
100
     *
101
     * @return string The generated <ph> placeholder tag.
102
     */
103 3
    protected function _finalizeScriptTag( string $buffer ): string {
104 3
        return $this->_finalizeTag( $buffer );
105
    }
106
107
    /**
108
     * Validates if a given string is a legitimate XML or HTML-like tag.
109
     *
110
     * This method provides a robust way to identify tags, avoiding the common pitfalls of
111
     * simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
112
     * two-step validation process:
113
     * 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
114
     * 2. A check ensures the string doesn't contain internal placeholders, which would indicate
115
     *    it's a partially processed string and not a single, complete tag.
116
     *
117
     * @param string $buffer The string to validate.
118
     *
119
     * @return bool True if the buffer is a valid tag, false otherwise.
120
     */
121 25
    protected function _isTagValid( string $buffer ): bool {
122
123
        // This is a safeguard against misinterpreting partially processed strings.
124
        // During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
125
        // If such placeholders exist within what looks like a tag, it means the tag's
126
        // content is not yet restored, so we must not treat it as a valid, final tag.
127
        // For example, an original string like '&lt;a href="<x/>"&gt;' could become
128
        // '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
129 25
        if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
130 2
            return false;
131
        }
132
133
        /*
134
         * accept tags start with:
135
         * - starting with / ( optional )
136
         * - NOT starting with a number
137
         * - containing [a-zA-Z0-9\-\._] at least 1
138
         * - ending with a letter a-zA-Z0-9 or a quote "' or /
139
         *
140
         * Not accept Unicode letters in attributes
141
         * @see https://regex101.com/r/fZGsUT/1
142
         */
143
        // This regex validates the general structure of an XML/HTML tag.
144
        // It checks for a valid tag name (not starting with a number), optional attributes
145
        // (with quoted or unquoted values), and correct opening/closing brackets.
146 23
        if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) {
147
148
            /**
149
             * HTML5 Tag Matcher and Global Attribute Parser
150
             *
151
             * This module provides a comprehensive approach to matching and validating HTML5 elements
152
             * with a focus on global attributes, including `data-*` attributes with complex Unicode names.
153
             *
154
             * Features:
155
             * 1. Matches all valid HTML5 tags, including structural, text, inline, form, multimedia, table,
156
             *    script, interactive, and miscellaneous tags.
157
             * 2. Supports opening tags, closing tags, and self-closing tags.
158
             * 3. Supports global attributes:
159
             *    - Standard global attributes: id, class, style, title, lang, dir, hidden, draggable, etc.
160
             *    - Data attributes: data-* with Unicode, emoji, or complex characters.
161
             *    - ARIA attributes: role, aria-*.
162
             *    - Event handlers: on*, e.g., onclick, onmouseover.
163
             *    - Deprecated XML attributes: xml:lang, xml:base.
164
             * 4. Handles attribute values in double quotes, single quotes, or unquoted.
165
             * 5. Example usage includes parsing headings (`h1`-`h6`) with multiple global attributes,
166
             *    as well as other HTML5 elements with complex `data-*` attributes.
167
             *
168
             * Regex Summary:
169
             * - Tag matching: matches all HTML5 tags listed in the specification.
170
             * - Attribute matching: matches zero or more global attributes including complex `data-*` names.
171
             * - Robust to multiple attributes, whitespace, self-closing tags, and Unicode characters.
172
             *
173
             * Example HTML matched by the regex:
174
             * <h1 id="title1" class="main" data-élément-αριθμός="321">Heading</h1>
175
             * <img src="image.png" alt="Photo" data-info="📸"/>
176
             * <div hidden data-属性名123="有效">Content</div>
177
             * <button onclick="alert('Click!')">Click me</button>
178
             *
179
             * Notes:
180
             * - This regex is intended for validation and parsing in contexts that allow Unicode and extended characters.
181
             * - For `dataset` access in JavaScript, `getAttribute` is recommended for attributes with non-ASCII names.
182
             */
183 22
            if ( preg_match( '#<\s*/?\s*(?:html|head|body|header|footer|main|section|article|nav|aside|h1|h2|h3|h4|h5|h6|p|hr|pre|blockquote|ol|ul|li|dl|dt|dd|figure|figcaption|div|a|em|strong|small|s|cite|q|dfn|abbr|ruby|rt|rp|data|time|code|var|samp|kbd|sub|sup|i|b|u|mark|bdi|bdo|span|br|wbr|form|label|input|button|select|datalist|optgroup|option|textarea|output|fieldset|legend|meter|progress|img|audio|video|source|track|picture|map|area|iframe|embed|object|param|table|caption|colgroup|col|tbody|thead|tfoot|tr|td|th|script|noscript|template|canvas|link|style|meta|base|title|details|summary|dialog|menu|menuitem|slot|portal)\b(?:\s+(?:accesskey|class|contenteditable|data-[^\s=]+|dir|draggable|enterkeyhint|hidden|id|inert|inputmode|lang|popover|spellcheck|style|tabindex|title|translate|xml:lang|xml:base|role|aria-[^\s=]+|on\w+)(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?\s*>#ui', $buffer ) ) {
184 15
                $this->isHTML = true;
185
            }
186
187 22
            return true;
188
        }
189
190 3
        return false;
191
192
    }
193
194
    /**
195
     * Main transformation method.
196
     *
197
     * It instantiates an HtmlParser, registers this class as the callback handler,
198
     * and processes the input segment to convert HTML tags to placeholders.
199
     *
200
     * @param string $segment The input string segment to process.
201
     *
202
     * @return string The transformed segment.
203
     */
204 95
    public function transform( string $segment ): string {
205 95
        $parser = new HtmlParser();
206 95
        $parser->registerCallbacksHandler( $this );
207
208 95
        return $parser->transform( $segment );
209
    }
210
211
}