Passed
Pull Request — master (#51)
by Domenico
02:05
created

XmlToPh   A

Complexity

Total Complexity 10

Size/Duplication

Total Lines 140
Duplicated Lines 0 %

Test Coverage

Coverage 95.83%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 19
c 1
b 0
f 0
dl 0
loc 140
ccs 23
cts 24
cp 0.9583
rs 10
wmc 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A _isTagValid() 0 31 4
A _finalizeTag() 0 2 1
A _finalizePlainText() 0 2 1
A _fixWrongBuffer() 0 4 1
A _finalizeHTMLTag() 0 12 1
A transform() 0 5 1
A _finalizeScriptTag() 0 2 1
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author domenico [email protected] / [email protected]
5
 * Date: 05/11/18
6
 * Time: 15.30
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Filters;
11
12
use Matecat\SubFiltering\Commons\AbstractHandler;
13
use Matecat\SubFiltering\Enum\ConstantEnum;
14
use Matecat\SubFiltering\Enum\CTypeEnum;
15
use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
16
use Matecat\SubFiltering\Filters\Html\HtmlParser;
17
18
/**
19
 * Class HtmlToPh
20
 *
21
 * This class converts HTML tags within a string into placeholder tags (<ph>).
22
 * It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
23
 *
24
 * @author  domenico [email protected] / [email protected]
25
 * @package SubFiltering
26
 *
27
 */
28
class XmlToPh extends AbstractHandler {
29
30
    use CallbacksHandler;
31
32
    /**
33
     * Handles plain text content. Returns the buffer unchanged.
34
     *
35
     * @param string $buffer The plain text buffer.
36
     *
37
     * @return string The original buffer.
38
     */
39 93
    protected function _finalizePlainText( string $buffer ): string {
40 93
        return $buffer;
41
    }
42
43
    /**
44
     * Handles and finalizes an HTML tag.
45
     *
46
     * This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
47
     * This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;amp;b=2"`
48
     * becomes `href="...?a=1&amp;b=2"`.
49
     *
50
     * @param string $buffer The HTML tag string.
51
     *
52
     * @return string The generated <ph> placeholder tag.
53
     */
54 18
    protected function _finalizeHTMLTag( string $buffer ): string {
55
        // Decode attributes by locking < and > first
56
        // Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
57
        // Ex:
58
        // incoming string: <a href="/users/settings?test=123&amp;amp;foobar=1" target="_blank">
59
        // this should be: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> with only one ampersand encoding
60
        //
61 18
        $buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
62 18
        $buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' );
63 18
        $buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );
64
65 18
        return $this->_finalizeTag( $buffer );
66
67
    }
68
69
    /**
70
     * Converts a generic tag string into a <ph> placeholder.
71
     * The original tag is stored in the 'equiv-text' attribute, base64 encoded.
72
     *
73
     * @param string $buffer The tag string to convert.
74
     *
75
     * @return string The resulting <ph> tag.
76
     */
77 20
    protected function _finalizeTag( string $buffer ): string {
78 20
        return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . CTypeEnum::XML . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>';
79
    }
80
81
    /**
82
     * "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
83
     *
84
     * @param string $buffer The string buffer.
85
     *
86
     * @return string The fixed string with escaped angle brackets.
87
     */
88 11
    protected function _fixWrongBuffer( string $buffer ): string {
89 11
        $buffer = str_replace( "<", "&lt;", $buffer );
90
91 11
        return str_replace( ">", "&gt;", $buffer );
92
    }
93
94
    /**
95
     * Finalizes a <script> tag by converting it into a placeholder.
96
     *
97
     * @param string $buffer The script tag string.
98
     *
99
     * @return string The generated <ph> placeholder tag.
100
     */
101 2
    protected function _finalizeScriptTag( string $buffer ): string {
102 2
        return $this->_finalizeTag( $buffer );
103
    }
104
105
    /**
106
     * Validates if a given string is a legitimate XML or HTML-like tag.
107
     *
108
     * This method provides a robust way to identify tags, avoiding the common pitfalls of
109
     * simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
110
     * two-step validation process:
111
     * 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
112
     * 2. A check ensures the string doesn't contain internal placeholders, which would indicate
113
     *    it's a partially processed string and not a single, complete tag.
114
     *
115
     * @param string $buffer The string to validate.
116
     *
117
     * @return bool True if the buffer is a valid tag, false otherwise.
118
     */
119 22
    protected function _isTagValid( string $buffer ): bool {
120
121
        /*
122
         * accept tags start with:
123
         * - starting with / ( optional )
124
         * - NOT starting with a number
125
         * - containing [a-zA-Z0-9\-\._] at least 1
126
         * - ending with a letter a-zA-Z0-9 or a quote "' or /
127
         *
128
         * Not accept Unicode letters in attributes
129
         * @see https://regex101.com/r/fZGsUT/1
130
         */
131
        // This regex validates the general structure of an XML/HTML tag.
132
        // It checks for a valid tag name (not starting with a number), optional attributes
133
        // (with quoted or unquoted values), and correct opening/closing brackets.
134 22
        if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) {
135
136
            // This is a safeguard against misinterpreting partially processed strings.
137
            // During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
138
            // If such placeholders exist within what looks like a tag, it means the tag's
139
            // content is not yet restored, so we must not treat it as a valid, final tag.
140
            // For example, an original string like '&lt;a href="<x/>"&gt;' could become
141
            // '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
142 19
            if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
143
                return false;
144
            }
145
146 19
            return true;
147
        }
148
149 4
        return false;
150
151
    }
152
153
    /**
154
     * Main transformation method.
155
     *
156
     * It instantiates an HtmlParser, registers this class as the callback handler,
157
     * and processes the input segment to convert HTML tags to placeholders.
158
     *
159
     * @param string $segment The input string segment to process.
160
     *
161
     * @return string The transformed segment.
162
     */
163 93
    public function transform( string $segment ): string {
164 93
        $parser = new HtmlParser();
165 93
        $parser->registerCallbacksHandler( $this );
166
167 93
        return $parser->transform( $segment );
168
    }
169
170
}