XmlToPh - Code Metrics - Inspection of "Updated to php-7.4" - matecat/subfiltering - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#51)

by Domenico

created 2025-09-08 16:08 UTC

XmlToPh A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	181
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
eloc	22
c	1
b	0
f	0
dl	0
loc	181
ccs	26
cts	26
cp	1
rs	10
wmc	12

7 Methods

Rating	Name	Size	Complexity
A	_isTagValid()	70	5
A	_finalizeMarkupTag()	12	1
A	_finalizeTag()	2	2
A	_finalizePlainText()	2	1
A	_fixWrongBuffer()	4	1
A	transform()	5	1
A	_finalizeScriptTag()	2	1

<?php
/**
 * Created by PhpStorm.
 * @author domenico [email protected] / [email protected]
 * Date: 05/11/18
 * Time: 15.30
 *
 */

namespace Matecat\SubFiltering\Filters;

use Matecat\SubFiltering\Commons\AbstractHandler;
use Matecat\SubFiltering\Enum\ConstantEnum;
use Matecat\SubFiltering\Enum\CTypeEnum;
use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
use Matecat\SubFiltering\Filters\Html\HtmlParser;

/**
 * Class HtmlToPh
 *
 * This class converts HTML tags within a string into placeholder tags (<ph>).
 * It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
 *
 * @author  domenico [email protected] / [email protected]
 * @package SubFiltering
 *
 */
class XmlToPh extends AbstractHandler {

    use CallbacksHandler;

    protected bool $isHTML = false;

    /**
     * Handles plain text content. Returns the buffer unchanged.
     *
     * @param string $buffer The plain text buffer.
     *
     * @return string The original buffer.
     */
    protected function _finalizePlainText( string $buffer ): string {
        return $buffer;
    }

    /**
     * Handles and finalizes an HTML tag.
     *
     * This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
     * This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;amp;b=2"`
     * becomes `href="...?a=1&amp;b=2"`.
     *
     * @param string $buffer The HTML tag string.
     *
     * @return string The generated <ph> placeholder tag.
     */
    protected function _finalizeMarkupTag( string $buffer ): string {
        // Decode attributes by locking < and > first
        // Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
        // Ex:
        // incoming string: <a href="/users/settings?test=123&amp;amp;foobar=1" target="_blank">
        // this should be: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> with only one ampersand encoding
        //
        $buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
        $buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' );
        $buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );

        return $this->_finalizeTag( $buffer );

    }

    /**
     * Converts a generic tag string into a <ph> placeholder.
     * The original tag is stored in the 'equiv-text' attribute, base64 encoded.
     *
     * @param string $buffer The tag string to convert.
     *
     * @return string The resulting <ph> tag.
     */
    protected function _finalizeTag( string $buffer ): string {
        return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $this->isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>';
    }

    /**
     * "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
     *
     * @param string $buffer The string buffer.
     *
     * @return string The fixed string with escaped angle brackets.
     */
    protected function _fixWrongBuffer( string $buffer ): string {
        $buffer = str_replace( "<", "&lt;", $buffer );

        return str_replace( ">", "&gt;", $buffer );
    }

    /**
     * Finalizes a <script> tag by converting it into a placeholder.
     *
     * @param string $buffer The script tag string.
     *
     * @return string The generated <ph> placeholder tag.
     */
    protected function _finalizeScriptTag( string $buffer ): string {
        return $this->_finalizeTag( $buffer );
    }

    /**
     * Validates if a given string is a legitimate XML or HTML-like tag.
     *
     * This method provides a robust way to identify tags, avoiding the common pitfalls of
     * simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
     * two-step validation process:
     * 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
     * 2. A check ensures the string doesn't contain internal placeholders, which would indicate
     *    it's a partially processed string and not a single, complete tag.
     *
     * @param string $buffer The string to validate.
     *
     * @return bool True if the buffer is a valid tag, false otherwise.
     */
    protected function _isTagValid( string $buffer ): bool {

        // This is a safeguard against misinterpreting partially processed strings.
        // During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
        // If such placeholders exist within what looks like a tag, it means the tag's
        // content is not yet restored, so we must not treat it as a valid, final tag.
        // For example, an original string like '&lt;a href="<x/>"&gt;' could become
        // '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
        if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
            return false;
        }

        /*
         * accept tags start with:
         * - starting with / ( optional )
         * - NOT starting with a number
         * - containing [a-zA-Z0-9\-\._] at least 1
         * - ending with a letter a-zA-Z0-9 or a quote "' or /
         *
         * Not accept Unicode letters in attributes
         * @see https://regex101.com/r/fZGsUT/1
         */
        // This regex validates the general structure of an XML/HTML tag.
        // It checks for a valid tag name (not starting with a number), optional attributes
        // (with quoted or unquoted values), and correct opening/closing brackets.
        if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) {

            /**
             * HTML5 Tag Matcher and Global Attribute Parser
             *
             * This module provides a comprehensive approach to matching and validating HTML5 elements
             * with a focus on global attributes, including `data-*` attributes with complex Unicode names.
             *
             * Features:
             * 1. Matches all valid HTML5 tags, including structural, text, inline, form, multimedia, table,
             *    script, interactive, and miscellaneous tags.
             * 2. Supports opening tags, closing tags, and self-closing tags.
             * 3. Supports global attributes:
             *    - Standard global attributes: id, class, style, title, lang, dir, hidden, draggable, etc.
             *    - Data attributes: data-* with Unicode, emoji, or complex characters.
             *    - ARIA attributes: role, aria-*.
             *    - Event handlers: on*, e.g., onclick, onmouseover.
             *    - Deprecated XML attributes: xml:lang, xml:base.
             * 4. Handles attribute values in double quotes, single quotes, or unquoted.
             * 5. Example usage includes parsing headings (`h1`-`h6`) with multiple global attributes,
             *    as well as other HTML5 elements with complex `data-*` attributes.
             *
             * Regex Summary:
             * - Tag matching: matches all HTML5 tags listed in the specification.
             * - Attribute matching: matches zero or more global attributes including complex `data-*` names.
             * - Robust to multiple attributes, whitespace, self-closing tags, and Unicode characters.
             *
             * Example HTML matched by the regex:
             * <h1 id="title1" class="main" data-élément-αριθμός="321">Heading</h1>
             * <img src="image.png" alt="Photo" data-info="📸"/>
             * <div hidden data-属性名123="有效">Content</div>
             * <button onclick="alert('Click!')">Click me</button>
             *
             * Notes:
             * - This regex is intended for validation and parsing in contexts that allow Unicode and extended characters.
             * - For `dataset` access in JavaScript, `getAttribute` is recommended for attributes with non-ASCII names.
             */
            if ( preg_match( '#<\s*/?\s*(?:html|head|body|header|footer|main|section|article|nav|aside|h1|h2|h3|h4|h5|h6|p|hr|pre|blockquote|ol|ul|li|dl|dt|dd|figure|figcaption|div|a|em|strong|small|s|cite|q|dfn|abbr|ruby|rt|rp|data|time|code|var|samp|kbd|sub|sup|i|b|u|mark|bdi|bdo|span|br|wbr|form|label|input|button|select|datalist|optgroup|option|textarea|output|fieldset|legend|meter|progress|img|audio|video|source|track|picture|map|area|iframe|embed|object|param|table|caption|colgroup|col|tbody|thead|tfoot|tr|td|th|script|noscript|template|canvas|link|style|meta|base|title|details|summary|dialog|menu|menuitem|slot|portal)\b(?:\s+(?:accesskey|class|contenteditable|data-[^\s=]+|dir|draggable|enterkeyhint|hidden|id|inert|inputmode|lang|popover|spellcheck|style|tabindex|title|translate|xml:lang|xml:base|role|aria-[^\s=]+|on\w+)(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?\s*>#ui', $buffer ) ) {
                $this->isHTML = true;
            }

            return true;
        }

        return false;

    }

    /**
     * Main transformation method.
     *
     * It instantiates an HtmlParser, registers this class as the callback handler,
     * and processes the input segment to convert HTML tags to placeholders.
     *
     * @param string $segment The input string segment to process.
     *
     * @return string The transformed segment.
     */
    public function transform( string $segment ): string {
        $parser = new HtmlParser();
        $parser->registerCallbacksHandler( $this );

        return $parser->transform( $segment );
    }

}

1		<?php
2		/**
3		* Created by PhpStorm.
4		* @author domenico [email protected] / [email protected]
5		* Date: 05/11/18
6		* Time: 15.30
7		*
8		*/
9
10		namespace Matecat\SubFiltering\Filters;
11
12		use Matecat\SubFiltering\Commons\AbstractHandler;
13		use Matecat\SubFiltering\Enum\ConstantEnum;
14		use Matecat\SubFiltering\Enum\CTypeEnum;
15		use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
16		use Matecat\SubFiltering\Filters\Html\HtmlParser;
17
18		/**
19		* Class HtmlToPh
20		*
21		* This class converts HTML tags within a string into placeholder tags (<ph>).
22		* It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
23		*
24		* @author domenico [email protected] / [email protected]
25		* @package SubFiltering
26		*
27		*/
28		class XmlToPh extends AbstractHandler {
29
30		use CallbacksHandler;
31
32		protected bool $isHTML = false;
33
34		/**
35		* Handles plain text content. Returns the buffer unchanged.
36		*
37		* @param string $buffer The plain text buffer.
38		*
39		* @return string The original buffer.
40		*/
41	95	protected function _finalizePlainText( string $buffer ): string {
42	95	return $buffer;
43		}
44
45		/**
46		* Handles and finalizes an HTML tag.
47		*
48		* This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
49		* This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;b=2"`
50		* becomes `href="...?a=1&b=2"`.
51		*
52		* @param string $buffer The HTML tag string.
53		*
54		* @return string The generated <ph> placeholder tag.
55		*/
56	19	protected function _finalizeMarkupTag( string $buffer ): string {
57		// Decode attributes by locking < and > first
58		// Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
59		// Ex:
60		// incoming string: <a href="/users/settings?test=123&amp;foobar=1" target="_blank">
61		// this should be: <a href="/users/settings?test=123&foobar=1" target="_blank"> with only one ampersand encoding
62		//
63	19	$buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
64	19	$buffer = html_entity_decode( $buffer, ENT_NOQUOTES \| 16 /* ENT_XML1 */, 'UTF-8' );
65	19	$buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );
66
67	19	return $this->_finalizeTag( $buffer );
68
69		}
70
71		/**
72		* Converts a generic tag string into a <ph> placeholder.
73		* The original tag is stored in the 'equiv-text' attribute, base64 encoded.
74		*
75		* @param string $buffer The tag string to convert.
76		*
77		* @return string The resulting <ph> tag.
78		*/
79	21	protected function _finalizeTag( string $buffer ): string {
80	21	return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . ( $this->isHTML ? CTypeEnum::HTML : CTypeEnum::XML ) . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES \| 16 /* ENT_XML1 */ ) ) . '"/>';
81		}
82
83		/**
84		* "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
85		*
86		* @param string $buffer The string buffer.
87		*
88		* @return string The fixed string with escaped angle brackets.
89		*/
90	12	protected function _fixWrongBuffer( string $buffer ): string {
91	12	$buffer = str_replace( "<", "<", $buffer );
92
93	12	return str_replace( ">", ">", $buffer );
94		}
95
96		/**
97		* Finalizes a <script> tag by converting it into a placeholder.
98		*
99		* @param string $buffer The script tag string.
100		*
101		* @return string The generated <ph> placeholder tag.
102		*/
103	3	protected function _finalizeScriptTag( string $buffer ): string {
104	3	return $this->_finalizeTag( $buffer );
105		}
106
107		/**
108		* Validates if a given string is a legitimate XML or HTML-like tag.
109		*
110		* This method provides a robust way to identify tags, avoiding the common pitfalls of
111		* simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
112		* two-step validation process:
113		* 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
114		* 2. A check ensures the string doesn't contain internal placeholders, which would indicate
115		* it's a partially processed string and not a single, complete tag.
116		*
117		* @param string $buffer The string to validate.
118		*
119		* @return bool True if the buffer is a valid tag, false otherwise.
120		*/
121	25	protected function _isTagValid( string $buffer ): bool {
122
123		// This is a safeguard against misinterpreting partially processed strings.
124		// During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
125		// If such placeholders exist within what looks like a tag, it means the tag's
126		// content is not yet restored, so we must not treat it as a valid, final tag.
127		// For example, an original string like '<a href="<x/>">' could become
128		// '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
129	25	if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false \|\| strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
130	2	return false;
131		}
132
133		/*
134		* accept tags start with:
135		* - starting with / ( optional )
136		* - NOT starting with a number
137		* - containing [a-zA-Z0-9\-\._] at least 1
138		* - ending with a letter a-zA-Z0-9 or a quote "' or /
139		*
140		* Not accept Unicode letters in attributes
141		* @see https://regex101.com/r/fZGsUT/1
142		*/
143		// This regex validates the general structure of an XML/HTML tag.
144		// It checks for a valid tag name (not starting with a number), optional attributes
145		// (with quoted or unquoted values), and correct opening/closing brackets.
146	23	if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]"\|\'[^\']\'\|[^\s>]+))?)\s/?>#ui', $buffer ) ) {
147
148		/**
149		* HTML5 Tag Matcher and Global Attribute Parser
150		*
151		* This module provides a comprehensive approach to matching and validating HTML5 elements
152		* with a focus on global attributes, including `data-*` attributes with complex Unicode names.
153		*
154		* Features:
155		* 1. Matches all valid HTML5 tags, including structural, text, inline, form, multimedia, table,
156		* script, interactive, and miscellaneous tags.
157		* 2. Supports opening tags, closing tags, and self-closing tags.
158		* 3. Supports global attributes:
159		* - Standard global attributes: id, class, style, title, lang, dir, hidden, draggable, etc.
160		* - Data attributes: data-* with Unicode, emoji, or complex characters.
161		* - ARIA attributes: role, aria-*.
162		* - Event handlers: on*, e.g., onclick, onmouseover.
163		* - Deprecated XML attributes: xml:lang, xml:base.
164		* 4. Handles attribute values in double quotes, single quotes, or unquoted.
165		* 5. Example usage includes parsing headings (`h1`-`h6`) with multiple global attributes,
166		* as well as other HTML5 elements with complex `data-*` attributes.
167		*
168		* Regex Summary:
169		* - Tag matching: matches all HTML5 tags listed in the specification.
170		* - Attribute matching: matches zero or more global attributes including complex `data-*` names.
171		* - Robust to multiple attributes, whitespace, self-closing tags, and Unicode characters.
172		*
173		* Example HTML matched by the regex:
174		* <h1 id="title1" class="main" data-élément-αριθμός="321">Heading</h1>
175		* <img src="image.png" alt="Photo" data-info="📸"/>
176		* <div hidden data-属性名123="有效">Content</div>
177		* <button onclick="alert('Click!')">Click me</button>
178		*
179		* Notes:
180		* - This regex is intended for validation and parsing in contexts that allow Unicode and extended characters.
181		* - For `dataset` access in JavaScript, `getAttribute` is recommended for attributes with non-ASCII names.
182		*/
183	22	if ( preg_match( '#<\s/?\s(?:html\|head\|body\|header\|footer\|main\|section\|article\|nav\|aside\|h1\|h2\|h3\|h4\|h5\|h6\|p\|hr\|pre\|blockquote\|ol\|ul\|li\|dl\|dt\|dd\|figure\|figcaption\|div\|a\|em\|strong\|small\|s\|cite\|q\|dfn\|abbr\|ruby\|rt\|rp\|data\|time\|code\|var\|samp\|kbd\|sub\|sup\|i\|b\|u\|mark\|bdi\|bdo\|span\|br\|wbr\|form\|label\|input\|button\|select\|datalist\|optgroup\|option\|textarea\|output\|fieldset\|legend\|meter\|progress\|img\|audio\|video\|source\|track\|picture\|map\|area\|iframe\|embed\|object\|param\|table\|caption\|colgroup\|col\|tbody\|thead\|tfoot\|tr\|td\|th\|script\|noscript\|template\|canvas\|link\|style\|meta\|base\|title\|details\|summary\|dialog\|menu\|menuitem\|slot\|portal)\b(?:\s+(?:accesskey\|class\|contenteditable\|data-[^\s=]+\|dir\|draggable\|enterkeyhint\|hidden\|id\|inert\|inputmode\|lang\|popover\|spellcheck\|style\|tabindex\|title\|translate\|xml:lang\|xml:base\|role\|aria-[^\s=]+\|on\w+)(?:=(?:"[^"]"\|\'[^\']\'\|[^\s>]+))?)\s/?\s*>#ui', $buffer ) ) {
184	15	$this->isHTML = true;
185		}
186
187	22	return true;
188		}
189
190	3	return false;
191
192		}
193
194		/**
195		* Main transformation method.
196		*
197		* It instantiates an HtmlParser, registers this class as the callback handler,
198		* and processes the input segment to convert HTML tags to placeholders.
199		*
200		* @param string $segment The input string segment to process.
201		*
202		* @return string The transformed segment.
203		*/
204	95	public function transform( string $segment ): string {
205	95	$parser = new HtmlParser();
206	95	$parser->registerCallbacksHandler( $this );
207
208	95	return $parser->transform( $segment );
209		}
210
211		}

matecat / subfiltering

Pull Request — master (#51)

XmlToPh A

Complexity

Size/Duplication

Test Coverage

Importance

7 Methods

Duplication Side-by-Side

Filter issues like