XmlToPh - Code Metrics - Inspection of "Updated to php-7.4" - matecat/subfiltering - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#51)

by Domenico

created 2025-09-05 18:27 UTC

XmlToPh A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	140
Duplicated Lines	0 %

Test Coverage

Coverage

95.83%

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
eloc	19
c	1
b	0
f	0
dl	0
loc	140
ccs	23
cts	24
cp	0.9583
rs	10
wmc	10

7 Methods

Rating	Name	Size	Complexity
A	_isTagValid()	31	4
A	_finalizeTag()	2	1
A	_finalizePlainText()	2	1
A	_fixWrongBuffer()	4	1
A	_finalizeHTMLTag()	12	1
A	transform()	5	1
A	_finalizeScriptTag()	2	1

<?php
/**
 * Created by PhpStorm.
 * @author domenico [email protected] / [email protected]
 * Date: 05/11/18
 * Time: 15.30
 *
 */

namespace Matecat\SubFiltering\Filters;

use Matecat\SubFiltering\Commons\AbstractHandler;
use Matecat\SubFiltering\Enum\ConstantEnum;
use Matecat\SubFiltering\Enum\CTypeEnum;
use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
use Matecat\SubFiltering\Filters\Html\HtmlParser;

/**
 * Class HtmlToPh
 *
 * This class converts HTML tags within a string into placeholder tags (<ph>).
 * It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
 *
 * @author  domenico [email protected] / [email protected]
 * @package SubFiltering
 *
 */
class XmlToPh extends AbstractHandler {

    use CallbacksHandler;

    /**
     * Handles plain text content. Returns the buffer unchanged.
     *
     * @param string $buffer The plain text buffer.
     *
     * @return string The original buffer.
     */
    protected function _finalizePlainText( string $buffer ): string {
        return $buffer;
    }

    /**
     * Handles and finalizes an HTML tag.
     *
     * This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
     * This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;amp;b=2"`
     * becomes `href="...?a=1&amp;b=2"`.
     *
     * @param string $buffer The HTML tag string.
     *
     * @return string The generated <ph> placeholder tag.
     */
    protected function _finalizeHTMLTag( string $buffer ): string {
        // Decode attributes by locking < and > first
        // Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
        // Ex:
        // incoming string: <a href="/users/settings?test=123&amp;amp;foobar=1" target="_blank">
        // this should be: <a href="/users/settings?test=123&amp;foobar=1" target="_blank"> with only one ampersand encoding
        //
        $buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
        $buffer = html_entity_decode( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */, 'UTF-8' );
        $buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );

        return $this->_finalizeTag( $buffer );

    }

    /**
     * Converts a generic tag string into a <ph> placeholder.
     * The original tag is stored in the 'equiv-text' attribute, base64 encoded.
     *
     * @param string $buffer The tag string to convert.
     *
     * @return string The resulting <ph> tag.
     */
    protected function _finalizeTag( string $buffer ): string {
        return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . CTypeEnum::XML . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES | 16 /* ENT_XML1 */ ) ) . '"/>';
    }

    /**
     * "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
     *
     * @param string $buffer The string buffer.
     *
     * @return string The fixed string with escaped angle brackets.
     */
    protected function _fixWrongBuffer( string $buffer ): string {
        $buffer = str_replace( "<", "&lt;", $buffer );

        return str_replace( ">", "&gt;", $buffer );
    }

    /**
     * Finalizes a <script> tag by converting it into a placeholder.
     *
     * @param string $buffer The script tag string.
     *
     * @return string The generated <ph> placeholder tag.
     */
    protected function _finalizeScriptTag( string $buffer ): string {
        return $this->_finalizeTag( $buffer );
    }

    /**
     * Validates if a given string is a legitimate XML or HTML-like tag.
     *
     * This method provides a robust way to identify tags, avoiding the common pitfalls of
     * simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
     * two-step validation process:
     * 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
     * 2. A check ensures the string doesn't contain internal placeholders, which would indicate
     *    it's a partially processed string and not a single, complete tag.
     *
     * @param string $buffer The string to validate.
     *
     * @return bool True if the buffer is a valid tag, false otherwise.
     */
    protected function _isTagValid( string $buffer ): bool {

        /*
         * accept tags start with:
         * - starting with / ( optional )
         * - NOT starting with a number
         * - containing [a-zA-Z0-9\-\._] at least 1
         * - ending with a letter a-zA-Z0-9 or a quote "' or /
         *
         * Not accept Unicode letters in attributes
         * @see https://regex101.com/r/fZGsUT/1
         */
        // This regex validates the general structure of an XML/HTML tag.
        // It checks for a valid tag name (not starting with a number), optional attributes
        // (with quoted or unquoted values), and correct opening/closing brackets.
        if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]*"|\'[^\']*\'|[^\s>]+))?)*\s*/?>#ui', $buffer ) ) {

            // This is a safeguard against misinterpreting partially processed strings.
            // During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
            // If such placeholders exist within what looks like a tag, it means the tag's
            // content is not yet restored, so we must not treat it as a valid, final tag.
            // For example, an original string like '&lt;a href="<x/>"&gt;' could become
            // '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
            if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false || strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
                return false;
            }

            return true;
        }

        return false;

    }

    /**
     * Main transformation method.
     *
     * It instantiates an HtmlParser, registers this class as the callback handler,
     * and processes the input segment to convert HTML tags to placeholders.
     *
     * @param string $segment The input string segment to process.
     *
     * @return string The transformed segment.
     */
    public function transform( string $segment ): string {
        $parser = new HtmlParser();
        $parser->registerCallbacksHandler( $this );

        return $parser->transform( $segment );
    }

}

1		<?php
2		/**
3		* Created by PhpStorm.
4		* @author domenico [email protected] / [email protected]
5		* Date: 05/11/18
6		* Time: 15.30
7		*
8		*/
9
10		namespace Matecat\SubFiltering\Filters;
11
12		use Matecat\SubFiltering\Commons\AbstractHandler;
13		use Matecat\SubFiltering\Enum\ConstantEnum;
14		use Matecat\SubFiltering\Enum\CTypeEnum;
15		use Matecat\SubFiltering\Filters\Html\CallbacksHandler;
16		use Matecat\SubFiltering\Filters\Html\HtmlParser;
17
18		/**
19		* Class HtmlToPh
20		*
21		* This class converts HTML tags within a string into placeholder tags (<ph>).
22		* It uses an HtmlParser with a set of callbacks to process different parts of the HTML content.
23		*
24		* @author domenico [email protected] / [email protected]
25		* @package SubFiltering
26		*
27		*/
28		class XmlToPh extends AbstractHandler {
29
30		use CallbacksHandler;
31
32		/**
33		* Handles plain text content. Returns the buffer unchanged.
34		*
35		* @param string $buffer The plain text buffer.
36		*
37		* @return string The original buffer.
38		*/
39	93	protected function _finalizePlainText( string $buffer ): string {
40	93	return $buffer;
41		}
42
43		/**
44		* Handles and finalizes an HTML tag.
45		*
46		* This method decodes HTML entities within the tag's attributes while preserving the '<' and '>' characters of the tag itself.
47		* This is necessary to correctly handle encoded attribute values. For example, an attribute like `href="...?a=1&amp;b=2"`
48		* becomes `href="...?a=1&b=2"`.
49		*
50		* @param string $buffer The HTML tag string.
51		*
52		* @return string The generated <ph> placeholder tag.
53		*/
54	18	protected function _finalizeHTMLTag( string $buffer ): string {
55		// Decode attributes by locking < and > first
56		// Because a HTML tag has it's attributes encoded and here we get lt and gt decoded but not other parts of the string
57		// Ex:
58		// incoming string: <a href="/users/settings?test=123&amp;foobar=1" target="_blank">
59		// this should be: <a href="/users/settings?test=123&foobar=1" target="_blank"> with only one ampersand encoding
60		//
61	18	$buffer = str_replace( [ '<', '>' ], [ '#_lt_#', '#_gt_#' ], $buffer );
62	18	$buffer = html_entity_decode( $buffer, ENT_NOQUOTES \| 16 /* ENT_XML1 */, 'UTF-8' );
63	18	$buffer = str_replace( [ '#_lt_#', '#_gt_#' ], [ '<', '>' ], $buffer );
64
65	18	return $this->_finalizeTag( $buffer );
66
67		}
68
69		/**
70		* Converts a generic tag string into a <ph> placeholder.
71		* The original tag is stored in the 'equiv-text' attribute, base64 encoded.
72		*
73		* @param string $buffer The tag string to convert.
74		*
75		* @return string The resulting <ph> tag.
76		*/
77	20	protected function _finalizeTag( string $buffer ): string {
78	20	return '<ph id="' . $this->getPipeline()->getNextId() . '" ctype="' . CTypeEnum::XML . '" equiv-text="base64:' . base64_encode( htmlentities( $buffer, ENT_NOQUOTES \| 16 /* ENT_XML1 */ ) ) . '"/>';
79		}
80
81		/**
82		* "Fixes" a buffer that was incorrectly identified as a tag by escaping its angle brackets.
83		*
84		* @param string $buffer The string buffer.
85		*
86		* @return string The fixed string with escaped angle brackets.
87		*/
88	11	protected function _fixWrongBuffer( string $buffer ): string {
89	11	$buffer = str_replace( "<", "<", $buffer );
90
91	11	return str_replace( ">", ">", $buffer );
92		}
93
94		/**
95		* Finalizes a <script> tag by converting it into a placeholder.
96		*
97		* @param string $buffer The script tag string.
98		*
99		* @return string The generated <ph> placeholder tag.
100		*/
101	2	protected function _finalizeScriptTag( string $buffer ): string {
102	2	return $this->_finalizeTag( $buffer );
103		}
104
105		/**
106		* Validates if a given string is a legitimate XML or HTML-like tag.
107		*
108		* This method provides a robust way to identify tags, avoiding the common pitfalls of
109		* simpler tools like `strip_tags` which can fail with strings such as "3 < 4". It uses a
110		* two-step validation process:
111		* 1. A regular expression checks for a valid tag structure (name, attributes, brackets).
112		* 2. A check ensures the string doesn't contain internal placeholders, which would indicate
113		* it's a partially processed string and not a single, complete tag.
114		*
115		* @param string $buffer The string to validate.
116		*
117		* @return bool True if the buffer is a valid tag, false otherwise.
118		*/
119	22	protected function _isTagValid( string $buffer ): bool {
120
121		/*
122		* accept tags start with:
123		* - starting with / ( optional )
124		* - NOT starting with a number
125		* - containing [a-zA-Z0-9\-\._] at least 1
126		* - ending with a letter a-zA-Z0-9 or a quote "' or /
127		*
128		* Not accept Unicode letters in attributes
129		* @see https://regex101.com/r/fZGsUT/1
130		*/
131		// This regex validates the general structure of an XML/HTML tag.
132		// It checks for a valid tag name (not starting with a number), optional attributes
133		// (with quoted or unquoted values), and correct opening/closing brackets.
134	22	if ( preg_match( '#</?(?![0-9]+)[a-z0-9\-._:]+?(?:\s+[:a-z0-9\-._]+(?:=(?:"[^"]"\|\'[^\']\'\|[^\s>]+))?)\s/?>#ui', $buffer ) ) {
135
136		// This is a safeguard against misinterpreting partially processed strings.
137		// During filtering, inner tags might be replaced by placeholders (e.g., ##LESSTHAN##).
138		// If such placeholders exist within what looks like a tag, it means the tag's
139		// content is not yet restored, so we must not treat it as a valid, final tag.
140		// For example, an original string like '<a href="<x/>">' could become
141		// '<a href="##LESSTHAN##x/##GREATERTHAN##">', which should not be converted to a <ph> tag.
142	19	if ( strpos( $buffer, ConstantEnum::LTPLACEHOLDER ) !== false \|\| strpos( $buffer, ConstantEnum::GTPLACEHOLDER ) !== false ) {
143		return false;
144		}
145
146	19	return true;
147		}
148
149	4	return false;
150
151		}
152
153		/**
154		* Main transformation method.
155		*
156		* It instantiates an HtmlParser, registers this class as the callback handler,
157		* and processes the input segment to convert HTML tags to placeholders.
158		*
159		* @param string $segment The input string segment to process.
160		*
161		* @return string The transformed segment.
162		*/
163	93	public function transform( string $segment ): string {
164	93	$parser = new HtmlParser();
165	93	$parser->registerCallbacksHandler( $this );
166
167	93	return $parser->transform( $segment );
168		}
169
170		}

matecat / subfiltering

Pull Request — master (#51)

XmlToPh A

Complexity

Size/Duplication

Test Coverage

Importance

7 Methods

Duplication Side-by-Side

Filter issues like