HtmlParser::parse() - Code Metrics - Inspection of "Code style updates" - matecat/xliff-parser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 993831...129cb7 )

by Domenico

created 2023-01-05 13:53 UTC

HtmlParser::parse() A

↳ Parent: HtmlParser

Complexity

Conditions	2
Paths	2

Size

Total Lines	11
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	2
eloc	6
c	1
b	0
f	0
nc	2
nop	1
dl	0
loc	11
rs	10

<?php

namespace Matecat\XliffParser\Utils;

class HtmlParser {
    const ORIGINAL_TEXT_PLACEHOLDER = '#####__ORIGINAL_TEXT__#####';
    const LT_PLACEHOLDER            = '#####__LT_PLACEHOLDER__#####';
    const GT_PLACEHOLDER            = '#####__GT_PLACEHOLDER__#####';

    /**
     * This solution is taken from here and then modified:
     * https://www.php.net/manual/fr/regexp.reference.recursive.php#95568
     *
     * @param string $html
     *
     * @return array
     */
    public static function parse( $html ) {
        $toBeEscaped = Strings::isAnEscapedHTML( $html );

        if ( $toBeEscaped ) {
            $html = Strings::htmlspecialchars_decode( $html );
        }

        $html = self::protectNotClosedHtmlTags( $html );
        $html = self::protectNotHtmlLessThanSymbols( $html );

        return self::extractHtmlNode( $html, $toBeEscaped );
    }

    /**
     * Protect all < symbols that are not part of html tags.
     *
     * Example:
     *
     * <div id="1">< Ciao <<div id="2"></div></div>
     *
     * is converted to:
     *
     * <div id="1">#####__LT_PLACEHOLDER__##### Ciao #####__LT_PLACEHOLDER__#####<div id="2"></div></div>
     *
     * @param string $html
     *
     * @return string
     */
    private static function protectNotHtmlLessThanSymbols( $html ) {
        preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );

        $delta          = 0;
        $realNextOffset = 0;
        $next           = null;

        foreach ( $matches[ 0 ] as $key => $match ) {

            $current = $matches[ 0 ][ $key ][ 0 ];

            if ( isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) ) {
                $next           = $matches[ 0 ][ $key + 1 ][ 0 ];
                $nextOffset     = $matches[ 0 ][ $key + 1 ][ 1 ];
                $realNextOffset = ( $delta === 0 ) ? $nextOffset : ( $nextOffset + $delta );
            }

            $length     = strlen( $match[ 0 ] );
            $offset     = $matches[ 0 ][ $key ][ 1 ];
            $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );

            if ( $current === '<' && isset( $next ) ) {

                // 1. if next is > or
                // 2. next is < and is not html tag (like < >)
                $insideAngularTags = substr( $html, $realOffset, ( $realNextOffset - $realOffset + 1 ) );

                if ( $next !== '>' || !Strings::isHtmlString( $insideAngularTags ) ) {
                    $html  = substr_replace( $html, self::LT_PLACEHOLDER, $realOffset, $length );
                    $delta = $delta + strlen( self::LT_PLACEHOLDER ) - $length;
                }
            }
        }

        return !is_array( $html ) ? $html : implode( $html );
    }

    /**
     * Protect not closed html tags.
     *
     * Example:
     *
     * Ciao <div> this div is not closed. <div>Instead, this is a closed div.</div>
     *
     * is converted to:
     *
     * Ciao #####__LT_PLACEHOLDER__#####div#####__GT_PLACEHOLDER__##### this div is not closed. <div>Instead, this is a closed div.</div>
     *
     * @param string $html
     *
     * @return string
     */
    private static function protectNotClosedHtmlTags( $html ) {
        preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );

        $tags            = [];
        $offsets         = [];
        $originalLengths = [];

        // 1. Map all tags
        foreach ( $matches[ 0 ] as $key => $match ) {
            $current       = $matches[ 0 ][ $key ][ 0 ];
            $currentOffset = $matches[ 0 ][ $key ][ 1 ];

            // check every string inside angular brackets (< and >)
            if ( $current === '<' && isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) && $matches[ 0 ][ $key + 1 ][ 0 ] === '>' ) {
                $nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ];
                $tag        = substr( $html, ( $currentOffset + 1 ), ( $nextOffset - $currentOffset - 1 ) );
                $trimmedTag = trim( $tag );

                // if the tag is self closed do nothing
                if ( Strings::lastChar( $tag ) !== '/' ) {
                    $tags[]            = $trimmedTag;
                    $offsets[]         = $currentOffset;
                    $originalLengths[] = strlen( $tag ) + 2; // add 2 to length because there are < and >
                }
            }
        }

        // 2. Removing closed tags
        $indexes = [];

        if ( count( $tags ) > 0 ) {
            foreach ( $tags as $index => $tag ) {

                if ( Strings::contains( '/', $tag ) ) {
                    $complementaryTag = $tag;
                } else {
                    $complementaryTag = '/' . explode( ' ', $tag )[ 0 ];
                }

                $complementaryTagIndex = array_search( $complementaryTag, $tags );

                if ( false !== $complementaryTagIndex ) {
                    $indexes[] = $index;
                    $indexes[] = $complementaryTagIndex;
                }
            }
        }

        $indexes = array_unique( $indexes );
        foreach ( $indexes as $index ) {
            unset( $tags[ $index ] );
        }

        // 3. Loop not closed tags
        $delta = 0;

        if ( count( $tags ) ) {
            foreach ( $tags as $index => $tag ) {

                $length     = $originalLengths[ $index ];
                $offset     = $offsets[ $index ];
                $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );

                $replacement = self::LT_PLACEHOLDER . $tag . self::GT_PLACEHOLDER;

                $html  = substr_replace( $html, $replacement, $realOffset, $length );
                $delta = $delta + strlen( $replacement ) - $length;
            }
        }

        return !is_array( $html ) ? $html : implode( $html );
    }

    /**
     * @param string $html
     * @param bool   $toBeEscaped
     *
     * @return array
     */
    private static function extractHtmlNode( $html, $toBeEscaped = false ) {
        $pattern = "/<([a-zA-Z0-9._-]+)([^>]|[^<]*?)(([\s]*\/>)|" .
                "(>((([^<]*?|<\!\-\-.*?\-\->)|(?R))*)<\/\\1[\s]*>))/sm";
        preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE );

        $elements = [];

        foreach ( $matches[ 0 ] as $key => $match ) {

            $attributes    = isset( $matches[ 2 ][ $key ][ 0 ] ) ? self::getAttributes( $matches[ 2 ][ $key ][ 0 ] ) : [];
            $base64Decoded = ( isset( $attributes[ 'equiv-text' ] ) ) ? base64_decode( str_replace( "base64:", "", $attributes[ 'equiv-text' ] ) ) : null;
            $tagName       = $matches[ 1 ][ $key ][ 0 ];
            $text          = !empty( $matches[ 6 ][ $key ][ 0 ] ) ? $matches[ 6 ][ $key ][ 0 ] : "";
            $originalText  = $text;
            $strippedText  = strip_tags( $text );

            // get start and end tags
            $explodedNode = explode( self::ORIGINAL_TEXT_PLACEHOLDER, str_replace( $originalText, self::ORIGINAL_TEXT_PLACEHOLDER, $match[ 0 ] ) );

            $start = ( isset( $explodedNode[ 0 ] ) ) ? $explodedNode[ 0 ] : "";
            $end   = ( isset( $explodedNode[ 1 ] ) ) ? $explodedNode[ 1 ] : "";

            // inner_html
            $inner_html = self::getInnerHtml( $matches, $key, $toBeEscaped );

            // node
            $node = self::rebuildNode( $originalText, $toBeEscaped, $start, $end );

            // terminator
            $terminator = ( $toBeEscaped ) ? '&gt;' : '>';

            // self closed
            $selfClosed = Strings::contains( '/>', trim( $start ) );

            $elements[] = (object)[
                    'node'           => self::restoreLessThanAndGreaterThanSymbols( $node ),
                    'start'          => self::restoreLessThanAndGreaterThanSymbols( $start ),
                    'end'            => self::restoreLessThanAndGreaterThanSymbols( $end ),
                    'terminator'     => $terminator,
                    'offset'         => $match[ 1 ],
                    'tagname'        => $tagName,
                    'attributes'     => $attributes,
                    'base64_decoded' => $base64Decoded,
                    'self_closed'    => $selfClosed,
                    'omittag'        => ( $matches[ 4 ][ $key ][ 1 ] > -1 ), // boolean
                    'inner_html'     => $inner_html,
                    'has_children'   => is_array( $inner_html ),
                    'original_text'  => ( $toBeEscaped ) ? self::restoreLessThanAndGreaterThanSymbols( Strings::escapeOnlyHTMLTags( $originalText ) ) : self::restoreLessThanAndGreaterThanSymbols( $originalText ),
                    'stripped_text'  => self::restoreLessThanAndGreaterThanSymbols( $strippedText ),
            ];
        }

        return $elements;
    }

    /**
     * @param $text
     *
     * @return string|string[]
     */
    private static function restoreLessThanAndGreaterThanSymbols( $text ) {
        return str_replace( [ self::LT_PLACEHOLDER, self::GT_PLACEHOLDER ], [ '<', '>' ], $text );
    }

    /**
     * @param string $originalText
     * @param bool   $toBeEscaped
     * @param string $start
     * @param string $end
     *
     * @return string
     */
    private static function rebuildNode( $originalText, $toBeEscaped, $start = null, $end = null ) {
        $node = '';

        if ( !empty( $start ) ) {
            $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $start ) : $start;
        }

        $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $originalText ) : $originalText;

        if ( !empty( $end ) ) {
            $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $end ) : $end;
        }

        return $node;
    }

    /**
     * @param $content
     *
     * @return mixed
     */
    public static function getAttributes( $content ) {
        $pattern = '/(.*?)=("|\'|\\\")(.*?)("|\'|\\\"|\\\')/';

        preg_match_all( $pattern, $content, $matches, PREG_OFFSET_CAPTURE );

        $attributes = [];

        if ( isset( $matches[ 1 ] ) && count( $matches[ 1 ] ) > 0 ) {
            foreach ( $matches[ 1 ] as $key => $match ) {
                $attributes[ trim( $match[ 0 ] ) ] = $matches[ 3 ][ $key ][ 0 ];
            }
        }

        return $attributes;
    }

    /**
     * @param array  $matches
     * @param string $key
     *
     * @param bool   $toBeEscaped
     *
     * @return array|mixed|string
     */
    private static function getInnerHtml( $matches, $key, $toBeEscaped = false ) {
        if ( isset( $matches[ 6 ][ $key ][ 0 ] ) ) {
            $node = self::extractHtmlNode( $matches[ 6 ][ $key ][ 0 ], $toBeEscaped );

            return ( !empty( $node ) ) ? $node : $matches[ 6 ][ $key ][ 0 ];
        }

        return null;
    }
}


1			<?php
2
3			namespace Matecat\XliffParser\Utils;
4
5			class HtmlParser {
6			const ORIGINAL_TEXT_PLACEHOLDER = '#####__ORIGINAL_TEXT__#####';
7			const LT_PLACEHOLDER = '#####__LT_PLACEHOLDER__#####';
8			const GT_PLACEHOLDER = '#####__GT_PLACEHOLDER__#####';
9
10			/**
11			* This solution is taken from here and then modified:
12			* https://www.php.net/manual/fr/regexp.reference.recursive.php#95568
13			*
14			* @param string $html
15			*
16			* @return array
17			*/
18			public static function parse( $html ) {
19			$toBeEscaped = Strings::isAnEscapedHTML( $html );
20
21			if ( $toBeEscaped ) {
22			$html = Strings::htmlspecialchars_decode( $html );
23			}
24
25			$html = self::protectNotClosedHtmlTags( $html );
26			$html = self::protectNotHtmlLessThanSymbols( $html );
27
28			return self::extractHtmlNode( $html, $toBeEscaped );
29			}
30
31			/**
32			* Protect all < symbols that are not part of html tags.
33			*
34			* Example:
35			*
36			* <div id="1">< Ciao <<div id="2"></div></div>
37			*
38			* is converted to:
39			*
40			* <div id="1">#####__LT_PLACEHOLDER__##### Ciao #####__LT_PLACEHOLDER__#####<div id="2"></div></div>
41			*
42			* @param string $html
43			*
44			* @return string
45			*/
46			private static function protectNotHtmlLessThanSymbols( $html ) {
47			preg_match_all( '/<\|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );
48
49			$delta = 0;
50			$realNextOffset = 0;
51			$next = null;
52
53			foreach ( $matches[ 0 ] as $key => $match ) {
54
55			$current = $matches[ 0 ][ $key ][ 0 ];
56
57			if ( isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) ) {
58			$next = $matches[ 0 ][ $key + 1 ][ 0 ];
59			$nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ];
60			$realNextOffset = ( $delta === 0 ) ? $nextOffset : ( $nextOffset + $delta );
61			}
62
63			$length = strlen( $match[ 0 ] );
64			$offset = $matches[ 0 ][ $key ][ 1 ];
65			$realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );
66
67			if ( $current === '<' && isset( $next ) ) {
68
69			// 1. if next is > or
70			// 2. next is < and is not html tag (like < >)
71			$insideAngularTags = substr( $html, $realOffset, ( $realNextOffset - $realOffset + 1 ) );
72
73			if ( $next !== '>' \|\| !Strings::isHtmlString( $insideAngularTags ) ) {
74			$html = substr_replace( $html, self::LT_PLACEHOLDER, $realOffset, $length );
75			$delta = $delta + strlen( self::LT_PLACEHOLDER ) - $length;
76			}
77			}
78			}
79
80			return !is_array( $html ) ? $html : implode( $html );
81			}
82
83			/**
84			* Protect not closed html tags.
85			*
86			* Example:
87			*
88			* Ciao <div> this div is not closed. <div>Instead, this is a closed div.</div>
89			*
90			* is converted to:
91			*
92			* Ciao #####__LT_PLACEHOLDER__#####div#####__GT_PLACEHOLDER__##### this div is not closed. <div>Instead, this is a closed div.</div>
93			*
94			* @param string $html
95			*
96			* @return string
97			*/
98			private static function protectNotClosedHtmlTags( $html ) {
99			preg_match_all( '/<\|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );
100
101			$tags = [];
102			$offsets = [];
103			$originalLengths = [];
104
105			// 1. Map all tags
106			foreach ( $matches[ 0 ] as $key => $match ) {
107			$current = $matches[ 0 ][ $key ][ 0 ];
108			$currentOffset = $matches[ 0 ][ $key ][ 1 ];
109
110			// check every string inside angular brackets (< and >)
111			if ( $current === '<' && isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) && $matches[ 0 ][ $key + 1 ][ 0 ] === '>' ) {
112			$nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ];
113			$tag = substr( $html, ( $currentOffset + 1 ), ( $nextOffset - $currentOffset - 1 ) );
114			$trimmedTag = trim( $tag );
115
116			// if the tag is self closed do nothing
117			if ( Strings::lastChar( $tag ) !== '/' ) {
118			$tags[] = $trimmedTag;
119			$offsets[] = $currentOffset;
120			$originalLengths[] = strlen( $tag ) + 2; // add 2 to length because there are < and >
121			}
122			}
123			}
124
125			// 2. Removing closed tags
126			$indexes = [];
127
128			if ( count( $tags ) > 0 ) {
129			foreach ( $tags as $index => $tag ) {
130
131			if ( Strings::contains( '/', $tag ) ) {
132			$complementaryTag = $tag;
133			} else {
134			$complementaryTag = '/' . explode( ' ', $tag )[ 0 ];
135			}
136
137			$complementaryTagIndex = array_search( $complementaryTag, $tags );
138
139			if ( false !== $complementaryTagIndex ) {
140			$indexes[] = $index;
141			$indexes[] = $complementaryTagIndex;
142			}
143			}
144			}
145
146			$indexes = array_unique( $indexes );
147			foreach ( $indexes as $index ) {
148			unset( $tags[ $index ] );
149			}
150
151			// 3. Loop not closed tags
152			$delta = 0;
153
154			if ( count( $tags ) ) {
155			foreach ( $tags as $index => $tag ) {
156
157			$length = $originalLengths[ $index ];
158			$offset = $offsets[ $index ];
159			$realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );
160
161			$replacement = self::LT_PLACEHOLDER . $tag . self::GT_PLACEHOLDER;
162
163			$html = substr_replace( $html, $replacement, $realOffset, $length );
164			$delta = $delta + strlen( $replacement ) - $length;
165			}
166			}
167
168			return !is_array( $html ) ? $html : implode( $html );
169			}
170
171			/**
172			* @param string $html
173			* @param bool $toBeEscaped
174			*
175			* @return array
176			*/
177			private static function extractHtmlNode( $html, $toBeEscaped = false ) {
178			$pattern = "/<([a-zA-Z0-9._-]+)([^>]\|[^<]?)(([\s]\/>)\|" .
179			"(>((([^<]?\|<\!\-\-.?\-\->)\|(?R)))<\/\\1[\s]>))/sm";
180			preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE );
181
182			$elements = [];
183
184			foreach ( $matches[ 0 ] as $key => $match ) {
185
186			$attributes = isset( $matches[ 2 ][ $key ][ 0 ] ) ? self::getAttributes( $matches[ 2 ][ $key ][ 0 ] ) : [];
187			$base64Decoded = ( isset( $attributes[ 'equiv-text' ] ) ) ? base64_decode( str_replace( "base64:", "", $attributes[ 'equiv-text' ] ) ) : null;
188			$tagName = $matches[ 1 ][ $key ][ 0 ];
189			$text = !empty( $matches[ 6 ][ $key ][ 0 ] ) ? $matches[ 6 ][ $key ][ 0 ] : "";
190			$originalText = $text;
191			$strippedText = strip_tags( $text );
192
193			// get start and end tags
194			$explodedNode = explode( self::ORIGINAL_TEXT_PLACEHOLDER, str_replace( $originalText, self::ORIGINAL_TEXT_PLACEHOLDER, $match[ 0 ] ) );
195
196			$start = ( isset( $explodedNode[ 0 ] ) ) ? $explodedNode[ 0 ] : "";
197			$end = ( isset( $explodedNode[ 1 ] ) ) ? $explodedNode[ 1 ] : "";
198
199			// inner_html
200			$inner_html = self::getInnerHtml( $matches, $key, $toBeEscaped );
201
202			// node
203			$node = self::rebuildNode( $originalText, $toBeEscaped, $start, $end );
204
205			// terminator
206			$terminator = ( $toBeEscaped ) ? '>' : '>';
207
208			// self closed
209			$selfClosed = Strings::contains( '/>', trim( $start ) );
210
211			$elements[] = (object)[
212			'node' => self::restoreLessThanAndGreaterThanSymbols( $node ),
213			'start' => self::restoreLessThanAndGreaterThanSymbols( $start ),
214			'end' => self::restoreLessThanAndGreaterThanSymbols( $end ),
215			'terminator' => $terminator,
216			'offset' => $match[ 1 ],
217			'tagname' => $tagName,
218			'attributes' => $attributes,
219			'base64_decoded' => $base64Decoded,
220			'self_closed' => $selfClosed,
221			'omittag' => ( $matches[ 4 ][ $key ][ 1 ] > -1 ), // boolean
222			'inner_html' => $inner_html,
223			'has_children' => is_array( $inner_html ),
224			'original_text' => ( $toBeEscaped ) ? self::restoreLessThanAndGreaterThanSymbols( Strings::escapeOnlyHTMLTags( $originalText ) ) : self::restoreLessThanAndGreaterThanSymbols( $originalText ),
225			'stripped_text' => self::restoreLessThanAndGreaterThanSymbols( $strippedText ),
226			];
227			}
228
229			return $elements;
230			}
231
232			/**
233			* @param $text
234			*
235			* @return string\|string[]
236			*/
237			private static function restoreLessThanAndGreaterThanSymbols( $text ) {
238			return str_replace( [ self::LT_PLACEHOLDER, self::GT_PLACEHOLDER ], [ '<', '>' ], $text );
239			}
240
241			/**
242			* @param string $originalText
243			* @param bool $toBeEscaped
244			* @param string $start
245			* @param string $end
246			*
247			* @return string
248			*/
249			private static function rebuildNode( $originalText, $toBeEscaped, $start = null, $end = null ) {
250			$node = '';
251
252			if ( !empty( $start ) ) {
253			$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $start ) : $start;
254			}
255
256			$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $originalText ) : $originalText;
257
258			if ( !empty( $end ) ) {
259			$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $end ) : $end;
260			}
261
262			return $node;
263			}
264
265			/**
266			* @param $content
267			*
268			* @return mixed
269			*/
270			public static function getAttributes( $content ) {
271			$pattern = '/(.?)=("\|\'\|\\\")(.?)("\|\'\|\\\"\|\\\')/';
272
273			preg_match_all( $pattern, $content, $matches, PREG_OFFSET_CAPTURE );
274
275			$attributes = [];
276
277			if ( isset( $matches[ 1 ] ) && count( $matches[ 1 ] ) > 0 ) {
278			foreach ( $matches[ 1 ] as $key => $match ) {
279			$attributes[ trim( $match[ 0 ] ) ] = $matches[ 3 ][ $key ][ 0 ];
280			}
281			}
282
283			return $attributes;
284			}
285
286			/**
287			* @param array $matches
288			* @param string $key
289			*
290			* @param bool $toBeEscaped
291			*
292			* @return array\|mixed\|string
293			*/
294			private static function getInnerHtml( $matches, $key, $toBeEscaped = false ) {
295			if ( isset( $matches[ 6 ][ $key ][ 0 ] ) ) {
296			$node = self::extractHtmlNode( $matches[ 6 ][ $key ][ 0 ], $toBeEscaped );
297
298			return ( !empty( $node ) ) ? $node : $matches[ 6 ][ $key ][ 0 ];
299			}
300
301			return null;
302			}
303			}
304

matecat / xliff-parser

Push — master ( 993831...129cb7 )

HtmlParser::parse() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like