Passed
Push — master ( 993831...129cb7 )
by Domenico
03:20
created

HtmlParser::extractHtmlNode()   B

Complexity

Conditions 9
Paths 129

Size

Total Lines 53
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 34
c 0
b 0
f 0
nc 129
nop 2
dl 0
loc 53
rs 7.8138

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Matecat\XliffParser\Utils;
4
5
class HtmlParser {
6
    const ORIGINAL_TEXT_PLACEHOLDER = '#####__ORIGINAL_TEXT__#####';
7
    const LT_PLACEHOLDER            = '#####__LT_PLACEHOLDER__#####';
8
    const GT_PLACEHOLDER            = '#####__GT_PLACEHOLDER__#####';
9
10
    /**
11
     * This solution is taken from here and then modified:
12
     * https://www.php.net/manual/fr/regexp.reference.recursive.php#95568
13
     *
14
     * @param string $html
15
     *
16
     * @return array
17
     */
18
    public static function parse( $html ) {
19
        $toBeEscaped = Strings::isAnEscapedHTML( $html );
20
21
        if ( $toBeEscaped ) {
22
            $html = Strings::htmlspecialchars_decode( $html );
23
        }
24
25
        $html = self::protectNotClosedHtmlTags( $html );
26
        $html = self::protectNotHtmlLessThanSymbols( $html );
27
28
        return self::extractHtmlNode( $html, $toBeEscaped );
29
    }
30
31
    /**
32
     * Protect all < symbols that are not part of html tags.
33
     *
34
     * Example:
35
     *
36
     * <div id="1">< Ciao <<div id="2"></div></div>
37
     *
38
     * is converted to:
39
     *
40
     * <div id="1">#####__LT_PLACEHOLDER__##### Ciao #####__LT_PLACEHOLDER__#####<div id="2"></div></div>
41
     *
42
     * @param string $html
43
     *
44
     * @return string
45
     */
46
    private static function protectNotHtmlLessThanSymbols( $html ) {
47
        preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );
48
49
        $delta          = 0;
50
        $realNextOffset = 0;
51
        $next           = null;
52
53
        foreach ( $matches[ 0 ] as $key => $match ) {
54
55
            $current = $matches[ 0 ][ $key ][ 0 ];
56
57
            if ( isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) ) {
58
                $next           = $matches[ 0 ][ $key + 1 ][ 0 ];
59
                $nextOffset     = $matches[ 0 ][ $key + 1 ][ 1 ];
60
                $realNextOffset = ( $delta === 0 ) ? $nextOffset : ( $nextOffset + $delta );
61
            }
62
63
            $length     = strlen( $match[ 0 ] );
64
            $offset     = $matches[ 0 ][ $key ][ 1 ];
65
            $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );
66
67
            if ( $current === '<' && isset( $next ) ) {
68
69
                // 1. if next is > or
70
                // 2. next is < and is not html tag (like < >)
71
                $insideAngularTags = substr( $html, $realOffset, ( $realNextOffset - $realOffset + 1 ) );
72
73
                if ( $next !== '>' || !Strings::isHtmlString( $insideAngularTags ) ) {
74
                    $html  = substr_replace( $html, self::LT_PLACEHOLDER, $realOffset, $length );
75
                    $delta = $delta + strlen( self::LT_PLACEHOLDER ) - $length;
76
                }
77
            }
78
        }
79
80
        return !is_array( $html ) ? $html : implode( $html );
81
    }
82
83
    /**
84
     * Protect not closed html tags.
85
     *
86
     * Example:
87
     *
88
     * Ciao <div> this div is not closed. <div>Instead, this is a closed div.</div>
89
     *
90
     * is converted to:
91
     *
92
     * Ciao #####__LT_PLACEHOLDER__#####div#####__GT_PLACEHOLDER__##### this div is not closed. <div>Instead, this is a closed div.</div>
93
     *
94
     * @param string $html
95
     *
96
     * @return string
97
     */
98
    private static function protectNotClosedHtmlTags( $html ) {
99
        preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE );
100
101
        $tags            = [];
102
        $offsets         = [];
103
        $originalLengths = [];
104
105
        // 1. Map all tags
106
        foreach ( $matches[ 0 ] as $key => $match ) {
107
            $current       = $matches[ 0 ][ $key ][ 0 ];
108
            $currentOffset = $matches[ 0 ][ $key ][ 1 ];
109
110
            // check every string inside angular brackets (< and >)
111
            if ( $current === '<' && isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) && $matches[ 0 ][ $key + 1 ][ 0 ] === '>' ) {
112
                $nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ];
113
                $tag        = substr( $html, ( $currentOffset + 1 ), ( $nextOffset - $currentOffset - 1 ) );
114
                $trimmedTag = trim( $tag );
115
116
                // if the tag is self closed do nothing
117
                if ( Strings::lastChar( $tag ) !== '/' ) {
118
                    $tags[]            = $trimmedTag;
119
                    $offsets[]         = $currentOffset;
120
                    $originalLengths[] = strlen( $tag ) + 2; // add 2 to length because there are < and >
121
                }
122
            }
123
        }
124
125
        // 2. Removing closed tags
126
        $indexes = [];
127
128
        if ( count( $tags ) > 0 ) {
129
            foreach ( $tags as $index => $tag ) {
130
131
                if ( Strings::contains( '/', $tag ) ) {
132
                    $complementaryTag = $tag;
133
                } else {
134
                    $complementaryTag = '/' . explode( ' ', $tag )[ 0 ];
135
                }
136
137
                $complementaryTagIndex = array_search( $complementaryTag, $tags );
138
139
                if ( false !== $complementaryTagIndex ) {
140
                    $indexes[] = $index;
141
                    $indexes[] = $complementaryTagIndex;
142
                }
143
            }
144
        }
145
146
        $indexes = array_unique( $indexes );
147
        foreach ( $indexes as $index ) {
148
            unset( $tags[ $index ] );
149
        }
150
151
        // 3. Loop not closed tags
152
        $delta = 0;
153
154
        if ( count( $tags ) ) {
155
            foreach ( $tags as $index => $tag ) {
156
157
                $length     = $originalLengths[ $index ];
158
                $offset     = $offsets[ $index ];
159
                $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );
160
161
                $replacement = self::LT_PLACEHOLDER . $tag . self::GT_PLACEHOLDER;
162
163
                $html  = substr_replace( $html, $replacement, $realOffset, $length );
164
                $delta = $delta + strlen( $replacement ) - $length;
165
            }
166
        }
167
168
        return !is_array( $html ) ? $html : implode( $html );
169
    }
170
171
    /**
172
     * @param string $html
173
     * @param bool   $toBeEscaped
174
     *
175
     * @return array
176
     */
177
    private static function extractHtmlNode( $html, $toBeEscaped = false ) {
178
        $pattern = "/<([a-zA-Z0-9._-]+)([^>]|[^<]*?)(([\s]*\/>)|" .
179
                "(>((([^<]*?|<\!\-\-.*?\-\->)|(?R))*)<\/\\1[\s]*>))/sm";
180
        preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE );
181
182
        $elements = [];
183
184
        foreach ( $matches[ 0 ] as $key => $match ) {
185
186
            $attributes    = isset( $matches[ 2 ][ $key ][ 0 ] ) ? self::getAttributes( $matches[ 2 ][ $key ][ 0 ] ) : [];
187
            $base64Decoded = ( isset( $attributes[ 'equiv-text' ] ) ) ? base64_decode( str_replace( "base64:", "", $attributes[ 'equiv-text' ] ) ) : null;
188
            $tagName       = $matches[ 1 ][ $key ][ 0 ];
189
            $text          = !empty( $matches[ 6 ][ $key ][ 0 ] ) ? $matches[ 6 ][ $key ][ 0 ] : "";
190
            $originalText  = $text;
191
            $strippedText  = strip_tags( $text );
192
193
            // get start and end tags
194
            $explodedNode = explode( self::ORIGINAL_TEXT_PLACEHOLDER, str_replace( $originalText, self::ORIGINAL_TEXT_PLACEHOLDER, $match[ 0 ] ) );
195
196
            $start = ( isset( $explodedNode[ 0 ] ) ) ? $explodedNode[ 0 ] : "";
197
            $end   = ( isset( $explodedNode[ 1 ] ) ) ? $explodedNode[ 1 ] : "";
198
199
            // inner_html
200
            $inner_html = self::getInnerHtml( $matches, $key, $toBeEscaped );
201
202
            // node
203
            $node = self::rebuildNode( $originalText, $toBeEscaped, $start, $end );
204
205
            // terminator
206
            $terminator = ( $toBeEscaped ) ? '&gt;' : '>';
207
208
            // self closed
209
            $selfClosed = Strings::contains( '/>', trim( $start ) );
210
211
            $elements[] = (object)[
212
                    'node'           => self::restoreLessThanAndGreaterThanSymbols( $node ),
213
                    'start'          => self::restoreLessThanAndGreaterThanSymbols( $start ),
214
                    'end'            => self::restoreLessThanAndGreaterThanSymbols( $end ),
215
                    'terminator'     => $terminator,
216
                    'offset'         => $match[ 1 ],
217
                    'tagname'        => $tagName,
218
                    'attributes'     => $attributes,
219
                    'base64_decoded' => $base64Decoded,
220
                    'self_closed'    => $selfClosed,
221
                    'omittag'        => ( $matches[ 4 ][ $key ][ 1 ] > -1 ), // boolean
222
                    'inner_html'     => $inner_html,
223
                    'has_children'   => is_array( $inner_html ),
224
                    'original_text'  => ( $toBeEscaped ) ? self::restoreLessThanAndGreaterThanSymbols( Strings::escapeOnlyHTMLTags( $originalText ) ) : self::restoreLessThanAndGreaterThanSymbols( $originalText ),
225
                    'stripped_text'  => self::restoreLessThanAndGreaterThanSymbols( $strippedText ),
226
            ];
227
        }
228
229
        return $elements;
230
    }
231
232
    /**
233
     * @param $text
234
     *
235
     * @return string|string[]
236
     */
237
    private static function restoreLessThanAndGreaterThanSymbols( $text ) {
238
        return str_replace( [ self::LT_PLACEHOLDER, self::GT_PLACEHOLDER ], [ '<', '>' ], $text );
239
    }
240
241
    /**
242
     * @param string $originalText
243
     * @param bool   $toBeEscaped
244
     * @param string $start
245
     * @param string $end
246
     *
247
     * @return string
248
     */
249
    private static function rebuildNode( $originalText, $toBeEscaped, $start = null, $end = null ) {
250
        $node = '';
251
252
        if ( !empty( $start ) ) {
253
            $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $start ) : $start;
254
        }
255
256
        $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $originalText ) : $originalText;
257
258
        if ( !empty( $end ) ) {
259
            $node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $end ) : $end;
260
        }
261
262
        return $node;
263
    }
264
265
    /**
266
     * @param $content
267
     *
268
     * @return mixed
269
     */
270
    public static function getAttributes( $content ) {
271
        $pattern = '/(.*?)=("|\'|\\\")(.*?)("|\'|\\\"|\\\')/';
272
273
        preg_match_all( $pattern, $content, $matches, PREG_OFFSET_CAPTURE );
274
275
        $attributes = [];
276
277
        if ( isset( $matches[ 1 ] ) && count( $matches[ 1 ] ) > 0 ) {
278
            foreach ( $matches[ 1 ] as $key => $match ) {
279
                $attributes[ trim( $match[ 0 ] ) ] = $matches[ 3 ][ $key ][ 0 ];
280
            }
281
        }
282
283
        return $attributes;
284
    }
285
286
    /**
287
     * @param array  $matches
288
     * @param string $key
289
     *
290
     * @param bool   $toBeEscaped
291
     *
292
     * @return array|mixed|string
293
     */
294
    private static function getInnerHtml( $matches, $key, $toBeEscaped = false ) {
295
        if ( isset( $matches[ 6 ][ $key ][ 0 ] ) ) {
296
            $node = self::extractHtmlNode( $matches[ 6 ][ $key ][ 0 ], $toBeEscaped );
297
298
            return ( !empty( $node ) ) ? $node : $matches[ 6 ][ $key ][ 0 ];
299
        }
300
301
        return null;
302
    }
303
}
304