|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace Matecat\XliffParser\Utils; |
|
4
|
|
|
|
|
5
|
|
|
class HtmlParser { |
|
6
|
|
|
const ORIGINAL_TEXT_PLACEHOLDER = '#####__ORIGINAL_TEXT__#####'; |
|
7
|
|
|
const LT_PLACEHOLDER = '#####__LT_PLACEHOLDER__#####'; |
|
8
|
|
|
const GT_PLACEHOLDER = '#####__GT_PLACEHOLDER__#####'; |
|
9
|
|
|
|
|
10
|
|
|
/** |
|
11
|
|
|
* This solution is taken from here and then modified: |
|
12
|
|
|
* https://www.php.net/manual/fr/regexp.reference.recursive.php#95568 |
|
13
|
|
|
* |
|
14
|
|
|
* @param string $html |
|
15
|
|
|
* |
|
16
|
|
|
* @return array |
|
17
|
|
|
*/ |
|
18
|
|
|
public static function parse( $html ) { |
|
19
|
|
|
$toBeEscaped = Strings::isAnEscapedHTML( $html ); |
|
20
|
|
|
|
|
21
|
|
|
if ( $toBeEscaped ) { |
|
22
|
|
|
$html = Strings::htmlspecialchars_decode( $html ); |
|
23
|
|
|
} |
|
24
|
|
|
|
|
25
|
|
|
$html = self::protectNotClosedHtmlTags( $html ); |
|
26
|
|
|
$html = self::protectNotHtmlLessThanSymbols( $html ); |
|
27
|
|
|
|
|
28
|
|
|
return self::extractHtmlNode( $html, $toBeEscaped ); |
|
29
|
|
|
} |
|
30
|
|
|
|
|
31
|
|
|
/** |
|
32
|
|
|
* Protect all < symbols that are not part of html tags. |
|
33
|
|
|
* |
|
34
|
|
|
* Example: |
|
35
|
|
|
* |
|
36
|
|
|
* <div id="1">< Ciao <<div id="2"></div></div> |
|
37
|
|
|
* |
|
38
|
|
|
* is converted to: |
|
39
|
|
|
* |
|
40
|
|
|
* <div id="1">#####__LT_PLACEHOLDER__##### Ciao #####__LT_PLACEHOLDER__#####<div id="2"></div></div> |
|
41
|
|
|
* |
|
42
|
|
|
* @param string $html |
|
43
|
|
|
* |
|
44
|
|
|
* @return string |
|
45
|
|
|
*/ |
|
46
|
|
|
private static function protectNotHtmlLessThanSymbols( $html ) { |
|
47
|
|
|
preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE ); |
|
48
|
|
|
|
|
49
|
|
|
$delta = 0; |
|
50
|
|
|
$realNextOffset = 0; |
|
51
|
|
|
$next = null; |
|
52
|
|
|
|
|
53
|
|
|
foreach ( $matches[ 0 ] as $key => $match ) { |
|
54
|
|
|
|
|
55
|
|
|
$current = $matches[ 0 ][ $key ][ 0 ]; |
|
56
|
|
|
|
|
57
|
|
|
if ( isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) ) { |
|
58
|
|
|
$next = $matches[ 0 ][ $key + 1 ][ 0 ]; |
|
59
|
|
|
$nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ]; |
|
60
|
|
|
$realNextOffset = ( $delta === 0 ) ? $nextOffset : ( $nextOffset + $delta ); |
|
61
|
|
|
} |
|
62
|
|
|
|
|
63
|
|
|
$length = strlen( $match[ 0 ] ); |
|
64
|
|
|
$offset = $matches[ 0 ][ $key ][ 1 ]; |
|
65
|
|
|
$realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta ); |
|
66
|
|
|
|
|
67
|
|
|
if ( $current === '<' && isset( $next ) ) { |
|
68
|
|
|
|
|
69
|
|
|
// 1. if next is > or |
|
70
|
|
|
// 2. next is < and is not html tag (like < >) |
|
71
|
|
|
$insideAngularTags = substr( $html, $realOffset, ( $realNextOffset - $realOffset + 1 ) ); |
|
72
|
|
|
|
|
73
|
|
|
if ( $next !== '>' || !Strings::isHtmlString( $insideAngularTags ) ) { |
|
74
|
|
|
$html = substr_replace( $html, self::LT_PLACEHOLDER, $realOffset, $length ); |
|
75
|
|
|
$delta = $delta + strlen( self::LT_PLACEHOLDER ) - $length; |
|
76
|
|
|
} |
|
77
|
|
|
} |
|
78
|
|
|
} |
|
79
|
|
|
|
|
80
|
|
|
return !is_array( $html ) ? $html : implode( $html ); |
|
81
|
|
|
} |
|
82
|
|
|
|
|
83
|
|
|
/** |
|
84
|
|
|
* Protect not closed html tags. |
|
85
|
|
|
* |
|
86
|
|
|
* Example: |
|
87
|
|
|
* |
|
88
|
|
|
* Ciao <div> this div is not closed. <div>Instead, this is a closed div.</div> |
|
89
|
|
|
* |
|
90
|
|
|
* is converted to: |
|
91
|
|
|
* |
|
92
|
|
|
* Ciao #####__LT_PLACEHOLDER__#####div#####__GT_PLACEHOLDER__##### this div is not closed. <div>Instead, this is a closed div.</div> |
|
93
|
|
|
* |
|
94
|
|
|
* @param string $html |
|
95
|
|
|
* |
|
96
|
|
|
* @return string |
|
97
|
|
|
*/ |
|
98
|
|
|
private static function protectNotClosedHtmlTags( $html ) { |
|
99
|
|
|
preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE ); |
|
100
|
|
|
|
|
101
|
|
|
$tags = []; |
|
102
|
|
|
$offsets = []; |
|
103
|
|
|
$originalLengths = []; |
|
104
|
|
|
|
|
105
|
|
|
// 1. Map all tags |
|
106
|
|
|
foreach ( $matches[ 0 ] as $key => $match ) { |
|
107
|
|
|
$current = $matches[ 0 ][ $key ][ 0 ]; |
|
108
|
|
|
$currentOffset = $matches[ 0 ][ $key ][ 1 ]; |
|
109
|
|
|
|
|
110
|
|
|
// check every string inside angular brackets (< and >) |
|
111
|
|
|
if ( $current === '<' && isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) && $matches[ 0 ][ $key + 1 ][ 0 ] === '>' ) { |
|
112
|
|
|
$nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ]; |
|
113
|
|
|
$tag = substr( $html, ( $currentOffset + 1 ), ( $nextOffset - $currentOffset - 1 ) ); |
|
114
|
|
|
$trimmedTag = trim( $tag ); |
|
115
|
|
|
|
|
116
|
|
|
// if the tag is self closed do nothing |
|
117
|
|
|
if ( Strings::lastChar( $tag ) !== '/' ) { |
|
118
|
|
|
$tags[] = $trimmedTag; |
|
119
|
|
|
$offsets[] = $currentOffset; |
|
120
|
|
|
$originalLengths[] = strlen( $tag ) + 2; // add 2 to length because there are < and > |
|
121
|
|
|
} |
|
122
|
|
|
} |
|
123
|
|
|
} |
|
124
|
|
|
|
|
125
|
|
|
// 2. Removing closed tags |
|
126
|
|
|
$indexes = []; |
|
127
|
|
|
|
|
128
|
|
|
if ( count( $tags ) > 0 ) { |
|
129
|
|
|
foreach ( $tags as $index => $tag ) { |
|
130
|
|
|
|
|
131
|
|
|
if ( Strings::contains( '/', $tag ) ) { |
|
132
|
|
|
$complementaryTag = $tag; |
|
133
|
|
|
} else { |
|
134
|
|
|
$complementaryTag = '/' . explode( ' ', $tag )[ 0 ]; |
|
135
|
|
|
} |
|
136
|
|
|
|
|
137
|
|
|
$complementaryTagIndex = array_search( $complementaryTag, $tags ); |
|
138
|
|
|
|
|
139
|
|
|
if ( false !== $complementaryTagIndex ) { |
|
140
|
|
|
$indexes[] = $index; |
|
141
|
|
|
$indexes[] = $complementaryTagIndex; |
|
142
|
|
|
} |
|
143
|
|
|
} |
|
144
|
|
|
} |
|
145
|
|
|
|
|
146
|
|
|
$indexes = array_unique( $indexes ); |
|
147
|
|
|
foreach ( $indexes as $index ) { |
|
148
|
|
|
unset( $tags[ $index ] ); |
|
149
|
|
|
} |
|
150
|
|
|
|
|
151
|
|
|
// 3. Loop not closed tags |
|
152
|
|
|
$delta = 0; |
|
153
|
|
|
|
|
154
|
|
|
if ( count( $tags ) ) { |
|
155
|
|
|
foreach ( $tags as $index => $tag ) { |
|
156
|
|
|
|
|
157
|
|
|
$length = $originalLengths[ $index ]; |
|
158
|
|
|
$offset = $offsets[ $index ]; |
|
159
|
|
|
$realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta ); |
|
160
|
|
|
|
|
161
|
|
|
$replacement = self::LT_PLACEHOLDER . $tag . self::GT_PLACEHOLDER; |
|
162
|
|
|
|
|
163
|
|
|
$html = substr_replace( $html, $replacement, $realOffset, $length ); |
|
164
|
|
|
$delta = $delta + strlen( $replacement ) - $length; |
|
165
|
|
|
} |
|
166
|
|
|
} |
|
167
|
|
|
|
|
168
|
|
|
return !is_array( $html ) ? $html : implode( $html ); |
|
169
|
|
|
} |
|
170
|
|
|
|
|
171
|
|
|
/** |
|
172
|
|
|
* @param string $html |
|
173
|
|
|
* @param bool $toBeEscaped |
|
174
|
|
|
* |
|
175
|
|
|
* @return array |
|
176
|
|
|
*/ |
|
177
|
|
|
private static function extractHtmlNode( $html, $toBeEscaped = false ) { |
|
178
|
|
|
$pattern = "/<([a-zA-Z0-9._-]+)([^>]|[^<]*?)(([\s]*\/>)|" . |
|
179
|
|
|
"(>((([^<]*?|<\!\-\-.*?\-\->)|(?R))*)<\/\\1[\s]*>))/sm"; |
|
180
|
|
|
preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE ); |
|
181
|
|
|
|
|
182
|
|
|
$elements = []; |
|
183
|
|
|
|
|
184
|
|
|
foreach ( $matches[ 0 ] as $key => $match ) { |
|
185
|
|
|
|
|
186
|
|
|
$attributes = isset( $matches[ 2 ][ $key ][ 0 ] ) ? self::getAttributes( $matches[ 2 ][ $key ][ 0 ] ) : []; |
|
187
|
|
|
$base64Decoded = ( isset( $attributes[ 'equiv-text' ] ) ) ? base64_decode( str_replace( "base64:", "", $attributes[ 'equiv-text' ] ) ) : null; |
|
188
|
|
|
$tagName = $matches[ 1 ][ $key ][ 0 ]; |
|
189
|
|
|
$text = !empty( $matches[ 6 ][ $key ][ 0 ] ) ? $matches[ 6 ][ $key ][ 0 ] : ""; |
|
190
|
|
|
$originalText = $text; |
|
191
|
|
|
$strippedText = strip_tags( $text ); |
|
192
|
|
|
|
|
193
|
|
|
// get start and end tags |
|
194
|
|
|
$explodedNode = explode( self::ORIGINAL_TEXT_PLACEHOLDER, str_replace( $originalText, self::ORIGINAL_TEXT_PLACEHOLDER, $match[ 0 ] ) ); |
|
195
|
|
|
|
|
196
|
|
|
$start = ( isset( $explodedNode[ 0 ] ) ) ? $explodedNode[ 0 ] : ""; |
|
197
|
|
|
$end = ( isset( $explodedNode[ 1 ] ) ) ? $explodedNode[ 1 ] : ""; |
|
198
|
|
|
|
|
199
|
|
|
// inner_html |
|
200
|
|
|
$inner_html = self::getInnerHtml( $matches, $key, $toBeEscaped ); |
|
201
|
|
|
|
|
202
|
|
|
// node |
|
203
|
|
|
$node = self::rebuildNode( $originalText, $toBeEscaped, $start, $end ); |
|
204
|
|
|
|
|
205
|
|
|
// terminator |
|
206
|
|
|
$terminator = ( $toBeEscaped ) ? '>' : '>'; |
|
207
|
|
|
|
|
208
|
|
|
// self closed |
|
209
|
|
|
$selfClosed = Strings::contains( '/>', trim( $start ) ); |
|
210
|
|
|
|
|
211
|
|
|
$elements[] = (object)[ |
|
212
|
|
|
'node' => self::restoreLessThanAndGreaterThanSymbols( $node ), |
|
213
|
|
|
'start' => self::restoreLessThanAndGreaterThanSymbols( $start ), |
|
214
|
|
|
'end' => self::restoreLessThanAndGreaterThanSymbols( $end ), |
|
215
|
|
|
'terminator' => $terminator, |
|
216
|
|
|
'offset' => $match[ 1 ], |
|
217
|
|
|
'tagname' => $tagName, |
|
218
|
|
|
'attributes' => $attributes, |
|
219
|
|
|
'base64_decoded' => $base64Decoded, |
|
220
|
|
|
'self_closed' => $selfClosed, |
|
221
|
|
|
'omittag' => ( $matches[ 4 ][ $key ][ 1 ] > -1 ), // boolean |
|
222
|
|
|
'inner_html' => $inner_html, |
|
223
|
|
|
'has_children' => is_array( $inner_html ), |
|
224
|
|
|
'original_text' => ( $toBeEscaped ) ? self::restoreLessThanAndGreaterThanSymbols( Strings::escapeOnlyHTMLTags( $originalText ) ) : self::restoreLessThanAndGreaterThanSymbols( $originalText ), |
|
225
|
|
|
'stripped_text' => self::restoreLessThanAndGreaterThanSymbols( $strippedText ), |
|
226
|
|
|
]; |
|
227
|
|
|
} |
|
228
|
|
|
|
|
229
|
|
|
return $elements; |
|
230
|
|
|
} |
|
231
|
|
|
|
|
232
|
|
|
/** |
|
233
|
|
|
* @param $text |
|
234
|
|
|
* |
|
235
|
|
|
* @return string|string[] |
|
236
|
|
|
*/ |
|
237
|
|
|
private static function restoreLessThanAndGreaterThanSymbols( $text ) { |
|
238
|
|
|
return str_replace( [ self::LT_PLACEHOLDER, self::GT_PLACEHOLDER ], [ '<', '>' ], $text ); |
|
239
|
|
|
} |
|
240
|
|
|
|
|
241
|
|
|
/** |
|
242
|
|
|
* @param string $originalText |
|
243
|
|
|
* @param bool $toBeEscaped |
|
244
|
|
|
* @param string $start |
|
245
|
|
|
* @param string $end |
|
246
|
|
|
* |
|
247
|
|
|
* @return string |
|
248
|
|
|
*/ |
|
249
|
|
|
private static function rebuildNode( $originalText, $toBeEscaped, $start = null, $end = null ) { |
|
250
|
|
|
$node = ''; |
|
251
|
|
|
|
|
252
|
|
|
if ( !empty( $start ) ) { |
|
253
|
|
|
$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $start ) : $start; |
|
254
|
|
|
} |
|
255
|
|
|
|
|
256
|
|
|
$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $originalText ) : $originalText; |
|
257
|
|
|
|
|
258
|
|
|
if ( !empty( $end ) ) { |
|
259
|
|
|
$node .= ( $toBeEscaped ) ? Strings::escapeOnlyHTMLTags( $end ) : $end; |
|
260
|
|
|
} |
|
261
|
|
|
|
|
262
|
|
|
return $node; |
|
263
|
|
|
} |
|
264
|
|
|
|
|
265
|
|
|
/** |
|
266
|
|
|
* @param $content |
|
267
|
|
|
* |
|
268
|
|
|
* @return mixed |
|
269
|
|
|
*/ |
|
270
|
|
|
public static function getAttributes( $content ) { |
|
271
|
|
|
$pattern = '/(.*?)=("|\'|\\\")(.*?)("|\'|\\\"|\\\')/'; |
|
272
|
|
|
|
|
273
|
|
|
preg_match_all( $pattern, $content, $matches, PREG_OFFSET_CAPTURE ); |
|
274
|
|
|
|
|
275
|
|
|
$attributes = []; |
|
276
|
|
|
|
|
277
|
|
|
if ( isset( $matches[ 1 ] ) && count( $matches[ 1 ] ) > 0 ) { |
|
278
|
|
|
foreach ( $matches[ 1 ] as $key => $match ) { |
|
279
|
|
|
$attributes[ trim( $match[ 0 ] ) ] = $matches[ 3 ][ $key ][ 0 ]; |
|
280
|
|
|
} |
|
281
|
|
|
} |
|
282
|
|
|
|
|
283
|
|
|
return $attributes; |
|
284
|
|
|
} |
|
285
|
|
|
|
|
286
|
|
|
/** |
|
287
|
|
|
* @param array $matches |
|
288
|
|
|
* @param string $key |
|
289
|
|
|
* |
|
290
|
|
|
* @param bool $toBeEscaped |
|
291
|
|
|
* |
|
292
|
|
|
* @return array|mixed|string |
|
293
|
|
|
*/ |
|
294
|
|
|
private static function getInnerHtml( $matches, $key, $toBeEscaped = false ) { |
|
295
|
|
|
if ( isset( $matches[ 6 ][ $key ][ 0 ] ) ) { |
|
296
|
|
|
$node = self::extractHtmlNode( $matches[ 6 ][ $key ][ 0 ], $toBeEscaped ); |
|
297
|
|
|
|
|
298
|
|
|
return ( !empty( $node ) ) ? $node : $matches[ 6 ][ $key ][ 0 ]; |
|
299
|
|
|
} |
|
300
|
|
|
|
|
301
|
|
|
return null; |
|
302
|
|
|
} |
|
303
|
|
|
} |
|
304
|
|
|
|