|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Performs transformations of HTML by wrapping around libxml2 and working |
|
4
|
|
|
* around its countless bugs. |
|
5
|
|
|
* |
|
6
|
|
|
* This program is free software; you can redistribute it and/or modify |
|
7
|
|
|
* it under the terms of the GNU General Public License as published by |
|
8
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
|
9
|
|
|
* (at your option) any later version. |
|
10
|
|
|
* |
|
11
|
|
|
* This program is distributed in the hope that it will be useful, |
|
12
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
* GNU General Public License for more details. |
|
15
|
|
|
* |
|
16
|
|
|
* You should have received a copy of the GNU General Public License along |
|
17
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc., |
|
18
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|
19
|
|
|
* http://www.gnu.org/copyleft/gpl.html |
|
20
|
|
|
* |
|
21
|
|
|
* @file |
|
22
|
|
|
*/ |
|
23
|
|
|
class HtmlFormatter { |
|
24
|
|
|
/** |
|
25
|
|
|
* @var DOMDocument |
|
26
|
|
|
*/ |
|
27
|
|
|
private $doc; |
|
28
|
|
|
|
|
29
|
|
|
private $html; |
|
30
|
|
|
private $itemsToRemove = []; |
|
31
|
|
|
private $elementsToFlatten = []; |
|
32
|
|
|
protected $removeMedia = false; |
|
33
|
|
|
|
|
34
|
|
|
/** |
|
35
|
|
|
* Constructor |
|
36
|
|
|
* |
|
37
|
|
|
* @param string $html Text to process |
|
38
|
|
|
*/ |
|
39
|
|
|
public function __construct( $html ) { |
|
40
|
|
|
$this->html = $html; |
|
41
|
|
|
} |
|
42
|
|
|
|
|
43
|
|
|
/** |
|
44
|
|
|
* Turns a chunk of HTML into a proper document |
|
45
|
|
|
* @param string $html |
|
46
|
|
|
* @return string |
|
47
|
|
|
*/ |
|
48
|
|
|
public static function wrapHTML( $html ) { |
|
49
|
|
|
return '<!doctype html><html><head></head><body>' . $html . '</body></html>'; |
|
50
|
|
|
} |
|
51
|
|
|
|
|
52
|
|
|
/** |
|
53
|
|
|
* Override this in descendant class to modify HTML after it has been converted from DOM tree |
|
54
|
|
|
* @param string $html HTML to process |
|
55
|
|
|
* @return string Processed HTML |
|
56
|
|
|
*/ |
|
57
|
|
|
protected function onHtmlReady( $html ) { |
|
58
|
|
|
return $html; |
|
59
|
|
|
} |
|
60
|
|
|
|
|
61
|
|
|
/** |
|
62
|
|
|
* @return DOMDocument DOM to manipulate |
|
63
|
|
|
*/ |
|
64
|
|
|
public function getDoc() { |
|
65
|
|
|
if ( !$this->doc ) { |
|
66
|
|
|
// DOMDocument::loadHTML isn't very good with encodings, so |
|
67
|
|
|
// convert input to ASCII by encoding everything above 128 as entities. |
|
68
|
|
|
$html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); |
|
69
|
|
|
|
|
70
|
|
|
// Workaround for bug that caused spaces before references |
|
71
|
|
|
// to disappear during processing: https://phabricator.wikimedia.org/T55086 |
|
72
|
|
|
// TODO: Please replace with a better fix if one can be found. |
|
73
|
|
|
$html = str_replace( ' <', ' <', $html ); |
|
74
|
|
|
|
|
75
|
|
|
libxml_use_internal_errors( true ); |
|
76
|
|
|
$loader = libxml_disable_entity_loader(); |
|
77
|
|
|
$this->doc = new DOMDocument(); |
|
78
|
|
|
$this->doc->strictErrorChecking = false; |
|
79
|
|
|
$this->doc->loadHTML( $html ); |
|
80
|
|
|
libxml_disable_entity_loader( $loader ); |
|
81
|
|
|
libxml_use_internal_errors( false ); |
|
82
|
|
|
$this->doc->encoding = 'UTF-8'; |
|
83
|
|
|
} |
|
84
|
|
|
return $this->doc; |
|
85
|
|
|
} |
|
86
|
|
|
|
|
87
|
|
|
/** |
|
88
|
|
|
* Sets whether images/videos/sounds should be removed from output |
|
89
|
|
|
* @param bool $flag |
|
90
|
|
|
*/ |
|
91
|
|
|
public function setRemoveMedia( $flag = true ) { |
|
92
|
|
|
$this->removeMedia = $flag; |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
/** |
|
96
|
|
|
* Adds one or more selector of content to remove. A subset of CSS selector |
|
97
|
|
|
* syntax is supported: |
|
98
|
|
|
* |
|
99
|
|
|
* <tag> |
|
100
|
|
|
* <tag>.class |
|
101
|
|
|
* .<class> |
|
102
|
|
|
* #<id> |
|
103
|
|
|
* |
|
104
|
|
|
* @param array|string $selectors Selector(s) of stuff to remove |
|
105
|
|
|
*/ |
|
106
|
|
|
public function remove( $selectors ) { |
|
107
|
|
|
$this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); |
|
108
|
|
|
} |
|
109
|
|
|
|
|
110
|
|
|
/** |
|
111
|
|
|
* Adds one or more element name to the list to flatten (remove tag, but not its content) |
|
112
|
|
|
* Can accept undelimited regexes |
|
113
|
|
|
* |
|
114
|
|
|
* Note this interface may fail in surprising unexpected ways due to usage of regexes, |
|
115
|
|
|
* so should not be relied on for HTML markup security measures. |
|
116
|
|
|
* |
|
117
|
|
|
* @param array|string $elements Name(s) of tag(s) to flatten |
|
118
|
|
|
*/ |
|
119
|
|
|
public function flatten( $elements ) { |
|
120
|
|
|
$this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
|
|
/** |
|
124
|
|
|
* Instructs the formatter to flatten all tags |
|
125
|
|
|
*/ |
|
126
|
|
|
public function flattenAllTags() { |
|
127
|
|
|
$this->flatten( '[?!]?[a-z0-9]+' ); |
|
128
|
|
|
} |
|
129
|
|
|
|
|
130
|
|
|
/** |
|
131
|
|
|
* Removes content we've chosen to remove. The text of the removed elements can be |
|
132
|
|
|
* extracted with the getText method. |
|
133
|
|
|
* @return array Array of removed DOMElements |
|
134
|
|
|
*/ |
|
135
|
|
|
public function filterContent() { |
|
136
|
|
|
$removals = $this->parseItemsToRemove(); |
|
137
|
|
|
|
|
138
|
|
|
// Bail out early if nothing to do |
|
139
|
|
|
if ( array_reduce( $removals, |
|
140
|
|
|
function ( $carry, $item ) { |
|
141
|
|
|
return $carry && !$item; |
|
142
|
|
|
}, |
|
143
|
|
|
true |
|
144
|
|
|
) ) { |
|
145
|
|
|
return []; |
|
146
|
|
|
} |
|
147
|
|
|
|
|
148
|
|
|
$doc = $this->getDoc(); |
|
149
|
|
|
|
|
150
|
|
|
// Remove tags |
|
151
|
|
|
|
|
152
|
|
|
// You can't remove DOMNodes from a DOMNodeList as you're iterating |
|
153
|
|
|
// over them in a foreach loop. It will seemingly leave the internal |
|
154
|
|
|
// iterator on the foreach out of wack and results will be quite |
|
155
|
|
|
// strange. Though, making a queue of items to remove seems to work. |
|
156
|
|
|
$domElemsToRemove = []; |
|
157
|
|
|
foreach ( $removals['TAG'] as $tagToRemove ) { |
|
158
|
|
|
$tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); |
|
159
|
|
|
foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { |
|
160
|
|
|
if ( $tagToRemoveNode ) { |
|
161
|
|
|
$domElemsToRemove[] = $tagToRemoveNode; |
|
162
|
|
|
} |
|
163
|
|
|
} |
|
164
|
|
|
} |
|
165
|
|
|
$removed = $this->removeElements( $domElemsToRemove ); |
|
166
|
|
|
|
|
167
|
|
|
// Elements with named IDs |
|
168
|
|
|
$domElemsToRemove = []; |
|
169
|
|
|
foreach ( $removals['ID'] as $itemToRemove ) { |
|
170
|
|
|
$itemToRemoveNode = $doc->getElementById( $itemToRemove ); |
|
171
|
|
|
if ( $itemToRemoveNode ) { |
|
172
|
|
|
$domElemsToRemove[] = $itemToRemoveNode; |
|
173
|
|
|
} |
|
174
|
|
|
} |
|
175
|
|
|
$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); |
|
176
|
|
|
|
|
177
|
|
|
// CSS Classes |
|
178
|
|
|
$domElemsToRemove = []; |
|
179
|
|
|
$xpath = new DOMXPath( $doc ); |
|
180
|
|
|
foreach ( $removals['CLASS'] as $classToRemove ) { |
|
181
|
|
|
$elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); |
|
182
|
|
|
|
|
183
|
|
|
/** @var $element DOMElement */ |
|
184
|
|
|
foreach ( $elements as $element ) { |
|
185
|
|
|
$classes = $element->getAttribute( 'class' ); |
|
186
|
|
|
if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { |
|
187
|
|
|
$domElemsToRemove[] = $element; |
|
188
|
|
|
} |
|
189
|
|
|
} |
|
190
|
|
|
} |
|
191
|
|
|
$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); |
|
192
|
|
|
|
|
193
|
|
|
// Tags with CSS Classes |
|
194
|
|
|
foreach ( $removals['TAG_CLASS'] as $classToRemove ) { |
|
195
|
|
|
$parts = explode( '.', $classToRemove ); |
|
196
|
|
|
|
|
197
|
|
|
$elements = $xpath->query( |
|
198
|
|
|
'//' . $parts[0] . '[@class="' . $parts[1] . '"]' |
|
199
|
|
|
); |
|
200
|
|
|
$removed = array_merge( $removed, $this->removeElements( $elements ) ); |
|
201
|
|
|
} |
|
202
|
|
|
|
|
203
|
|
|
return $removed; |
|
204
|
|
|
} |
|
205
|
|
|
|
|
206
|
|
|
/** |
|
207
|
|
|
* Removes a list of elelments from DOMDocument |
|
208
|
|
|
* @param array|DOMNodeList $elements |
|
209
|
|
|
* @return array Array of removed elements |
|
210
|
|
|
*/ |
|
211
|
|
|
private function removeElements( $elements ) { |
|
212
|
|
|
$list = $elements; |
|
213
|
|
|
if ( $elements instanceof DOMNodeList ) { |
|
214
|
|
|
$list = []; |
|
215
|
|
|
foreach ( $elements as $element ) { |
|
216
|
|
|
$list[] = $element; |
|
217
|
|
|
} |
|
218
|
|
|
} |
|
219
|
|
|
/** @var $element DOMElement */ |
|
220
|
|
|
foreach ( $list as $element ) { |
|
221
|
|
|
if ( $element->parentNode ) { |
|
222
|
|
|
$element->parentNode->removeChild( $element ); |
|
223
|
|
|
} |
|
224
|
|
|
} |
|
225
|
|
|
return $list; |
|
226
|
|
|
} |
|
227
|
|
|
|
|
228
|
|
|
/** |
|
229
|
|
|
* libxml in its usual pointlessness converts many chars to entities - this function |
|
230
|
|
|
* perfoms a reverse conversion |
|
231
|
|
|
* @param string $html |
|
232
|
|
|
* @return string |
|
233
|
|
|
*/ |
|
234
|
|
|
private function fixLibXML( $html ) { |
|
235
|
|
|
static $replacements; |
|
236
|
|
|
if ( !$replacements ) { |
|
237
|
|
|
// We don't include rules like '"' => '&quot;' because entities had already been |
|
238
|
|
|
// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! |
|
239
|
|
|
$replacements = new ReplacementArray( [ |
|
240
|
|
|
'"' => '&quot;', |
|
241
|
|
|
'&' => '&amp;', |
|
242
|
|
|
'<' => '&lt;', |
|
243
|
|
|
'>' => '&gt;', |
|
244
|
|
|
] ); |
|
245
|
|
|
} |
|
246
|
|
|
$html = $replacements->replace( $html ); |
|
247
|
|
|
|
|
248
|
|
|
// Just in case the conversion in getDoc() above used named |
|
249
|
|
|
// entities that aren't known to html_entity_decode(). |
|
250
|
|
|
$html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); |
|
251
|
|
|
|
|
252
|
|
|
return $html; |
|
253
|
|
|
} |
|
254
|
|
|
|
|
255
|
|
|
/** |
|
256
|
|
|
* Performs final transformations and returns resulting HTML. Note that if you want to call this |
|
257
|
|
|
* both without an element and with an element you should call it without an element first. If you |
|
258
|
|
|
* specify the $element in the method it'll change the underlying dom and you won't be able to get |
|
259
|
|
|
* it back. |
|
260
|
|
|
* |
|
261
|
|
|
* @param DOMElement|string|null $element ID of element to get HTML from or |
|
262
|
|
|
* false to get it from the whole tree |
|
263
|
|
|
* @return string Processed HTML |
|
264
|
|
|
*/ |
|
265
|
|
|
public function getText( $element = null ) { |
|
266
|
|
|
|
|
267
|
|
|
if ( $this->doc ) { |
|
268
|
|
|
if ( $element !== null && !( $element instanceof DOMElement ) ) { |
|
269
|
|
|
$element = $this->doc->getElementById( $element ); |
|
270
|
|
|
} |
|
271
|
|
|
if ( $element ) { |
|
272
|
|
|
$body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); |
|
273
|
|
|
$nodesArray = []; |
|
274
|
|
|
foreach ( $body->childNodes as $node ) { |
|
275
|
|
|
$nodesArray[] = $node; |
|
276
|
|
|
} |
|
277
|
|
|
foreach ( $nodesArray as $nodeArray ) { |
|
278
|
|
|
$body->removeChild( $nodeArray ); |
|
279
|
|
|
} |
|
280
|
|
|
$body->appendChild( $element ); |
|
281
|
|
|
} |
|
282
|
|
|
$html = $this->doc->saveHTML(); |
|
283
|
|
|
|
|
284
|
|
|
$html = $this->fixLibXML( $html ); |
|
285
|
|
|
if ( wfIsWindows() ) { |
|
286
|
|
|
// Cleanup for CRLF misprocessing of unknown origin on Windows. |
|
287
|
|
|
// If this error continues in the future, please track it down in the |
|
288
|
|
|
// XML code paths if possible and fix there. |
|
289
|
|
|
$html = str_replace( ' ', '', $html ); |
|
290
|
|
|
} |
|
291
|
|
|
} else { |
|
292
|
|
|
$html = $this->html; |
|
293
|
|
|
} |
|
294
|
|
|
// Remove stuff added by wrapHTML() |
|
295
|
|
|
$html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html ); |
|
296
|
|
|
$html = $this->onHtmlReady( $html ); |
|
297
|
|
|
|
|
298
|
|
|
if ( $this->elementsToFlatten ) { |
|
|
|
|
|
|
299
|
|
|
$elements = implode( '|', $this->elementsToFlatten ); |
|
300
|
|
|
$html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html ); |
|
301
|
|
|
} |
|
302
|
|
|
|
|
303
|
|
|
return $html; |
|
304
|
|
|
} |
|
305
|
|
|
|
|
306
|
|
|
/** |
|
307
|
|
|
* Helper function for parseItemsToRemove(). This function extracts the selector type |
|
308
|
|
|
* and the raw name of a selector from a CSS-style selector string and assigns those |
|
309
|
|
|
* values to parameters passed by reference. For example, if given '#toc' as the |
|
310
|
|
|
* $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. |
|
311
|
|
|
* @param string $selector CSS selector to parse |
|
312
|
|
|
* @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) |
|
313
|
|
|
* @param string $rawName The raw name of the selector |
|
314
|
|
|
* @return bool Whether the selector was successfully recognised |
|
315
|
|
|
* @throws MWException |
|
316
|
|
|
*/ |
|
317
|
|
|
protected function parseSelector( $selector, &$type, &$rawName ) { |
|
318
|
|
|
if ( strpos( $selector, '.' ) === 0 ) { |
|
319
|
|
|
$type = 'CLASS'; |
|
320
|
|
|
$rawName = substr( $selector, 1 ); |
|
321
|
|
|
} elseif ( strpos( $selector, '#' ) === 0 ) { |
|
322
|
|
|
$type = 'ID'; |
|
323
|
|
|
$rawName = substr( $selector, 1 ); |
|
324
|
|
|
} elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { |
|
325
|
|
|
$type = 'TAG_CLASS'; |
|
326
|
|
|
$rawName = $selector; |
|
327
|
|
|
} elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { |
|
328
|
|
|
$type = 'TAG'; |
|
329
|
|
|
$rawName = $selector; |
|
330
|
|
|
} else { |
|
331
|
|
|
throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); |
|
332
|
|
|
} |
|
333
|
|
|
|
|
334
|
|
|
return true; |
|
335
|
|
|
} |
|
336
|
|
|
|
|
337
|
|
|
/** |
|
338
|
|
|
* Transforms CSS-style selectors into an internal representation suitable for |
|
339
|
|
|
* processing by filterContent() |
|
340
|
|
|
* @return array |
|
341
|
|
|
*/ |
|
342
|
|
|
protected function parseItemsToRemove() { |
|
343
|
|
|
$removals = [ |
|
344
|
|
|
'ID' => [], |
|
345
|
|
|
'TAG' => [], |
|
346
|
|
|
'CLASS' => [], |
|
347
|
|
|
'TAG_CLASS' => [], |
|
348
|
|
|
]; |
|
349
|
|
|
|
|
350
|
|
|
foreach ( $this->itemsToRemove as $itemToRemove ) { |
|
351
|
|
|
$type = ''; |
|
352
|
|
|
$rawName = ''; |
|
353
|
|
|
if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { |
|
354
|
|
|
$removals[$type][] = $rawName; |
|
355
|
|
|
} |
|
356
|
|
|
} |
|
357
|
|
|
|
|
358
|
|
|
if ( $this->removeMedia ) { |
|
359
|
|
|
$removals['TAG'][] = 'img'; |
|
360
|
|
|
$removals['TAG'][] = 'audio'; |
|
361
|
|
|
$removals['TAG'][] = 'video'; |
|
362
|
|
|
} |
|
363
|
|
|
|
|
364
|
|
|
return $removals; |
|
365
|
|
|
} |
|
366
|
|
|
} |
|
367
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.