Completed
Branch master (d58858)
by
unknown
28:23
created

HtmlFormatter   B

Complexity

Total Complexity 49

Size/Duplication

Total Lines 344
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 49
c 1
b 0
f 0
lcom 1
cbo 2
dl 0
loc 344
rs 8.5454

14 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A wrapHTML() 0 3 1
A onHtmlReady() 0 3 1
A getDoc() 0 22 2
A setRemoveMedia() 0 3 1
A remove() 0 3 1
A flatten() 0 3 1
A flattenAllTags() 0 3 1
C filterContent() 0 70 13
B removeElements() 0 16 5
A fixLibXML() 0 20 2
D getText() 0 40 9
B parseSelector() 0 19 7
B parseItemsToRemove() 0 24 4

How to fix   Complexity   

Complex Class

Complex classes like HtmlFormatter often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HtmlFormatter, and based on these observations, apply Extract Interface, too.

1
<?php
2
/**
3
 * Performs transformations of HTML by wrapping around libxml2 and working
4
 * around its countless bugs.
5
 *
6
 * This program is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License along
17
 * with this program; if not, write to the Free Software Foundation, Inc.,
18
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
 * http://www.gnu.org/copyleft/gpl.html
20
 *
21
 * @file
22
 */
23
class HtmlFormatter {
24
	/**
25
	 * @var DOMDocument
26
	 */
27
	private $doc;
28
29
	private $html;
30
	private $itemsToRemove = [];
31
	private $elementsToFlatten = [];
32
	protected $removeMedia = false;
33
34
	/**
35
	 * Constructor
36
	 *
37
	 * @param string $html Text to process
38
	 */
39
	public function __construct( $html ) {
40
		$this->html = $html;
41
	}
42
43
	/**
44
	 * Turns a chunk of HTML into a proper document
45
	 * @param string $html
46
	 * @return string
47
	 */
48
	public static function wrapHTML( $html ) {
49
		return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
50
	}
51
52
	/**
53
	 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54
	 * @param string $html HTML to process
55
	 * @return string Processed HTML
56
	 */
57
	protected function onHtmlReady( $html ) {
58
		return $html;
59
	}
60
61
	/**
62
	 * @return DOMDocument DOM to manipulate
63
	 */
64
	public function getDoc() {
65
		if ( !$this->doc ) {
66
			// DOMDocument::loadHTML isn't very good with encodings, so
67
			// convert input to ASCII by encoding everything above 128 as entities.
68
			$html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
69
70
			// Workaround for bug that caused spaces before references
71
			// to disappear during processing: https://phabricator.wikimedia.org/T55086
72
			// TODO: Please replace with a better fix if one can be found.
73
			$html = str_replace( ' <', '&#32;<', $html );
74
75
			libxml_use_internal_errors( true );
76
			$loader = libxml_disable_entity_loader();
77
			$this->doc = new DOMDocument();
78
			$this->doc->strictErrorChecking = false;
79
			$this->doc->loadHTML( $html );
80
			libxml_disable_entity_loader( $loader );
81
			libxml_use_internal_errors( false );
82
			$this->doc->encoding = 'UTF-8';
83
		}
84
		return $this->doc;
85
	}
86
87
	/**
88
	 * Sets whether images/videos/sounds should be removed from output
89
	 * @param bool $flag
90
	 */
91
	public function setRemoveMedia( $flag = true ) {
92
		$this->removeMedia = $flag;
93
	}
94
95
	/**
96
	 * Adds one or more selector of content to remove. A subset of CSS selector
97
	 * syntax is supported:
98
	 *
99
	 *   <tag>
100
	 *   <tag>.class
101
	 *   .<class>
102
	 *   #<id>
103
	 *
104
	 * @param array|string $selectors Selector(s) of stuff to remove
105
	 */
106
	public function remove( $selectors ) {
107
		$this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
108
	}
109
110
	/**
111
	 * Adds one or more element name to the list to flatten (remove tag, but not its content)
112
	 * Can accept undelimited regexes
113
	 *
114
	 * Note this interface may fail in surprising unexpected ways due to usage of regexes,
115
	 * so should not be relied on for HTML markup security measures.
116
	 *
117
	 * @param array|string $elements Name(s) of tag(s) to flatten
118
	 */
119
	public function flatten( $elements ) {
120
		$this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
121
	}
122
123
	/**
124
	 * Instructs the formatter to flatten all tags
125
	 */
126
	public function flattenAllTags() {
127
		$this->flatten( '[?!]?[a-z0-9]+' );
128
	}
129
130
	/**
131
	 * Removes content we've chosen to remove.  The text of the removed elements can be
132
	 * extracted with the getText method.
133
	 * @return array Array of removed DOMElements
134
	 */
135
	public function filterContent() {
136
		$removals = $this->parseItemsToRemove();
137
138
		// Bail out early if nothing to do
139
		if ( array_reduce( $removals,
140
			function ( $carry, $item ) {
141
				return $carry && !$item;
142
			},
143
			true
144
		) ) {
145
			return [];
146
		}
147
148
		$doc = $this->getDoc();
149
150
		// Remove tags
151
152
		// You can't remove DOMNodes from a DOMNodeList as you're iterating
153
		// over them in a foreach loop. It will seemingly leave the internal
154
		// iterator on the foreach out of wack and results will be quite
155
		// strange. Though, making a queue of items to remove seems to work.
156
		$domElemsToRemove = [];
157
		foreach ( $removals['TAG'] as $tagToRemove ) {
158
			$tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
159
			foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
160
				if ( $tagToRemoveNode ) {
161
					$domElemsToRemove[] = $tagToRemoveNode;
162
				}
163
			}
164
		}
165
		$removed = $this->removeElements( $domElemsToRemove );
166
167
		// Elements with named IDs
168
		$domElemsToRemove = [];
169
		foreach ( $removals['ID'] as $itemToRemove ) {
170
			$itemToRemoveNode = $doc->getElementById( $itemToRemove );
171
			if ( $itemToRemoveNode ) {
172
				$domElemsToRemove[] = $itemToRemoveNode;
173
			}
174
		}
175
		$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
176
177
		// CSS Classes
178
		$domElemsToRemove = [];
179
		$xpath = new DOMXPath( $doc );
180
		foreach ( $removals['CLASS'] as $classToRemove ) {
181
			$elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
182
183
			/** @var $element DOMElement */
184
			foreach ( $elements as $element ) {
185
				$classes = $element->getAttribute( 'class' );
186
				if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
187
					$domElemsToRemove[] = $element;
188
				}
189
			}
190
		}
191
		$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
192
193
		// Tags with CSS Classes
194
		foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
195
			$parts = explode( '.', $classToRemove );
196
197
			$elements = $xpath->query(
198
				'//' . $parts[0] . '[@class="' . $parts[1] . '"]'
199
			);
200
			$removed = array_merge( $removed, $this->removeElements( $elements ) );
201
		}
202
203
		return $removed;
204
	}
205
206
	/**
207
	 * Removes a list of elelments from DOMDocument
208
	 * @param array|DOMNodeList $elements
209
	 * @return array Array of removed elements
210
	 */
211
	private function removeElements( $elements ) {
212
		$list = $elements;
213
		if ( $elements instanceof DOMNodeList ) {
214
			$list = [];
215
			foreach ( $elements as $element ) {
216
				$list[] = $element;
217
			}
218
		}
219
		/** @var $element DOMElement */
220
		foreach ( $list as $element ) {
221
			if ( $element->parentNode ) {
222
				$element->parentNode->removeChild( $element );
223
			}
224
		}
225
		return $list;
226
	}
227
228
	/**
229
	 * libxml in its usual pointlessness converts many chars to entities - this function
230
	 * perfoms a reverse conversion
231
	 * @param string $html
232
	 * @return string
233
	 */
234
	private function fixLibXML( $html ) {
235
		static $replacements;
236
		if ( !$replacements ) {
237
			// We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
238
			// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
239
			$replacements = new ReplacementArray( [
240
				'&quot;' => '&amp;quot;',
241
				'&amp;' => '&amp;amp;',
242
				'&lt;' => '&amp;lt;',
243
				'&gt;' => '&amp;gt;',
244
			] );
245
		}
246
		$html = $replacements->replace( $html );
247
248
		// Just in case the conversion in getDoc() above used named
249
		// entities that aren't known to html_entity_decode().
250
		$html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
251
252
		return $html;
253
	}
254
255
	/**
256
	 * Performs final transformations and returns resulting HTML.  Note that if you want to call this
257
	 * both without an element and with an element you should call it without an element first.  If you
258
	 * specify the $element in the method it'll change the underlying dom and you won't be able to get
259
	 * it back.
260
	 *
261
	 * @param DOMElement|string|null $element ID of element to get HTML from or
262
	 *   false to get it from the whole tree
263
	 * @return string Processed HTML
264
	 */
265
	public function getText( $element = null ) {
266
267
		if ( $this->doc ) {
268
			if ( $element !== null && !( $element instanceof DOMElement ) ) {
269
				$element = $this->doc->getElementById( $element );
270
			}
271
			if ( $element ) {
272
				$body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
273
				$nodesArray = [];
274
				foreach ( $body->childNodes as $node ) {
275
					$nodesArray[] = $node;
276
				}
277
				foreach ( $nodesArray as $nodeArray ) {
278
					$body->removeChild( $nodeArray );
279
				}
280
				$body->appendChild( $element );
281
			}
282
			$html = $this->doc->saveHTML();
283
284
			$html = $this->fixLibXML( $html );
285
			if ( wfIsWindows() ) {
286
				// Cleanup for CRLF misprocessing of unknown origin on Windows.
287
				// If this error continues in the future, please track it down in the
288
				// XML code paths if possible and fix there.
289
				$html = str_replace( '&#13;', '', $html );
290
			}
291
		} else {
292
			$html = $this->html;
293
		}
294
		// Remove stuff added by wrapHTML()
295
		$html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
296
		$html = $this->onHtmlReady( $html );
297
298
		if ( $this->elementsToFlatten ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->elementsToFlatten of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
299
			$elements = implode( '|', $this->elementsToFlatten );
300
			$html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
301
		}
302
303
		return $html;
304
	}
305
306
	/**
307
	 * Helper function for parseItemsToRemove(). This function extracts the selector type
308
	 * and the raw name of a selector from a CSS-style selector string and assigns those
309
	 * values to parameters passed by reference. For example, if given '#toc' as the
310
	 * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
311
	 * @param string $selector CSS selector to parse
312
	 * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
313
	 * @param string $rawName The raw name of the selector
314
	 * @return bool Whether the selector was successfully recognised
315
	 * @throws MWException
316
	 */
317
	protected function parseSelector( $selector, &$type, &$rawName ) {
318
		if ( strpos( $selector, '.' ) === 0 ) {
319
			$type = 'CLASS';
320
			$rawName = substr( $selector, 1 );
321
		} elseif ( strpos( $selector, '#' ) === 0 ) {
322
			$type = 'ID';
323
			$rawName = substr( $selector, 1 );
324
		} elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
325
			$type = 'TAG_CLASS';
326
			$rawName = $selector;
327
		} elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
328
			$type = 'TAG';
329
			$rawName = $selector;
330
		} else {
331
			throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
332
		}
333
334
		return true;
335
	}
336
337
	/**
338
	 * Transforms CSS-style selectors into an internal representation suitable for
339
	 * processing by filterContent()
340
	 * @return array
341
	 */
342
	protected function parseItemsToRemove() {
343
		$removals = [
344
			'ID' => [],
345
			'TAG' => [],
346
			'CLASS' => [],
347
			'TAG_CLASS' => [],
348
		];
349
350
		foreach ( $this->itemsToRemove as $itemToRemove ) {
351
			$type = '';
352
			$rawName = '';
353
			if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
354
				$removals[$type][] = $rawName;
355
			}
356
		}
357
358
		if ( $this->removeMedia ) {
359
			$removals['TAG'][] = 'img';
360
			$removals['TAG'][] = 'audio';
361
			$removals['TAG'][] = 'video';
362
		}
363
364
		return $removals;
365
	}
366
}
367