Completed
Push — master ( 0b2951...459fb4 )
by Daniel
12:39
created

HTMLEditorSanitiser::attributeMatchesRule()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 10
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 4
eloc 4
c 1
b 1
f 0
nc 3
nop 2
dl 0
loc 10
rs 9.2
1
<?php
2
3
/**
4
 * Sanitises an HTMLValue so it's contents are the elements and attributes that are whitelisted
5
 * using the same configuration as TinyMCE
6
 *
7
 * See www.tinymce.com/wiki.php/configuration:valid_elements for details on the spec of TinyMCE's
8
 * whitelist configuration
9
 *
10
 * @package forms
11
 * @subpackage fields-formattedinput
12
 */
13
class HTMLEditorSanitiser {
14
15
	/** @var [stdClass] - $element => $rule hash for whitelist element rules where the element name isn't a pattern */
16
	protected $elements = array();
17
	/** @var [stdClass] - Sequential list of whitelist element rules where the element name is a pattern */
18
	protected $elementPatterns = array();
19
20
	/** @var [stdClass] - The list of attributes that apply to all further whitelisted elements added */
21
	protected $globalAttributes = array();
22
23
	/**
24
	 * Construct a sanitiser from a given HTMLEditorConfig
25
	 *
26
	 * Note that we build data structures from the current state of HTMLEditorConfig - later changes to
27
	 * the passed instance won't cause this instance to update it's whitelist
28
	 *
29
	 * @param HTMLEditorConfig $config
30
	 */
31
	public function __construct(HTMLEditorConfig $config) {
32
		$valid = $config->getOption('valid_elements');
33
		if ($valid) $this->addValidElements($valid);
34
35
		$valid = $config->getOption('extended_valid_elements');
36
		if ($valid) $this->addValidElements($valid);
37
	}
38
39
	/**
40
	 * Given a TinyMCE pattern (close to unix glob style), create a regex that does the match
41
	 *
42
	 * @param $str - The TinyMCE pattern
43
	 * @return string - The equivalent regex
44
	 */
45
	protected function patternToRegex($str) {
46
		return '/^' . preg_replace('/([?+*])/', '.$1', $str) . '$/';
47
	}
48
49
	/**
50
	 * Given a valid_elements string, parse out the actual element and attribute rules and add to the
51
	 * internal whitelist
52
	 *
53
	 * Logic based heavily on javascript version from tiny_mce_src.js
54
	 *
55
	 * @param string $validElements - The valid_elements or extended_valid_elements string to add to the whitelist
56
	 */
57
	protected function addValidElements($validElements) {
58
		$elementRuleRegExp = '/^([#+\-])?([^\[\/]+)(?:\/([^\[]+))?(?:\[([^\]]+)\])?$/';
59
		$attrRuleRegExp = '/^([!\-])?(\w+::\w+|[^=:<]+)?(?:([=:<])(.*))?$/';
60
		$hasPatternsRegExp = '/[*?+]/';
61
62
		foreach(explode(',', $validElements) as $validElement) {
63
			if(preg_match($elementRuleRegExp, $validElement, $matches)) {
64
65
				$prefix = isset($matches[1]) ? $matches[1] : null;
66
				$elementName = isset($matches[2]) ? $matches[2] : null;
67
				$outputName = isset($matches[3]) ? $matches[3] : null;
68
				$attrData = isset($matches[4]) ? $matches[4] : null;
69
70
				// Create the new element
71
				$element = new stdClass();
72
				$element->attributes = array();
73
				$element->attributePatterns = array();
74
75
				$element->attributesRequired = array();
76
				$element->attributesDefault = array();
77
				$element->attributesForced = array();
78
79
				foreach(array('#' => 'paddEmpty', '-' => 'removeEmpty') as $match => $means) {
80
					$element->$means = ($prefix === $match);
81
				}
82
83
				// Copy attributes from global rule into current rule
84
				if($this->globalAttributes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->globalAttributes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
85
					$element->attributes = array_merge($element->attributes, $this->globalAttributes);
86
				}
87
88
				// Attributes defined
89
				if($attrData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $attrData of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
90
					foreach(explode('|', $attrData) as $attr) {
91
						if(preg_match($attrRuleRegExp, $attr, $matches)) {
92
							$attr = new stdClass();
93
94
							$attrType = isset($matches[1]) ? $matches[1] : null;
95
							$attrName = isset($matches[2]) ? str_replace('::', ':', $matches[2]) : null;
96
							$prefix = isset($matches[3]) ? $matches[3] : null;
97
							$value = isset($matches[4]) ? $matches[4] : null;
98
99
							// Required
100
							if($attrType === '!') {
101
								$element->attributesRequired[] = $attrName;
102
								$attr->required = true;
103
							}
104
105
							// Denied from global
106
							else if($attrType === '-') {
107
								unset($element->attributes[$attrName]);
108
								continue;
109
							}
110
111
							// Default value
112
							if($prefix) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $prefix of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
113
								// Default value
114
								if($prefix === '=') {
115
									$element->attributesDefault[$attrName] = $value;
116
									$attr->defaultValue = $value;
117
								}
118
119
								// Forced value
120
								else if($prefix === ':') {
121
									$element->attributesForced[$attrName] = $value;
122
									$attr->forcedValue = $value;
123
								}
124
125
								// Required values
126
								else if($prefix === '<') {
127
									$attr->validValues = explode('?', $value);
128
								}
129
							}
130
131
							// Check for attribute patterns
132
							if(preg_match($hasPatternsRegExp, $attrName)) {
133
								$attr->pattern = $this->patternToRegex($attrName);
134
								$element->attributePatterns[] = $attr;
135
							}
136
							else {
137
								$element->attributes[$attrName] = $attr;
138
							}
139
						}
140
					}
141
				}
142
143
				// Global rule, store away these for later usage
144
				if(!$this->globalAttributes && $elementName == '@') {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->globalAttributes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
145
					$this->globalAttributes = $element->attributes;
146
				}
147
148
				// Handle substitute elements such as b/strong
149
				if($outputName) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $outputName of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
150
					$element->outputName = $elementName;
151
					$this->elements[$outputName] = $element;
152
				}
153
154
				// Add pattern or exact element
155
				if(preg_match($hasPatternsRegExp, $elementName)) {
156
					$element->pattern = $this->patternToRegex($elementName);
157
					$this->elementPatterns[] = $element;
158
				}
159
				else {
160
					$this->elements[$elementName] = $element;
161
				}
162
			}
163
		}
164
	}
165
166
	/**
167
	 * Given an element tag, return the rule structure for that element
168
	 * @param string $tag - The element tag
169
	 * @return stdClass - The element rule
170
	 */
171
	protected function getRuleForElement($tag) {
172
		if(isset($this->elements[$tag])) {
173
			return $this->elements[$tag];
174
		}
175
		else foreach($this->elementPatterns as $element) {
176
			if(preg_match($element->pattern, $tag)) return $element;
177
		}
178
	}
179
180
	/**
181
	 * Given an attribute name, return the rule structure for that attribute
182
	 * @param string $name - The attribute name
183
	 * @return stdClass - The attribute rule
184
	 */
185
	protected function getRuleForAttribute($elementRule, $name) {
186
		if(isset($elementRule->attributes[$name])) {
187
			return $elementRule->attributes[$name];
188
		}
189
		else foreach($elementRule->attributePatterns as $attribute) {
190
			if(preg_match($attribute->pattern, $name)) return $attribute;
191
		}
192
	}
193
194
	/**
195
	 * Given a DOMElement and an element rule, check if that element passes the rule
196
	 * @param DOMElement $element - the element to check
197
	 * @param stdClass $rule - the rule to check against
198
	 * @return bool - true if the element passes (and so can be kept), false if it fails (and so needs stripping)
199
	 */
200
	protected function elementMatchesRule($element, $rule = null) {
201
		// If the rule doesn't exist at all, the element isn't allowed
202
		if(!$rule) return false;
203
204
		// If the rule has attributes required, check them to see if this element has at least one
205
		if($rule->attributesRequired) {
206
			$hasMatch = false;
207
208
			foreach($rule->attributesRequired as $attr) {
209
				if($element->getAttribute($attr)) {
210
					$hasMatch = true;
211
					break;
212
				}
213
			}
214
215
			if(!$hasMatch) return false;
216
		}
217
218
		// If the rule says to remove empty elements, and this element is empty, remove it
219
		if($rule->removeEmpty && !$element->firstChild) return false;
220
221
		// No further tests required, element passes
222
		return true;
223
	}
224
225
	/**
226
	 * Given a DOMAttr and an attribute rule, check if that attribute passes the rule
227
	 * @param DOMAttr $attr - the attribute to check
228
	 * @param stdClass $rule - the rule to check against
229
	 * @return bool - true if the attribute passes (and so can be kept), false if it fails (and so needs stripping)
230
	 */
231
	protected function attributeMatchesRule($attr, $rule = null) {
232
		// If the rule doesn't exist at all, the attribute isn't allowed
233
		if(!$rule) return false;
234
235
		// If the rule has a set of valid values, check them to see if this attribute is one
236
		if(isset($rule->validValues) && !in_array($attr->value, $rule->validValues)) return false;
237
238
		// No further tests required, attribute passes
239
		return true;
240
	}
241
242
	/**
243
	 * Given an SS_HTMLValue instance, will remove and elements and attributes that are
244
	 * not explicitly included in the whitelist passed to __construct on instance creation
245
	 *
246
	 * @param SS_HTMLValue $html - The HTMLValue to remove any non-whitelisted elements & attributes from
247
	 */
248
	public function sanitise (SS_HTMLValue $html) {
249
		if(!$this->elements && !$this->elementPatterns) return;
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->elements of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
Bug Best Practice introduced by
The expression $this->elementPatterns of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
250
251
		$doc = $html->getDocument();
252
253
		foreach($html->query('//body//*') as $el) {
254
			$elementRule = $this->getRuleForElement($el->tagName);
255
256
			// If this element isn't allowed, strip it
257
			if(!$this->elementMatchesRule($el, $elementRule)) {
258
				// If it's a script or style, we don't keep contents
259
				if($el->tagName === 'script' || $el->tagName === 'style') {
260
					$el->parentNode->removeChild($el);
261
				}
262
				// Otherwise we replace this node with all it's children
263
				else {
264
					// First, create a new fragment with all of $el's children moved into it
265
					$frag = $doc->createDocumentFragment();
266
					while($el->firstChild) $frag->appendChild($el->firstChild);
267
268
					// Then replace $el with the frags contents (which used to be it's children)
269
					$el->parentNode->replaceChild($frag, $el);
270
				}
271
			}
272
			// Otherwise tidy the element
273
			else {
274
				// First, if we're supposed to pad & this element is empty, fix that
275
				if($elementRule->paddEmpty && !$el->firstChild) {
276
					$el->nodeValue = '&nbsp;';
277
				}
278
279
				// Then filter out any non-whitelisted attributes
280
				$children = $el->attributes;
281
				$i = $children->length;
282
				while($i--) {
283
					$attr = $children->item($i);
284
					$attributeRule = $this->getRuleForAttribute($elementRule, $attr->name);
285
286
					// If this attribute isn't allowed, strip it
287
					if(!$this->attributeMatchesRule($attr, $attributeRule)) {
288
						$el->removeAttributeNode($attr);
289
					}
290
				}
291
292
				// Then enforce any default attributes
293
				foreach($elementRule->attributesDefault as $attr => $default) {
294
					if(!$el->getAttribute($attr)) $el->setAttribute($attr, $default);
295
				}
296
297
				// And any forced attributes
298
				foreach($elementRule->attributesForced as $attr => $forced) {
299
					$el->setAttribute($attr, $forced);
300
				}
301
			}
302
		}
303
	}
304
305
}
306