Completed
Push — add/amp-pwa-experiment ( efea12 )
by
unknown
11:53
created

AMP_Blacklist_Sanitizer::validate_a_node()   C

Complexity

Conditions 8
Paths 9

Size

Total Lines 40
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 21
nc 9
nop 1
dl 0
loc 40
rs 5.3846
c 0
b 0
f 0
1
<?php
2
3
require_once( AMP__ROOT__ . '/includes/sanitizers/class-amp-base-sanitizer.php' );
4
5
/**
6
 * Strips blacklisted tags and attributes from content.
7
 *
8
 * See following for blacklist:
9
 *     https://github.com/ampproject/amphtml/blob/master/spec/amp-html-format.md#html-tags
10
 */
11
class AMP_Blacklist_Sanitizer extends AMP_Base_Sanitizer {
12
	const PATTERN_REL_WP_ATTACHMENT = '#wp-att-([\d]+)#';
13
14
	protected $DEFAULT_ARGS = array(
15
		'add_blacklisted_protocols' => array(),
16
		'add_blacklisted_tags' => array(),
17
		'add_blacklisted_attributes' => array(),
18
	);
19
20
	public function sanitize() {
21
		$blacklisted_tags = $this->get_blacklisted_tags();
22
		$blacklisted_attributes = $this->get_blacklisted_attributes();
23
		$blacklisted_protocols = $this->get_blacklisted_protocols();
24
25
		$body = $this->get_body_node();
26
		$this->strip_tags( $body, $blacklisted_tags );
27
		$this->strip_attributes_recursive( $body, $blacklisted_attributes, $blacklisted_protocols );
28
	}
29
30
	private function strip_attributes_recursive( $node, $bad_attributes, $bad_protocols ) {
31
		if ( $node->nodeType !== XML_ELEMENT_NODE ) {
32
			return;
33
		}
34
35
		$node_name = $node->nodeName;
36
37
		// Some nodes may contain valid content but are themselves invalid.
38
		// Remove the node but preserve the children.
39
 		if ( 'font' === $node_name ) {
40
			$this->replace_node_with_children( $node, $bad_attributes, $bad_protocols );
41
			return;
42
		} elseif ( 'a' === $node_name && false === $this->validate_a_node( $node ) ) {
43
			$this->replace_node_with_children( $node, $bad_attributes, $bad_protocols );
44
			return;
45
		}
46
47
		if ( $node->hasAttributes() ) {
48
			$length = $node->attributes->length;
49
			for ( $i = $length - 1; $i >= 0; $i-- ) {
50
				$attribute = $node->attributes->item( $i );
51
				$attribute_name = strtolower( $attribute->name );
52
				if ( in_array( $attribute_name, $bad_attributes ) ) {
53
					$node->removeAttribute( $attribute_name );
54
					continue;
55
				}
56
57
				// on* attributes (like onclick) are a special case
58
				if ( 0 === stripos( $attribute_name, 'on' ) && $attribute_name != 'on' ) {
59
					$node->removeAttribute( $attribute_name );
60
					continue;
61
				} elseif ( 'a' === $node_name ) {
62
					$this->sanitize_a_attribute( $node, $attribute );
63
				}
64
			}
65
		}
66
67
		$length = $node->childNodes->length;
68
		for ( $i = $length - 1; $i >= 0; $i-- ) {
69
			$child_node = $node->childNodes->item( $i );
70
71
			$this->strip_attributes_recursive( $child_node, $bad_attributes, $bad_protocols );
72
		}
73
	}
74
75
	private function strip_tags( $node, $tag_names ) {
76
		foreach ( $tag_names as $tag_name ) {
77
			$elements = $node->getElementsByTagName( $tag_name );
78
			$length = $elements->length;
79
			if ( 0 === $length ) {
80
				continue;
81
			}
82
83
			for ( $i = $length - 1; $i >= 0; $i-- ) {
84
				$element = $elements->item( $i );
85
				$parent_node = $element->parentNode;
86
				$parent_node->removeChild( $element );
87
88
				if ( 'body' !== $parent_node->nodeName && AMP_DOM_Utils::is_node_empty( $parent_node ) ) {
89
					$parent_node->parentNode->removeChild( $parent_node );
90
				}
91
			}
92
		}
93
	}
94
95
	private function sanitize_a_attribute( $node, $attribute ) {
96
		$attribute_name = strtolower( $attribute->name );
97
98
		if ( 'rel' === $attribute_name ) {
99
			$old_value = $attribute->value;
100
			$new_value = trim( preg_replace( self::PATTERN_REL_WP_ATTACHMENT, '', $old_value ) );
101
			if ( empty( $new_value ) ) {
102
				$node->removeAttribute( $attribute_name );
103
			} elseif ( $old_value !== $new_value ) {
104
				$node->setAttribute( $attribute_name, $new_value );
105
			}
106
		} elseif ( 'rev' === $attribute_name ) {
107
			// rev removed from HTML5 spec, which was used by Jetpack Markdown.
108
			$node->removeAttribute( $attribute_name );
109
		} elseif ( 'target' === $attribute_name ) {
110
			// _blank is the only allowed value and it must be lowercase.
111
			// replace _new with _blank and others should simply be removed.
112
			$old_value = strtolower( $attribute->value );
113
			if ( '_blank' === $old_value || '_new' === $old_value ) {
114
				// _new is not allowed; swap with _blank
115
				$node->setAttribute( $attribute_name, '_blank' );
116
			} else {
117
				// only _blank is allowed
118
				$node->removeAttribute( $attribute_name );
119
			}
120
		}
121
	}
122
123
	private function validate_a_node( $node ) {
124
		// Get the href attribute
125
		$href = $node->getAttribute( 'href' );
126
127
		// If no href is set and this isn't an anchor, it's invalid
128
		if ( empty( $href ) ) {
129
			$name_attr = $node->getAttribute( 'name' );
130
			if ( ! empty( $name_attr ) ) {
131
				// No further validation is required
132
				return true;
133
			} else {
134
				return false;
135
			}
136
		}
137
138
		// If this is an anchor link, just return true
139
		if ( 0 === strpos( $href, '#' ) ) {
140
			return true;
141
		}
142
143
		// If the href starts with a '/', append the home_url to it for validation purposes.
144
		if ( 0 === stripos( $href, '/' ) ) {
145
			$href = untrailingslashit( get_home_url() ) . $href;
146
		}
147
148
		$valid_protocols = array( 'http', 'https', 'mailto', 'sms', 'tel', 'viber', 'whatsapp' );
149
		$special_protocols = array( 'tel', 'sms' ); // these ones don't valid with `filter_var+FILTER_VALIDATE_URL`
150
		$protocol = strtok( $href, ':' );
151
152
		if ( false === filter_var( $href, FILTER_VALIDATE_URL )
153
			&& ! in_array( $protocol, $special_protocols ) ) {
154
			return false;
155
		}
156
157
		if ( ! in_array( $protocol, $valid_protocols ) ) {
158
			return false;
159
		}
160
161
		return true;
162
	}
163
164
	private function replace_node_with_children( $node, $bad_attributes, $bad_protocols ) {
165
		// If the node has children and also has a parent node,
166
		// clone and re-add all the children just before current node.
167
		if ( $node->hasChildNodes() && $node->parentNode ) {
168
			foreach ( $node->childNodes as $child_node ) {
169
				$new_child = $child_node->cloneNode( true );
170
				$this->strip_attributes_recursive( $new_child, $bad_attributes, $bad_protocols );
171
				$node->parentNode->insertBefore( $new_child, $node );
172
			}
173
		}
174
175
		// Remove the node from the parent, if defined.
176
		if ( $node->parentNode ) {
177
			$node->parentNode->removeChild( $node );
178
		}
179
	}
180
181
	private function merge_defaults_with_args( $key, $values ) {
182
		// Merge default values with user specified args
183
		if ( ! empty( $this->args[ $key ] )
184
			&& is_array( $this->args[ $key ] ) ) {
185
			$values = array_merge( $values, $this->args[ $key ] );
186
		}
187
188
		return $values;
189
	}
190
191
	private function get_blacklisted_protocols() {
192
		return $this->merge_defaults_with_args( 'add_blacklisted_protocols', array(
193
			'javascript',
194
		) );
195
	}
196
197
	private function get_blacklisted_tags() {
198
		return $this->merge_defaults_with_args( 'add_blacklisted_tags', array(
199
			'script',
200
			'noscript',
201
			'style',
202
			'frame',
203
			'frameset',
204
			'object',
205
			'param',
206
			'applet',
207
			'form',
208
			'label',
209
			'input',
210
			'textarea',
211
			'select',
212
			'option',
213
			'link',
214
			'picture',
215
216
			// Sanitizers run after embed handlers, so if anything wasn't matched, it needs to be removed.
217
			'embed',
218
			'embedvideo',
219
220
			// Other weird ones
221
			'comments-count',
222
223
			// These are converted into amp-* versions
224
			//'img',
225
			//'video',
226
			//'audio',
227
			//'iframe',
228
		) );
229
	}
230
231
	private function get_blacklisted_attributes() {
232
		return $this->merge_defaults_with_args( 'add_blacklisted_attributes', array(
233
			'style',
234
			'size',
235
			'clear',
236
			'align',
237
			'valign',
238
		) );
239
	}
240
}
241