Completed
Push — master ( 803d3c...9ccb75 )
by Josh
12:51
created

UrlFilter::rebuildUrl()   C

Complexity

Conditions 8
Paths 32

Size

Total Lines 74
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 8

Importance

Changes 0
Metric Value
dl 0
loc 74
ccs 27
cts 27
cp 1
rs 6.2894
c 0
b 0
f 0
cc 8
eloc 27
nc 32
nop 1
crap 8

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Parser\AttributeFilters;
9
10
use s9e\TextFormatter\Parser\Logger;
11
12
class UrlFilter
13
{
14
	/**
15
	* Filter a URL
16
	*
17
	* @param  mixed  $attrValue Original URL
18
	* @param  array  $urlConfig URL config
19
	* @param  Logger $logger    Parser's logger
20
	* @return mixed             Cleaned up URL if valid, FALSE otherwise
21
	*/
22 59
	public static function filter($attrValue, array $urlConfig, Logger $logger = null)
23
	{
24
		/**
25
		* Trim the URL to conform with HTML5 then parse it
26
		* @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href
27
		*/
28 59
		$p = self::parseUrl(trim($attrValue));
29
30 59
		$error = self::validateUrl($urlConfig, $p);
31 59
		if (!empty($error))
32
		{
33 21
			if (isset($logger))
34
			{
35 20
				$p['attrValue'] = $attrValue;
36 20
				$logger->err($error, $p);
37
			}
38
39 21
			return false;
40
		}
41
42 39
		return self::rebuildUrl($p);
43
	}
44
45
	/**
46
	* Parse a URL and return its components
47
	*
48
	* Similar to PHP's own parse_url() except that all parts are always returned
49
	*
50
	* @param  string $url Original URL
51
	* @return array
52
	*/
53 59
	protected static function parseUrl($url)
54
	{
55 59
		$regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di';
56
57
		// NOTE: this regexp always matches because of the last three captures
58 59
		preg_match($regexp, $url, $m);
59
60 59
		$parts  = [];
61 59
		$tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment'];
62 59
		foreach ($tokens as $i => $name)
63
		{
64 59
			$parts[$name] = (isset($m[$i + 1])) ? $m[$i + 1] : '';
65
		}
66
67
		/**
68
		* @link http://tools.ietf.org/html/rfc3986#section-3.1
69
		*
70
		* 'An implementation should accept uppercase letters as equivalent to lowercase in
71
		* scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but
72
		* should only produce lowercase scheme names for consistency.'
73
		*/
74 59
		$parts['scheme'] = strtolower($parts['scheme']);
75
76
		/**
77
		* Normalize the domain label separators and remove trailing dots
78
		* @link http://url.spec.whatwg.org/#domain-label-separators
79
		*/
80 59
		$parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.');
81
82
		// Test whether host has non-ASCII characters and punycode it if possible
83 59
		if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii'))
84
		{
85 3
			$variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0;
86 3
			$parts['host'] = idn_to_ascii($parts['host'], 0, $variant);
87
		}
88
89 59
		return $parts;
90
	}
91
92
	/**
93
	* Rebuild a parsed URL
94
	*
95
	* @param  array  $p Parsed URL
96
	* @return string
97
	*/
98 39
	protected static function rebuildUrl(array $p)
99
	{
100 39
		$url = '';
101 39
		if ($p['scheme'] !== '')
102
		{
103 29
			$url .= $p['scheme'] . ':';
104
		}
105 39
		if ($p['host'] === '')
106
		{
107
			// Allow the file: scheme to not have a host and ensure it starts with slashes
108 9
			if ($p['scheme'] === 'file')
109
			{
110 9
				$url .= '//';
111
			}
112
		}
113
		else
114
		{
115 30
			$url .= '//';
116
117
			// Add the credentials if applicable
118 30
			if ($p['user'] !== '')
119
			{
120
				// Reencode the credentials in case there are invalid chars in them, or suspicious
121
				// characters such as : or @ that could confuse a browser into connecting to the
122
				// wrong host (or at least, to a host that is different than the one we thought)
123 2
				$url .= rawurlencode(urldecode($p['user']));
124
125 2
				if ($p['pass'] !== '')
126
				{
127 2
					$url .= ':' . rawurlencode(urldecode($p['pass']));
128
				}
129
130 2
				$url .= '@';
131
			}
132
133 30
			$url .= $p['host'];
134
135
			// Append the port number (note that as per the regexp it can only contain digits)
136 30
			if ($p['port'] !== '')
137
			{
138 3
				$url .= ':' . $p['port'];
139
			}
140
		}
141
142
		// Build the path, including the query and fragment parts
143 39
		$path = $p['path'] . $p['query'] . $p['fragment'];
144
145
		/**
146
		* "For consistency, URI producers and normalizers should use uppercase hexadecimal digits
147
		* for all percent- encodings."
148
		*
149
		* @link http://tools.ietf.org/html/rfc3986#section-2.1
150
		*/
151 39
		$path = preg_replace_callback(
152 39
			'/%.?[a-f]/',
153 39
			function ($m)
154
			{
155 1
				return strtoupper($m[0]);
156 39
			},
157 39
			$path
158
		);
159
160
		// Append the sanitized path to the URL
161 39
		$url .= self::sanitizeUrl($path);
162
163
		// Replace the first colon if there's no scheme and it could potentially be interpreted as
164
		// the scheme separator
165 39
		if (!$p['scheme'])
166
		{
167 10
			$url = preg_replace('#^([^/]*):#', '$1%3A', $url);
168
		}
169
170 39
		return $url;
171
	}
172
173
	/**
174
	* Sanitize a URL for safe use regardless of context
175
	*
176
	* This method URL-encodes some sensitive characters in case someone would want to use the URL in
177
	* some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path
178
	* of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately
179
	* followed by two hex digits.
180
	*
181
	* " and ' to prevent breaking out of quotes (JavaScript or otherwise)
182
	* ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression())
183
	* < and > to prevent breaking out of <script>
184
	* \r and \n because they're illegal in JavaScript
185
	* [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986
186
	* Non-ASCII characters as per RFC 3986
187
	* Control codes and spaces, as per RFC 3986
188
	*
189
	* @link http://sla.ckers.org/forum/read.php?2,51478
190
	* @link http://timelessrepo.com/json-isnt-a-javascript-subset
191
	* @link http://www.ietf.org/rfc/rfc3986.txt
192
	* @link http://stackoverflow.com/a/1547922
193
	* @link http://tools.ietf.org/html/rfc3986#appendix-A
194
	*
195
	* @param  string $url Original URL
196
	* @return string      Sanitized URL
197
	*/
198 56
	public static function sanitizeUrl($url)
199
	{
200 56
		return preg_replace_callback(
201 56
			'/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z]/S',
202 56
			function ($m)
203
			{
204 26
				return rawurlencode($m[0]);
205 56
			},
206 56
			$url
207
		);
208
	}
209
210
	/**
211
	* Validate a parsed URL
212
	*
213
	* @param  array      $urlConfig URL config
214
	* @param  array      $p         Parsed URL
215
	* @return string|null           Error message if invalid, or NULL
216
	*/
217 59
	protected static function validateUrl(array $urlConfig, array $p)
218
	{
219 59
		if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme']))
220
		{
221 3
			return 'URL scheme is not allowed';
222
		}
223
224 57
		if ($p['host'] === '')
225
		{
226
			// Reject malformed URLs such as http:///example.org but allow schemeless paths
227 13
			if ($p['scheme'] !== 'file' && $p['scheme'] !== '')
228
			{
229 13
				return 'Missing host';
230
			}
231
		}
232
		else
233
		{
234
			/**
235
			* Test whether the host is valid
236
			* @link http://tools.ietf.org/html/rfc1035#section-2.3.1
237
			* @link http://tools.ietf.org/html/rfc1123#section-2
238
			*/
239 44
			$regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i';
240 44
			if (!preg_match($regexp, $p['host']))
241
			{
242
				// If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets)
243 2
				if (!NetworkFilter::filterIpv4($p['host'])
244 2
				 && !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host'])))
245
				{
246 1
					return 'URL host is invalid';
247
				}
248
			}
249
250 43
			if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host']))
251 43
			 || (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host'])))
252
			{
253 14
				return 'URL host is not allowed';
254
			}
255
		}
256
	}
257
}