UrlFilter::validateUrl() - Code Metrics - Inspection of "BuiltInFilters: reorganized filters into separate..." - s9e/TextFormatter - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 803d3c...9ccb75 )

by Josh

created 2017-11-24 02:34 UTC

UrlFilter::validateUrl() C

↳ Parent: UrlFilter

Complexity

Conditions	13
Paths	8

Size

Total Lines	40
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	15
CRAP Score	13

Importance

Changes

Metric	Value
dl	0
loc	40
ccs	15
cts	15
cp	1
rs	5.1234
c	0
b	0
f	0
cc	13
eloc	15
nc	8
nop	2
crap	13

How to fix Complexity

<?php

/**
* @package   s9e\TextFormatter
* @copyright Copyright (c) 2010-2017 The s9e Authors
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Parser\AttributeFilters;

use s9e\TextFormatter\Parser\Logger;

class UrlFilter
{
	/**
	* Filter a URL
	*
	* @param  mixed  $attrValue Original URL
	* @param  array  $urlConfig URL config
	* @param  Logger $logger    Parser's logger
	* @return mixed             Cleaned up URL if valid, FALSE otherwise
	*/
	public static function filter($attrValue, array $urlConfig, Logger $logger = null)
	{
		/**
		* Trim the URL to conform with HTML5 then parse it
		* @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href
		*/
		$p = self::parseUrl(trim($attrValue));

		$error = self::validateUrl($urlConfig, $p);
		if (!empty($error))
		{
			if (isset($logger))
			{
				$p['attrValue'] = $attrValue;
				$logger->err($error, $p);
			}

			return false;
		}

		return self::rebuildUrl($p);
	}

	/**
	* Parse a URL and return its components
	*
	* Similar to PHP's own parse_url() except that all parts are always returned
	*
	* @param  string $url Original URL
	* @return array
	*/
	protected static function parseUrl($url)
	{
		$regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di';

		// NOTE: this regexp always matches because of the last three captures
		preg_match($regexp, $url, $m);

		$parts  = [];
		$tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment'];
		foreach ($tokens as $i => $name)
		{
			$parts[$name] = (isset($m[$i + 1])) ? $m[$i + 1] : '';
		}

		/**
		* @link http://tools.ietf.org/html/rfc3986#section-3.1
		*
		* 'An implementation should accept uppercase letters as equivalent to lowercase in
		* scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but
		* should only produce lowercase scheme names for consistency.'
		*/
		$parts['scheme'] = strtolower($parts['scheme']);

		/**
		* Normalize the domain label separators and remove trailing dots
		* @link http://url.spec.whatwg.org/#domain-label-separators
		*/
		$parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.');

		// Test whether host has non-ASCII characters and punycode it if possible
		if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii'))
		{
			$variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0;
			$parts['host'] = idn_to_ascii($parts['host'], 0, $variant);
		}

		return $parts;
	}

	/**
	* Rebuild a parsed URL
	*
	* @param  array  $p Parsed URL
	* @return string
	*/
	protected static function rebuildUrl(array $p)
	{
		$url = '';
		if ($p['scheme'] !== '')
		{
			$url .= $p['scheme'] . ':';
		}
		if ($p['host'] === '')
		{
			// Allow the file: scheme to not have a host and ensure it starts with slashes
			if ($p['scheme'] === 'file')
			{
				$url .= '//';
			}
		}
		else
		{
			$url .= '//';

			// Add the credentials if applicable
			if ($p['user'] !== '')
			{
				// Reencode the credentials in case there are invalid chars in them, or suspicious
				// characters such as : or @ that could confuse a browser into connecting to the
				// wrong host (or at least, to a host that is different than the one we thought)
				$url .= rawurlencode(urldecode($p['user']));

				if ($p['pass'] !== '')
				{
					$url .= ':' . rawurlencode(urldecode($p['pass']));
				}

				$url .= '@';
			}

			$url .= $p['host'];

			// Append the port number (note that as per the regexp it can only contain digits)
			if ($p['port'] !== '')
			{
				$url .= ':' . $p['port'];
			}
		}

		// Build the path, including the query and fragment parts
		$path = $p['path'] . $p['query'] . $p['fragment'];

		/**
		* "For consistency, URI producers and normalizers should use uppercase hexadecimal digits
		* for all percent- encodings."
		*
		* @link http://tools.ietf.org/html/rfc3986#section-2.1
		*/
		$path = preg_replace_callback(
			'/%.?[a-f]/',
			function ($m)
			{
				return strtoupper($m[0]);
			},
			$path
		);

		// Append the sanitized path to the URL
		$url .= self::sanitizeUrl($path);

		// Replace the first colon if there's no scheme and it could potentially be interpreted as
		// the scheme separator
		if (!$p['scheme'])
		{
			$url = preg_replace('#^([^/]*):#', '$1%3A', $url);
		}

		return $url;
	}

	/**
	* Sanitize a URL for safe use regardless of context
	*
	* This method URL-encodes some sensitive characters in case someone would want to use the URL in
	* some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path
	* of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately
	* followed by two hex digits.
	*
	* " and ' to prevent breaking out of quotes (JavaScript or otherwise)
	* ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression())
	* < and > to prevent breaking out of <script>
	* \r and \n because they're illegal in JavaScript
	* [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986
	* Non-ASCII characters as per RFC 3986
	* Control codes and spaces, as per RFC 3986
	*
	* @link http://sla.ckers.org/forum/read.php?2,51478
	* @link http://timelessrepo.com/json-isnt-a-javascript-subset
	* @link http://www.ietf.org/rfc/rfc3986.txt
	* @link http://stackoverflow.com/a/1547922
	* @link http://tools.ietf.org/html/rfc3986#appendix-A
	*
	* @param  string $url Original URL
	* @return string      Sanitized URL
	*/
	public static function sanitizeUrl($url)
	{
		return preg_replace_callback(
			'/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z]/S',
			function ($m)
			{
				return rawurlencode($m[0]);
			},
			$url
		);
	}

	/**
	* Validate a parsed URL
	*
	* @param  array      $urlConfig URL config
	* @param  array      $p         Parsed URL
	* @return string|null           Error message if invalid, or NULL
	*/
	protected static function validateUrl(array $urlConfig, array $p)
	{
		if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme']))
		{
			return 'URL scheme is not allowed';
		}

		if ($p['host'] === '')
		{
			// Reject malformed URLs such as http:///example.org but allow schemeless paths
			if ($p['scheme'] !== 'file' && $p['scheme'] !== '')
			{
				return 'Missing host';
			}
		}
		else
		{
			/**
			* Test whether the host is valid
			* @link http://tools.ietf.org/html/rfc1035#section-2.3.1
			* @link http://tools.ietf.org/html/rfc1123#section-2
			*/
			$regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i';
			if (!preg_match($regexp, $p['host']))
			{
				// If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets)
				if (!NetworkFilter::filterIpv4($p['host'])
				 && !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host'])))
				{
					return 'URL host is invalid';
				}
			}

			if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host']))
			 || (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host'])))
			{
				return 'URL host is not allowed';
			}
		}
	}
}

1		<?php
2
3		/**
4		* @package s9e\TextFormatter
5		* @copyright Copyright (c) 2010-2017 The s9e Authors
6		* @license http://www.opensource.org/licenses/mit-license.php The MIT License
7		*/
8		namespace s9e\TextFormatter\Parser\AttributeFilters;
9
10		use s9e\TextFormatter\Parser\Logger;
11
12		class UrlFilter
13		{
14		/**
15		* Filter a URL
16		*
17		* @param mixed $attrValue Original URL
18		* @param array $urlConfig URL config
19		* @param Logger $logger Parser's logger
20		* @return mixed Cleaned up URL if valid, FALSE otherwise
21		*/
22	59	public static function filter($attrValue, array $urlConfig, Logger $logger = null)
23		{
24		/**
25		* Trim the URL to conform with HTML5 then parse it
26		* @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href
27		*/
28	59	$p = self::parseUrl(trim($attrValue));
29
30	59	$error = self::validateUrl($urlConfig, $p);
31	59	if (!empty($error))
32		{
33	21	if (isset($logger))
34		{
35	20	$p['attrValue'] = $attrValue;
36	20	$logger->err($error, $p);
37		}
38
39	21	return false;
40		}
41
42	39	return self::rebuildUrl($p);
43		}
44
45		/**
46		* Parse a URL and return its components
47		*
48		* Similar to PHP's own parse_url() except that all parts are always returned
49		*
50		* @param string $url Original URL
51		* @return array
52		*/
53	59	protected static function parseUrl($url)
54		{
55	59	$regexp = '(^(?:([a-z][-+.\\w]):)?(?://(?:([^:/?#])(?::([^/?#])?)?@)?(?:(\\[[a-f\\d:]+\\]\|[^:/?#]+)(?::(\\d))?)?(?![^/?#]))?([^?#])(\\?[^#])?(#.*)?$)Di';
56
57		// NOTE: this regexp always matches because of the last three captures
58	59	preg_match($regexp, $url, $m);
59
60	59	$parts = [];
61	59	$tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment'];
62	59	foreach ($tokens as $i => $name)
63		{
64	59	$parts[$name] = (isset($m[$i + 1])) ? $m[$i + 1] : '';
65		}
66
67		/**
68		* @link http://tools.ietf.org/html/rfc3986#section-3.1
69		*
70		* 'An implementation should accept uppercase letters as equivalent to lowercase in
71		* scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but
72		* should only produce lowercase scheme names for consistency.'
73		*/
74	59	$parts['scheme'] = strtolower($parts['scheme']);
75
76		/**
77		* Normalize the domain label separators and remove trailing dots
78		* @link http://url.spec.whatwg.org/#domain-label-separators
79		*/
80	59	$parts['host'] = rtrim(preg_replace("/\xE3\x80\x82\|\xEF(?:\xBC\x8E\|\xBD\xA1)/s", '.', $parts['host']), '.');
81
82		// Test whether host has non-ASCII characters and punycode it if possible
83	59	if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii'))
84		{
85	3	$variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0;
86	3	$parts['host'] = idn_to_ascii($parts['host'], 0, $variant);
87		}
88
89	59	return $parts;
90		}
91
92		/**
93		* Rebuild a parsed URL
94		*
95		* @param array $p Parsed URL
96		* @return string
97		*/
98	39	protected static function rebuildUrl(array $p)
99		{
100	39	$url = '';
101	39	if ($p['scheme'] !== '')
102		{
103	29	$url .= $p['scheme'] . ':';
104		}
105	39	if ($p['host'] === '')
106		{
107		// Allow the file: scheme to not have a host and ensure it starts with slashes
108	9	if ($p['scheme'] === 'file')
109		{
110	9	$url .= '//';
111		}
112		}
113		else
114		{
115	30	$url .= '//';
116
117		// Add the credentials if applicable
118	30	if ($p['user'] !== '')
119		{
120		// Reencode the credentials in case there are invalid chars in them, or suspicious
121		// characters such as : or @ that could confuse a browser into connecting to the
122		// wrong host (or at least, to a host that is different than the one we thought)
123	2	$url .= rawurlencode(urldecode($p['user']));
124
125	2	if ($p['pass'] !== '')
126		{
127	2	$url .= ':' . rawurlencode(urldecode($p['pass']));
128		}
129
130	2	$url .= '@';
131		}
132
133	30	$url .= $p['host'];
134
135		// Append the port number (note that as per the regexp it can only contain digits)
136	30	if ($p['port'] !== '')
137		{
138	3	$url .= ':' . $p['port'];
139		}
140		}
141
142		// Build the path, including the query and fragment parts
143	39	$path = $p['path'] . $p['query'] . $p['fragment'];
144
145		/**
146		* "For consistency, URI producers and normalizers should use uppercase hexadecimal digits
147		* for all percent- encodings."
148		*
149		* @link http://tools.ietf.org/html/rfc3986#section-2.1
150		*/
151	39	$path = preg_replace_callback(
152	39	'/%.?[a-f]/',
153	39	function ($m)
154		{
155	1	return strtoupper($m[0]);
156	39	},
157	39	$path
158		);
159
160		// Append the sanitized path to the URL
161	39	$url .= self::sanitizeUrl($path);
162
163		// Replace the first colon if there's no scheme and it could potentially be interpreted as
164		// the scheme separator
165	39	if (!$p['scheme'])
166		{
167	10	$url = preg_replace('#^([^/]*):#', '$1%3A', $url);
168		}
169
170	39	return $url;
171		}
172
173		/**
174		* Sanitize a URL for safe use regardless of context
175		*
176		* This method URL-encodes some sensitive characters in case someone would want to use the URL in
177		* some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path
178		* of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately
179		* followed by two hex digits.
180		*
181		* " and ' to prevent breaking out of quotes (JavaScript or otherwise)
182		* ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression())
183		* < and > to prevent breaking out of <script>
184		* \r and \n because they're illegal in JavaScript
185		* [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986
186		* Non-ASCII characters as per RFC 3986
187		* Control codes and spaces, as per RFC 3986
188		*
189		* @link http://sla.ckers.org/forum/read.php?2,51478
190		* @link http://timelessrepo.com/json-isnt-a-javascript-subset
191		* @link http://www.ietf.org/rfc/rfc3986.txt
192		* @link http://stackoverflow.com/a/1547922
193		* @link http://tools.ietf.org/html/rfc3986#appendix-A
194		*
195		* @param string $url Original URL
196		* @return string Sanitized URL
197		*/
198	56	public static function sanitizeUrl($url)
199		{
200	56	return preg_replace_callback(
201	56	'/%(?![0-9A-Fa-f]{2})\|[^!#-&*-;=?-Z_a-z]/S',
202	56	function ($m)
203		{
204	26	return rawurlencode($m[0]);
205	56	},
206	56	$url
207		);
208		}
209
210		/**
211		* Validate a parsed URL
212		*
213		* @param array $urlConfig URL config
214		* @param array $p Parsed URL
215		* @return string\|null Error message if invalid, or NULL
216		*/
217	59	protected static function validateUrl(array $urlConfig, array $p)
218		{
219	59	if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme']))
220		{
221	3	return 'URL scheme is not allowed';
222		}
223
224	57	if ($p['host'] === '')
225		{
226		// Reject malformed URLs such as http:///example.org but allow schemeless paths
227	13	if ($p['scheme'] !== 'file' && $p['scheme'] !== '')
228		{
229	13	return 'Missing host';
230		}
231		}
232		else
233		{
234		/**
235		* Test whether the host is valid
236		* @link http://tools.ietf.org/html/rfc1035#section-2.3.1
237		* @link http://tools.ietf.org/html/rfc1123#section-2
238		*/
239	44	$regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i';
240	44	if (!preg_match($regexp, $p['host']))
241		{
242		// If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets)
243	2	if (!NetworkFilter::filterIpv4($p['host'])
244	2	&& !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host'])))
245		{
246	1	return 'URL host is invalid';
247		}
248		}
249
250	43	if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host']))
251	43	\|\| (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host'])))
252		{
253	14	return 'URL host is not allowed';
254		}
255		}
256		}
257		}

s9e / TextFormatter

Push — master ( 803d3c...9ccb75 )

UrlFilter::validateUrl() C

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Complexity

Long Method

Duplication Side-by-Side

Filter issues like