1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @package s9e\TextFormatter |
5
|
|
|
* @copyright Copyright (c) 2010-2017 The s9e Authors |
6
|
|
|
* @license http://www.opensource.org/licenses/mit-license.php The MIT License |
7
|
|
|
*/ |
8
|
|
|
namespace s9e\TextFormatter\Parser\AttributeFilters; |
9
|
|
|
|
10
|
|
|
use s9e\TextFormatter\Parser\Logger; |
11
|
|
|
|
12
|
|
|
class UrlFilter |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* Filter a URL |
16
|
|
|
* |
17
|
|
|
* @param mixed $attrValue Original URL |
18
|
|
|
* @param array $urlConfig URL config |
19
|
|
|
* @param Logger $logger Parser's logger |
20
|
|
|
* @return mixed Cleaned up URL if valid, FALSE otherwise |
21
|
|
|
*/ |
22
|
59 |
|
public static function filter($attrValue, array $urlConfig, Logger $logger = null) |
23
|
|
|
{ |
24
|
|
|
/** |
25
|
|
|
* Trim the URL to conform with HTML5 then parse it |
26
|
|
|
* @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href |
27
|
|
|
*/ |
28
|
59 |
|
$p = self::parseUrl(trim($attrValue)); |
29
|
|
|
|
30
|
59 |
|
$error = self::validateUrl($urlConfig, $p); |
31
|
59 |
|
if (!empty($error)) |
32
|
|
|
{ |
33
|
21 |
|
if (isset($logger)) |
34
|
|
|
{ |
35
|
20 |
|
$p['attrValue'] = $attrValue; |
36
|
20 |
|
$logger->err($error, $p); |
37
|
|
|
} |
38
|
|
|
|
39
|
21 |
|
return false; |
40
|
|
|
} |
41
|
|
|
|
42
|
39 |
|
return self::rebuildUrl($p); |
43
|
|
|
} |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Parse a URL and return its components |
47
|
|
|
* |
48
|
|
|
* Similar to PHP's own parse_url() except that all parts are always returned |
49
|
|
|
* |
50
|
|
|
* @param string $url Original URL |
51
|
|
|
* @return array |
52
|
|
|
*/ |
53
|
59 |
|
protected static function parseUrl($url) |
54
|
|
|
{ |
55
|
59 |
|
$regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di'; |
56
|
|
|
|
57
|
|
|
// NOTE: this regexp always matches because of the last three captures |
58
|
59 |
|
preg_match($regexp, $url, $m); |
59
|
|
|
|
60
|
59 |
|
$parts = []; |
61
|
59 |
|
$tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment']; |
62
|
59 |
|
foreach ($tokens as $i => $name) |
63
|
|
|
{ |
64
|
59 |
|
$parts[$name] = (isset($m[$i + 1])) ? $m[$i + 1] : ''; |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* @link http://tools.ietf.org/html/rfc3986#section-3.1 |
69
|
|
|
* |
70
|
|
|
* 'An implementation should accept uppercase letters as equivalent to lowercase in |
71
|
|
|
* scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but |
72
|
|
|
* should only produce lowercase scheme names for consistency.' |
73
|
|
|
*/ |
74
|
59 |
|
$parts['scheme'] = strtolower($parts['scheme']); |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Normalize the domain label separators and remove trailing dots |
78
|
|
|
* @link http://url.spec.whatwg.org/#domain-label-separators |
79
|
|
|
*/ |
80
|
59 |
|
$parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.'); |
81
|
|
|
|
82
|
|
|
// Test whether host has non-ASCII characters and punycode it if possible |
83
|
59 |
|
if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii')) |
84
|
|
|
{ |
85
|
3 |
|
$variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0; |
86
|
3 |
|
$parts['host'] = idn_to_ascii($parts['host'], 0, $variant); |
87
|
|
|
} |
88
|
|
|
|
89
|
59 |
|
return $parts; |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* Rebuild a parsed URL |
94
|
|
|
* |
95
|
|
|
* @param array $p Parsed URL |
96
|
|
|
* @return string |
97
|
|
|
*/ |
98
|
39 |
|
protected static function rebuildUrl(array $p) |
99
|
|
|
{ |
100
|
39 |
|
$url = ''; |
101
|
39 |
|
if ($p['scheme'] !== '') |
102
|
|
|
{ |
103
|
29 |
|
$url .= $p['scheme'] . ':'; |
104
|
|
|
} |
105
|
39 |
|
if ($p['host'] === '') |
106
|
|
|
{ |
107
|
|
|
// Allow the file: scheme to not have a host and ensure it starts with slashes |
108
|
9 |
|
if ($p['scheme'] === 'file') |
109
|
|
|
{ |
110
|
9 |
|
$url .= '//'; |
111
|
|
|
} |
112
|
|
|
} |
113
|
|
|
else |
114
|
|
|
{ |
115
|
30 |
|
$url .= '//'; |
116
|
|
|
|
117
|
|
|
// Add the credentials if applicable |
118
|
30 |
|
if ($p['user'] !== '') |
119
|
|
|
{ |
120
|
|
|
// Reencode the credentials in case there are invalid chars in them, or suspicious |
121
|
|
|
// characters such as : or @ that could confuse a browser into connecting to the |
122
|
|
|
// wrong host (or at least, to a host that is different than the one we thought) |
123
|
2 |
|
$url .= rawurlencode(urldecode($p['user'])); |
124
|
|
|
|
125
|
2 |
|
if ($p['pass'] !== '') |
126
|
|
|
{ |
127
|
2 |
|
$url .= ':' . rawurlencode(urldecode($p['pass'])); |
128
|
|
|
} |
129
|
|
|
|
130
|
2 |
|
$url .= '@'; |
131
|
|
|
} |
132
|
|
|
|
133
|
30 |
|
$url .= $p['host']; |
134
|
|
|
|
135
|
|
|
// Append the port number (note that as per the regexp it can only contain digits) |
136
|
30 |
|
if ($p['port'] !== '') |
137
|
|
|
{ |
138
|
3 |
|
$url .= ':' . $p['port']; |
139
|
|
|
} |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
// Build the path, including the query and fragment parts |
143
|
39 |
|
$path = $p['path'] . $p['query'] . $p['fragment']; |
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* "For consistency, URI producers and normalizers should use uppercase hexadecimal digits |
147
|
|
|
* for all percent- encodings." |
148
|
|
|
* |
149
|
|
|
* @link http://tools.ietf.org/html/rfc3986#section-2.1 |
150
|
|
|
*/ |
151
|
39 |
|
$path = preg_replace_callback( |
152
|
39 |
|
'/%.?[a-f]/', |
153
|
39 |
|
function ($m) |
154
|
|
|
{ |
155
|
1 |
|
return strtoupper($m[0]); |
156
|
39 |
|
}, |
157
|
39 |
|
$path |
158
|
|
|
); |
159
|
|
|
|
160
|
|
|
// Append the sanitized path to the URL |
161
|
39 |
|
$url .= self::sanitizeUrl($path); |
162
|
|
|
|
163
|
|
|
// Replace the first colon if there's no scheme and it could potentially be interpreted as |
164
|
|
|
// the scheme separator |
165
|
39 |
|
if (!$p['scheme']) |
166
|
|
|
{ |
167
|
10 |
|
$url = preg_replace('#^([^/]*):#', '$1%3A', $url); |
168
|
|
|
} |
169
|
|
|
|
170
|
39 |
|
return $url; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
/** |
174
|
|
|
* Sanitize a URL for safe use regardless of context |
175
|
|
|
* |
176
|
|
|
* This method URL-encodes some sensitive characters in case someone would want to use the URL in |
177
|
|
|
* some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path |
178
|
|
|
* of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately |
179
|
|
|
* followed by two hex digits. |
180
|
|
|
* |
181
|
|
|
* " and ' to prevent breaking out of quotes (JavaScript or otherwise) |
182
|
|
|
* ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression()) |
183
|
|
|
* < and > to prevent breaking out of <script> |
184
|
|
|
* \r and \n because they're illegal in JavaScript |
185
|
|
|
* [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986 |
186
|
|
|
* Non-ASCII characters as per RFC 3986 |
187
|
|
|
* Control codes and spaces, as per RFC 3986 |
188
|
|
|
* |
189
|
|
|
* @link http://sla.ckers.org/forum/read.php?2,51478 |
190
|
|
|
* @link http://timelessrepo.com/json-isnt-a-javascript-subset |
191
|
|
|
* @link http://www.ietf.org/rfc/rfc3986.txt |
192
|
|
|
* @link http://stackoverflow.com/a/1547922 |
193
|
|
|
* @link http://tools.ietf.org/html/rfc3986#appendix-A |
194
|
|
|
* |
195
|
|
|
* @param string $url Original URL |
196
|
|
|
* @return string Sanitized URL |
197
|
|
|
*/ |
198
|
56 |
|
public static function sanitizeUrl($url) |
199
|
|
|
{ |
200
|
56 |
|
return preg_replace_callback( |
201
|
56 |
|
'/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z]/S', |
202
|
56 |
|
function ($m) |
203
|
|
|
{ |
204
|
26 |
|
return rawurlencode($m[0]); |
205
|
56 |
|
}, |
206
|
56 |
|
$url |
207
|
|
|
); |
208
|
|
|
} |
209
|
|
|
|
210
|
|
|
/** |
211
|
|
|
* Validate a parsed URL |
212
|
|
|
* |
213
|
|
|
* @param array $urlConfig URL config |
214
|
|
|
* @param array $p Parsed URL |
215
|
|
|
* @return string|null Error message if invalid, or NULL |
216
|
|
|
*/ |
217
|
59 |
|
protected static function validateUrl(array $urlConfig, array $p) |
218
|
|
|
{ |
219
|
59 |
|
if ($p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme'])) |
220
|
|
|
{ |
221
|
3 |
|
return 'URL scheme is not allowed'; |
222
|
|
|
} |
223
|
|
|
|
224
|
57 |
|
if ($p['host'] === '') |
225
|
|
|
{ |
226
|
|
|
// Reject malformed URLs such as http:///example.org but allow schemeless paths |
227
|
13 |
|
if ($p['scheme'] !== 'file' && $p['scheme'] !== '') |
228
|
|
|
{ |
229
|
13 |
|
return 'Missing host'; |
230
|
|
|
} |
231
|
|
|
} |
232
|
|
|
else |
233
|
|
|
{ |
234
|
|
|
/** |
235
|
|
|
* Test whether the host is valid |
236
|
|
|
* @link http://tools.ietf.org/html/rfc1035#section-2.3.1 |
237
|
|
|
* @link http://tools.ietf.org/html/rfc1123#section-2 |
238
|
|
|
*/ |
239
|
44 |
|
$regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i'; |
240
|
44 |
|
if (!preg_match($regexp, $p['host'])) |
241
|
|
|
{ |
242
|
|
|
// If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets) |
243
|
2 |
|
if (!NetworkFilter::filterIpv4($p['host']) |
244
|
2 |
|
&& !NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host']))) |
245
|
|
|
{ |
246
|
1 |
|
return 'URL host is invalid'; |
247
|
|
|
} |
248
|
|
|
} |
249
|
|
|
|
250
|
43 |
|
if ((isset($urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host'])) |
251
|
43 |
|
|| (isset($urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host']))) |
252
|
|
|
{ |
253
|
14 |
|
return 'URL host is not allowed'; |
254
|
|
|
} |
255
|
|
|
} |
256
|
|
|
} |
257
|
|
|
} |