1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Validate; |
4
|
|
|
|
5
|
|
|
class Url implements \Validate\Contracts\Validate |
6
|
|
|
{ |
7
|
|
|
public static function toDatabase(string $url) |
8
|
|
|
{ |
9
|
|
|
$url = str_replace('http://', '', $url); |
10
|
|
|
$url = str_replace('https://', '', $url); |
11
|
|
|
return $url; |
12
|
|
|
} |
13
|
|
|
|
14
|
|
|
public static function toUser($url) |
15
|
|
|
{ |
16
|
|
|
return 'https://'.$url; |
17
|
|
|
} |
18
|
|
|
|
19
|
|
|
public static function validate($url) |
20
|
|
|
{ |
21
|
|
|
if (strpos($url, ' ') !== false) { |
22
|
|
|
return false; |
23
|
|
|
} |
24
|
|
|
return true; |
25
|
|
|
} |
26
|
|
|
|
27
|
|
|
public static function break(string $url) |
28
|
|
|
{ |
29
|
|
|
return self::splitUrl($url); |
30
|
|
|
} |
31
|
|
|
|
32
|
|
|
public static function isSame(string $to, string $from) |
33
|
|
|
{ |
34
|
|
|
return (self::toDatabase($to)===self::toDatabase($from)); |
35
|
|
|
} |
36
|
|
|
|
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Given a URL calculates the page's directory |
40
|
|
|
* |
41
|
|
|
* @params string $url target URL |
42
|
|
|
* @return string Directory |
43
|
|
|
*/ |
44
|
|
|
public function parseDir($url) |
45
|
|
|
{ |
46
|
|
|
$slash = strrpos($url, '/'); |
47
|
|
|
return substr($url, 0, $slash+1); |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* Link Checking Functions |
52
|
|
|
*/ |
53
|
|
|
|
54
|
|
|
/** |
55
|
|
|
* Uniformly cleans a link to avoid duplicates |
56
|
|
|
* |
57
|
|
|
* 1. Changes relative links to absolute (/bar to http://www.foo.com/bar) |
58
|
|
|
* 2. Removes anchor tags (foo.html#bar to foo.html) |
59
|
|
|
* 3. Adds trailing slash if directory (foo.com/bar to foo.com/bar/) |
60
|
|
|
* 4. Adds www if there is not a subdomain (foo.com to www.foo.com but not bar.foo.com) |
61
|
|
|
* |
62
|
|
|
* @params string $relativeUrl link to clean |
63
|
|
|
* @parmas string $baseUrl directory of parent (linking) page |
64
|
|
|
* @return string cleaned link |
65
|
|
|
*/ |
66
|
|
|
public function cleanLink($relativeUrl, $baseUrl) |
67
|
|
|
{ |
68
|
|
|
$relativeUrl = self::urlToAbsolute($baseUrl, $relativeUrl); //make them absolute, not relative |
69
|
|
|
|
70
|
|
|
if (stripos($relativeUrl, '#') !== false) { |
71
|
|
|
$relativeUrl = substr($relativeUrl, 0, stripos($relativeUrl, '#')); //remove anchors |
72
|
|
|
} |
73
|
|
|
|
74
|
|
|
if (!preg_match('#(^http://(.*)/$)|http://(.*)/(.*)\.([A-Za-z0-9]+)|http://(.*)/([^\?\#]*)(\?|\#)([^/]*)#i', $relativeUrl)) { |
75
|
|
|
$relativeUrl .= '/'; |
76
|
|
|
} |
77
|
|
|
|
78
|
|
|
$relativeUrl = preg_replace('#http://([^.]+).([a-zA-z]{3})/#i', 'http://www.$1.$2/', $relativeUrl); |
79
|
|
|
return $relativeUrl; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Performs a regular expression to see if a given link is an image |
85
|
|
|
* |
86
|
|
|
* @params string $link target link |
87
|
|
|
* @return bool true on image, false on anything else |
88
|
|
|
*/ |
89
|
|
|
public static function isImage($link) |
90
|
|
|
{ |
91
|
|
|
if (preg_match('%\.(gif|jpe?g|png|bmp)$%i', $link)) { |
92
|
|
|
return true; |
93
|
|
|
} else { |
94
|
|
|
return false; |
95
|
|
|
} |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Checks to see that a given link is within the domain/host whitelist |
100
|
|
|
* |
101
|
|
|
* Improved from original to use regular expression and match hosts. |
102
|
|
|
* |
103
|
|
|
* @params string $link target link |
104
|
|
|
* @return bool true if out of domain, false if on domain whitelist |
105
|
|
|
*/ |
106
|
|
|
public static function outOfDomain($link, $domainArray) |
107
|
|
|
{ |
108
|
|
|
if (!is_array($domainArray)) { |
109
|
|
|
$domainArray[] = $domainArray; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
// get host name from URL |
113
|
|
|
preg_match("/^(http:\/\/)?([^\/]+)/i", $link, $matches); |
114
|
|
|
$host = $matches[2]; |
115
|
|
|
// echo "<br />host: $host"; |
116
|
|
|
// get last two segments of host name |
117
|
|
|
// preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches); |
118
|
|
|
foreach ($domainArray as $domain) { |
119
|
|
|
if ($domain == $host) { |
120
|
|
|
return false; |
121
|
|
|
} |
122
|
|
|
} |
123
|
|
|
return true; |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* Checks to see that a given link matches a pattern in the exclude list |
128
|
|
|
* |
129
|
|
|
* @params string $link target link |
130
|
|
|
* @return bool true if matches exclude, false if no match |
131
|
|
|
*/ |
132
|
|
|
public function excludeByPattern($link, $excludedArray = []) |
133
|
|
|
{ |
134
|
|
|
if (!is_array($excludedArray)) { |
135
|
|
|
$excludedArray[] = $excludedArray; |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
foreach ($excludedArray as $pattern) { |
139
|
|
|
if (preg_match($pattern, urldecode($link))) { |
140
|
|
|
echo "<p>matched exclude pattern <b>$pattern</b> in ".urldecode($link)."</p>"; |
141
|
|
|
return true; |
142
|
|
|
} |
143
|
|
|
} |
144
|
|
|
return false; |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
/** |
148
|
|
|
* Checks to see if a given link is in fact a mailto: link |
149
|
|
|
* |
150
|
|
|
* @params string $link Link to check |
151
|
|
|
* @return bool true on mailto:, false on everything else |
152
|
|
|
*/ |
153
|
|
|
public static function isMailto($link) |
154
|
|
|
{ |
155
|
|
|
if (stripos($link, 'mailto:')===false) { |
156
|
|
|
return false; |
157
|
|
|
} else { |
158
|
|
|
return true; |
159
|
|
|
} |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
/* Depreciated (I think) |
163
|
|
|
|
164
|
|
|
public function count_slashes($url) { |
165
|
|
|
if (strlen($url)<7) return 0; |
166
|
|
|
return substr_count($url,'/',7); |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
public function get_slashes($url) { |
170
|
|
|
if (preg_match_all('#/#',$url,$matches,PREG_OFFSET_CAPTURE,7)) return $matches[0]; |
171
|
|
|
else return array(); |
172
|
|
|
} |
173
|
|
|
*/ |
174
|
|
|
|
175
|
|
|
/** |
176
|
|
|
* Converts a relative URL (/bar) to an absolute URL (http://www.foo.com/bar) |
177
|
|
|
* |
178
|
|
|
* Inspired from code available at http://nadeausoftware.com/node/79, |
179
|
|
|
* Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) |
180
|
|
|
* |
181
|
|
|
* @params string $baseUrl Directory of linking page |
182
|
|
|
* @params string $relativeURL URL to convert to absolute |
183
|
|
|
* @return string Absolute URL |
184
|
|
|
*/ |
185
|
|
|
public static function urlToAbsolute($baseUrl, $relativeUrl) |
186
|
|
|
{ |
187
|
|
|
// If relative URL has a scheme, clean path and return. |
188
|
|
|
if (!$r = self::splitUrl($relativeUrl)) { |
189
|
|
|
return false; |
|
|
|
|
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
|
193
|
|
|
if (!empty($r['scheme'])) { |
194
|
|
|
if (!empty($r['path']) && $r['path'][0] == '/') { |
195
|
|
|
$r['path'] = self::urlRemoveDotSegments($r['path']); |
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
return self::joinUrl($r); |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
// Make sure the base URL is absolute. |
202
|
|
|
$b = self::splitUrl($baseUrl); |
203
|
|
|
if ($b === false || empty($b['scheme']) || empty($b['host'])) { |
204
|
|
|
return false; |
|
|
|
|
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
$r['scheme'] = $b['scheme']; |
208
|
|
|
|
209
|
|
|
// If relative URL has an authority, clean path and return. |
210
|
|
|
if (isset($r['host'])) { |
211
|
|
|
if (!empty($r['path'])) { |
212
|
|
|
$r['path'] = self::urlRemoveDotSegments($r['path']); |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
return self::joinUrl($r); |
216
|
|
|
} |
217
|
|
|
unset($r['port']); |
218
|
|
|
unset($r['user']); |
219
|
|
|
unset($r['pass']); |
220
|
|
|
|
221
|
|
|
// Copy base authority. |
222
|
|
|
$r['host'] = $b['host']; |
223
|
|
|
if (isset($b['port'])) { |
224
|
|
|
$r['port'] = $b['port']; |
225
|
|
|
} |
226
|
|
|
if (isset($b['user'])) { |
227
|
|
|
$r['user'] = $b['user']; |
228
|
|
|
} |
229
|
|
|
if (isset($b['pass'])) { |
230
|
|
|
$r['pass'] = $b['pass']; |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
// If relative URL has no path, use base path |
234
|
|
|
if (empty($r['path'])) { |
235
|
|
|
if (!empty($b['path'])) { |
236
|
|
|
$r['path'] = $b['path']; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
if (!isset($r['query']) && isset($b['query'])) { |
240
|
|
|
$r['query'] = $b['query']; |
241
|
|
|
} |
242
|
|
|
|
243
|
|
|
return self::joinUrl($r); |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
// If relative URL path doesn't start with /, merge with base path |
247
|
|
|
if ($r['path'][0] != '/') { |
248
|
|
|
$base = mb_strrchr($b['path'], '/', true, 'UTF-8'); |
249
|
|
|
if ($base === false) { |
250
|
|
|
$base = ''; |
251
|
|
|
} |
252
|
|
|
$r['path'] = $base . '/' . $r['path']; |
253
|
|
|
} |
254
|
|
|
$r['path'] = self::urlRemoveDotSegments($r['path']); |
255
|
|
|
return self::joinUrl($r); |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
/** |
259
|
|
|
* Required public function of URL to absolute |
260
|
|
|
* |
261
|
|
|
* Inspired from code available at http://nadeausoftware.com/node/79, |
262
|
|
|
* Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) |
263
|
|
|
*/ |
264
|
|
|
public static function urlRemoveDotSegments($path) |
265
|
|
|
{ |
266
|
|
|
// multi-byte character explode |
267
|
|
|
$inSegs = preg_split('!/!u', $path); |
268
|
|
|
$outSegs = array( ); |
269
|
|
|
foreach ($inSegs as $seg) { |
270
|
|
|
if ($seg == '' || $seg == '.') { |
271
|
|
|
continue; |
272
|
|
|
} |
273
|
|
|
if ($seg == '..') { |
274
|
|
|
array_pop($outSegs); |
275
|
|
|
} else { |
276
|
|
|
array_push($outSegs, $seg); |
277
|
|
|
} |
278
|
|
|
} |
279
|
|
|
$outPath = implode('/', $outSegs); |
280
|
|
|
if ($path[0] == '/') { |
281
|
|
|
$outPath = '/' . $outPath; |
282
|
|
|
} |
283
|
|
|
|
284
|
|
|
// compare last multi-byte character against '/' |
285
|
|
|
if ($outPath != '/' |
286
|
|
|
&& (mb_strlen($path)-1) == mb_strrpos($path, '/') |
287
|
|
|
) { |
288
|
|
|
$outPath .= '/'; |
289
|
|
|
} |
290
|
|
|
|
291
|
|
|
return $outPath; |
292
|
|
|
} |
293
|
|
|
|
294
|
|
|
/** |
295
|
|
|
* Required public function of URL to absolute |
296
|
|
|
* |
297
|
|
|
* Inspired from code available at http://nadeausoftware.com/node/79, |
298
|
|
|
* Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) |
299
|
|
|
*/ |
300
|
|
|
public static function splitUrl(string $url, $decode=true) |
301
|
|
|
{ |
302
|
|
|
$parts = []; |
303
|
|
|
$m = []; |
304
|
|
|
$xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;='; |
305
|
|
|
$xpchar = $xunressub . ':@%'; |
306
|
|
|
|
307
|
|
|
$xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)'; |
308
|
|
|
|
309
|
|
|
$xuserinfo = '(([' . $xunressub . '%]*)' . |
310
|
|
|
'(:([' . $xunressub . ':%]*))?)'; |
311
|
|
|
|
312
|
|
|
$xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'; |
313
|
|
|
|
314
|
|
|
$xipv6 = '(\[([a-fA-F\d.:]+)\])'; |
315
|
|
|
|
316
|
|
|
$xhost_name = '([a-zA-Z%]+)'; |
317
|
|
|
// $xhost_name = '([a-zA-Z\d-.%]+)'; @todo alterado pq tava dando erro nesse parser |
318
|
|
|
|
319
|
|
|
$xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')'; |
320
|
|
|
$xport = '(\d*)'; |
321
|
|
|
$xauthority = '((' . $xuserinfo . '@)?' . $xhost . |
322
|
|
|
'?(:' . $xport . ')?)'; |
323
|
|
|
|
324
|
|
|
$xslash_seg = '(/[' . $xpchar . ']*)'; |
325
|
|
|
$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))'; |
326
|
|
|
$xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)'; |
327
|
|
|
$xpath_abs = '(/(' . $xpath_rel . ')?)'; |
328
|
|
|
$xapath = '(' . $xpath_authabs . '|' . $xpath_abs . |
329
|
|
|
'|' . $xpath_rel . ')'; |
330
|
|
|
|
331
|
|
|
$xqueryfrag = '([' . $xpchar . '/?' . ']*)'; |
332
|
|
|
|
333
|
|
|
$xurl = '^(' . $xscheme . ':)?' . $xapath . '?' . |
334
|
|
|
'(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$'; |
335
|
|
|
|
336
|
|
|
|
337
|
|
|
// Split the URL into components. |
338
|
|
|
if (!preg_match('!' . $xurl . '!', $url, $m)) { |
339
|
|
|
return false; |
340
|
|
|
} |
341
|
|
|
|
342
|
|
|
if (!empty($m[2])) { |
343
|
|
|
$parts['scheme'] = strtolower($m[2]); |
344
|
|
|
} |
345
|
|
|
|
346
|
|
|
if (!empty($m[7])) { |
347
|
|
|
if (isset($m[9])) { |
348
|
|
|
$parts['user'] = $m[9]; |
349
|
|
|
} else { |
350
|
|
|
$parts['user'] = ''; |
351
|
|
|
} |
352
|
|
|
} |
353
|
|
|
if (!empty($m[10])) { |
354
|
|
|
$parts['pass'] = $m[11]; |
355
|
|
|
} |
356
|
|
|
|
357
|
|
|
if (!empty($m[13])) { |
358
|
|
|
$h=$parts['host'] = $m[13]; |
359
|
|
|
} elseif (!empty($m[14])) { |
360
|
|
|
$parts['host'] = $m[14]; |
361
|
|
|
} elseif (!empty($m[16])) { |
362
|
|
|
$parts['host'] = $m[16]; |
363
|
|
|
} elseif (!empty($m[5])) { |
364
|
|
|
$parts['host'] = ''; |
365
|
|
|
} |
366
|
|
|
if (!empty($m[17])) { |
367
|
|
|
$parts['port'] = $m[18]; |
368
|
|
|
} |
369
|
|
|
|
370
|
|
|
if (!empty($m[19])) { |
371
|
|
|
$parts['path'] = $m[19]; |
372
|
|
|
} elseif (!empty($m[21])) { |
373
|
|
|
$parts['path'] = $m[21]; |
374
|
|
|
} elseif (!empty($m[25])) { |
375
|
|
|
$parts['path'] = $m[25]; |
376
|
|
|
} |
377
|
|
|
|
378
|
|
|
if (!empty($m[27])) { |
379
|
|
|
$parts['query'] = $m[28]; |
380
|
|
|
} |
381
|
|
|
if (!empty($m[29])) { |
382
|
|
|
$parts['fragment']= $m[30]; |
383
|
|
|
} |
384
|
|
|
|
385
|
|
|
if (!$decode) { |
386
|
|
|
return $parts; |
387
|
|
|
} |
388
|
|
|
if (!empty($parts['user'])) { |
389
|
|
|
$parts['user'] = rawurldecode($parts['user']); |
390
|
|
|
} |
391
|
|
|
if (!empty($parts['pass'])) { |
392
|
|
|
$parts['pass'] = rawurldecode($parts['pass']); |
393
|
|
|
} |
394
|
|
|
if (!empty($parts['path'])) { |
395
|
|
|
$parts['path'] = rawurldecode($parts['path']); |
396
|
|
|
} |
397
|
|
|
if (isset($h)) { |
398
|
|
|
$parts['host'] = rawurldecode($parts['host']); |
399
|
|
|
} |
400
|
|
|
if (!empty($parts['query'])) { |
401
|
|
|
$parts['query'] = rawurldecode($parts['query']); |
402
|
|
|
} |
403
|
|
|
if (!empty($parts['fragment'])) { |
404
|
|
|
$parts['fragment'] = rawurldecode($parts['fragment']); |
405
|
|
|
} |
406
|
|
|
return $parts; |
407
|
|
|
} |
408
|
|
|
|
409
|
|
|
/** |
410
|
|
|
* Required public function of URL to absolute |
411
|
|
|
* |
412
|
|
|
* Inspired from code available at http://nadeausoftware.com/node/79, |
413
|
|
|
* Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) |
414
|
|
|
*/ |
415
|
|
|
public static function joinUrl($parts, $encode=true) |
416
|
|
|
{ |
417
|
|
|
if ($encode) { |
418
|
|
|
if (isset($parts['user'])) { |
419
|
|
|
$parts['user'] = rawurlencode($parts['user']); |
420
|
|
|
} |
421
|
|
|
if (isset($parts['pass'])) { |
422
|
|
|
$parts['pass'] = rawurlencode($parts['pass']); |
423
|
|
|
} |
424
|
|
|
if (isset($parts['host']) |
425
|
|
|
&& !preg_match('!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host']) |
426
|
|
|
) { |
427
|
|
|
$parts['host'] = rawurlencode($parts['host']); |
428
|
|
|
} |
429
|
|
|
if (!empty($parts['path'])) { |
430
|
|
|
$parts['path'] = preg_replace( |
431
|
|
|
'!%2F!ui', |
432
|
|
|
'/', |
433
|
|
|
rawurlencode($parts['path']) |
434
|
|
|
); |
435
|
|
|
} |
436
|
|
|
|
437
|
|
|
if (isset($parts['query'])) { |
438
|
|
|
$parts['query'] = rawurlencode($parts['query']); |
439
|
|
|
} |
440
|
|
|
|
441
|
|
|
if (isset($parts['fragment'])) { |
442
|
|
|
$parts['fragment'] = rawurlencode($parts['fragment']); |
443
|
|
|
} |
444
|
|
|
} |
445
|
|
|
|
446
|
|
|
$url = ''; |
447
|
|
|
if (!empty($parts['scheme'])) { |
448
|
|
|
$url .= $parts['scheme'] . ':'; |
449
|
|
|
} |
450
|
|
|
if (isset($parts['host'])) { |
451
|
|
|
$url .= '//'; |
452
|
|
|
if (isset($parts['user'])) { |
453
|
|
|
$url .= $parts['user']; |
454
|
|
|
if (isset($parts['pass'])) { |
455
|
|
|
$url .= ':' . $parts['pass']; |
456
|
|
|
} |
457
|
|
|
$url .= '@'; |
458
|
|
|
} |
459
|
|
|
if (preg_match('!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'])) { |
460
|
|
|
$url .= '[' . $parts['host'] . ']'; |
461
|
|
|
} // IPv6 |
462
|
|
|
else { |
463
|
|
|
$url .= $parts['host']; |
464
|
|
|
} // IPv4 or name |
465
|
|
|
if (isset($parts['port'])) { |
466
|
|
|
$url .= ':' . $parts['port']; |
467
|
|
|
} |
468
|
|
|
if (!empty($parts['path']) && $parts['path'][0] != '/') { |
469
|
|
|
$url .= '/'; |
470
|
|
|
} |
471
|
|
|
} |
472
|
|
|
if (!empty($parts['path'])) { |
473
|
|
|
$url .= $parts['path']; |
474
|
|
|
} |
475
|
|
|
if (isset($parts['query'])) { |
476
|
|
|
$url .= '?' . $parts['query']; |
477
|
|
|
} |
478
|
|
|
if (isset($parts['fragment'])) { |
479
|
|
|
$url .= '#' . $parts['fragment']; |
480
|
|
|
} |
481
|
|
|
return $url; |
482
|
|
|
} |
483
|
|
|
} |
484
|
|
|
|