1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser\Parser; |
3
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\ClientException; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Trait UrlParser |
8
|
|
|
* |
9
|
|
|
* @package vipnytt\RobotsTxtParser\Parser |
10
|
|
|
*/ |
11
|
|
View Code Duplication |
trait UrlParser |
|
|
|
|
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* Convert relative to full URL |
15
|
|
|
* |
16
|
|
|
* @param string $url |
17
|
|
|
* @param string $base |
18
|
|
|
* @return string |
19
|
|
|
* @throws ClientException |
20
|
|
|
*/ |
21
|
|
|
protected function urlConvertToFull($url, $base) |
22
|
|
|
{ |
23
|
|
|
$url = $this->urlEncode($url); |
24
|
|
|
if ($this->urlValidate($url)) { |
25
|
|
|
return $url; |
26
|
|
|
} elseif (mb_stripos($url, '/') === 0) { |
27
|
|
|
return $this->urlBase($base) . $url; |
28
|
|
|
} |
29
|
|
|
throw new ClientException('Invalid URL'); |
30
|
|
|
} |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* URL encoder according to RFC 3986 |
34
|
|
|
* Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings. |
35
|
|
|
* @link http://publicmind.in/blog/url-encoding/ |
36
|
|
|
* |
37
|
|
|
* @param string $url |
38
|
|
|
* @return string |
39
|
|
|
*/ |
40
|
|
|
protected function urlEncode($url) |
41
|
|
|
{ |
42
|
|
|
$reserved = [ |
43
|
|
|
":" => '!%3A!ui', |
44
|
|
|
"/" => '!%2F!ui', |
45
|
|
|
"?" => '!%3F!ui', |
46
|
|
|
"#" => '!%23!ui', |
47
|
|
|
"[" => '!%5B!ui', |
48
|
|
|
"]" => '!%5D!ui', |
49
|
|
|
"@" => '!%40!ui', |
50
|
|
|
"!" => '!%21!ui', |
51
|
|
|
"$" => '!%24!ui', |
52
|
|
|
"&" => '!%26!ui', |
53
|
|
|
"'" => '!%27!ui', |
54
|
|
|
"(" => '!%28!ui', |
55
|
|
|
")" => '!%29!ui', |
56
|
|
|
"*" => '!%2A!ui', |
57
|
|
|
"+" => '!%2B!ui', |
58
|
|
|
"," => '!%2C!ui', |
59
|
|
|
";" => '!%3B!ui', |
60
|
|
|
"=" => '!%3D!ui', |
61
|
|
|
"%" => '!%25!ui' |
62
|
|
|
]; |
63
|
|
|
return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url)); |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Validate URL |
68
|
|
|
* |
69
|
|
|
* @param string $url |
70
|
|
|
* @return bool |
71
|
|
|
*/ |
72
|
|
|
protected function urlValidate($url) |
73
|
|
|
{ |
74
|
|
|
return ( |
75
|
|
|
filter_var($url, FILTER_VALIDATE_URL) && |
76
|
|
|
($parsed = parse_url($url)) !== false && |
77
|
|
|
$this->urlValidateHost($parsed['host']) && |
78
|
|
|
$this->urlValidateScheme($parsed['scheme']) |
79
|
|
|
); |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* Validate host name |
84
|
|
|
* |
85
|
|
|
* @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php |
86
|
|
|
* |
87
|
|
|
* @param string $host |
88
|
|
|
* @return bool |
89
|
|
|
*/ |
90
|
|
|
protected static function urlValidateHost($host) |
91
|
|
|
{ |
92
|
|
|
return ( |
93
|
|
|
preg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check |
94
|
|
|
&& preg_match("/^.{1,253}$/", $host) //overall length check |
95
|
|
|
&& preg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label |
96
|
|
|
&& !filter_var($host, FILTER_VALIDATE_IP) //is not an IP address |
97
|
|
|
); |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* Validate URL scheme |
102
|
|
|
* |
103
|
|
|
* @param string $scheme |
104
|
|
|
* @return bool |
105
|
|
|
*/ |
106
|
|
|
protected static function urlValidateScheme($scheme) |
107
|
|
|
{ |
108
|
|
|
return in_array($scheme, [ |
109
|
|
|
'http', |
110
|
|
|
'https', |
111
|
|
|
'ftp', |
112
|
|
|
'sftp', |
113
|
|
|
] |
114
|
|
|
); |
115
|
|
|
} |
116
|
|
|
|
117
|
|
|
/** |
118
|
|
|
* Base URL |
119
|
|
|
* |
120
|
|
|
* @param string $url |
121
|
|
|
* @return string |
122
|
|
|
* @throws ClientException |
123
|
|
|
*/ |
124
|
|
|
protected function urlBase($url) |
125
|
|
|
{ |
126
|
|
|
if ($this->urlValidate($url) === false) { |
127
|
|
|
throw new ClientException('Invalid URL'); |
128
|
|
|
} |
129
|
|
|
$parts = [ |
130
|
|
|
'scheme' => parse_url($url, PHP_URL_SCHEME), |
131
|
|
|
'host' => parse_url($url, PHP_URL_HOST), |
132
|
|
|
]; |
133
|
|
|
$parts['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parts['scheme'], 'tcp'); |
134
|
|
|
return $parts['scheme'] . '://' . $parts['host'] . ':' . $parts['port']; |
135
|
|
|
} |
136
|
|
|
} |
137
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.