1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* vipnytt/RobotsTxtParser |
4
|
|
|
* |
5
|
|
|
* @link https://github.com/VIPnytt/RobotsTxtParser |
6
|
|
|
* @license https://github.com/VIPnytt/RobotsTxtParser/blob/master/LICENSE The MIT License (MIT) |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
namespace vipnytt\RobotsTxtParser\Parser; |
10
|
|
|
|
11
|
|
|
class UriParser |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* Scheme white-list |
15
|
|
|
* @var string[] |
16
|
|
|
*/ |
17
|
|
|
protected $schemes = [ |
18
|
|
|
'http', |
19
|
|
|
'https', |
20
|
|
|
'ftp', |
21
|
|
|
'ftps', |
22
|
|
|
'sftp', |
23
|
|
|
]; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* URI |
27
|
|
|
* @var string |
28
|
|
|
*/ |
29
|
|
|
private $uri; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* UriParser constructor. |
33
|
|
|
* |
34
|
|
|
* @param $uri |
35
|
|
|
*/ |
36
|
|
|
public function __construct($uri) |
37
|
|
|
{ |
38
|
|
|
$this->uri = $uri; |
39
|
|
|
} |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* Convert relative to full |
43
|
|
|
* |
44
|
|
|
* @param string $fallbackBase |
45
|
|
|
* @return string |
46
|
|
|
*/ |
47
|
|
|
public function convertToFull($fallbackBase) |
48
|
|
|
{ |
49
|
|
|
$this->encode(); |
50
|
|
|
if ($this->validate()) { |
51
|
|
|
return $this->uri; |
52
|
|
|
} elseif (strpos($this->uri, '/') === 0) { |
53
|
|
|
$relative = $this->uri; |
54
|
|
|
$this->uri = $fallbackBase; |
55
|
|
|
return $this->base() . $relative; |
56
|
|
|
} |
57
|
|
|
throw new \InvalidArgumentException("Invalid URI `$this->uri`"); |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* URI encoder according to RFC 3986 |
62
|
|
|
* Returns a string containing the encoded URI with disallowed characters converted to their percentage encodings. |
63
|
|
|
* @link http://publicmind.in/blog/url-encoding/ |
64
|
|
|
* |
65
|
|
|
* @return string |
66
|
|
|
*/ |
67
|
|
|
public function encode() |
68
|
|
|
{ |
69
|
|
|
$reserved = [ |
70
|
|
|
'!%21!ui' => "!", |
71
|
|
|
'!%23!ui' => "#", |
72
|
|
|
'!%24!ui' => "$", |
73
|
|
|
'!%26!ui' => "&", |
74
|
|
|
'!%27!ui' => "'", |
75
|
|
|
'!%28!ui' => "(", |
76
|
|
|
'!%29!ui' => ")", |
77
|
|
|
'!%2A!ui' => "*", |
78
|
|
|
'!%2B!ui' => "+", |
79
|
|
|
'!%2C!ui' => ",", |
80
|
|
|
'!%2F!ui' => "/", |
81
|
|
|
'!%3A!ui' => ":", |
82
|
|
|
'!%3B!ui' => ";", |
83
|
|
|
'!%3D!ui' => "=", |
84
|
|
|
'!%3F!ui' => "?", |
85
|
|
|
'!%40!ui' => "@", |
86
|
|
|
'!%5B!ui' => "[", |
87
|
|
|
'!%5D!ui' => "]", |
88
|
|
|
'!%25!ui' => "%", |
89
|
|
|
]; |
90
|
|
|
// The % character must be the last in the $reserved array. |
91
|
|
|
// This makes sure that the already encoded values are not lost or encoded again. |
92
|
|
|
$this->uri = preg_replace(array_keys($reserved), array_values($reserved), rawurlencode($this->uri)); |
93
|
|
|
return $this->baseToLowercase(); |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Base uri to lowercase |
98
|
|
|
* |
99
|
|
|
* @return string |
100
|
|
|
*/ |
101
|
|
|
private function baseToLowercase() |
102
|
|
|
{ |
103
|
|
|
if (($host = parse_url($this->uri, PHP_URL_HOST)) === null) { |
104
|
|
|
return $this->uri; |
105
|
|
|
} |
106
|
|
|
$pos = strpos($this->uri, $host) + strlen($host); |
107
|
|
|
return $this->uri = substr_replace($this->uri, strtolower(substr($this->uri, 0, $pos)), 0, $pos); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* Validate |
112
|
|
|
* |
113
|
|
|
* @return bool |
114
|
|
|
*/ |
115
|
|
|
public function validate() |
116
|
|
|
{ |
117
|
|
|
return ( |
118
|
|
|
( |
119
|
|
|
filter_var($this->uri, FILTER_VALIDATE_URL) || |
120
|
|
|
// PHP 5.x bug fix: FILTER_VALIDATE_URL doesn't support IPv6 urls. IP check not needed in the future. |
121
|
|
|
$this->validateIP(($parsed = parse_url($this->uri, PHP_URL_HOST)) === false ? '' : $parsed) |
122
|
|
|
) && |
123
|
|
|
($parsed = parse_url($this->uri)) !== false && |
124
|
|
|
( |
125
|
|
|
$this->validateHost($parsed['host']) || |
126
|
|
|
$this->validateIP($parsed['host']) |
127
|
|
|
) && |
128
|
|
|
$this->validateScheme($parsed['scheme']) |
129
|
|
|
); |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Validate IPv4 or IPv6 |
134
|
|
|
* |
135
|
|
|
* @param string|null $ipAddress |
136
|
|
|
* @return bool |
137
|
|
|
*/ |
138
|
|
|
public function validateIP($ipAddress = null) |
139
|
|
|
{ |
140
|
|
View Code Duplication |
if ($ipAddress === null) { |
|
|
|
|
141
|
|
|
$parsed = parse_url($this->uri); |
142
|
|
|
$ipAddress = isset($parsed['host']) ? $parsed['host'] : null; |
143
|
|
|
} |
144
|
|
|
return ( |
145
|
|
|
filter_var($ipAddress, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4) || |
146
|
|
|
filter_var(trim($ipAddress, '[]'), FILTER_VALIDATE_IP, FILTER_FLAG_IPV6) |
147
|
|
|
); |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
/** |
151
|
|
|
* Validate host name |
152
|
|
|
* |
153
|
|
|
* @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php |
154
|
|
|
* |
155
|
|
|
* @param string|null $host |
156
|
|
|
* @return bool |
157
|
|
|
*/ |
158
|
|
|
public function validateHost($host = null) |
159
|
|
|
{ |
160
|
|
View Code Duplication |
if ($host === null) { |
|
|
|
|
161
|
|
|
$parsed = parse_url($this->uri); |
162
|
|
|
$host = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
163
|
|
|
} |
164
|
|
|
return ( |
165
|
|
|
preg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check |
166
|
|
|
&& preg_match("/^.{1,253}$/", $host) //overall length check |
167
|
|
|
&& preg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label |
168
|
|
|
&& !$this->validateIP($host) |
169
|
|
|
); |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
/** |
173
|
|
|
* Validate scheme |
174
|
|
|
* |
175
|
|
|
* @param string|null $scheme |
176
|
|
|
* @return bool |
177
|
|
|
*/ |
178
|
|
|
public function validateScheme($scheme = null) |
179
|
|
|
{ |
180
|
|
View Code Duplication |
if ($scheme === null) { |
|
|
|
|
181
|
|
|
$parsed = parse_url($this->uri); |
182
|
|
|
$scheme = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
183
|
|
|
} |
184
|
|
|
return in_array($scheme, $this->schemes); |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* Base |
189
|
|
|
* |
190
|
|
|
* @return string |
191
|
|
|
*/ |
192
|
|
|
public function base() |
193
|
|
|
{ |
194
|
|
|
if (!$this->validate()) { |
195
|
|
|
throw new \InvalidArgumentException("Invalid URI: $this->uri"); |
196
|
|
|
} |
197
|
|
|
$parts = [ |
198
|
|
|
'scheme' => parse_url($this->uri, PHP_URL_SCHEME), |
199
|
|
|
'host' => parse_url($this->uri, PHP_URL_HOST), |
200
|
|
|
]; |
201
|
|
|
$parts['port'] = is_int($port = parse_url($this->uri, PHP_URL_PORT)) ? $port : getservbyname($parts['scheme'], 'tcp'); |
202
|
|
|
return strtolower($parts['scheme'] . '://' . $parts['host'] . ':' . $parts['port']); |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
/** |
206
|
|
|
* Strip fragment |
207
|
|
|
* |
208
|
|
|
* @return string |
209
|
|
|
*/ |
210
|
|
|
public function stripFragment() |
211
|
|
|
{ |
212
|
|
|
return explode('#', $this->uri, 2)[0]; |
213
|
|
|
} |
214
|
|
|
} |
215
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.