1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser\Parser; |
3
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\ClientException; |
5
|
|
|
|
6
|
|
|
class UriParser |
7
|
|
|
{ |
8
|
|
|
/** |
9
|
|
|
* URI |
10
|
|
|
* @var string |
11
|
|
|
*/ |
12
|
|
|
private $uri; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* UriParser constructor. |
16
|
|
|
* |
17
|
|
|
* @param $uri |
18
|
|
|
*/ |
19
|
|
|
public function __construct($uri) |
20
|
|
|
{ |
21
|
|
|
$this->uri = $uri; |
22
|
|
|
} |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* Convert relative to full |
26
|
|
|
* |
27
|
|
|
* @param string $fallbackBase |
28
|
|
|
* @return string |
29
|
|
|
* @throws ClientException |
30
|
|
|
*/ |
31
|
|
|
public function convertToFull($fallbackBase) |
32
|
|
|
{ |
33
|
|
|
$this->encode(); |
34
|
|
|
if ($this->validate()) { |
35
|
|
|
return $this->uri; |
36
|
|
|
} elseif (strpos($this->uri, '/') === 0) { |
37
|
|
|
$relative = $this->uri; |
38
|
|
|
$this->uri = $fallbackBase; |
39
|
|
|
return $this->base() . $relative; |
40
|
|
|
} |
41
|
|
|
throw new ClientException("Invalid URI `$this->uri`"); |
42
|
|
|
} |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* URI encoder according to RFC 3986 |
46
|
|
|
* Returns a string containing the encoded URI with disallowed characters converted to their percentage encodings. |
47
|
|
|
* @link http://publicmind.in/blog/url-encoding/ |
48
|
|
|
* |
49
|
|
|
* @return string |
50
|
|
|
*/ |
51
|
|
|
public function encode() |
52
|
|
|
{ |
53
|
|
|
$reserved = [ |
54
|
|
|
'!%21!ui' => "!", |
55
|
|
|
'!%23!ui' => "#", |
56
|
|
|
'!%24!ui' => "$", |
57
|
|
|
'!%25!ui' => "%", |
58
|
|
|
'!%26!ui' => "&", |
59
|
|
|
'!%27!ui' => "'", |
60
|
|
|
'!%28!ui' => "(", |
61
|
|
|
'!%29!ui' => ")", |
62
|
|
|
'!%2A!ui' => "*", |
63
|
|
|
'!%2B!ui' => "+", |
64
|
|
|
'!%2C!ui' => ",", |
65
|
|
|
'!%2F!ui' => "/", |
66
|
|
|
'!%3A!ui' => ":", |
67
|
|
|
'!%3B!ui' => ";", |
68
|
|
|
'!%3D!ui' => "=", |
69
|
|
|
'!%3F!ui' => "?", |
70
|
|
|
'!%40!ui' => "@", |
71
|
|
|
'!%5B!ui' => "[", |
72
|
|
|
'!%5D!ui' => "]", |
73
|
|
|
]; |
74
|
|
|
$this->uri = preg_replace(array_keys($reserved), array_values($reserved), rawurlencode($this->uri)); |
75
|
|
|
return $this->baseToLowercase(); |
76
|
|
|
} |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* Base uri to lowercase |
80
|
|
|
* |
81
|
|
|
* @return string |
82
|
|
|
*/ |
83
|
|
|
private function baseToLowercase() |
84
|
|
|
{ |
85
|
|
|
if (($host = parse_url($this->uri, PHP_URL_HOST)) === null) { |
86
|
|
|
return $this->uri; |
87
|
|
|
} |
88
|
|
|
$pos = strpos($this->uri, $host) + strlen($host); |
89
|
|
|
return $this->uri = substr_replace($this->uri, strtolower(substr($this->uri, 0, $pos)), 0, $pos); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* Validate |
94
|
|
|
* |
95
|
|
|
* @return bool |
96
|
|
|
*/ |
97
|
|
|
public function validate() |
98
|
|
|
{ |
99
|
|
|
return ( |
100
|
|
|
( |
101
|
|
|
filter_var($this->uri, FILTER_VALIDATE_URL) || |
102
|
|
|
// PHP 5.x bug fix: FILTER_VALIDATE_URL doesn't support IPv6 urls. IP check not needed in the future. |
103
|
|
|
$this->validateIP(($parsed = parse_url($this->uri, PHP_URL_HOST)) === false ? '' : $parsed) |
104
|
|
|
) && |
105
|
|
|
($parsed = parse_url($this->uri)) !== false && |
106
|
|
|
( |
107
|
|
|
$this->validateHost($parsed['host']) || |
108
|
|
|
$this->validateIP($parsed['host']) |
109
|
|
|
) && |
110
|
|
|
$this->validateScheme($parsed['scheme']) |
111
|
|
|
); |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
/** |
115
|
|
|
* Validate IPv4 or IPv6 |
116
|
|
|
* |
117
|
|
|
* @param string|null $ipAddress |
118
|
|
|
* @return bool |
119
|
|
|
*/ |
120
|
|
|
public function validateIP($ipAddress = null) |
121
|
|
|
{ |
122
|
|
View Code Duplication |
if ($ipAddress === null) { |
|
|
|
|
123
|
|
|
$parsed = parse_url($this->uri); |
124
|
|
|
$ipAddress = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
125
|
|
|
} |
126
|
|
|
return ( |
127
|
|
|
filter_var($ipAddress, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4) || |
128
|
|
|
filter_var(trim($ipAddress, '[]'), FILTER_VALIDATE_IP, FILTER_FLAG_IPV6) |
129
|
|
|
); |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Validate host name |
134
|
|
|
* |
135
|
|
|
* @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php |
136
|
|
|
* |
137
|
|
|
* @param string|null $host |
138
|
|
|
* @return bool |
139
|
|
|
*/ |
140
|
|
|
public function validateHost($host = null) |
141
|
|
|
{ |
142
|
|
View Code Duplication |
if ($host === null) { |
|
|
|
|
143
|
|
|
$parsed = parse_url($this->uri); |
144
|
|
|
$host = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
145
|
|
|
} |
146
|
|
|
return ( |
147
|
|
|
preg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check |
148
|
|
|
&& preg_match("/^.{1,253}$/", $host) //overall length check |
149
|
|
|
&& preg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label |
150
|
|
|
&& !$this->validateIP($host) |
151
|
|
|
); |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Validate scheme |
156
|
|
|
* |
157
|
|
|
* @param string|null $scheme |
158
|
|
|
* @return bool |
159
|
|
|
*/ |
160
|
|
|
public function validateScheme($scheme = null) |
161
|
|
|
{ |
162
|
|
View Code Duplication |
if ($scheme === null) { |
|
|
|
|
163
|
|
|
$parsed = parse_url($this->uri); |
164
|
|
|
$scheme = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
165
|
|
|
} |
166
|
|
|
return in_array($scheme, [ |
167
|
|
|
'http', |
168
|
|
|
'https', |
169
|
|
|
'ftp', |
170
|
|
|
'ftps', |
171
|
|
|
'sftp', |
172
|
|
|
] |
173
|
|
|
); |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
/** |
177
|
|
|
* Base |
178
|
|
|
* |
179
|
|
|
* @return string |
180
|
|
|
* @throws ClientException |
181
|
|
|
*/ |
182
|
|
|
public function base() |
183
|
|
|
{ |
184
|
|
|
if (!$this->validate()) { |
185
|
|
|
throw new ClientException('Invalid URI'); |
186
|
|
|
} |
187
|
|
|
$parts = [ |
188
|
|
|
'scheme' => parse_url($this->uri, PHP_URL_SCHEME), |
189
|
|
|
'host' => parse_url($this->uri, PHP_URL_HOST), |
190
|
|
|
]; |
191
|
|
|
$parts['port'] = is_int($port = parse_url($this->uri, PHP_URL_PORT)) ? $port : getservbyname($parts['scheme'], 'tcp'); |
192
|
|
|
return strtolower($parts['scheme'] . '://' . $parts['host'] . ':' . $parts['port']); |
193
|
|
|
} |
194
|
|
|
} |
195
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.