1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Riimu\Kit\UrlParser; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Provides a RFC 3986 compliant solution to URL parsing. |
7
|
|
|
* |
8
|
|
|
* UriParser provides a method for parsing URLs that accurately complies with |
9
|
|
|
* the RFC specification. Unlike the built function `parse_url()`, the parser in |
10
|
|
|
* this library is based on the ABNF definition of the generic URI syntax. In |
11
|
|
|
* other words, this library does not allow any kind of invalid URLs and parses |
12
|
|
|
* them exactly as defined in the specification. |
13
|
|
|
* |
14
|
|
|
* While the intention of this library is to provide an accurate implementation |
15
|
|
|
* for URL parsing, it possible to use this library for parsing any kind of |
16
|
|
|
* valid URIs, since the parsing is simply based on the generic URI syntax. |
17
|
|
|
* Some of the features are simply more suited to dealing with URLs. The parser, |
18
|
|
|
* however, does not provide any additional validation based on the URI scheme. |
19
|
|
|
* |
20
|
|
|
* While the RFC specification does not allow UTF-8 characters in URIs, these |
21
|
|
|
* are still commonly used, especially in user input. To accommodate this fact, |
22
|
|
|
* the parser provides two additional compatibility modes that permit UTF-8 in |
23
|
|
|
* some of the URI components in addition to providing a simple support for |
24
|
|
|
* international domain names. |
25
|
|
|
* |
26
|
|
|
* @see https://tools.ietf.org/html/rfc3986 |
27
|
|
|
* @author Riikka Kalliomäki <[email protected]> |
28
|
|
|
* @copyright Copyright (c) 2015-2017 Riikka Kalliomäki |
29
|
|
|
* @license http://opensource.org/licenses/mit-license.php MIT License |
30
|
|
|
*/ |
31
|
|
|
class UriParser |
32
|
|
|
{ |
33
|
|
|
/** Parsing mode that conforms strictly to the RFC 3986 specification */ |
34
|
|
|
const MODE_RFC3986 = 1; |
35
|
|
|
|
36
|
|
|
/** Parsing mode that allows UTF-8 characters in some URI components */ |
37
|
|
|
const MODE_UTF8 = 2; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Parsing mode that also converts international domain names to ascii. |
41
|
|
|
* @deprecated Use MODE_IDNA instead |
42
|
|
|
* @see UriParser::MODE_IDNA |
43
|
|
|
*/ |
44
|
|
|
const MODE_IDNA2003 = 4; |
45
|
|
|
|
46
|
|
|
/** Parsing mode that also converts international domain names to ascii */ |
47
|
|
|
const MODE_IDNA = 4; |
48
|
|
|
|
49
|
|
|
/** @var array<string,string> List of methods used to assign the URI components */ |
50
|
|
|
private static $setters = [ |
51
|
|
|
'scheme' => 'withScheme', |
52
|
|
|
'host' => 'withHost', |
53
|
|
|
'port' => 'withPort', |
54
|
|
|
'path_abempty' => 'withPath', |
55
|
|
|
'path_absolute' => 'withPath', |
56
|
|
|
'path_noscheme' => 'withPath', |
57
|
|
|
'path_rootless' => 'withPath', |
58
|
|
|
'query' => 'withQuery', |
59
|
|
|
'fragment' => 'withFragment', |
60
|
|
|
]; |
61
|
|
|
|
62
|
|
|
/** @var int The current parsing mode */ |
63
|
|
|
private $mode; |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* Creates a new instance of UriParser. |
67
|
|
|
*/ |
68
|
|
|
public function __construct() |
69
|
|
|
{ |
70
|
|
|
$this->mode = self::MODE_RFC3986; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* Sets the parsing mode. |
75
|
|
|
* |
76
|
|
|
* The parser supports three different parsing modes as indicated by the |
77
|
|
|
* available parsing mode constants. The modes are as follows: |
78
|
|
|
* |
79
|
|
|
* - `MODE_RFC3986` adheres strictly to the RFC specification and does not |
80
|
|
|
* allow any non ascii characters in the URIs. This is the default mode. |
81
|
|
|
* |
82
|
|
|
* - `MODE_UTF8` allows UTF-8 characters in the user information, path, |
83
|
|
|
* query and fragment components of the URI. These characters will be |
84
|
|
|
* converted to appropriate percent encoded sequences. |
85
|
|
|
* |
86
|
|
|
* - `MODE_IDNA` also allows UTF-8 characters in the domain name and |
87
|
|
|
* converts the international domain name to ascii according to the IDNA |
88
|
|
|
* standard. |
89
|
|
|
* |
90
|
|
|
* @param int $mode One of the parsing mode constants |
91
|
|
|
*/ |
92
|
|
|
public function setMode($mode) |
93
|
|
|
{ |
94
|
|
|
$this->mode = (int) $mode; |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
/** |
98
|
|
|
* Parses the URL using the generic URI syntax. |
99
|
|
|
* |
100
|
|
|
* This method returns the `Uri` instance constructed from the components |
101
|
|
|
* parsed from the URL. The URL is parsed using either the absolute URI |
102
|
|
|
* pattern or the relative URI pattern based on which one matches the |
103
|
|
|
* provided string. If the URL cannot be parsed as a valid URI, null is |
104
|
|
|
* returned instead. |
105
|
|
|
* |
106
|
|
|
* @param string $uri The URL to parse |
107
|
|
|
* @return Uri|null The parsed URL or null if the URL is invalid |
108
|
|
|
*/ |
109
|
|
|
public function parse($uri) |
110
|
|
|
{ |
111
|
|
|
if (!$this->isValidString($uri)) { |
112
|
|
|
return null; |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
$pattern = new UriPattern(); |
116
|
|
|
$pattern->allowNonAscii($this->mode !== self::MODE_RFC3986); |
117
|
|
|
|
118
|
|
|
if ($pattern->matchUri($uri, $match)) { |
119
|
|
|
try { |
120
|
|
|
return $this->buildUri($match); |
121
|
|
|
} catch (\InvalidArgumentException $exception) { |
122
|
|
|
return null; |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
return null; |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* Tells if the URI string is valid for the current parser mode. |
131
|
|
|
* @param string $uri The URI to validate |
132
|
|
|
* @return bool True if the string is valid, false if not |
133
|
|
|
*/ |
134
|
|
|
private function isValidString($uri) |
135
|
|
|
{ |
136
|
|
|
if (preg_match('/^[\\x00-\\x7F]*$/', $uri)) { |
137
|
|
|
return true; |
138
|
|
|
} elseif ($this->mode === self::MODE_RFC3986) { |
139
|
|
|
return false; |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
// Validate UTF-8 via regular expression to avoid mbstring dependency |
143
|
|
|
$pattern = |
144
|
|
|
'/^(?> |
145
|
|
|
[\x00-\x7F]+ # ASCII |
146
|
|
|
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
147
|
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding over longs |
148
|
|
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
149
|
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
150
|
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
151
|
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
152
|
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
153
|
|
|
)*$/x'; |
154
|
|
|
|
155
|
|
|
return (bool) preg_match($pattern, $uri); |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
/** |
159
|
|
|
* Builds the Uri instance from the parsed components. |
160
|
|
|
* @param array<string, string> $components Components parsed from the URI |
161
|
|
|
* @return Uri The constructed URI representation |
162
|
|
|
*/ |
163
|
|
|
private function buildUri(array $components) |
164
|
|
|
{ |
165
|
|
|
$uri = new Uri(); |
166
|
|
|
|
167
|
|
|
if (isset($components['reg_name'])) { |
168
|
|
|
$components['host'] = $this->decodeHost($components['host']); |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
foreach (array_intersect_key(self::$setters, $components) as $key => $method) { |
172
|
|
|
$uri = call_user_func([$uri, $method], $components[$key]); |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
if (isset($components['userinfo'])) { |
176
|
|
|
list($username, $password) = preg_split('/:|$/', $components['userinfo'], 2); |
177
|
|
|
$uri = $uri->withUserInfo(rawurldecode($username), rawurldecode($password)); |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
return $uri; |
181
|
|
|
} |
182
|
|
|
|
183
|
|
|
/** |
184
|
|
|
* Decodes the hostname component according to parser mode. |
185
|
|
|
* @param string $hostname The parsed hostname |
186
|
|
|
* @return string The decoded hostname |
187
|
|
|
* @throws \InvalidArgumentException If the hostname is not valid |
188
|
|
|
*/ |
189
|
|
|
private function decodeHost($hostname) |
190
|
|
|
{ |
191
|
|
|
if (preg_match('/^[\\x00-\\x7F]*$/', $hostname)) { |
192
|
|
|
return $hostname; |
193
|
|
|
} elseif ($this->mode !== self::MODE_IDNA) { |
194
|
|
|
throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
$mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : INTL_IDNA_VARIANT_2003; |
|
|
|
|
198
|
|
|
$hostname = idn_to_ascii($hostname, IDNA_DEFAULT, $mode); |
199
|
|
|
|
200
|
|
|
if ($hostname === false) { |
201
|
|
|
throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
202
|
|
|
} |
203
|
|
|
|
204
|
|
|
return $hostname; |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
|