|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace Riimu\Kit\UrlParser; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Provides a RFC 3986 compliant solution to URL parsing. |
|
7
|
|
|
* |
|
8
|
|
|
* UriParser provides a method for parsing URLs that accurately complies with |
|
9
|
|
|
* the RFC specification. Unlike the built function `parse_url()`, the parser in |
|
10
|
|
|
* this library is based on the ABNF definition of the generic URI syntax. In |
|
11
|
|
|
* other words, this library does not allow any kind of invalid URLs and parses |
|
12
|
|
|
* them exactly as defined in the specification. |
|
13
|
|
|
* |
|
14
|
|
|
* While the intention of this library is to provide an accurate implementation |
|
15
|
|
|
* for URL parsing, it possible to use this library for parsing any kind of |
|
16
|
|
|
* valid URIs, since the parsing is simply based on the generic URI syntax. |
|
17
|
|
|
* Some of the features are simply more suited to dealing with URLs. The parser, |
|
18
|
|
|
* however, does not provide any additional validation based on the URI scheme. |
|
19
|
|
|
* |
|
20
|
|
|
* While the RFC specification does not allow UTF-8 characters in URIs, these |
|
21
|
|
|
* are still commonly used, especially in user input. To accommodate this fact, |
|
22
|
|
|
* the parser provides two additional compatibility modes that permit UTF-8 in |
|
23
|
|
|
* some of the URI components in addition to providing a simple support for |
|
24
|
|
|
* international domain names. |
|
25
|
|
|
* |
|
26
|
|
|
* @see https://tools.ietf.org/html/rfc3986 |
|
27
|
|
|
* @author Riikka Kalliomäki <[email protected]> |
|
28
|
|
|
* @copyright Copyright (c) 2015-2017 Riikka Kalliomäki |
|
29
|
|
|
* @license http://opensource.org/licenses/mit-license.php MIT License |
|
30
|
|
|
*/ |
|
31
|
|
|
class UriParser |
|
32
|
|
|
{ |
|
33
|
|
|
/** Parsing mode that conforms strictly to the RFC 3986 specification */ |
|
34
|
|
|
const MODE_RFC3986 = 1; |
|
35
|
|
|
|
|
36
|
|
|
/** Parsing mode that allows UTF-8 characters in some URI components */ |
|
37
|
|
|
const MODE_UTF8 = 2; |
|
38
|
|
|
|
|
39
|
|
|
/** |
|
40
|
|
|
* Parsing mode that also converts international domain names to ascii. |
|
41
|
|
|
* @deprecated Use MODE_IDNA instead |
|
42
|
|
|
* @see UriParser::MODE_IDNA |
|
43
|
|
|
*/ |
|
44
|
|
|
const MODE_IDNA2003 = 4; |
|
45
|
|
|
|
|
46
|
|
|
/** Parsing mode that also converts international domain names to ascii */ |
|
47
|
|
|
const MODE_IDNA = 4; |
|
48
|
|
|
|
|
49
|
|
|
/** @var array<string,string> List of methods used to assign the URI components */ |
|
50
|
|
|
private static $setters = [ |
|
51
|
|
|
'scheme' => 'withScheme', |
|
52
|
|
|
'host' => 'withHost', |
|
53
|
|
|
'port' => 'withPort', |
|
54
|
|
|
'path_abempty' => 'withPath', |
|
55
|
|
|
'path_absolute' => 'withPath', |
|
56
|
|
|
'path_noscheme' => 'withPath', |
|
57
|
|
|
'path_rootless' => 'withPath', |
|
58
|
|
|
'query' => 'withQuery', |
|
59
|
|
|
'fragment' => 'withFragment', |
|
60
|
|
|
]; |
|
61
|
|
|
|
|
62
|
|
|
/** @var int The current parsing mode */ |
|
63
|
|
|
private $mode; |
|
64
|
|
|
|
|
65
|
|
|
/** |
|
66
|
|
|
* Creates a new instance of UriParser. |
|
67
|
|
|
*/ |
|
68
|
|
|
public function __construct() |
|
69
|
|
|
{ |
|
70
|
|
|
$this->mode = self::MODE_RFC3986; |
|
71
|
|
|
} |
|
72
|
|
|
|
|
73
|
|
|
/** |
|
74
|
|
|
* Sets the parsing mode. |
|
75
|
|
|
* |
|
76
|
|
|
* The parser supports three different parsing modes as indicated by the |
|
77
|
|
|
* available parsing mode constants. The modes are as follows: |
|
78
|
|
|
* |
|
79
|
|
|
* - `MODE_RFC3986` adheres strictly to the RFC specification and does not |
|
80
|
|
|
* allow any non ascii characters in the URIs. This is the default mode. |
|
81
|
|
|
* |
|
82
|
|
|
* - `MODE_UTF8` allows UTF-8 characters in the user information, path, |
|
83
|
|
|
* query and fragment components of the URI. These characters will be |
|
84
|
|
|
* converted to appropriate percent encoded sequences. |
|
85
|
|
|
* |
|
86
|
|
|
* - `MODE_IDNA` also allows UTF-8 characters in the domain name and |
|
87
|
|
|
* converts the international domain name to ascii according to the IDNA |
|
88
|
|
|
* standard. |
|
89
|
|
|
* |
|
90
|
|
|
* @param int $mode One of the parsing mode constants |
|
91
|
|
|
*/ |
|
92
|
|
|
public function setMode($mode) |
|
93
|
|
|
{ |
|
94
|
|
|
$this->mode = (int) $mode; |
|
95
|
|
|
} |
|
96
|
|
|
|
|
97
|
|
|
/** |
|
98
|
|
|
* Parses the URL using the generic URI syntax. |
|
99
|
|
|
* |
|
100
|
|
|
* This method returns the `Uri` instance constructed from the components |
|
101
|
|
|
* parsed from the URL. The URL is parsed using either the absolute URI |
|
102
|
|
|
* pattern or the relative URI pattern based on which one matches the |
|
103
|
|
|
* provided string. If the URL cannot be parsed as a valid URI, null is |
|
104
|
|
|
* returned instead. |
|
105
|
|
|
* |
|
106
|
|
|
* @param string $uri The URL to parse |
|
107
|
|
|
* @return Uri|null The parsed URL or null if the URL is invalid |
|
108
|
|
|
*/ |
|
109
|
|
|
public function parse($uri) |
|
110
|
|
|
{ |
|
111
|
|
|
if (!$this->isValidString($uri)) { |
|
112
|
|
|
return null; |
|
113
|
|
|
} |
|
114
|
|
|
|
|
115
|
|
|
$pattern = new UriPattern(); |
|
116
|
|
|
$pattern->allowNonAscii($this->mode !== self::MODE_RFC3986); |
|
117
|
|
|
|
|
118
|
|
|
if ($pattern->matchUri($uri, $match)) { |
|
119
|
|
|
try { |
|
120
|
|
|
return $this->buildUri($match); |
|
121
|
|
|
} catch (\InvalidArgumentException $exception) { |
|
122
|
|
|
return null; |
|
123
|
|
|
} |
|
124
|
|
|
} |
|
125
|
|
|
|
|
126
|
|
|
return null; |
|
127
|
|
|
} |
|
128
|
|
|
|
|
129
|
|
|
/** |
|
130
|
|
|
* Tells if the URI string is valid for the current parser mode. |
|
131
|
|
|
* @param string $uri The URI to validate |
|
132
|
|
|
* @return bool True if the string is valid, false if not |
|
133
|
|
|
*/ |
|
134
|
|
|
private function isValidString($uri) |
|
135
|
|
|
{ |
|
136
|
|
|
if (preg_match('/^[\\x00-\\x7F]*$/', $uri)) { |
|
137
|
|
|
return true; |
|
138
|
|
|
} elseif ($this->mode === self::MODE_RFC3986) { |
|
139
|
|
|
return false; |
|
140
|
|
|
} |
|
141
|
|
|
|
|
142
|
|
|
// Validate UTF-8 via regular expression to avoid mbstring dependency |
|
143
|
|
|
$pattern = |
|
144
|
|
|
'/^(?> |
|
145
|
|
|
[\x00-\x7F]+ # ASCII |
|
146
|
|
|
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
|
147
|
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding over longs |
|
148
|
|
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
|
149
|
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
|
150
|
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
|
151
|
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
|
152
|
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
|
153
|
|
|
)*$/x'; |
|
154
|
|
|
|
|
155
|
|
|
return (bool) preg_match($pattern, $uri); |
|
156
|
|
|
} |
|
157
|
|
|
|
|
158
|
|
|
/** |
|
159
|
|
|
* Builds the Uri instance from the parsed components. |
|
160
|
|
|
* @param array<string, string> $components Components parsed from the URI |
|
161
|
|
|
* @return Uri The constructed URI representation |
|
162
|
|
|
*/ |
|
163
|
|
|
private function buildUri(array $components) |
|
164
|
|
|
{ |
|
165
|
|
|
$uri = new Uri(); |
|
166
|
|
|
|
|
167
|
|
|
if (isset($components['reg_name'])) { |
|
168
|
|
|
$components['host'] = $this->decodeHost($components['host']); |
|
169
|
|
|
} |
|
170
|
|
|
|
|
171
|
|
|
foreach (array_intersect_key(self::$setters, $components) as $key => $method) { |
|
172
|
|
|
$uri = call_user_func([$uri, $method], $components[$key]); |
|
173
|
|
|
} |
|
174
|
|
|
|
|
175
|
|
|
if (isset($components['userinfo'])) { |
|
176
|
|
|
list($username, $password) = preg_split('/:|$/', $components['userinfo'], 2); |
|
177
|
|
|
$uri = $uri->withUserInfo(rawurldecode($username), rawurldecode($password)); |
|
178
|
|
|
} |
|
179
|
|
|
|
|
180
|
|
|
return $uri; |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
/** |
|
184
|
|
|
* Decodes the hostname component according to parser mode. |
|
185
|
|
|
* @param string $hostname The parsed hostname |
|
186
|
|
|
* @return string The decoded hostname |
|
187
|
|
|
* @throws \InvalidArgumentException If the hostname is not valid |
|
188
|
|
|
*/ |
|
189
|
|
|
private function decodeHost($hostname) |
|
190
|
|
|
{ |
|
191
|
|
|
if (preg_match('/^[\\x00-\\x7F]*$/', $hostname)) { |
|
192
|
|
|
return $hostname; |
|
193
|
|
|
} elseif ($this->mode !== self::MODE_IDNA) { |
|
194
|
|
|
throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
|
195
|
|
|
} |
|
196
|
|
|
|
|
197
|
|
|
$mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : INTL_IDNA_VARIANT_2003; |
|
|
|
|
|
|
198
|
|
|
$hostname = idn_to_ascii($hostname, IDNA_DEFAULT, $mode); |
|
199
|
|
|
|
|
200
|
|
|
if ($hostname === false) { |
|
201
|
|
|
throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
|
202
|
|
|
} |
|
203
|
|
|
|
|
204
|
|
|
return $hostname; |
|
205
|
|
|
} |
|
206
|
|
|
} |
|
207
|
|
|
|