1 | <?php |
||
2 | |||
3 | namespace Riimu\Kit\UrlParser; |
||
4 | |||
5 | /** |
||
6 | * Provides a RFC 3986 compliant solution to URL parsing. |
||
7 | * |
||
8 | * UriParser provides a method for parsing URLs that accurately complies with |
||
9 | * the RFC specification. Unlike the built function `parse_url()`, the parser in |
||
10 | * this library is based on the ABNF definition of the generic URI syntax. In |
||
11 | * other words, this library does not allow any kind of invalid URLs and parses |
||
12 | * them exactly as defined in the specification. |
||
13 | * |
||
14 | * While the intention of this library is to provide an accurate implementation |
||
15 | * for URL parsing, it possible to use this library for parsing any kind of |
||
16 | * valid URIs, since the parsing is simply based on the generic URI syntax. |
||
17 | * Some of the features are simply more suited to dealing with URLs. The parser, |
||
18 | * however, does not provide any additional validation based on the URI scheme. |
||
19 | * |
||
20 | * While the RFC specification does not allow UTF-8 characters in URIs, these |
||
21 | * are still commonly used, especially in user input. To accommodate this fact, |
||
22 | * the parser provides two additional compatibility modes that permit UTF-8 in |
||
23 | * some of the URI components in addition to providing a simple support for |
||
24 | * international domain names. |
||
25 | * |
||
26 | * @see https://tools.ietf.org/html/rfc3986 |
||
27 | * @author Riikka Kalliomäki <[email protected]> |
||
28 | * @copyright Copyright (c) 2015-2017 Riikka Kalliomäki |
||
29 | * @license http://opensource.org/licenses/mit-license.php MIT License |
||
30 | */ |
||
31 | class UriParser |
||
32 | { |
||
33 | /** Parsing mode that conforms strictly to the RFC 3986 specification */ |
||
34 | const MODE_RFC3986 = 1; |
||
35 | |||
36 | /** Parsing mode that allows UTF-8 characters in some URI components */ |
||
37 | const MODE_UTF8 = 2; |
||
38 | |||
39 | /** |
||
40 | * Parsing mode that also converts international domain names to ascii. |
||
41 | * @deprecated Use MODE_IDNA instead |
||
42 | * @see UriParser::MODE_IDNA |
||
43 | */ |
||
44 | const MODE_IDNA2003 = 4; |
||
45 | |||
46 | /** Parsing mode that also converts international domain names to ascii */ |
||
47 | const MODE_IDNA = 4; |
||
48 | |||
49 | /** @var array<string,string> List of methods used to assign the URI components */ |
||
50 | private static $setters = [ |
||
51 | 'scheme' => 'withScheme', |
||
52 | 'host' => 'withHost', |
||
53 | 'port' => 'withPort', |
||
54 | 'path_abempty' => 'withPath', |
||
55 | 'path_absolute' => 'withPath', |
||
56 | 'path_noscheme' => 'withPath', |
||
57 | 'path_rootless' => 'withPath', |
||
58 | 'query' => 'withQuery', |
||
59 | 'fragment' => 'withFragment', |
||
60 | ]; |
||
61 | |||
62 | /** @var int The current parsing mode */ |
||
63 | private $mode; |
||
64 | |||
65 | /** |
||
66 | * Creates a new instance of UriParser. |
||
67 | */ |
||
68 | public function __construct() |
||
69 | { |
||
70 | $this->mode = self::MODE_RFC3986; |
||
71 | } |
||
72 | |||
73 | /** |
||
74 | * Sets the parsing mode. |
||
75 | * |
||
76 | * The parser supports three different parsing modes as indicated by the |
||
77 | * available parsing mode constants. The modes are as follows: |
||
78 | * |
||
79 | * - `MODE_RFC3986` adheres strictly to the RFC specification and does not |
||
80 | * allow any non ascii characters in the URIs. This is the default mode. |
||
81 | * |
||
82 | * - `MODE_UTF8` allows UTF-8 characters in the user information, path, |
||
83 | * query and fragment components of the URI. These characters will be |
||
84 | * converted to appropriate percent encoded sequences. |
||
85 | * |
||
86 | * - `MODE_IDNA` also allows UTF-8 characters in the domain name and |
||
87 | * converts the international domain name to ascii according to the IDNA |
||
88 | * standard. |
||
89 | * |
||
90 | * @param int $mode One of the parsing mode constants |
||
91 | */ |
||
92 | public function setMode($mode) |
||
93 | { |
||
94 | $this->mode = (int) $mode; |
||
95 | } |
||
96 | |||
97 | /** |
||
98 | * Parses the URL using the generic URI syntax. |
||
99 | * |
||
100 | * This method returns the `Uri` instance constructed from the components |
||
101 | * parsed from the URL. The URL is parsed using either the absolute URI |
||
102 | * pattern or the relative URI pattern based on which one matches the |
||
103 | * provided string. If the URL cannot be parsed as a valid URI, null is |
||
104 | * returned instead. |
||
105 | * |
||
106 | * @param string $uri The URL to parse |
||
107 | * @return Uri|null The parsed URL or null if the URL is invalid |
||
108 | */ |
||
109 | public function parse($uri) |
||
110 | { |
||
111 | if (!$this->isValidString($uri)) { |
||
112 | return null; |
||
113 | } |
||
114 | |||
115 | $pattern = new UriPattern(); |
||
116 | $pattern->allowNonAscii($this->mode !== self::MODE_RFC3986); |
||
117 | |||
118 | if ($pattern->matchUri($uri, $match)) { |
||
119 | try { |
||
120 | return $this->buildUri($match); |
||
121 | } catch (\InvalidArgumentException $exception) { |
||
122 | return null; |
||
123 | } |
||
124 | } |
||
125 | |||
126 | return null; |
||
127 | } |
||
128 | |||
129 | /** |
||
130 | * Tells if the URI string is valid for the current parser mode. |
||
131 | * @param string $uri The URI to validate |
||
132 | * @return bool True if the string is valid, false if not |
||
133 | */ |
||
134 | private function isValidString($uri) |
||
135 | { |
||
136 | if (preg_match('/^[\\x00-\\x7F]*$/', $uri)) { |
||
137 | return true; |
||
138 | } elseif ($this->mode === self::MODE_RFC3986) { |
||
139 | return false; |
||
140 | } |
||
141 | |||
142 | // Validate UTF-8 via regular expression to avoid mbstring dependency |
||
143 | $pattern = |
||
144 | '/^(?> |
||
145 | [\x00-\x7F]+ # ASCII |
||
146 | | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
||
147 | | \xE0[\xA0-\xBF][\x80-\xBF] # excluding over longs |
||
148 | | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
||
149 | | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
||
150 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
||
151 | | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
||
152 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||
153 | )*$/x'; |
||
154 | |||
155 | return (bool) preg_match($pattern, $uri); |
||
156 | } |
||
157 | |||
158 | /** |
||
159 | * Builds the Uri instance from the parsed components. |
||
160 | * @param array<string, string> $components Components parsed from the URI |
||
161 | * @return Uri The constructed URI representation |
||
162 | */ |
||
163 | private function buildUri(array $components) |
||
164 | { |
||
165 | $uri = new Uri(); |
||
166 | |||
167 | if (isset($components['reg_name'])) { |
||
168 | $components['host'] = $this->decodeHost($components['host']); |
||
169 | } |
||
170 | |||
171 | foreach (array_intersect_key(self::$setters, $components) as $key => $method) { |
||
172 | $uri = call_user_func([$uri, $method], $components[$key]); |
||
173 | } |
||
174 | |||
175 | if (isset($components['userinfo'])) { |
||
176 | list($username, $password) = preg_split('/:|$/', $components['userinfo'], 2); |
||
177 | $uri = $uri->withUserInfo(rawurldecode($username), rawurldecode($password)); |
||
178 | } |
||
179 | |||
180 | return $uri; |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * Decodes the hostname component according to parser mode. |
||
185 | * @param string $hostname The parsed hostname |
||
186 | * @return string The decoded hostname |
||
187 | * @throws \InvalidArgumentException If the hostname is not valid |
||
188 | */ |
||
189 | private function decodeHost($hostname) |
||
190 | { |
||
191 | if (preg_match('/^[\\x00-\\x7F]*$/', $hostname)) { |
||
192 | return $hostname; |
||
193 | } elseif ($this->mode !== self::MODE_IDNA) { |
||
194 | throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
||
195 | } |
||
196 | |||
197 | $mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : INTL_IDNA_VARIANT_2003; |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
198 | $hostname = idn_to_ascii($hostname, IDNA_DEFAULT, $mode); |
||
199 | |||
200 | if ($hostname === false) { |
||
201 | throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
||
202 | } |
||
203 | |||
204 | return $hostname; |
||
205 | } |
||
206 | } |
||
207 |