| 1 | <?php |
||
| 2 | |||
| 3 | namespace Riimu\Kit\UrlParser; |
||
| 4 | |||
| 5 | /** |
||
| 6 | * Provides a RFC 3986 compliant solution to URL parsing. |
||
| 7 | * |
||
| 8 | * UriParser provides a method for parsing URLs that accurately complies with |
||
| 9 | * the RFC specification. Unlike the built function `parse_url()`, the parser in |
||
| 10 | * this library is based on the ABNF definition of the generic URI syntax. In |
||
| 11 | * other words, this library does not allow any kind of invalid URLs and parses |
||
| 12 | * them exactly as defined in the specification. |
||
| 13 | * |
||
| 14 | * While the intention of this library is to provide an accurate implementation |
||
| 15 | * for URL parsing, it possible to use this library for parsing any kind of |
||
| 16 | * valid URIs, since the parsing is simply based on the generic URI syntax. |
||
| 17 | * Some of the features are simply more suited to dealing with URLs. The parser, |
||
| 18 | * however, does not provide any additional validation based on the URI scheme. |
||
| 19 | * |
||
| 20 | * While the RFC specification does not allow UTF-8 characters in URIs, these |
||
| 21 | * are still commonly used, especially in user input. To accommodate this fact, |
||
| 22 | * the parser provides two additional compatibility modes that permit UTF-8 in |
||
| 23 | * some of the URI components in addition to providing a simple support for |
||
| 24 | * international domain names. |
||
| 25 | * |
||
| 26 | * @see https://tools.ietf.org/html/rfc3986 |
||
| 27 | * @author Riikka Kalliomäki <[email protected]> |
||
| 28 | * @copyright Copyright (c) 2015-2017 Riikka Kalliomäki |
||
| 29 | * @license http://opensource.org/licenses/mit-license.php MIT License |
||
| 30 | */ |
||
| 31 | class UriParser |
||
| 32 | { |
||
| 33 | /** Parsing mode that conforms strictly to the RFC 3986 specification */ |
||
| 34 | const MODE_RFC3986 = 1; |
||
| 35 | |||
| 36 | /** Parsing mode that allows UTF-8 characters in some URI components */ |
||
| 37 | const MODE_UTF8 = 2; |
||
| 38 | |||
| 39 | /** |
||
| 40 | * Parsing mode that also converts international domain names to ascii. |
||
| 41 | * @deprecated Use MODE_IDNA instead |
||
| 42 | * @see UriParser::MODE_IDNA |
||
| 43 | */ |
||
| 44 | const MODE_IDNA2003 = 4; |
||
| 45 | |||
| 46 | /** Parsing mode that also converts international domain names to ascii */ |
||
| 47 | const MODE_IDNA = 4; |
||
| 48 | |||
| 49 | /** @var array<string,string> List of methods used to assign the URI components */ |
||
| 50 | private static $setters = [ |
||
| 51 | 'scheme' => 'withScheme', |
||
| 52 | 'host' => 'withHost', |
||
| 53 | 'port' => 'withPort', |
||
| 54 | 'path_abempty' => 'withPath', |
||
| 55 | 'path_absolute' => 'withPath', |
||
| 56 | 'path_noscheme' => 'withPath', |
||
| 57 | 'path_rootless' => 'withPath', |
||
| 58 | 'query' => 'withQuery', |
||
| 59 | 'fragment' => 'withFragment', |
||
| 60 | ]; |
||
| 61 | |||
| 62 | /** @var int The current parsing mode */ |
||
| 63 | private $mode; |
||
| 64 | |||
| 65 | /** |
||
| 66 | * Creates a new instance of UriParser. |
||
| 67 | */ |
||
| 68 | public function __construct() |
||
| 69 | { |
||
| 70 | $this->mode = self::MODE_RFC3986; |
||
| 71 | } |
||
| 72 | |||
| 73 | /** |
||
| 74 | * Sets the parsing mode. |
||
| 75 | * |
||
| 76 | * The parser supports three different parsing modes as indicated by the |
||
| 77 | * available parsing mode constants. The modes are as follows: |
||
| 78 | * |
||
| 79 | * - `MODE_RFC3986` adheres strictly to the RFC specification and does not |
||
| 80 | * allow any non ascii characters in the URIs. This is the default mode. |
||
| 81 | * |
||
| 82 | * - `MODE_UTF8` allows UTF-8 characters in the user information, path, |
||
| 83 | * query and fragment components of the URI. These characters will be |
||
| 84 | * converted to appropriate percent encoded sequences. |
||
| 85 | * |
||
| 86 | * - `MODE_IDNA` also allows UTF-8 characters in the domain name and |
||
| 87 | * converts the international domain name to ascii according to the IDNA |
||
| 88 | * standard. |
||
| 89 | * |
||
| 90 | * @param int $mode One of the parsing mode constants |
||
| 91 | */ |
||
| 92 | public function setMode($mode) |
||
| 93 | { |
||
| 94 | $this->mode = (int) $mode; |
||
| 95 | } |
||
| 96 | |||
| 97 | /** |
||
| 98 | * Parses the URL using the generic URI syntax. |
||
| 99 | * |
||
| 100 | * This method returns the `Uri` instance constructed from the components |
||
| 101 | * parsed from the URL. The URL is parsed using either the absolute URI |
||
| 102 | * pattern or the relative URI pattern based on which one matches the |
||
| 103 | * provided string. If the URL cannot be parsed as a valid URI, null is |
||
| 104 | * returned instead. |
||
| 105 | * |
||
| 106 | * @param string $uri The URL to parse |
||
| 107 | * @return Uri|null The parsed URL or null if the URL is invalid |
||
| 108 | */ |
||
| 109 | public function parse($uri) |
||
| 110 | { |
||
| 111 | if (!$this->isValidString($uri)) { |
||
| 112 | return null; |
||
| 113 | } |
||
| 114 | |||
| 115 | $pattern = new UriPattern(); |
||
| 116 | $pattern->allowNonAscii($this->mode !== self::MODE_RFC3986); |
||
| 117 | |||
| 118 | if ($pattern->matchUri($uri, $match)) { |
||
| 119 | try { |
||
| 120 | return $this->buildUri($match); |
||
| 121 | } catch (\InvalidArgumentException $exception) { |
||
| 122 | return null; |
||
| 123 | } |
||
| 124 | } |
||
| 125 | |||
| 126 | return null; |
||
| 127 | } |
||
| 128 | |||
| 129 | /** |
||
| 130 | * Tells if the URI string is valid for the current parser mode. |
||
| 131 | * @param string $uri The URI to validate |
||
| 132 | * @return bool True if the string is valid, false if not |
||
| 133 | */ |
||
| 134 | private function isValidString($uri) |
||
| 135 | { |
||
| 136 | if (preg_match('/^[\\x00-\\x7F]*$/', $uri)) { |
||
| 137 | return true; |
||
| 138 | } elseif ($this->mode === self::MODE_RFC3986) { |
||
| 139 | return false; |
||
| 140 | } |
||
| 141 | |||
| 142 | // Validate UTF-8 via regular expression to avoid mbstring dependency |
||
| 143 | $pattern = |
||
| 144 | '/^(?> |
||
| 145 | [\x00-\x7F]+ # ASCII |
||
| 146 | | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
||
| 147 | | \xE0[\xA0-\xBF][\x80-\xBF] # excluding over longs |
||
| 148 | | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
||
| 149 | | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
||
| 150 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
||
| 151 | | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
||
| 152 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||
| 153 | )*$/x'; |
||
| 154 | |||
| 155 | return (bool) preg_match($pattern, $uri); |
||
| 156 | } |
||
| 157 | |||
| 158 | /** |
||
| 159 | * Builds the Uri instance from the parsed components. |
||
| 160 | * @param array<string, string> $components Components parsed from the URI |
||
| 161 | * @return Uri The constructed URI representation |
||
| 162 | */ |
||
| 163 | private function buildUri(array $components) |
||
| 164 | { |
||
| 165 | $uri = new Uri(); |
||
| 166 | |||
| 167 | if (isset($components['reg_name'])) { |
||
| 168 | $components['host'] = $this->decodeHost($components['host']); |
||
| 169 | } |
||
| 170 | |||
| 171 | foreach (array_intersect_key(self::$setters, $components) as $key => $method) { |
||
| 172 | $uri = call_user_func([$uri, $method], $components[$key]); |
||
| 173 | } |
||
| 174 | |||
| 175 | if (isset($components['userinfo'])) { |
||
| 176 | list($username, $password) = preg_split('/:|$/', $components['userinfo'], 2); |
||
| 177 | $uri = $uri->withUserInfo(rawurldecode($username), rawurldecode($password)); |
||
| 178 | } |
||
| 179 | |||
| 180 | return $uri; |
||
| 181 | } |
||
| 182 | |||
| 183 | /** |
||
| 184 | * Decodes the hostname component according to parser mode. |
||
| 185 | * @param string $hostname The parsed hostname |
||
| 186 | * @return string The decoded hostname |
||
| 187 | * @throws \InvalidArgumentException If the hostname is not valid |
||
| 188 | */ |
||
| 189 | private function decodeHost($hostname) |
||
| 190 | { |
||
| 191 | if (preg_match('/^[\\x00-\\x7F]*$/', $hostname)) { |
||
| 192 | return $hostname; |
||
| 193 | } elseif ($this->mode !== self::MODE_IDNA) { |
||
| 194 | throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
||
| 195 | } |
||
| 196 | |||
| 197 | $mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : INTL_IDNA_VARIANT_2003; |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 198 | $hostname = idn_to_ascii($hostname, IDNA_DEFAULT, $mode); |
||
| 199 | |||
| 200 | if ($hostname === false) { |
||
| 201 | throw new \InvalidArgumentException("Invalid hostname '$hostname'"); |
||
| 202 | } |
||
| 203 | |||
| 204 | return $hostname; |
||
| 205 | } |
||
| 206 | } |
||
| 207 |