UriParser   A
last analyzed

Complexity

Total Complexity 18

Size/Duplication

Total Lines 174
Duplicated Lines 0 %

Importance

Changes 8
Bugs 0 Features 0
Metric Value
eloc 62
dl 0
loc 174
rs 10
c 8
b 0
f 0
wmc 18

6 Methods

Rating   Name   Duplication   Size   Complexity  
A isValidString() 0 22 3
A buildUri() 0 18 4
A __construct() 0 3 1
A parse() 0 18 4
A setMode() 0 3 1
A decodeHost() 0 16 5
1
<?php
2
3
namespace Riimu\Kit\UrlParser;
4
5
/**
6
 * Provides a RFC 3986 compliant solution to URL parsing.
7
 *
8
 * UriParser provides a method for parsing URLs that accurately complies with
9
 * the RFC specification. Unlike the built function `parse_url()`, the parser in
10
 * this library is based on the ABNF definition of the generic URI syntax. In
11
 * other words, this library does not allow any kind of invalid URLs and parses
12
 * them exactly as defined in the specification.
13
 *
14
 * While the intention of this library is to provide an accurate implementation
15
 * for URL parsing, it possible to use this library for parsing any kind of
16
 * valid URIs, since the parsing is simply based on the generic URI syntax.
17
 * Some of the features are simply more suited to dealing with URLs. The parser,
18
 * however, does not provide any additional validation based on the URI scheme.
19
 *
20
 * While the RFC specification does not allow UTF-8 characters in URIs, these
21
 * are still commonly used, especially in user input. To accommodate this fact,
22
 * the parser provides two additional compatibility modes that permit UTF-8 in
23
 * some of the URI components in addition to providing a simple support for
24
 * international domain names.
25
 *
26
 * @see https://tools.ietf.org/html/rfc3986
27
 * @author Riikka Kalliomäki <[email protected]>
28
 * @copyright Copyright (c) 2015-2017 Riikka Kalliomäki
29
 * @license http://opensource.org/licenses/mit-license.php MIT License
30
 */
31
class UriParser
32
{
33
    /** Parsing mode that conforms strictly to the RFC 3986 specification */
34
    const MODE_RFC3986 = 1;
35
36
    /** Parsing mode that allows UTF-8 characters in some URI components */
37
    const MODE_UTF8 = 2;
38
39
    /**
40
     * Parsing mode that also converts international domain names to ascii.
41
     * @deprecated Use MODE_IDNA instead
42
     * @see UriParser::MODE_IDNA
43
     */
44
    const MODE_IDNA2003 = 4;
45
46
    /** Parsing mode that also converts international domain names to ascii */
47
    const MODE_IDNA = 4;
48
49
    /** @var array<string,string> List of methods used to assign the URI components */
50
    private static $setters = [
51
        'scheme' => 'withScheme',
52
        'host' => 'withHost',
53
        'port' => 'withPort',
54
        'path_abempty' => 'withPath',
55
        'path_absolute' => 'withPath',
56
        'path_noscheme' => 'withPath',
57
        'path_rootless' => 'withPath',
58
        'query' => 'withQuery',
59
        'fragment' => 'withFragment',
60
    ];
61
62
    /** @var int The current parsing mode */
63
    private $mode;
64
65
    /**
66
     * Creates a new instance of UriParser.
67
     */
68
    public function __construct()
69
    {
70
        $this->mode = self::MODE_RFC3986;
71
    }
72
73
    /**
74
     * Sets the parsing mode.
75
     *
76
     * The parser supports three different parsing modes as indicated by the
77
     * available parsing mode constants. The modes are as follows:
78
     *
79
     * - `MODE_RFC3986` adheres strictly to the RFC specification and does not
80
     *   allow any non ascii characters in the URIs. This is the default mode.
81
     *
82
     * - `MODE_UTF8` allows UTF-8 characters in the user information, path,
83
     *   query and fragment components of the URI. These characters will be
84
     *   converted to appropriate percent encoded sequences.
85
     *
86
     * - `MODE_IDNA` also allows UTF-8 characters in the domain name and
87
     *   converts the international domain name to ascii according to the IDNA
88
     *   standard.
89
     *
90
     * @param int $mode One of the parsing mode constants
91
     */
92
    public function setMode($mode)
93
    {
94
        $this->mode = (int) $mode;
95
    }
96
97
    /**
98
     * Parses the URL using the generic URI syntax.
99
     *
100
     * This method returns the `Uri` instance constructed from the components
101
     * parsed from the URL. The URL is parsed using either the absolute URI
102
     * pattern or the relative URI pattern based on which one matches the
103
     * provided string. If the URL cannot be parsed as a valid URI, null is
104
     * returned instead.
105
     *
106
     * @param string $uri The URL to parse
107
     * @return Uri|null The parsed URL or null if the URL is invalid
108
     */
109
    public function parse($uri)
110
    {
111
        if (!$this->isValidString($uri)) {
112
            return null;
113
        }
114
115
        $pattern = new UriPattern();
116
        $pattern->allowNonAscii($this->mode !== self::MODE_RFC3986);
117
118
        if ($pattern->matchUri($uri, $match)) {
119
            try {
120
                return $this->buildUri($match);
121
            } catch (\InvalidArgumentException $exception) {
122
                return null;
123
            }
124
        }
125
126
        return null;
127
    }
128
129
    /**
130
     * Tells if the URI string is valid for the current parser mode.
131
     * @param string $uri The URI to validate
132
     * @return bool True if the string is valid, false if not
133
     */
134
    private function isValidString($uri)
135
    {
136
        if (preg_match('/^[\\x00-\\x7F]*$/', $uri)) {
137
            return true;
138
        } elseif ($this->mode === self::MODE_RFC3986) {
139
            return false;
140
        }
141
142
        // Validate UTF-8 via regular expression to avoid mbstring dependency
143
        $pattern =
144
            '/^(?>
145
                [\x00-\x7F]+                       # ASCII
146
              | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
147
              |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding over longs
148
              | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
149
              |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
150
              |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
151
              | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
152
              |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
153
            )*$/x';
154
155
        return (bool) preg_match($pattern, $uri);
156
    }
157
158
    /**
159
     * Builds the Uri instance from the parsed components.
160
     * @param array<string, string> $components Components parsed from the URI
161
     * @return Uri The constructed URI representation
162
     */
163
    private function buildUri(array $components)
164
    {
165
        $uri = new Uri();
166
167
        if (isset($components['reg_name'])) {
168
            $components['host'] = $this->decodeHost($components['host']);
169
        }
170
171
        foreach (array_intersect_key(self::$setters, $components) as $key => $method) {
172
            $uri = call_user_func([$uri, $method], $components[$key]);
173
        }
174
175
        if (isset($components['userinfo'])) {
176
            list($username, $password) = preg_split('/:|$/', $components['userinfo'], 2);
177
            $uri = $uri->withUserInfo(rawurldecode($username), rawurldecode($password));
178
        }
179
180
        return $uri;
181
    }
182
183
    /**
184
     * Decodes the hostname component according to parser mode.
185
     * @param string $hostname The parsed hostname
186
     * @return string The decoded hostname
187
     * @throws \InvalidArgumentException If the hostname is not valid
188
     */
189
    private function decodeHost($hostname)
190
    {
191
        if (preg_match('/^[\\x00-\\x7F]*$/', $hostname)) {
192
            return $hostname;
193
        } elseif ($this->mode !== self::MODE_IDNA) {
194
            throw new \InvalidArgumentException("Invalid hostname '$hostname'");
195
        }
196
197
        $mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : INTL_IDNA_VARIANT_2003;
0 ignored issues
show
introduced by
The constant INTL_IDNA_VARIANT_2003 has been deprecated: 7.2 Use {@see INTL_IDNA_VARIANT_UTS46} instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

197
        $mode = defined('INTL_IDNA_VARIANT_UTS46') ? INTL_IDNA_VARIANT_UTS46 : /** @scrutinizer ignore-deprecated */ INTL_IDNA_VARIANT_2003;
Loading history...
198
        $hostname = idn_to_ascii($hostname, IDNA_DEFAULT, $mode);
199
200
        if ($hostname === false) {
201
            throw new \InvalidArgumentException("Invalid hostname '$hostname'");
202
        }
203
204
        return $hostname;
205
    }
206
}
207