Completed
Pull Request — master (#132)
by Paweł
62:51
created

UriString::checkSchemePath()   A

Complexity

Conditions 5
Paths 3

Size

Total Lines 15
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 5

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 5
eloc 7
c 1
b 0
f 0
nc 3
nop 1
dl 0
loc 15
ccs 9
cts 9
cp 1
crap 5
rs 9.6111
1
<?php
2
3
/**
4
 * League.Uri (https://uri.thephpleague.com)
5
 *
6
 * (c) Ignace Nyamagana Butera <[email protected]>
7
 *
8
 * For the full copyright and license information, please view the LICENSE
9
 * file that was distributed with this source code.
10
 */
11
12
declare(strict_types=1);
13
14
namespace League\Uri;
15
16
use League\Uri\Exceptions\SyntaxError;
17
use TypeError;
18
use function array_merge;
19
use function explode;
20
use function filter_var;
21
use function gettype;
22
use function inet_pton;
23
use function is_scalar;
24
use function method_exists;
25
use function preg_match;
26
use function rawurldecode;
27
use function sprintf;
28
use function strpos;
29
use function substr;
30
use const FILTER_FLAG_IPV6;
31
use const FILTER_VALIDATE_IP;
32
33
/**
34
 * A class to parse a URI string according to RFC3986.
35
 *
36
 * @see     https://tools.ietf.org/html/rfc3986
37
 * @package League\Uri
38
 * @author  Ignace Nyamagana Butera <[email protected]>
39
 * @since   0.1.0
40
 */
41
final class UriString
42
{
43
    /**
44
     * Default URI component values.
45
     */
46
    private const URI_COMPONENTS = [
47
        'scheme' => null,
48
        'user' => null,
49
        'pass' => null,
50
        'host' => null,
51
        'port' => null,
52
        'path' => '',
53
        'query' => null,
54
        'fragment' => null,
55
    ];
56
57
    /**
58
     * sSimple URI which do not need any parsing.
59
     */
60
    private const URI_SCHORTCUTS = [
61
        '' => [],
62
        '#' => ['fragment' => ''],
63
        '?' => ['query' => ''],
64
        '?#' => ['query' => '', 'fragment' => ''],
65
        '/' => ['path' => '/'],
66
        '//' => ['host' => ''],
67
    ];
68
69
    /**
70
     * RFC3986 regular expression URI splitter.
71
     *
72
     * @see https://tools.ietf.org/html/rfc3986#appendix-B
73
     */
74
    private const REGEXP_URI_PARTS = ',^
75
        (?<scheme>(?<scontent>[^:/?\#]+):)?    # URI scheme component
76
        (?<authority>//(?<acontent>[^/?\#]*))? # URI authority part
77
        (?<path>[^?\#]*)                       # URI path component
78
        (?<query>\?(?<qcontent>[^\#]*))?       # URI query component
79
        (?<fragment>\#(?<fcontent>.*))?        # URI fragment component
80
    ,x';
81
82
    /**
83
     * URI scheme regular expresssion.
84
     *
85
     * @see https://tools.ietf.org/html/rfc3986#section-3.1
86
     */
87
    private const REGEXP_URI_SCHEME = '/^([a-z][a-z\d\+\.\-]*)?$/i';
88
89
    /**
90
     * Invalid path for URI without scheme and authority regular expression.
91
     *
92
     * @see https://tools.ietf.org/html/rfc3986#section-3.3
93
     */
94
    private const REGEXP_INVALID_PATH = ',^(([^/]*):)(.*)?/,';
95
96
    /**
97
     * Host and Port splitter regular expression.
98
     */
99
    private const REGEXP_HOST_PORT = ',^(?<host>\[.*\]|[^:]*)(:(?<port>.*))?$,';
100
101
    /**
102
     * IDN Host detector regular expression.
103
     */
104
    private const REGEXP_IDN_PATTERN = '/[^\x20-\x7f]/';
105
106
    /**
107
     * Generate an URI string representation from its parsed representation
108
     * returned by League\Uri\parse() or PHP's parse_url.
109
     *
110
     * If you supply your own array, you are responsible for providing
111
     * valid components without their URI delimiters.
112
     *
113
     * @see https://tools.ietf.org/html/rfc3986#section-5.3
114
     * @see https://tools.ietf.org/html/rfc3986#section-7.5
115
     */
116
    public static function build(array $components): string
117
    {
118
        $result = $components['path'] ?? '';
119
120
        if (isset($components['query'])) {
121
            $result .= '?'.$components['query'];
122
        }
123
124
        if (isset($components['fragment'])) {
125
            $result .= '#'.$components['fragment'];
126
        }
127
128
        $scheme = null;
129
130
        if (isset($components['scheme'])) {
131
            $scheme = $components['scheme'].':';
132
        }
133
134
        if (!isset($components['host'])) {
135
            return $scheme.$result;
136
        }
137
138
        $scheme .= '//';
139
        $authority = $components['host'];
140
141
        if (isset($components['port'])) {
142
            $authority .= ':'.$components['port'];
143
        }
144
145
        if (!isset($components['user'])) {
146
            return $scheme.$authority.$result;
147
        }
148
149
        $authority = '@'.$authority;
150
151
        if (!isset($components['pass'])) {
152
            return $scheme.$components['user'].$authority.$result;
153
        }
154
155
        return $scheme.$components['user'].':'.$components['pass'].$authority.$result;
156
    }
157
158
    /**
159
     * Parse an URI string into its components.
160
     *
161
     * This method parses a URI and returns an associative array containing any
162
     * of the various components of the URI that are present.
163
     *
164
     * <code>
165
     * $components = (new Parser())->parse('http://[email protected]:42?query#');
166
     * var_export($components);
167
     * //will display
168
     * array(
169
     *   'scheme' => 'http',           // the URI scheme component
170
     *   'user' => 'foo',              // the URI user component
171
     *   'pass' => null,               // the URI pass component
172
     *   'host' => 'test.example.com', // the URI host component
173
     *   'port' => 42,                 // the URI port component
174
     *   'path' => '',                 // the URI path component
175 94
     *   'query' => 'query',           // the URI query component
176
     *   'fragment' => '',             // the URI fragment component
177 94
     * );
178 94
     * </code>
179 50
     *
180
     * The returned array is similar to PHP's parse_url return value with the following
181
     * differences:
182 94
     *
183 56
     * <ul>
184
     * <li>All components are always present in the returned array</li>
185
     * <li>Empty and undefined component are treated differently. And empty component is
186 94
     *   set to the empty string while an undefined component is set to the `null` value.</li>
187 94
     * <li>The path component is never undefined</li>
188 52
     * <li>The method parses the URI following the RFC3986 rules but you are still
189
     *   required to validate the returned components against its related scheme specific rules.</li>
190
     * </ul>
191 94
     *
192 42
     * @see https://tools.ietf.org/html/rfc3986
193
     *
194
     * @param mixed $uri any scalar or stringable object
195 52
     *
196 52
     * @throws SyntaxError if the URI contains invalid characters
197 52
     * @throws SyntaxError if the URI contains an invalid scheme
198 14
     * @throws SyntaxError if the URI contains an invalid path
199
     */
200
    public static function parse($uri): array
201 52
    {
202 34
        if (!is_scalar($uri) && !method_exists($uri, '__toString')) {
203
            throw new TypeError(sprintf('The uri must be a scalar or a stringable object `%s` given', gettype($uri)));
204
        }
205 18
206 18
        $uri = (string) $uri;
207 4
        $components = self::parseSimply($uri);
208
209
        if ($components !== null) {
210 14
            return $components;
211
        }
212
213
        $parts = self::checkSchemePath($uri);
214
215
        return array_merge(
216
            self::URI_COMPONENTS,
217
            '' === $parts['authority'] ? [] : self::parseAuthority($parts['acontent']),
218
            [
219
                'path' => $parts['path'],
220
                'scheme' => '' === $parts['scheme'] ? null : $parts['scontent'],
221
                'query' => '' === $parts['query'] ? null : $parts['qcontent'],
222
                'fragment' => '' === $parts['fragment'] ? null : $parts['fcontent'],
223
            ]
224
        );
225
    }
226
227
    /**
228
     * Parse URI when it can be simply parsed.
229
     *
230
     * @throws SyntaxError
231
     */
232
    private static function parseSimply(string $uri): ?array
233
    {
234
        if (isset(self::URI_SCHORTCUTS[$uri])) {
235
            return array_merge(self::URI_COMPONENTS, self::URI_SCHORTCUTS[$uri]);
236
        }
237
238
        if (1 === preg_match(Common::REGEXP_INVALID_URI_CHARS, $uri)) {
239
            throw new SyntaxError(sprintf('The uri `%s` contains invalid characters', $uri));
240
        }
241
242
        //if the first character is a known URI delimiter parsing can be simplified
243
        $first_char = $uri[0];
244
245
        //The URI is made of the fragment only
246
        if ('#' === $first_char) {
247
            [, $fragment] = explode('#', $uri, 2);
248
            $components = self::URI_COMPONENTS;
249
            $components['fragment'] = $fragment;
250
251
            return $components;
252
        }
253
254
        //The URI is made of the query and fragment
255 528
        if ('?' === $first_char) {
256
            [, $partial] = explode('?', $uri, 2);
257 528
            [$query, $fragment] = explode('#', $partial, 2) + [1 => null];
258 2
            $components = self::URI_COMPONENTS;
259
            $components['query'] = $query;
260
            $components['fragment'] = $fragment;
261 526
262
            return $components;
263 526
        }
264 46
265
        return null;
266
    }
267 496
268 2
    /**
269
     * Check scheme and path parts of uri.
270
     *
271
     * @throws SyntaxError
272 494
     */
273
    private static function checkSchemePath(string $uri): array
274
    {
275 494
        //use RFC3986 URI regexp to split the URI
276 6
        preg_match(self::REGEXP_URI_PARTS, $uri, $parts);
277 6
        $parts += ['query' => '', 'fragment' => ''];
278 6
279
        if (':' === $parts['scheme'] || 1 !== preg_match(self::REGEXP_URI_SCHEME, $parts['scontent'])) {
280 6
            throw new SyntaxError(sprintf('The uri `%s` contains an invalid scheme', $uri));
281
        }
282
283
        if ('' === $parts['scheme'].$parts['authority'] && 1 === preg_match(self::REGEXP_INVALID_PATH, $parts['path'])) {
284 490
            throw new SyntaxError(sprintf('The uri `%s` contains an invalid path.', $uri));
285 6
        }
286 6
287 6
        return $parts;
288 6
    }
289 6
290
    /**
291 6
     * Parses the URI authority part.
292
     *
293
     * @see https://tools.ietf.org/html/rfc3986#section-3.2
294
     *
295 486
     * @throws SyntaxError If the port component is invalid
296 486
     */
297
    private static function parseAuthority(string $authority): array
298 486
    {
299 6
        $components = ['user' => null, 'pass' => null, 'host' => '', 'port' => null];
300
301
        if ('' === $authority) {
302 480
            return $components;
303 2
        }
304
305
        $parts = explode('@', $authority, 2);
306 478
307 478
        if (isset($parts[1])) {
308 478
            [$components['user'], $components['pass']] = explode(':', $parts[0], 2) + [1 => null];
309
        }
310 444
311 444
        preg_match(self::REGEXP_HOST_PORT, $parts[1] ?? $parts[0], $matches);
312 444
        $matches += ['port' => ''];
313 444
314
        $components['port'] = self::filterPort($matches['port']);
315
        $components['host'] = self::filterHost($matches['host']);
316
317
        return $components;
318
    }
319
320
    /**
321
     * Filter and format the port component.
322
     *
323
     * @see https://tools.ietf.org/html/rfc3986#section-3.2.2
324
     *
325 378
     * @throws SyntaxError if the registered name is invalid
326
     *
327 378
     */
328 378
    private static function filterPort(string $port): ?int
329 14
    {
330
        if ('' === $port) {
331
            return null;
332 366
        }
333 366
334 90
        if (1 === preg_match('/^\d*$/', $port)) {
335
            return (int) $port;
336
        }
337 366
338 366
        throw new SyntaxError(sprintf('The port `%s` is invalid', $port));
339
    }
340 366
341 354
    /**
342
     * Returns whether a hostname is valid.
343 332
     *
344
     * @see https://tools.ietf.org/html/rfc3986#section-3.2.2
345
     *
346
     * @throws SyntaxError if the registered name is invalid
347
     */
348
    private static function filterHost(string $host): string
349
    {
350
        if ('' === $host) {
351
            return $host;
352
        }
353
354 366
        if ('[' !== $host[0] || ']' !== substr($host, -1)) {
355
            return Common::filterRegisteredName($host, false);
356 366
        }
357 260
358
        if (!self::isIpHost(substr($host, 1, -1))) {
359
            throw new SyntaxError(sprintf('Host `%s` is invalid : the IP host is malformed', $host));
360 110
        }
361 98
362
        return $host;
363
    }
364 12
365
    /**
366
     * Validates a IPv6/IPvfuture host.
367
     *
368
     * @see https://tools.ietf.org/html/rfc3986#section-3.2.2
369
     * @see https://tools.ietf.org/html/rfc6874#section-2
370
     * @see https://tools.ietf.org/html/rfc6874#section-4
371
     */
372
    private static function isIpHost(string $ip_host): bool
373
    {
374 354
        if (false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
375
            return true;
376 354
        }
377 4
378
        if (1 === preg_match(Common::REGEXP_HOST_IPFUTURE, $ip_host, $matches)) {
379
            return !in_array($matches['version'], ['4', '6'], true);
380 352
        }
381 316
382
        $pos = strpos($ip_host, '%');
383
384 36
        if (false === $pos || 1 === preg_match(
385 10
            Common::REGEXP_INVALID_HOST_CHARS,
386
            rawurldecode(substr($ip_host, $pos))
387
        )) {
388 26
            return false;
389
        }
390
391
        $ip_host = substr($ip_host, 0, $pos);
392
393
        return false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)
394
            && 0 === strpos((string) inet_pton($ip_host), Common::ZONE_ID_ADDRESS_BLOCK);
395
    }
396
}
397