Parser::parseUrl()   B
last analyzed

Complexity

Conditions 5
Paths 6

Size

Total Lines 46

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
nc 6
nop 1
dl 0
loc 46
ccs 21
cts 21
cp 1
crap 5
rs 8.867
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
/**
6
 * PHP Domain Parser: Public Suffix List based URL parsing.
7
 *
8
 * @link      http://github.com/jeremykendall/php-domain-parser for the canonical source repository
9
 *
10
 * @copyright Copyright (c) 2014 Jeremy Kendall (http://about.me/jeremykendall)
11
 * @license   http://github.com/jeremykendall/php-domain-parser/blob/master/LICENSE MIT License
12
 */
13
14
namespace Pdp;
15
16
use Pdp\Exception\SeriouslyMalformedUrlException;
17
use Pdp\Uri\Url;
18
use Pdp\Uri\Url\Host;
19
use voku\helper\UTF8;
20
21
/**
22
 * Parser.
23
 *
24
 * This class is responsible for Public Suffix List based url parsing
25
 */
26
class Parser
27
{
28
  /**
29
   * @var string RFC 3986 compliant scheme regex pattern
30
   *
31
   * @see https://tools.ietf.org/html/rfc3986#section-3.1
32
   */
33
  const SCHEME_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*)://#';
34
35
  /**
36
   * @var string IP address regex pattern
37
   */
38
  const IP_ADDRESS_PATTERN = '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/';
39
40
  /**
41
   * @var PublicSuffixList Public Suffix List
42
   */
43
  protected $publicSuffixList;
44
45
  /**
46
   * @var bool Whether or not a host part has been normalized
47
   */
48
  protected $isNormalized = false;
49
50
  /**
51
   * @var PunycodeWrapper
52
   */
53
  private $punycodeWrapper;
54
55
  /**
56
   * Public constructor.
57
   *
58
   * @codeCoverageIgnore
59
   *
60
   * @param PublicSuffixList $publicSuffixList Instance of PublicSuffixList
61
   */
62
  public function __construct(PublicSuffixList $publicSuffixList)
63
  {
64
    $this->publicSuffixList = $publicSuffixList;
65
    $this->punycodeWrapper = new PunycodeWrapper();
66
  }
67
68
  /**
69
   * Parses url.
70
   *
71
   * @param string $url Url to parse
72
   *
73
   * @return Url Object representation of url
74
   */
75 233
  public function parseUrl($url): Url
76
  {
77 233
    $rawUrl = $url;
78
    $elem = [
79 233
        'scheme'   => null,
80
        'user'     => null,
81
        'pass'     => null,
82
        'host'     => null,
83
        'port'     => null,
84
        'path'     => null,
85
        'query'    => null,
86
        'fragment' => null,
87
    ];
88
89 233
    if (\preg_match(self::SCHEME_PATTERN, $url) === 0) {
90
      // Wacky scheme required to overcome parse_url behavior in PHP
91
      // See https://github.com/jeremykendall/php-domain-parser/issues/49
92 129
      $url = 'php-hack://' . \preg_replace('#^//#', '', $url, 1);
93
    }
94
95 233
    $parts = pdp_parse_url($url);
96
97 233
    if ($parts === false || !isset($parts['host'])) {
98 3
      throw new SeriouslyMalformedUrlException($rawUrl);
99
    }
100
101 230
    if ($parts['scheme'] === 'php-hack') {
102 128
      $parts['scheme'] = null;
103
    }
104
105
    /** @noinspection AdditionOperationOnArraysInspection */
106 230
    $elem = (array)$parts + $elem;
107
108 230
    $host = $this->parseHost($parts['host']);
109
110 230
    return new Url(
111 230
        $elem['scheme'],
112 230
        $elem['user'],
113 230
        $elem['pass'],
114 230
        $host,
115 230
        $elem['port'],
116 230
        $elem['path'],
117 230
        $elem['query'],
118 230
        $elem['fragment']
119
    );
120
  }
121
122
  /**
123
   * Parses host part of url.
124
   *
125
   * @param string $host Host part of url
126
   *
127
   * @return Host Object representation of host portion of url
128
   */
129 54
  public function parseHost(string $host): Host
130
  {
131 54
    $host = UTF8::strtolower($host);
132
133 54
    return new Host(
134 54
        $this->getSubdomain($host),
135 54
        $this->getRegistrableDomain($host),
136 54
        $this->getPublicSuffix($host),
137 54
        $host
138
    );
139
  }
140
141
  /**
142
   * Get the raw public suffix based on the cached public suffix list file.
143
   * Return false if the provided suffix is not included in the PSL.
144
   *
145
   * @param string $host The host to process
146
   *
147
   * @return string|false The suffix or false if suffix not included in the PSL
148
   */
149 11
  protected function getRawPublicSuffix(string $host)
150
  {
151 11
    $host = $this->normalize($host);
152
153 11
    $parts = \array_reverse(\explode('.', $host));
154 11
    $publicSuffix = [];
155 11
    $publicSuffixList = $this->publicSuffixList;
156
157 11
    foreach ($parts as $part) {
158
      if (
159 11
          \array_key_exists($part, $publicSuffixList)
160
          &&
161 11
          \array_key_exists('!', $publicSuffixList[$part])
162
      ) {
163 1
        break;
164
      }
165
166 11
      if (\array_key_exists($part, $publicSuffixList)) {
167 11
        \array_unshift($publicSuffix, $part);
168 11
        $publicSuffixList = $publicSuffixList[$part];
169 11
        continue;
170
      }
171
172 11
      if (\array_key_exists('*', $publicSuffixList)) {
173 1
        \array_unshift($publicSuffix, $part);
174 1
        $publicSuffixList = $publicSuffixList['*'];
175 1
        continue;
176
      }
177
178
      // Avoids improper parsing when $host's subdomain + public suffix ===
179
      // a valid public suffix (e.g. host 'us.example.com' and public suffix 'us.com')
180
      //
181
      // Added by @goodhabit in https://github.com/jeremykendall/php-domain-parser/pull/15
182
      // Resolves https://github.com/jeremykendall/php-domain-parser/issues/16
183 11
      break;
184
    }
185
186
    // If empty, then the suffix is not included in the PSL and is
187
    // considered "invalid". This also triggers algorithm rule #2: If no
188
    // rules match, the prevailing rule is "*".
189 11
    if (empty($publicSuffix)) {
190 1
      return false;
191
    }
192
193 11
    $suffix = \implode('.', \array_filter($publicSuffix, '\strlen'));
194
195 11
    return $this->denormalize($suffix);
196
  }
197
198
  /**
199
   * Returns the public suffix portion of provided host.
200
   *
201
   * @param string $host host
202
   *
203
   * @return string|null public suffix or null if host does not contain a public suffix
204
   */
205 56
  public function getPublicSuffix(string $host)
206
  {
207 56
    if (\strpos($host, '.') === 0) {
208 1
      return null;
209
    }
210
211
    // Fixes #22: If a single label domain makes it this far (e.g.,
212
    // localhost, foo, etc.), this stops it from incorrectly being set as
213
    // the public suffix.
214 56
    if (!$this->isMultiLabelDomain($host)) {
215 6
      return null;
216
    }
217
218
    // Fixes #43
219 50
    if ($this->isIpv4Address($host)) {
220 3
      return null;
221
    }
222
223 47
    $suffix = $this->getRawPublicSuffix($host);
224
225
    // Apply algorithm rule #2: If no rules match, the prevailing rule is "*".
226 47
    if (false === $suffix) {
227 3
      $parts = \array_reverse(explode('.', $host));
228 3
      $suffix = \array_shift($parts);
229
    }
230
231 47
    return $suffix;
232
  }
233
234
  /**
235
   * Is suffix valid?
236
   *
237
   * Validity determined by whether or not the suffix is included in the PSL.
238
   *
239
   * @param string $host Host part
240
   *
241
   * @return bool True is suffix is valid, false otherwise
242
   */
243 2
  public function isSuffixValid(string $host): bool
244
  {
245 2
    return $this->getRawPublicSuffix($host) !== false;
246
  }
247
248
  /**
249
   * Returns registrable domain portion of provided host.
250
   *
251
   * Per the test cases provided by Mozilla
252
   * (http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1),
253
   * this method should return null if the domain provided is a public suffix.
254
   *
255
   * @param string|null $host host
256
   *
257
   * @return string|null registrable domain
258
   */
259 55
  public function getRegistrableDomain($host)
260
  {
261 55
    if (!$this->isMultiLabelDomain($host)) {
262 7
      return null;
263
    }
264
265 49
    $publicSuffix = $this->getPublicSuffix($host);
266
267 49
    if ($publicSuffix === null || $host == $publicSuffix) {
268 4
      return null;
269
    }
270
271 46
    $publicSuffixParts = \array_reverse(\explode('.', $publicSuffix));
272 46
    $hostParts = \array_reverse(\explode('.', $host));
273 46
    $registrableDomainParts = $publicSuffixParts + \array_slice($hostParts, 0, \count($publicSuffixParts) + 1);
274
275 46
    return \implode('.', \array_reverse($registrableDomainParts));
276
  }
277
278
  /**
279
   * Returns the subdomain portion of provided host.
280
   *
281
   * @param string $host host
282
   *
283
   * @return string|null subdomain
284
   */
285 55
  public function getSubdomain(string $host)
286
  {
287 55
    $registrableDomain = $this->getRegistrableDomain($host);
288
289 55
    if ($registrableDomain === null || $host === $registrableDomain) {
290 30
      return null;
291
    }
292
293 28
    $registrableDomainParts = \array_reverse(\explode('.', $registrableDomain));
294
295 28
    $host = $this->normalize($host);
296
297 28
    $hostParts = \array_reverse(\explode('.', $host));
298 28
    $subdomainParts = \array_slice($hostParts, \count($registrableDomainParts));
299
300 28
    $subdomain = \implode('.', \array_reverse($subdomainParts));
301
302 28
    return $this->denormalize($subdomain);
303
  }
304
305
  /**
306
   * If a URL is not punycoded, then it may be an IDNA URL, so it must be
307
   * converted to ASCII. Performs conversion and sets flag.
308
   *
309
   * @param string $part Host part
310
   *
311
   * @return string Host part, transformed if not punycoded
312
   */
313 11
  protected function normalize(string $part): string
314
  {
315 11
    $punycoded = (\strpos($part, 'xn--') !== false);
316
317 11
    if ($punycoded === false) {
318 11
      $part = $this->punycodeWrapper->encode($part);
319 11
      $this->isNormalized = true;
320
    }
321
322 11
    return \strtolower($part);
323
  }
324
325
  /**
326
   * Converts any normalized part back to IDNA. Performs conversion and
327
   * resets flag.
328
   *
329
   * @param string $part Host part
330
   *
331
   * @return string Denormalized host part
332
   */
333 11
  protected function denormalize(string $part): string
334
  {
335 11
    if ($this->isNormalized === true) {
336 11
      $part = $this->punycodeWrapper->decode($part);
337 11
      $this->isNormalized = false;
338
    }
339
340 11
    return $part;
341
  }
342
343
  /**
344
   * Tests host for presence of '.'.
345
   *
346
   * Related to #22
347
   *
348
   * @param string|null $host Host part of url
349
   *
350
   * @return bool True if multi-label domain, false otherwise
351
   */
352 11
  protected function isMultiLabelDomain($host): bool
353
  {
354 11
    if (!$host) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $host of type string|null is loosely compared to false; this is ambiguous if the string can be empty. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
355 1
      return false;
356
    }
357
358 11
    return \strpos($host, '.') !== false;
359
  }
360
361
  /**
362
   * Tests host to determine if it is an IP address.
363
   *
364
   * Related to #43
365
   *
366
   * @param string $host Host part of url
367
   *
368
   * @return bool True if host is an ip address, false otherwise
369
   */
370 11
  protected function isIpv4Address(string $host): bool
371
  {
372 11
    return \preg_match(self::IP_ADDRESS_PATTERN, $host) === 1;
373
  }
374
}
375