Completed
Push — develop ( 1f8417...7020e4 )
by Lars
03:18
created

Parser::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 6
Bugs 3 Features 1
Metric Value
cc 1
eloc 3
c 6
b 3
f 1
nc 1
nop 1
dl 0
loc 5
ccs 0
cts 0
cp 0
crap 2
rs 9.4285
1
<?php
2
3
/**
4
 * PHP Domain Parser: Public Suffix List based URL parsing.
5
 *
6
 * @link      http://github.com/jeremykendall/php-domain-parser for the canonical source repository
7
 *
8
 * @copyright Copyright (c) 2014 Jeremy Kendall (http://about.me/jeremykendall)
9
 * @license   http://github.com/jeremykendall/php-domain-parser/blob/master/LICENSE MIT License
10
 */
11
namespace Pdp;
12
13
use Pdp\Exception\SeriouslyMalformedUrlException;
14
use Pdp\Uri\Url;
15
use Pdp\Uri\Url\Host;
16
use voku\helper\UTF8;
17
18
/**
19
 * Parser.
20
 *
21
 * This class is reponsible for Public Suffix List based url parsing
22
 */
23
class Parser
24
{
25
  /**
26
   * @var string RFC 3986 compliant scheme regex pattern
27
   *
28
   * @see https://tools.ietf.org/html/rfc3986#section-3.1
29
   */
30
  const SCHEME_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*)://#';
31
32
  /**
33
   * @var string IP address regex pattern
34
   */
35
  const IP_ADDRESS_PATTERN = '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/';
36
37
  /**
38
   * @var PublicSuffixList Public Suffix List
39
   */
40
  protected $publicSuffixList;
41
42
  /**
43
   * @var bool Whether or not a host part has been normalized
44
   */
45
  protected $isNormalized = false;
46
47
  /**
48
   * @var PunycodeWrapper
49
   */
50
  private $punycodeWrapper;
51
52
  /**
53
   * Public constructor.
54
   *
55
   * @codeCoverageIgnore
56
   *
57
   * @param PublicSuffixList $publicSuffixList Instance of PublicSuffixList
58
   */
59
  public function __construct(PublicSuffixList $publicSuffixList)
60
  {
61
    $this->publicSuffixList = $publicSuffixList;
62
    $this->punycodeWrapper = new PunycodeWrapper();
63
  }
64
65
  /**
66
   * Parses url.
67
   *
68
   * @param string $url Url to parse
69
   *
70
   * @return Url Object representation of url
71
   */
72 232
  public function parseUrl($url)
73
  {
74 232
    $rawUrl = $url;
75
    $elem = array(
76 232
        'scheme'   => null,
77 232
        'user'     => null,
78 232
        'pass'     => null,
79 232
        'host'     => null,
80 232
        'port'     => null,
81 232
        'path'     => null,
82 232
        'query'    => null,
83 232
        'fragment' => null,
84 232
    );
85
86 232
    if (preg_match(self::SCHEME_PATTERN, $url) === 0) {
87
      // Wacky scheme required to overcome parse_url behavior in PHP lt 5.4.7
88
      // See https://github.com/jeremykendall/php-domain-parser/issues/49
89 129
      $url = 'php-lt-5.4.7-hack://' . preg_replace('#^//#', '', $url, 1);
90 129
    }
91
92 232
    $parts = pdp_parse_url($url);
93
94 232
    if ($parts === false) {
95 2
      throw new SeriouslyMalformedUrlException($rawUrl);
96
    }
97
98 230
    if ($parts['scheme'] === 'php-lt-5.4.7-hack') {
99
      // Remove wacky scheme required to overcome parse_url behavior in PHP lt 5.4.7
100
      // See https://github.com/jeremykendall/php-domain-parser/issues/49
101 128
      $parts['scheme'] = null;
102 128
    }
103
104
    /** @noinspection AdditionOperationOnArraysInspection */
105 230
    $elem = (array)$parts + $elem;
106
107 230
    $host = $this->parseHost($parts['host']);
108
109 230
    return new Url(
110 230
        $elem['scheme'],
111 230
        $elem['user'],
112 230
        $elem['pass'],
113 230
        $host,
114 230
        $elem['port'],
115 230
        $elem['path'],
116 230
        $elem['query'],
117 230
        $elem['fragment']
118 230
    );
119
  }
120
121
  /**
122
   * Parses host part of url.
123
   *
124
   * @param string $host Host part of url
125
   *
126
   * @return Host Object representation of host portion of url
127
   */
128 54
  public function parseHost($host)
129
  {
130 54
    $host = UTF8::strtolower($host);
131
132 54
    return new Host(
133 54
        $this->getSubdomain($host),
134 54
        $this->getRegistrableDomain($host),
135 54
        $this->getPublicSuffix($host),
136
        $host
137 54
    );
138
  }
139
140
  /**
141
   * Get the raw public suffix based on the cached public suffix list file.
142
   * Return false if the provided suffix is not included in the PSL.
143
   *
144
   * @param string $host The host to process
145
   *
146
   * @return string|false The suffix or false if suffix not included in the PSL
147
   */
148 11
  protected function getRawPublicSuffix($host)
149
  {
150 11
    $host = $this->normalize($host);
151
152 11
    $parts = array_reverse(explode('.', $host));
153 11
    $publicSuffix = array();
154 11
    $publicSuffixList = $this->publicSuffixList;
155
156 11
    foreach ($parts as $part) {
157 11
      if (array_key_exists($part, $publicSuffixList)
158 11
          && array_key_exists('!', $publicSuffixList[$part])
159 11
      ) {
160 1
        break;
161
      }
162
163 11
      if (array_key_exists($part, $publicSuffixList)) {
164 11
        array_unshift($publicSuffix, $part);
165 11
        $publicSuffixList = $publicSuffixList[$part];
166 11
        continue;
167
      }
168
169 11
      if (array_key_exists('*', $publicSuffixList)) {
170 1
        array_unshift($publicSuffix, $part);
171 1
        $publicSuffixList = $publicSuffixList['*'];
172 1
        continue;
173
      }
174
175
      // Avoids improper parsing when $host's subdomain + public suffix ===
176
      // a valid public suffix (e.g. host 'us.example.com' and public suffix 'us.com')
177
      //
178
      // Added by @goodhabit in https://github.com/jeremykendall/php-domain-parser/pull/15
179
      // Resolves https://github.com/jeremykendall/php-domain-parser/issues/16
180 11
      break;
181 11
    }
182
183
    // If empty, then the suffix is not included in the PSL and is
184
    // considered "invalid". This also triggers algorithm rule #2: If no
185
    // rules match, the prevailing rule is "*".
186 11
    if (empty($publicSuffix)) {
187 1
      return false;
188
    }
189
190 11
    $suffix = implode('.', array_filter($publicSuffix, 'strlen'));
191
192 11
    return $this->denormalize($suffix);
193
  }
194
195
  /**
196
   * Returns the public suffix portion of provided host.
197
   *
198
   * @param string $host host
199
   *
200
   * @return string|null public suffix or null if host does not contain a public suffix
201
   */
202 56
  public function getPublicSuffix($host)
203
  {
204 56
    if (strpos($host, '.') === 0) {
205 1
      return null;
206
    }
207
208
    // Fixes #22: If a single label domain makes it this far (e.g.,
209
    // localhost, foo, etc.), this stops it from incorrectly being set as
210
    // the public suffix.
211 56
    if (!$this->isMultiLabelDomain($host)) {
212 6
      return null;
213
    }
214
215
    // Fixes #43
216 50
    if ($this->isIpv4Address($host)) {
217 3
      return null;
218
    }
219
220 47
    $suffix = $this->getRawPublicSuffix($host);
221
222
    // Apply algorithm rule #2: If no rules match, the prevailing rule is "*".
223 47
    if (false === $suffix) {
224 3
      $parts = array_reverse(explode('.', $host));
225 3
      $suffix = array_shift($parts);
226 3
    }
227
228 47
    return $suffix;
229
  }
230
231
  /**
232
   * Is suffix valid?
233
   *
234
   * Validity determined by whether or not the suffix is included in the PSL.
235
   *
236
   * @param string $host Host part
237
   *
238
   * @return bool True is suffix is valid, false otherwise
239
   */
240 2
  public function isSuffixValid($host)
241
  {
242 2
    return $this->getRawPublicSuffix($host) !== false;
243
  }
244
245
  /**
246
   * Returns registrable domain portion of provided host.
247
   *
248
   * Per the test cases provided by Mozilla
249
   * (http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1),
250
   * this method should return null if the domain provided is a public suffix.
251
   *
252
   * @param string $host host
253
   *
254
   * @return string|null registrable domain
255
   */
256 55
  public function getRegistrableDomain($host)
257
  {
258 55
    if (!$this->isMultiLabelDomain($host)) {
259 7
      return null;
260
    }
261
262 49
    $publicSuffix = $this->getPublicSuffix($host);
263
264 49
    if ($publicSuffix === null || $host == $publicSuffix) {
265 4
      return null;
266
    }
267
268 46
    $publicSuffixParts = array_reverse(explode('.', $publicSuffix));
269 46
    $hostParts = array_reverse(explode('.', $host));
270 46
    $registrableDomainParts = $publicSuffixParts + array_slice($hostParts, 0, count($publicSuffixParts) + 1);
271
272 46
    return implode('.', array_reverse($registrableDomainParts));
273
  }
274
275
  /**
276
   * Returns the subdomain portion of provided host.
277
   *
278
   * @param string $host host
279
   *
280
   * @return string|null subdomain
281
   */
282 55
  public function getSubdomain($host)
283
  {
284 55
    $registrableDomain = $this->getRegistrableDomain($host);
285
286 55
    if ($registrableDomain === null || $host === $registrableDomain) {
287 30
      return null;
288
    }
289
290 28
    $registrableDomainParts = array_reverse(explode('.', $registrableDomain));
291
292 28
    $host = $this->normalize($host);
293
294 28
    $hostParts = array_reverse(explode('.', $host));
295 28
    $subdomainParts = array_slice($hostParts, count($registrableDomainParts));
296
297 28
    $subdomain = implode('.', array_reverse($subdomainParts));
298
299 28
    return $this->denormalize($subdomain);
300
  }
301
302
  /**
303
   * If a URL is not punycoded, then it may be an IDNA URL, so it must be
304
   * converted to ASCII. Performs conversion and sets flag.
305
   *
306
   * @param string $part Host part
307
   *
308
   * @return string Host part, transformed if not punycoded
309
   */
310 11
  protected function normalize($part)
311
  {
312 11
    $punycoded = (strpos($part, 'xn--') !== false);
313
314 11
    if ($punycoded === false) {
315 11
      $part = $this->punycodeWrapper->encode($part);
316 11
      $this->isNormalized = true;
317 11
    }
318
319 11
    return strtolower($part);
320
  }
321
322
  /**
323
   * Converts any normalized part back to IDNA. Performs conversion and
324
   * resets flag.
325
   *
326
   * @param string $part Host part
327
   *
328
   * @return string Denormalized host part
329
   */
330 11
  protected function denormalize($part)
331
  {
332 11
    if ($this->isNormalized === true) {
333 11
      $part = $this->punycodeWrapper->decode($part);
334 11
      $this->isNormalized = false;
335 11
    }
336
337 11
    return $part;
338
  }
339
340
  /**
341
   * Tests host for presence of '.'.
342
   *
343
   * Related to #22
344
   *
345
   * @param string $host Host part of url
346
   *
347
   * @return bool True if multi-label domain, false otherwise
348
   */
349 11
  protected function isMultiLabelDomain($host)
350
  {
351 11
    return strpos($host, '.') !== false;
352
  }
353
354
  /**
355
   * Tests host to determine if it is an IP address.
356
   *
357
   * Related to #43
358
   *
359
   * @param string $host Host part of url
360
   *
361
   * @return bool True if host is an ip address, false otherwise
362
   */
363 11
  protected function isIpv4Address($host)
364
  {
365 11
    return preg_match(self::IP_ADDRESS_PATTERN, $host) === 1;
366
  }
367
}
368