Extract::fixQueryPart()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 2
eloc 5
c 1
b 1
f 0
nc 2
nop 1
dl 0
loc 10
rs 9.4285
1
<?php
2
/**
3
 * TLDExtract: Library for extraction of domain parts e.g. TLD. Domain parser that uses Public Suffix List.
4
 *
5
 * @link      https://github.com/layershifter/TLDExtract
6
 *
7
 * @copyright Copyright (c) 2016, Alexander Fedyashov
8
 * @license   https://raw.githubusercontent.com/layershifter/TLDExtract/master/LICENSE Apache 2.0 License
9
 */
10
11
namespace LayerShifter\TLDExtract;
12
13
use LayerShifter\TLDDatabase\Store;
14
use LayerShifter\TLDExtract\Exceptions\RuntimeException;
15
use LayerShifter\TLDSupport\Helpers\Arr;
16
use LayerShifter\TLDSupport\Helpers\IP;
17
use LayerShifter\TLDSupport\Helpers\Str;
18
19
/**
20
 * Extract class accurately extracts subdomain, domain and TLD components from URLs.
21
 *
22
 * @see Result for more information on the returned data structure.
23
 */
24
class Extract
25
{
26
27
    /**
28
     * @const int If this option provided, extract will consider ICCAN suffixes.
29
     */
30
    const MODE_ALLOW_ICCAN = 2;
31
    /**
32
     * @const int If this option provided, extract will consider private suffixes.
33
     */
34
    const MODE_ALLOW_PRIVATE = 4;
35
    /**
36
     * @const int If this option provided, extract will consider custom domains.
37
     */
38
    const MODE_ALLOW_NOT_EXISTING_SUFFIXES = 8;
39
    /**
40
     * @const string RFC 3986 compliant scheme regex pattern.
41
     *
42
     * @see   https://tools.ietf.org/html/rfc3986#section-3.1
43
     */
44
    const SCHEMA_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*:)?//#';
45
46
    /**
47
     * @var int Value of extraction options.
48
     */
49
    private $extractionMode;
50
    /**
51
     * @var string Name of class that will store results of parsing.
52
     */
53
    private $resultClassName;
54
    /**
55
     * @var Store Object of TLDDatabase\Store class.
56
     */
57
    private $suffixStore;
58
59
    /**
60
     * Factory constructor.
61
     *
62
     * @param null|string $databaseFile    Optional, name of file with Public Suffix List database
63
     * @param null|string $resultClassName Optional, name of class that will store results of parsing
64
     * @param null|int    $extractionMode  Optional, option that will control extraction process
65
     *
66
     * @throws RuntimeException
67
     */
68
    public function __construct($databaseFile = null, $resultClassName = null, $extractionMode = null)
69
    {
70
        $this->suffixStore = new Store($databaseFile);
71
        $this->resultClassName = Result::class;
72
73
        // Checks for resultClassName argument.
74
75
        if (null !== $resultClassName) {
76
            if (!class_exists($resultClassName)) {
77
                throw new RuntimeException(sprintf('Class "%s" is not defined', $resultClassName));
78
            }
79
80
            if (!in_array(ResultInterface::class, class_implements($resultClassName), true)) {
81
                throw new RuntimeException(sprintf('Class "%s" not implements ResultInterface', $resultClassName));
82
            }
83
84
            $this->resultClassName = $resultClassName;
85
        }
86
87
        $this->setExtractionMode($extractionMode);
88
    }
89
90
    /**
91
     * Sets extraction mode, option that will control extraction process.
92
     *
93
     * @param int $extractionMode One of MODE_* constants
94
     *
95
     * @throws RuntimeException
96
     */
97
    public function setExtractionMode($extractionMode = null)
98
    {
99
        if (null === $extractionMode) {
100
            $this->extractionMode = static::MODE_ALLOW_ICCAN
101
                | static::MODE_ALLOW_PRIVATE
102
                | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES;
103
104
            return;
105
        }
106
107
        if (!is_int($extractionMode)) {
108
            throw new RuntimeException('Invalid argument type, extractionMode must be integer');
109
        }
110
111
        if (!in_array($extractionMode, [
112
            static::MODE_ALLOW_ICCAN,
113
            static::MODE_ALLOW_PRIVATE,
114
            static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
115
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE,
116
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
117
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
118
            static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES
119
        ], true)
120
        ) {
121
            throw new RuntimeException(
122
                'Invalid argument type, extractionMode must be one of defined constants of their combination'
123
            );
124
        }
125
126
        $this->extractionMode = $extractionMode;
127
    }
128
129
    /**
130
     * Extract the subdomain, host and gTLD/ccTLD components from a URL.
131
     *
132
     * @param string $url URL that will be extracted
133
     *
134
     * @return ResultInterface
135
     */
136
    public function parse($url)
137
    {
138
        $hostname = $this->extractHostname($url);
139
140
        // If received hostname is valid IP address, result will be formed from it.
141
142
        if (IP::isValid($hostname)) {
143
            return new $this->resultClassName(null, $hostname, null);
144
        }
145
146
        list($subDomain, $host, $suffix) = $this->extractParts($hostname);
147
148
        return new $this->resultClassName($subDomain, $host, $suffix);
149
    }
150
151
    /**
152
     * Method that extracts the hostname or IP address from a URL.
153
     *
154
     * @param string $url URL for extraction
155
     *
156
     * @return null|string Hostname or IP address
157
     */
158
    private function extractHostname($url)
159
    {
160
        $url = trim(Str::lower($url));
161
162
        // Removes scheme and path i.e. "https://github.com/layershifter" to "github.com/layershifter".
163
164
        $url = preg_replace(static::SCHEMA_PATTERN, '', $url);
165
166
        // Removes path and query part of URL i.e. "github.com/layershifter" to "github.com".
167
168
        $url = $this->fixQueryPart($url);
169
        $hostname = Arr::first(explode('/', $url, 2));
170
171
        // Removes username from URL i.e. [email protected] to github.com.
172
173
        $hostname = Arr::last(explode('@', $hostname));
174
175
        // Remove ports from hosts, also check for IPv6 literals like "[3ffe:2a00:100:7031::1]".
176
        //
177
        // @see http://www.ietf.org/rfc/rfc2732.txt
178
179
        $lastBracketPosition = Str::strrpos($hostname, ']');
180
181
        if ($lastBracketPosition !== false && Str::startsWith($hostname, '[')) {
182
            return Str::substr($hostname, 1, $lastBracketPosition - 1);
183
        }
184
185
        // This is either a normal hostname or an IPv4 address, just remove the port.
186
187
        $hostname = Arr::first(explode(':', $hostname));
188
189
        // If string is empty, null will be returned.
190
191
        return '' === $hostname ? null : $hostname;
192
    }
193
194
    /**
195
     * Extracts subdomain, host and suffix from input string. Based on algorithm described in
196
     * https://publicsuffix.org/list/.
197
     *
198
     * @param string $hostname Hostname for extraction
199
     *
200
     * @return array|string[] An array that contains subdomain, host and suffix.
201
     */
202
    public function extractParts($hostname)
203
    {
204
        $suffix = $this->extractSuffix($hostname);
205
206
        if ($suffix === $hostname) {
207
            return [null, $hostname, null];
208
        }
209
210
        if (null !== $suffix) {
211
            $hostname = Str::substr($hostname, 0, -Str::length($suffix) - 1);
212
        }
213
214
        $lastDot = Str::strrpos($hostname, '.');
215
216
        if (false === $lastDot) {
217
            return [null, $hostname, $suffix];
218
        }
219
220
        $subDomain = Str::substr($hostname, 0, $lastDot);
0 ignored issues
show
Bug introduced by
It seems like $lastDot defined by \LayerShifter\TLDSupport...strrpos($hostname, '.') on line 214 can also be of type boolean; however, LayerShifter\TLDSupport\Helpers\Str::substr() does only seem to accept integer|null, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
221
        $host = Str::substr($hostname, $lastDot + 1);
222
223
        return [
224
            $subDomain,
225
            $host,
226
            $suffix
227
        ];
228
    }
229
230
    /**
231
     * Extracts suffix from hostname using Public Suffix List database.
232
     *
233
     * @param string $hostname Hostname for extraction
234
     *
235
     * @return null|string
236
     */
237
    private function extractSuffix($hostname)
238
    {
239
        // If hostname has leading dot, it's invalid.
240
        // If hostname is a single label domain makes, it's invalid.
241
242
        if (Str::startsWith($hostname, '.') || Str::strpos($hostname, '.') === false) {
243
            return null;
244
        }
245
246
        // If domain is in punycode, it will be converted to IDN.
247
248
        $isPunycoded = Str::strpos($hostname, 'xn--') !== false;
249
250
        if ($isPunycoded) {
251
            $hostname = idn_to_utf8($hostname);
252
        }
253
254
        $suffix = $this->parseSuffix($hostname);
255
256
        if (null === $suffix) {
257
            if (!($this->extractionMode & static::MODE_ALLOW_NOT_EXISTING_SUFFIXES)) {
258
                return null;
259
            }
260
261
            $suffix = Str::substr($hostname, Str::strrpos($hostname, '.') + 1);
262
        }
263
264
        // If domain is punycoded, suffix will be converted to punycode.
265
266
        return $isPunycoded ? idn_to_ascii($suffix) : $suffix;
267
    }
268
269
    /**
270
     * Extracts suffix from hostname using Public Suffix List database.
271
     *
272
     * @param string $hostname Hostname for extraction
273
     *
274
     * @return null|string
275
     */
276
    private function parseSuffix($hostname)
277
    {
278
        $hostnameParts = explode('.', $hostname);
279
        $realSuffix = null;
280
281
        for ($i = 0, $count = count($hostnameParts); $i < $count; $i++) {
282
            $possibleSuffix = implode('.', array_slice($hostnameParts, $i));
283
            $exceptionSuffix = '!' . $possibleSuffix;
284
285
            if ($this->suffixExists($exceptionSuffix)) {
286
                $realSuffix = implode('.', array_slice($hostnameParts, $i + 1));
287
288
                break;
289
            }
290
291
            if ($this->suffixExists($possibleSuffix)) {
292
                $realSuffix = $possibleSuffix;
293
294
                break;
295
            }
296
297
            $wildcardTld = '*.' . implode('.', array_slice($hostnameParts, $i + 1));
298
299
            if ($this->suffixExists($wildcardTld)) {
300
                $realSuffix = $possibleSuffix;
301
302
                break;
303
            }
304
        }
305
306
        return $realSuffix;
307
    }
308
309
    /**
310
     * Method that checks existence of entry in Public Suffix List database, including provided options.
311
     *
312
     * @param string $entry Entry for check in Public Suffix List database
313
     *
314
     * @return bool
315
     */
316
    private function suffixExists($entry)
317
    {
318
        if (!$this->suffixStore->isExists($entry)) {
319
            return false;
320
        }
321
322
        $type = $this->suffixStore->getType($entry);
323
324
        if ($this->extractionMode & static::MODE_ALLOW_ICCAN && $type === Store::TYPE_ICCAN) {
325
            return true;
326
        }
327
328
        return $this->extractionMode & static::MODE_ALLOW_PRIVATE && $type === Store::TYPE_PRIVATE;
329
    }
330
331
    /**
332
     * Fixes URL from "github.com?layershifter" to "github.com/?layershifter".
333
     *
334
     * @see https://github.com/layershifter/TLDExtract/issues/5
335
     *
336
     * @param string $url
337
     *
338
     * @return string
339
     */
340
    private function fixQueryPart($url)
341
    {
342
        $position = Str::strpos($url, '?');
343
344
        if ($position === false) {
345
            return $url;
346
        }
347
348
        return Str::substr($url, 0, $position) . '/' . Str::substr($url, $position);
0 ignored issues
show
Bug introduced by
It seems like $position defined by \LayerShifter\TLDSupport...\Str::strpos($url, '?') on line 342 can also be of type boolean; however, LayerShifter\TLDSupport\Helpers\Str::substr() does only seem to accept integer|null, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
Bug introduced by
It seems like $position defined by \LayerShifter\TLDSupport...\Str::strpos($url, '?') on line 342 can also be of type boolean; however, LayerShifter\TLDSupport\Helpers\Str::substr() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
349
    }
350
}
351