Completed
Push — master ( c563e4...8f8f70 )
by Alexander
04:52
created

Extract::setExtractionMode()   B

Complexity

Conditions 4
Paths 4

Size

Total Lines 31
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
eloc 20
c 1
b 0
f 0
nc 4
nop 1
dl 0
loc 31
rs 8.5806
1
<?php
2
/**
3
 * TLDExtract: Library for extraction of domain parts e.g. TLD. Domain parser that uses Public Suffix List.
4
 *
5
 * @link      https://github.com/layershifter/TLDExtract
6
 *
7
 * @copyright Copyright (c) 2016, Alexander Fedyashov
8
 * @license   https://raw.githubusercontent.com/layershifter/TLDExtract/master/LICENSE Apache 2.0 License
9
 */
10
11
namespace LayerShifter\TLDExtract;
12
13
use LayerShifter\TLDDatabase\Store;
14
use LayerShifter\TLDExtract\Exceptions\RuntimeException;
15
use LayerShifter\TLDSupport\Helpers\Arr;
16
use LayerShifter\TLDSupport\Helpers\IP;
17
use LayerShifter\TLDSupport\Helpers\Str;
18
19
/**
20
 * Extract class accurately extracts subdomain, domain and TLD components from URLs.
21
 *
22
 * @see Result for more information on the returned data structure.
23
 */
24
class Extract
25
{
26
27
    /**
28
     * @const int If this option provided, extract will consider ICCAN suffixes.
29
     */
30
    const MODE_ALLOW_ICCAN = 2;
31
    /**
32
     * @const int If this option provided, extract will consider private suffixes.
33
     */
34
    const MODE_ALLOW_PRIVATE = 4;
35
    /**
36
     * @const int If this option provided, extract will consider custom domains.
37
     */
38
    const MODE_ALLOW_NOT_EXISTING_SUFFIXES = 8;
39
    /**
40
     * @const string RFC 3986 compliant scheme regex pattern.
41
     *
42
     * @see   https://tools.ietf.org/html/rfc3986#section-3.1
43
     */
44
    const SCHEMA_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*:)?//#';
45
46
    /**
47
     * @var int Value of extraction options.
48
     */
49
    private $extractionMode;
50
    /**
51
     * @var string Name of class that will store results of parsing.
52
     */
53
    private $resultClassName;
54
    /**
55
     * @var Store Object of TLDDatabase\Store class.
56
     */
57
    private $suffixStore;
58
59
    /**
60
     * Factory constructor.
61
     *
62
     * @param null|string $databaseFile    Optional, name of file with Public Suffix List database
63
     * @param null|string $resultClassName Optional, name of class that will store results of parsing
64
     * @param null|int    $extractionMode  Optional, option that will control extraction process
65
     *
66
     * @throws RuntimeException
67
     */
68
    public function __construct($databaseFile = null, $resultClassName = null, $extractionMode = null)
69
    {
70
        $this->suffixStore = new Store($databaseFile);
71
        $this->resultClassName = Result::class;
72
73
        // Checks for resultClassName argument.
74
75
        if (null !== $resultClassName) {
76
            if (!class_exists($resultClassName)) {
77
                throw new RuntimeException(sprintf('Class "%s" is not defined', $resultClassName));
78
            }
79
80
            if (!in_array(ResultInterface::class, class_implements($resultClassName), true)) {
81
                throw new RuntimeException(sprintf('Class "%s" not implements ResultInterface', $resultClassName));
82
            }
83
84
            $this->resultClassName = $resultClassName;
85
        }
86
87
        $this->setExtractionMode($extractionMode);
88
    }
89
90
    /**
91
     * Sets extraction mode, option that will control extraction process.
92
     *
93
     * @param int $extractionMode One of MODE_* constants
94
     *
95
     * @throws RuntimeException
96
     */
97
    public function setExtractionMode($extractionMode = null)
98
    {
99
        if (null === $extractionMode) {
100
            $this->extractionMode = static::MODE_ALLOW_ICCAN
101
                | static::MODE_ALLOW_PRIVATE
102
                | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES;
103
104
            return;
105
        }
106
107
        if (!is_int($extractionMode)) {
108
            throw new RuntimeException('Invalid argument type, extractionMode must be integer');
109
        }
110
111
        if (!in_array($extractionMode, [
112
            static::MODE_ALLOW_ICCAN,
113
            static::MODE_ALLOW_PRIVATE,
114
            static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
115
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE,
116
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
117
            static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES,
118
            static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES
119
        ], true)
120
        ) {
121
            throw new RuntimeException(
122
                'Invalid argument type, extractionMode must be one of defined constants of their combination'
123
            );
124
        }
125
126
        $this->extractionMode = $extractionMode;
127
    }
128
129
    /**
130
     * Extract the subdomain, host and gTLD/ccTLD components from a URL.
131
     *
132
     * @param string $url URL that will be extracted
133
     *
134
     * @return ResultInterface
135
     */
136
    public function parse($url)
137
    {
138
        $hostname = $this->extractHostname($url);
139
140
        // If received hostname is valid IP address, result will be formed from it.
141
142
        if (IP::isValid($hostname)) {
143
            return new $this->resultClassName(null, $hostname, null);
144
        }
145
146
        list($subDomain, $host, $suffix) = $this->extractParts($hostname);
147
148
        return new $this->resultClassName($subDomain, $host, $suffix);
149
    }
150
151
    /**
152
     * Method that extracts the hostname or IP address from a URL.
153
     *
154
     * @param string $url URL for extraction
155
     *
156
     * @return null|string Hostname or IP address
157
     */
158
    private function extractHostname($url)
159
    {
160
        $url = trim(Str::lower($url));
161
162
        // Removes scheme and path i.e. http://github.com to github.com.
163
164
        $parts = explode('/', preg_replace(static::SCHEMA_PATTERN, '', $url), 2);
165
        $hostname = Arr::first($parts);
166
167
        // Removes username from URL i.e. [email protected] to github.com.
168
169
        $hostname = Arr::last(explode('@', $hostname));
170
171
        // Remove ports from hosts, also check for IPv6 literals like "[3ffe:2a00:100:7031::1]".
172
        //
173
        // @see http://www.ietf.org/rfc/rfc2732.txt
174
175
        $lastBracketPosition = Str::strrpos($hostname, ']');
176
177
        if ($lastBracketPosition !== false && Str::startsWith($hostname, '[')) {
178
            return Str::substr($hostname, 1, $lastBracketPosition - 1);
179
        }
180
181
        // This is either a normal hostname or an IPv4 address, just remove the port.
182
183
        $hostname = Arr::first(explode(':', $hostname));
184
185
        // If string is empty, null will be returned.
186
187
        return '' === $hostname ? null : $hostname;
188
    }
189
190
    /**
191
     * Extracts subdomain, host and suffix from input string. Based on algorithm described in
192
     * https://publicsuffix.org/list/.
193
     *
194
     * @param string $hostname Hostname for extraction
195
     *
196
     * @return array|string[] An array that contains subdomain, host and suffix.
197
     */
198
    public function extractParts($hostname)
199
    {
200
        $suffix = $this->extractSuffix($hostname);
201
202
        if ($suffix === $hostname) {
203
            return [null, $hostname, null];
204
        }
205
206
        if (null !== $suffix) {
207
            $hostname = Str::substr($hostname, 0, -Str::length($suffix) - 1);
208
        }
209
210
        $lastDot = Str::strrpos($hostname, '.');
211
212
        if (false === $lastDot) {
213
            return [null, $hostname, $suffix];
214
        }
215
216
        $subDomain = Str::substr($hostname, 0, $lastDot);
217
        $host = Str::substr($hostname, $lastDot + 1);
218
219
        return [
220
            $subDomain,
221
            $host,
222
            $suffix
223
        ];
224
    }
225
226
    /**
227
     * Extracts suffix from hostname using Public Suffix List database.
228
     *
229
     * @param string $hostname Hostname for extraction
230
     *
231
     * @return null|string
232
     */
233
    private function extractSuffix($hostname)
234
    {
235
        // If hostname has leading dot, it's invalid.
236
        // If hostname is a single label domain makes, it's invalid.
237
238
        if (Str::startsWith($hostname, '.') || Str::strpos($hostname, '.') === false) {
239
            return null;
240
        }
241
242
        // If domain is in punycode, it will be converted to IDN.
243
244
        $isPunycoded = Str::strpos($hostname, 'xn--') !== false;
245
246
        if ($isPunycoded) {
247
            $hostname = idn_to_utf8($hostname);
248
        }
249
250
        $suffix = $this->parseSuffix($hostname);
251
252
        if (null === $suffix) {
253
            if (!($this->extractionMode & static::MODE_ALLOW_NOT_EXISTING_SUFFIXES)) {
254
                return null;
255
            }
256
257
            $suffix = Str::substr($hostname, Str::strrpos($hostname, '.') + 1);
258
        }
259
260
        // If domain is punycoded, suffix will be converted to punycode.
261
262
        return $isPunycoded ? idn_to_ascii($suffix) : $suffix;
263
    }
264
265
    /**
266
     * Extracts suffix from hostname using Public Suffix List database.
267
     *
268
     * @param string $hostname Hostname for extraction
269
     *
270
     * @return null|string
271
     */
272
    private function parseSuffix($hostname)
273
    {
274
        $hostnameParts = explode('.', $hostname);
275
        $realSuffix = null;
276
277
        for ($i = 0, $count = count($hostnameParts); $i < $count; $i++) {
278
            $possibleSuffix = implode('.', array_slice($hostnameParts, $i));
279
            $exceptionSuffix = '!' . $possibleSuffix;
280
281
            if ($this->suffixExists($exceptionSuffix)) {
282
                $realSuffix = implode('.', array_slice($hostnameParts, $i + 1));
283
284
                break;
285
            }
286
287
            if ($this->suffixExists($possibleSuffix)) {
288
                $realSuffix = $possibleSuffix;
289
290
                break;
291
            }
292
293
            $wildcardTld = '*.' . implode('.', array_slice($hostnameParts, $i + 1));
294
295
            if ($this->suffixExists($wildcardTld)) {
296
                $realSuffix = $possibleSuffix;
297
298
                break;
299
            }
300
        }
301
302
        return $realSuffix;
303
    }
304
305
    /**
306
     * Method that checks existence of entry in Public Suffix List database, including provided options.
307
     *
308
     * @param string $entry Entry for check in Public Suffix List database
309
     *
310
     * @return bool
311
     */
312
    private function suffixExists($entry)
313
    {
314
        if (!$this->suffixStore->isExists($entry)) {
315
            return false;
316
        }
317
318
        $type = $this->suffixStore->getType($entry);
319
320
        if ($this->extractionMode & static::MODE_ALLOW_ICCAN && $type === Store::TYPE_ICCAN) {
321
            return true;
322
        }
323
324
        return $this->extractionMode & static::MODE_ALLOW_PRIVATE && $type === Store::TYPE_PRIVATE;
325
    }
326
}
327