Completed
Branch 2.0-dev (4f313a)
by Jan-Petter
02:57
created

UserAgentClient::getRules()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 13
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 13
rs 9.4285
cc 1
eloc 10
nc 1
nop 0
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client\Directives;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\Directives\SubDirectiveHandler;
6
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
7
use vipnytt\RobotsTxtParser\Parser\UrlParser;
8
use vipnytt\RobotsTxtParser\RobotsTxtInterface;
9
10
/**
11
 * Class UserAgentClient
12
 *
13
 * @package vipnytt\RobotsTxtParser\Client\Directives
14
 */
15
class UserAgentClient implements RobotsTxtInterface
16
{
17
    use UrlParser;
18
19
    /**
20
     * Rules
21
     * @var SubDirectiveHandler
22
     */
23
    private $handler;
24
25
    /**
26
     * Base Uri
27
     * @var string
28
     */
29
    private $base;
30
31
    /**
32
     * Status code
33
     * @var int|null
34
     */
35
    private $statusCode;
36
37
    /**
38
     * UserAgentClient constructor.
39
     *
40
     * @param SubDirectiveHandler $handler
41
     * @param string $baseUri
42
     * @param int|null $statusCode
43
     */
44
    public function __construct(SubDirectiveHandler $handler, $baseUri, $statusCode)
45
    {
46
        $this->handler = $handler;
47
        $this->base = $baseUri;
48
        $this->statusCode = $statusCode;
49
    }
50
51
    /**
52
     * UserAgentClient destructor.
53
     */
54
    public function __destruct()
55
    {
56
        $this->comment();
57
    }
58
59
    /**
60
     * Comment
61
     *
62
     * @return CommentClient
63
     */
64
    public function comment()
65
    {
66
        return $this->handler->comment()->client();
67
    }
68
69
    /**
70
     * Check if URL is allowed to crawl
71
     *
72
     * @param string $url
73
     * @return bool
74
     */
75
    public function isAllowed($url)
76
    {
77
        return $this->check(self::DIRECTIVE_ALLOW, $url);
78
    }
79
80
    /**
81
     * Check
82
     *
83
     * @param string $directive
84
     * @param string $url - URL to check
85
     * @return bool
86
     * @throws ClientException
87
     */
88
    private function check($directive, $url)
89
    {
90
        $url = $this->urlConvertToFull($url, $this->base);
91
        if (!$this->isUrlApplicable([$url, $this->base])) {
92
            throw new ClientException('URL belongs to a different robots.txt');
93
        }
94
        $statusCodeParser = new StatusCodeParser($this->statusCode, parse_url($this->base, PHP_URL_SCHEME));
95
        $statusCodeParser->replaceUnofficial();
96
        if (($result = $statusCodeParser->check()) !== null) {
97
            return $directive === $result;
98
        }
99
        $result = self::DIRECTIVE_ALLOW;
100
        foreach (
101
            [
102
                self::DIRECTIVE_DISALLOW => $this->handler->disallow(),
103
                self::DIRECTIVE_ALLOW => $this->handler->allow()
104
            ] as $currentDirective => $currentRules
105
        ) {
106
            if ($currentRules->check($url)) {
107
                $result = $currentDirective;
108
            }
109
        }
110
        return $directive === $result;
111
    }
112
113
    /**
114
     * Check if the URL belongs to current robots.txt
115
     *
116
     * @param string[] $urls
117
     * @return bool
118
     */
119
    private function isUrlApplicable($urls)
120
    {
121
        foreach ($urls as $url) {
122
            $parsed = parse_url($url);
123
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
124
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
125
            if (!isset($result)) {
126
                $result = $assembled;
127
            } elseif ($result !== $assembled) {
128
                return false;
129
            }
130
        }
131
        return true;
132
    }
133
134
    /**
135
     * Check if URL is disallowed to crawl
136
     *
137
     * @param string $url
138
     * @return bool
139
     */
140
    public function isDisallowed($url)
141
    {
142
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
143
    }
144
145
    /**
146
     * Cache-delay
147
     *
148
     * @return DelayClient
149
     */
150
    public function cacheDelay()
151
    {
152
        return $this->handler->cacheDelay()->client($this->crawlDelay()->get());
153
    }
154
155
    /**
156
     * Crawl-delay
157
     *
158
     * @return DelayClient
159
     */
160
    public function crawlDelay()
161
    {
162
        return $this->handler->crawlDelay()->client($this->requestRate()->get());
163
    }
164
165
    /**
166
     * RequestClient-rate
167
     *
168
     * @return RequestRateClient
169
     */
170
    public function requestRate()
171
    {
172
        return $this->handler->requestRate()->client();
173
    }
174
175
    /**
176
     * Robot-version
177
     *
178
     * @return RobotVersionClient
179
     */
180
    public function robotVersion()
181
    {
182
        return $this->handler->robotVersion()->client();
183
    }
184
185
    /**
186
     * Rule export
187
     *
188
     * @return array
189
     */
190
    public function getRules()
191
    {
192
        return array_merge(
193
            $this->handler->allow()->getRules(),
194
            $this->handler->comment()->getRules(),
195
            $this->handler->cacheDelay()->getRules(),
196
            $this->handler->crawlDelay()->getRules(),
197
            $this->handler->disallow()->getRules(),
198
            $this->handler->requestRate()->getRules(),
199
            $this->handler->robotVersion()->getRules(),
200
            $this->handler->visitTime()->getRules()
201
        );
202
    }
203
204
    /**
205
     * Visit-time
206
     *
207
     * @return VisitTimeClient
208
     */
209
    public function visitTime()
210
    {
211
        return $this->handler->visitTime()->client();
212
    }
213
}
214