Completed
Push — master ( cf0606...512c88 )
by Jan-Petter
04:57
created

UserAgentClient::getRequestRate()   D

Complexity

Conditions 9
Paths 24

Size

Total Lines 29
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 29
rs 4.909
cc 9
eloc 20
nc 24
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\RobotsTxtInterface;
6
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
7
use vipnytt\RobotsTxtParser\Parser\UrlParser;
8
9
/**
10
 * Class UserAgentClient
11
 *
12
 * @package vipnytt\RobotsTxtParser\Client
13
 */
14
class UserAgentClient implements RobotsTxtInterface
15
{
16
    use UrlParser;
17
18
    /**
19
     * Rules
20
     * @var array
21
     */
22
    protected $rules;
23
24
    /**
25
     * User-agent
26
     * @var string
27
     */
28
    protected $userAgent;
29
30
    /**
31
     * Robots.txt base URL
32
     * @var string
33
     */
34
    protected $base;
35
36
    /**
37
     * Status code parser
38
     * @var StatusCodeParser
39
     */
40
    protected $statusCodeParser;
41
42
    /**
43
     * Comment export status
44
     * @var bool
45
     */
46
    protected $commentsExported = false;
47
48
    /**
49
     * UserAgentClient constructor.
50
     *
51
     * @param array $rules
52
     * @param string $userAgent
53
     * @param string $baseUrl
54
     * @param int|null $statusCode
55
     */
56
    public function __construct(array $rules, $userAgent, $baseUrl, $statusCode)
57
    {
58
        $this->statusCodeParser = new StatusCodeParser($statusCode, parse_url($baseUrl, PHP_URL_SCHEME));
59
        $this->userAgent = $userAgent;
60
        $this->rules = $rules;
61
        $this->base = $baseUrl;
62
    }
63
64
    /**
65
     * Check if URL is allowed to crawl
66
     *
67
     * @param string $url
68
     * @return bool
69
     */
70
    public function isAllowed($url)
71
    {
72
        return $this->check(self::DIRECTIVE_ALLOW, $url);
73
    }
74
75
    /**
76
     * Check
77
     *
78
     * @param string $directive
79
     * @param string $url - URL to check
80
     * @return bool
81
     * @throws ClientException
82
     */
83
    protected function check($directive, $url)
84
    {
85
        $url = $this->urlConvertToFull($url, $this->base);
86
        if (!$this->isUrlApplicable([$url, $this->base])) {
87
            throw new ClientException('URL belongs to a different robots.txt, please check it against that one instead');
88
        }
89
        $this->statusCodeParser->replaceUnofficial();
90
        if (($result = $this->statusCodeParser->check()) !== null) {
91
            return $directive === $result;
92
        }
93
        $result = self::DIRECTIVE_ALLOW;
94
        foreach ([self::DIRECTIVE_DISALLOW, self::DIRECTIVE_ALLOW] as $currentDirective) {
95
            if ($this->rules[$currentDirective]->check($url)) {
96
                $result = $currentDirective;
97
            }
98
        }
99
        return $directive === $result;
100
    }
101
102
    /**
103
     * Check if the URL belongs to current robots.txt
104
     *
105
     * @param $urls
106
     * @return bool
107
     */
108
    protected function isUrlApplicable($urls)
109
    {
110
        foreach ($urls as $url) {
111
            $parsed = parse_url($url);
112
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
113
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
114
            if (!isset($result)) {
115
                $result = $assembled;
116
            } elseif ($result !== $assembled) {
117
                return false;
118
            }
119
        }
120
        return true;
121
    }
122
123
    /**
124
     * Check if URL is disallowed to crawl
125
     *
126
     * @param string $url
127
     * @return bool
128
     */
129
    public function isDisallowed($url)
130
    {
131
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
132
    }
133
134
    /**
135
     * Get Cache-delay
136
     *
137
     * @return float|int
138
     */
139
    public function getCacheDelay()
140
    {
141
        $delay = $this->rules[self::DIRECTIVE_CACHE_DELAY]->export();
142
        return isset($delay[self::DIRECTIVE_CACHE_DELAY]) ? $delay[self::DIRECTIVE_CACHE_DELAY] : $this->getCrawlDelay();
143
    }
144
145
    /**
146
     * Get Crawl-delay
147
     *
148
     * @return float|int
149
     */
150
    public function getCrawlDelay()
151
    {
152
        $delay = $this->rules[self::DIRECTIVE_CRAWL_DELAY]->export();
153
        return isset($delay[self::DIRECTIVE_CRAWL_DELAY]) ? $delay[self::DIRECTIVE_CRAWL_DELAY] : $this->getRequestRate();
154
    }
155
156
    /**
157
     * Get Request-rate
158
     *
159
     * @param int|null $timestamp
160
     * @return float|int
161
     */
162
    protected function getRequestRate($timestamp = null)
163
    {
164
        if ($timestamp === null) {
165
            $timestamp = time();
166
        }
167
        $rates = $this->getRequestRates();
168
        $values = [];
169
        foreach ($rates as $array) {
170
            if (
171
                !isset($array['from']) ||
172
                !isset($array['to'])
173
            ) {
174
                $values[] = $array['rate'];
175
                continue;
176
            }
177
            $from = mktime(mb_substr($array['from'], 0, mb_strlen($array['from']) - 2), mb_substr($array['from'], -2, 2), 0);
178
            $to = mktime(mb_substr($array['to'], 0, mb_strlen($array['to']) - 2), mb_substr($array['to'], -2, 2), 59);
179
            if ($from > $to) {
180
                $to = mktime(mb_substr($array['to'] + 24, 0, mb_strlen($array['to']) - 2), mb_substr($array['to'], -2, 2), 59);
181
            }
182
            if (
183
                $timestamp >= $from &&
184
                $timestamp <= $to
185
            ) {
186
                $values[] = $array['rate'];
187
            }
188
        };
189
        return ($rate = min($values)) > 0 ? $rate : 0;
190
    }
191
192
    /**
193
     * Get Request-rates
194
     *
195
     * @return array
196
     */
197
    public function getRequestRates()
198
    {
199
        $array = $this->rules[self::DIRECTIVE_REQUEST_RATE]->export();
200
        return isset($array[self::DIRECTIVE_REQUEST_RATE]) ? $array[self::DIRECTIVE_REQUEST_RATE] : [];
201
    }
202
203
    /**
204
     * Rule export
205
     *
206
     * @return array
207
     */
208
    public function export()
209
    {
210
        $result = [];
211
        foreach ($this->rules as $directive => $object) {
212
            if (!empty($export = $object->export())) {
213
                $result[$directive] = $export[$directive];
214
            }
215
        }
216
        return $result;
217
    }
218
219
    /**
220
     * UserAgentClient destructor.
221
     */
222
    public function __destruct()
223
    {
224
        if (!$this->commentsExported) {
225
            // Comment from the `Comments` directive exists, but has not been exported.
226
            foreach ($this->getComments() as $message) {
227
                trigger_error('Comment for `' . $this->userAgent . '` at `' . $this->base . '/robots.txt`: ' . $message, E_USER_NOTICE);
228
            }
229
        }
230
    }
231
232
    /**
233
     * Get Comments
234
     *
235
     * @return array
236
     */
237
    public function getComments()
238
    {
239
        $this->commentsExported = true;
240
        $comments = $this->rules[self::DIRECTIVE_COMMENT]->export();
241
        return isset($comments[self::DIRECTIVE_COMMENT]) ? $comments[self::DIRECTIVE_COMMENT] : [];
242
    }
243
244
    /**
245
     * Get Visit-time
246
     *
247
     * @return array|false
248
     */
249
    public function getVisitTime()
250
    {
251
        $times = $this->rules[self::DIRECTIVE_VISIT_TIME]->export();
252
        return isset($times[self::DIRECTIVE_VISIT_TIME]) ? $times[self::DIRECTIVE_VISIT_TIME] : [];
253
    }
254
}
255