Completed
Push — master ( e5e536...a6d8dc )
by Jan-Petter
02:09
created

UserAgentClient::determineRequestRates()   C

Complexity

Conditions 7
Paths 6

Size

Total Lines 26
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 26
rs 6.7272
cc 7
eloc 18
nc 6
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\RobotsTxtInterface;
6
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
7
use vipnytt\RobotsTxtParser\Parser\UrlParser;
8
9
/**
10
 * Class UserAgentClient
11
 *
12
 * @package vipnytt\RobotsTxtParser\Client
13
 */
14
class UserAgentClient implements RobotsTxtInterface
15
{
16
    use UrlParser;
17
18
    /**
19
     * Rules
20
     * @var array
21
     */
22
    protected $rules;
23
24
    /**
25
     * User-agent
26
     * @var string
27
     */
28
    protected $userAgent;
29
30
    /**
31
     * Robots.txt base URL
32
     * @var string
33
     */
34
    protected $base;
35
36
    /**
37
     * Status code parser
38
     * @var StatusCodeParser
39
     */
40
    protected $statusCodeParser;
41
42
    /**
43
     * Comment export status
44
     * @var bool
45
     */
46
    protected $commentsExported = false;
47
48
    /**
49
     * UserAgentClient constructor.
50
     *
51
     * @param array $rules
52
     * @param string $userAgent
53
     * @param string $baseUrl
54
     * @param int|null $statusCode
55
     */
56
    public function __construct(array $rules, $userAgent, $baseUrl, $statusCode)
57
    {
58
        $this->statusCodeParser = new StatusCodeParser($statusCode, parse_url($baseUrl, PHP_URL_SCHEME));
59
        $this->userAgent = $userAgent;
60
        $this->rules = $rules;
61
        $this->base = $baseUrl;
62
    }
63
64
    /**
65
     * Check if URL is allowed to crawl
66
     *
67
     * @param string $url
68
     * @return bool
69
     */
70
    public function isAllowed($url)
71
    {
72
        return $this->check(self::DIRECTIVE_ALLOW, $url);
73
    }
74
75
    /**
76
     * Check
77
     *
78
     * @param string $directive
79
     * @param string $url - URL to check
80
     * @return bool
81
     * @throws ClientException
82
     */
83
    protected function check($directive, $url)
84
    {
85
        $url = $this->urlConvertToFull($url, $this->base);
86
        if (!$this->isUrlApplicable([$url, $this->base])) {
87
            throw new ClientException('URL belongs to a different robots.txt, please check it against that one instead');
88
        }
89
        $this->statusCodeParser->replaceUnofficial();
90
        if (($result = $this->statusCodeParser->check()) !== null) {
91
            return $directive === $result;
92
        }
93
        $result = self::DIRECTIVE_ALLOW;
94
        foreach ([self::DIRECTIVE_DISALLOW, self::DIRECTIVE_ALLOW] as $currentDirective) {
95
            if ($this->rules[$currentDirective]->check($url)) {
96
                $result = $currentDirective;
97
            }
98
        }
99
        return $directive === $result;
100
    }
101
102
    /**
103
     * Check if the URL belongs to current robots.txt
104
     *
105
     * @param $urls
106
     * @return bool
107
     */
108
    protected function isUrlApplicable($urls)
109
    {
110
        foreach ($urls as $url) {
111
            $parsed = parse_url($url);
112
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
113
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
114
            if (!isset($result)) {
115
                $result = $assembled;
116
            } elseif ($result !== $assembled) {
117
                return false;
118
            }
119
        }
120
        return true;
121
    }
122
123
    /**
124
     * Check if URL is disallowed to crawl
125
     *
126
     * @param string $url
127
     * @return bool
128
     */
129
    public function isDisallowed($url)
130
    {
131
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
132
    }
133
134
    /**
135
     * Get Cache-delay
136
     *
137
     * @return float|int
138
     */
139
    public function getCacheDelay()
140
    {
141
        $delay = $this->rules[self::DIRECTIVE_CACHE_DELAY]->export();
142
        return isset($delay[self::DIRECTIVE_CACHE_DELAY]) ? $delay[self::DIRECTIVE_CACHE_DELAY] : $this->getCrawlDelay();
143
    }
144
145
    /**
146
     * Get Crawl-delay
147
     *
148
     * @return float|int
149
     */
150
    public function getCrawlDelay()
151
    {
152
        $delay = $this->rules[self::DIRECTIVE_CRAWL_DELAY]->export();
153
        return isset($delay[self::DIRECTIVE_CRAWL_DELAY]) ? $delay[self::DIRECTIVE_CRAWL_DELAY] : $this->getRequestRate();
154
    }
155
156
    /**
157
     * Get Request-rate for current timestamp
158
     *
159
     * @param int|null $timestamp
160
     * @return float|int
161
     */
162
    protected function getRequestRate($timestamp = null)
163
    {
164
        if (!is_int($timestamp)) {
165
            $timestamp = time();
166
        }
167
        $values = $this->determineRequestRates($timestamp);
168
        if (
169
            count($values) > 0 &&
170
            ($rate = min($values)) > 0
171
        ) {
172
            return $rate;
173
        }
174
        return 0;
175
    }
176
177
    /**
178
     * Determine Request rates
179
     *
180
     * @param $timestamp
181
     * @return array
182
     */
183
    protected function determineRequestRates($timestamp)
184
    {
185
        $rates = $this->getRequestRates();
186
        $values = [];
187
        foreach ($rates as $array) {
188
            if (
189
                !isset($array['from']) ||
190
                !isset($array['to'])
191
            ) {
192
                $values[] = $array['rate'];
193
                continue;
194
            }
195
            $fromTime = gmmktime(mb_substr($array['from'], 0, mb_strlen($array['from']) - 2), mb_substr($array['from'], -2, 2), 0);
196
            $toTime = gmmktime(mb_substr($array['to'], 0, mb_strlen($array['to']) - 2), mb_substr($array['to'], -2, 2), 59);
197
            if ($fromTime > $toTime) {
198
                $toTime = gmmktime(mb_substr($array['to'] + 24, 0, mb_strlen($array['to']) - 2), mb_substr($array['to'], -2, 2), 59);
199
            }
200
            if (
201
                $timestamp >= $fromTime &&
202
                $timestamp <= $toTime
203
            ) {
204
                $values[] = $array['rate'];
205
            }
206
        }
207
        return $values;
208
    }
209
210
    /**
211
     * Get Request-rates
212
     *
213
     * @return array
214
     */
215
    public function getRequestRates()
216
    {
217
        $array = $this->rules[self::DIRECTIVE_REQUEST_RATE]->export();
218
        return isset($array[self::DIRECTIVE_REQUEST_RATE]) ? $array[self::DIRECTIVE_REQUEST_RATE] : [];
219
    }
220
221
    /**
222
     * Rule export
223
     *
224
     * @return array
225
     */
226
    public function export()
227
    {
228
        $result = [];
229
        foreach ($this->rules as $directive => $object) {
230
            if (!empty($export = $object->export())) {
231
                $result[$directive] = $export[$directive];
232
            }
233
        }
234
        return $result;
235
    }
236
237
    /**
238
     * UserAgentClient destructor.
239
     */
240
    public function __destruct()
241
    {
242
        if (!$this->commentsExported) {
243
            // Comment from the `Comments` directive exists, but has not been exported.
244
            foreach ($this->getComments() as $message) {
245
                trigger_error('Comment for `' . $this->userAgent . '` at `' . $this->base . '/robots.txt`: ' . $message, E_USER_NOTICE);
246
            }
247
        }
248
    }
249
250
    /**
251
     * Get Comments
252
     *
253
     * @return array
254
     */
255
    public function getComments()
256
    {
257
        $this->commentsExported = true;
258
        $comments = $this->rules[self::DIRECTIVE_COMMENT]->export();
259
        return isset($comments[self::DIRECTIVE_COMMENT]) ? $comments[self::DIRECTIVE_COMMENT] : [];
260
    }
261
262
    /**
263
     * Get Visit-time
264
     *
265
     * @return array|false
266
     */
267
    public function getVisitTime()
268
    {
269
        $times = $this->rules[self::DIRECTIVE_VISIT_TIME]->export();
270
        return isset($times[self::DIRECTIVE_VISIT_TIME]) ? $times[self::DIRECTIVE_VISIT_TIME] : [];
271
    }
272
}
273