Completed
Branch 2.0-dev (d250b8)
by Jan-Petter
03:02
created

UserAgentClient::check()   B

Complexity

Conditions 5
Paths 5

Size

Total Lines 23
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 23
rs 8.5906
cc 5
eloc 15
nc 5
nop 2
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client\Directives;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\Directives\UserAgentParser;
6
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
7
use vipnytt\RobotsTxtParser\Parser\UrlParser;
8
use vipnytt\RobotsTxtParser\RobotsTxtInterface;
9
use vipnytt\UserAgentParser as UAStringParser;
10
11
/**
12
 * Class UserAgentClient
13
 *
14
 * @package vipnytt\RobotsTxtParser\Client\Directives
15
 */
16
class UserAgentClient implements RobotsTxtInterface
17
{
18
    use UrlParser;
19
20
    /**
21
     * Rules
22
     * @var array
23
     */
24
    private $rules;
25
26
    /**
27
     * User-agent
28
     * @var string
29
     */
30
    private $userAgent;
31
32
    /**
33
     * Origin user-agent
34
     * @var string
35
     */
36
    private $userAgentOrigin;
37
38
    /**
39
     * Robots.txt base URL
40
     * @var string
41
     */
42
    private $base;
43
44
    /**
45
     * Status code parser
46
     * @var StatusCodeParser
47
     */
48
    private $statusCodeParser;
49
50
    /**
51
     * Comment export status
52
     * @var bool
53
     */
54
    private $commentsExported = false;
55
56
    /**
57
     * UserAgentClient constructor.
58
     *
59
     * @param string $userAgent
60
     * @param UserAgentParser $rules
61
     * @param string $baseUri
62
     * @param int|null $statusCode
63
     */
64
    public function __construct($userAgent, UserAgentParser $rules, $baseUri, $statusCode)
65
    {
66
        $this->statusCodeParser = new StatusCodeParser($statusCode, parse_url($baseUri, PHP_URL_SCHEME));
67
        $this->rules = $rules;
0 ignored issues
show
Documentation Bug introduced by
It seems like $rules of type object<vipnytt\RobotsTxt...ctives\UserAgentParser> is incompatible with the declared type array of property $rules.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
68
        $this->base = $baseUri;
69
        $this->userAgentOrigin = mb_strtolower($userAgent);
70
        $userAgentParser = new UAStringParser($this->userAgentOrigin);
71
        if (($this->userAgent = $userAgentParser->match($rules->userAgents)) === false) {
0 ignored issues
show
Documentation Bug introduced by
It seems like $userAgentParser->match($rules->userAgents) can also be of type false. However, the property $userAgent is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
72
            $this->userAgent = self::USER_AGENT;
73
        }
74
    }
75
76
    /**
77
     * Check if URL is allowed to crawl
78
     *
79
     * @param string $url
80
     * @return bool
81
     */
82
    public function isAllowed($url)
83
    {
84
        return $this->check(self::DIRECTIVE_ALLOW, $url);
85
    }
86
87
    /**
88
     * Check
89
     *
90
     * @param string $directive
91
     * @param string $url - URL to check
92
     * @return bool
93
     * @throws ClientException
94
     */
95
    private function check($directive, $url)
96
    {
97
        $url = $this->urlConvertToFull($url, $this->base);
98
        if (!$this->isUrlApplicable([$url, $this->base])) {
99
            throw new ClientException('URL belongs to a different robots.txt');
100
        }
101
        $this->statusCodeParser->replaceUnofficial();
102
        if (($result = $this->statusCodeParser->check()) !== null) {
103
            return $directive === $result;
104
        }
105
        $result = self::DIRECTIVE_ALLOW;
106
        foreach (
107
            [
108
                self::DIRECTIVE_DISALLOW => $this->rules->disallow[$this->userAgent],
109
                self::DIRECTIVE_ALLOW => $this->rules->allow[$this->userAgent]
110
            ] as $currentDirective => $currentRules
111
        ) {
112
            if ($currentRules->check($url)) {
113
                $result = $currentDirective;
114
            }
115
        }
116
        return $directive === $result;
117
    }
118
119
    /**
120
     * Check if the URL belongs to current robots.txt
121
     *
122
     * @param $urls
123
     * @return bool
124
     */
125
    private function isUrlApplicable($urls)
126
    {
127
        foreach ($urls as $url) {
128
            $parsed = parse_url($url);
129
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
130
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
131
            if (!isset($result)) {
132
                $result = $assembled;
133
            } elseif ($result !== $assembled) {
134
                return false;
135
            }
136
        }
137
        return true;
138
    }
139
140
    /**
141
     * Check if URL is disallowed to crawl
142
     *
143
     * @param string $url
144
     * @return bool
145
     */
146
    public function isDisallowed($url)
147
    {
148
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
149
    }
150
151
    /**
152
     * Cache-delay
153
     *
154
     * @return DelayClient
155
     */
156
    public function cacheDelay()
157
    {
158
        $export = $this->rules->cacheDelay[$this->userAgent]->export();
159
        $delay = isset($export[self::DIRECTIVE_CACHE_DELAY]) ? $export[self::DIRECTIVE_CACHE_DELAY] : 0;
160
        return new DelayClient($this->base, $this->userAgent, $delay, $this->crawlDelay()->get());
161
    }
162
163
    /**
164
     * Crawl-delay
165
     *
166
     * @return DelayClient
167
     */
168
    public function crawlDelay()
169
    {
170
        $export = $this->rules->crawlDelay[$this->userAgent]->export();
171
        $delay = isset($export[self::DIRECTIVE_CRAWL_DELAY]) ? $export[self::DIRECTIVE_CRAWL_DELAY] : 0;
172
        return new DelayClient($this->base, $this->userAgent, $delay, $this->requestRate()->get());
173
    }
174
175
    /**
176
     * RequestClient-rate
177
     *
178
     * @return RequestRateClient
179
     */
180
    public function requestRate()
181
    {
182
        $array = $this->rules->requestRate[$this->userAgent]->export();
183
        $rates = isset($array[self::DIRECTIVE_REQUEST_RATE]) ? $array[self::DIRECTIVE_REQUEST_RATE] : [];
184
        return new RequestRateClient($this->base, $this->userAgent, $rates);
185
    }
186
187
    /**
188
     * Robot-version
189
     *
190
     * @return RobotVersionClient
191
     */
192
    public function robotVersion()
193
    {
194
        $export = $this->rules->robotVersion[$this->userAgent]->export();
195
        return new RobotVersionClient(isset($export[self::DIRECTIVE_ROBOT_VERSION]) ? $export[self::DIRECTIVE_ROBOT_VERSION] : null);
196
    }
197
198
    /**
199
     * Rule export
200
     *
201
     * @return array
202
     */
203
    public function export()
204
    {
205
        return array_merge(
206
            $this->rules->allow[$this->userAgent]->export(),
207
            $this->rules->comment[$this->userAgent]->export(),
208
            $this->rules->cacheDelay[$this->userAgent]->export(),
209
            $this->rules->crawlDelay[$this->userAgent]->export(),
210
            $this->rules->disallow[$this->userAgent]->export(),
211
            $this->rules->requestRate[$this->userAgent]->export(),
212
            $this->rules->robotVersion[$this->userAgent]->export(),
213
            $this->rules->visitTime[$this->userAgent]->export()
214
        );
215
    }
216
217
    /**
218
     * Visit-time
219
     *
220
     * @return VisitTimeClient
221
     */
222
    public function visitTime()
223
    {
224
        $export = $this->rules->visitTime[$this->userAgent]->export();
225
        $times = isset($export[self::DIRECTIVE_VISIT_TIME]) ? $export[self::DIRECTIVE_VISIT_TIME] : [];
226
        return new VisitTimeClient($times);
227
    }
228
229
    /**
230
     * UserAgentClient destructor.
231
     */
232
    public function __destruct()
233
    {
234
        if (!$this->commentsExported && $this->userAgent != self::USER_AGENT) {
235
            // Comment from the `Comments` directive exists, but has not been read.
236
            foreach ($this->comment()->export() as $message) {
237
                trigger_error($this->userAgent . ' @ ' . $this->base . self::PATH . ': ' . $message, E_USER_NOTICE);
238
            }
239
        }
240
    }
241
242
    /**
243
     * Comment
244
     *
245
     * @return CommentClient
246
     */
247
    public function comment()
248
    {
249
        $this->commentsExported = true;
250
        $export = $this->rules->comment[$this->userAgent]->export();
251
        $comments = isset($export[self::DIRECTIVE_COMMENT]) ? $export[self::DIRECTIVE_COMMENT] : [];
252
        return new CommentClient($comments);
253
    }
254
}
255