Completed
Push — master ( 705095...78cb27 )
by Jan-Petter
04:57
created

UserAgentClient::isUrlApplicable()   B

Complexity

Conditions 5
Paths 7

Size

Total Lines 14
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 14
rs 8.8571
cc 5
eloc 10
nc 7
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
6
use vipnytt\RobotsTxtParser\Parser\UrlParser;
7
use vipnytt\RobotsTxtParser\RobotsTxtInterface;
8
9
/**
10
 * Class UserAgentClient
11
 *
12
 * @package vipnytt\RobotsTxtParser\Client
13
 */
14
class UserAgentClient implements RobotsTxtInterface
15
{
16
    use UrlParser;
17
18
    /**
19
     * Rules
20
     * @var array
21
     */
22
    protected $rules;
23
24
    /**
25
     * User-agent
26
     * @var string
27
     */
28
    protected $userAgent;
29
30
    /**
31
     * Robots.txt base URL
32
     * @var string
33
     */
34
    protected $base;
35
36
    /**
37
     * Status code parser
38
     * @var StatusCodeParser
39
     */
40
    protected $statusCodeParser;
41
42
    /**
43
     * UserAgentClient constructor.
44
     *
45
     * @param array $rules
46
     * @param string $userAgent
47
     * @param string $baseUrl
48
     * @param int $statusCode
49
     */
50
    public function __construct($rules, $userAgent, $baseUrl, $statusCode)
51
    {
52
        $this->statusCodeParser = new StatusCodeParser($statusCode, parse_url($baseUrl, PHP_URL_SCHEME));
53
        $this->userAgent = $userAgent;
54
        $this->rules = $rules;
55
        $this->base = $baseUrl;
56
    }
57
58
    /**
59
     * Check if URL is allowed to crawl
60
     *
61
     * @param string $url
62
     * @return bool
63
     */
64
    public function isAllowed($url)
65
    {
66
        return $this->check(self::DIRECTIVE_ALLOW, $url);
67
    }
68
69
    /**
70
     * Check
71
     *
72
     * @param string $directive
73
     * @param string $url - URL to check
74
     * @return bool
75
     * @throws ClientException
76
     */
77
    protected function check($directive, $url)
78
    {
79
        $url = $this->urlConvertToFull($url, $this->base);
80
        if (!$this->isUrlApplicable([$url, $this->base])) {
81
            throw new ClientException('URL belongs to a different robots.txt, please check it against that one instead');
82
        }
83
        $this->statusCodeParser->replaceUnofficial();
84
        if (($result = $this->statusCodeParser->check()) !== null) {
85
            return $directive === $result;
86
        }
87
        $result = self::DIRECTIVE_ALLOW;
88
        foreach ([self::DIRECTIVE_DISALLOW, self::DIRECTIVE_ALLOW] as $currentDirective) {
89
            if ($this->rules[$currentDirective]->check($url)) {
90
                $result = $currentDirective;
91
            }
92
        }
93
        return $directive === $result;
94
    }
95
96
    /**
97
     * Check if the URL belongs to current robots.txt
98
     *
99
     * @param $urls
100
     * @return bool
101
     */
102
    protected function isUrlApplicable($urls)
103
    {
104
        foreach ($urls as $url) {
105
            $parsed = parse_url($url);
106
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
107
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
108
            if (!isset($result)) {
109
                $result = $assembled;
110
            } elseif ($result !== $assembled) {
111
                return false;
112
            }
113
        }
114
        return true;
115
    }
116
117
    /**
118
     * Check if URL is disallowed to crawl
119
     *
120
     * @param string $url
121
     * @return bool
122
     */
123
    public function isDisallowed($url)
124
    {
125
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
126
    }
127
128
    /**
129
     * Get Cache-delay
130
     *
131
     * @return float|int
132
     */
133
    public function getCacheDelay()
134
    {
135
        $exported = $this->rules[self::DIRECTIVE_CACHE_DELAY]->export();
136
        return isset($exported[self::DIRECTIVE_CACHE_DELAY]) ? $exported[self::DIRECTIVE_CACHE_DELAY] : $this->getCrawlDelay();
137
    }
138
139
    /**
140
     * Get Crawl-delay
141
     *
142
     * @return float|int
143
     */
144
    public function getCrawlDelay()
145
    {
146
        $exported = $this->rules[self::DIRECTIVE_CRAWL_DELAY]->export();
147
        return isset($exported[self::DIRECTIVE_CRAWL_DELAY]) ? $exported[self::DIRECTIVE_CRAWL_DELAY] : 0;
148
    }
149
150
    /**
151
     * Rule export
152
     *
153
     * @return array
154
     */
155
    public function export()
156
    {
157
        $result = [];
158
        foreach ($this->rules as $directive => $object) {
159
            if (!empty($export = $object->export())) {
160
                $result[$directive] = $export[$directive];
161
            }
162
        }
163
        return $result;
164
    }
165
}
166