Completed
Push — 2.0-dev ( 2c252e...97b412 )
by Jan-Petter
02:13
created

UserAgentTools::isAllowed()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser\Client\Directives;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
5
use vipnytt\RobotsTxtParser\Parser\Directives\DirectiveParserCommons;
6
use vipnytt\RobotsTxtParser\Parser\Directives\SubDirectiveHandler;
7
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser;
8
use vipnytt\RobotsTxtParser\Parser\UrlParser;
9
use vipnytt\RobotsTxtParser\RobotsTxtInterface;
10
11
class UserAgentTools implements RobotsTxtInterface
12
{
13
    use UrlParser;
14
    use DirectiveParserCommons;
15
16
    /**
17
     * Rules
18
     * @var SubDirectiveHandler
19
     */
20
    protected $handler;
21
22
    /**
23
     * Base Uri
24
     * @var string
25
     */
26
    private $base;
27
28
    /**
29
     * Status code
30
     * @var int|null
31
     */
32
    private $statusCode;
33
34
    /**
35
     * DisAllowClient constructor.
36
     *
37
     * @param string $base
38
     * @param int|null $statusCode
39
     * @param SubDirectiveHandler $handler
40
     */
41
    public function __construct(SubDirectiveHandler $handler, $base, $statusCode)
42
    {
43
        $this->handler = $handler;
44
        $this->base = $base;
45
        $this->statusCode = $statusCode;
46
    }
47
48
    /**
49
     * UserAgentClient destructor.
50
     */
51
    public function __destruct()
52
    {
53
        $this->handler->comment()->client();
54
    }
55
56
    /**
57
     * Check if URL is allowed to crawl
58
     *
59
     * @param string $url
60
     * @return bool
61
     */
62
    public function isAllowed($url)
63
    {
64
        return $this->check(self::DIRECTIVE_ALLOW, $url);
65
    }
66
67
    /**
68
     * Check
69
     *
70
     * @param string $directive
71
     * @param string $url
72
     * @return bool
73
     * @throws ClientException
74
     */
75
    private function check($directive, $url)
76
    {
77
        $directive = $this->validateDirective($directive, [self::DIRECTIVE_DISALLOW, self::DIRECTIVE_ALLOW]);
78
        $url = $this->urlConvertToFull($url, $this->base);
79
        if (!$this->isUrlApplicable([$url, $this->base])) {
80
            throw new ClientException('URL belongs to a different robots.txt');
81
        }
82
        $statusCodeParser = new StatusCodeParser($this->statusCode, parse_url($this->base, PHP_URL_SCHEME));
83
        $statusCodeParser->codeOverride();
84
        if (($result = $statusCodeParser->accessOverrideCheck()) !== null) {
85
            return $directive === $result;
86
        }
87
        if ($this->handler->visitTime()->client()->isVisitTime() === false) {
88
            return $result === self::DIRECTIVE_DISALLOW;
89
        }
90
        $result = self::DIRECTIVE_ALLOW;
91
        foreach (
92
            [
93
                self::DIRECTIVE_DISALLOW => $this->handler->disallow(),
94
                self::DIRECTIVE_ALLOW => $this->handler->allow(),
95
            ] as $currentDirective => $ruleClient
96
        ) {
97
            if ($ruleClient->client()->isListed($url)) {
98
                $result = $currentDirective;
99
            }
100
        }
101
        return $directive === $result;
102
    }
103
104
    /**
105
     * Check if the URL belongs to current robots.txt
106
     *
107
     * @param string[] $urls
108
     * @return bool
109
     */
110
    private function isUrlApplicable($urls)
111
    {
112
        foreach ($urls as $url) {
113
            $parsed = parse_url($url);
114
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
115
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
116
            if (!isset($result)) {
117
                $result = $assembled;
118
            } elseif ($result !== $assembled) {
119
                return false;
120
            }
121
        }
122
        return true;
123
    }
124
125
    /**
126
     * Check if URL is disallowed to crawl
127
     *
128
     * @param string $url
129
     * @return bool
130
     */
131
    public function isDisallowed($url)
132
    {
133
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
134
    }
135
136
    /**
137
     * Rule export
138
     *
139
     * @return array
140
     */
141
    public function export()
142
    {
143
        return [
144
            self::DIRECTIVE_ROBOT_VERSION => $this->handler->robotVersion()->client()->export(),
145
            self::DIRECTIVE_VISIT_TIME => $this->handler->visitTime()->client()->export(),
146
            self::DIRECTIVE_DISALLOW => $this->handler->disallow()->client()->export(),
147
            self::DIRECTIVE_ALLOW => $this->handler->allow()->client()->export(),
148
            self::DIRECTIVE_CRAWL_DELAY => $this->handler->crawlDelay()->client()->export(),
149
            self::DIRECTIVE_CACHE_DELAY => $this->handler->cacheDelay()->client()->export(),
150
            self::DIRECTIVE_REQUEST_RATE => $this->handler->requestRate()->client()->export(),
151
            self::DIRECTIVE_COMMENT => $this->handler->comment()->client()->export(),
152
        ];
153
    }
154
}
155