Completed
Push — master ( f3d3e2...49e9a0 )
by Jan-Petter
04:42
created

UserAgentClient::validateRules()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 9
rs 9.6666
cc 3
eloc 5
nc 3
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Directives\DisAllow;
5
use vipnytt\RobotsTxtParser\Exceptions\ClientException;
6
7
/**
8
 * Class UserAgentClient
9
 *
10
 * @package vipnytt\RobotsTxtParser
11
 */
12
class UserAgentClient implements RobotsTxtInterface
13
{
14
    /**
15
     * Allow rules
16
     * @var DisAllow
17
     */
18
    protected $allow;
19
20
    /**
21
     * Disallow rules
22
     * @var DisAllow
23
     */
24
    protected $disallow;
25
26
    /**
27
     * User-agent
28
     * @var string
29
     */
30
    protected $userAgent;
31
32
    /**
33
     * Robots.txt origin
34
     * @var string
35
     */
36
    protected $origin;
37
38
    /**
39
     * Status code parser
40
     * @var StatusCodeParser
41
     */
42
    protected $statusCodeParser;
43
44
    /**
45
     * UserAgentClient constructor.
46
     *
47
     * @param DisAllow $allow
48
     * @param DisAllow $disallow
49
     * @param string $userAgent
50
     * @param string $origin
51
     * @param int $statusCode
52
     */
53
    public function __construct($allow, $disallow, $userAgent, $origin, $statusCode)
54
    {
55
        $this->statusCodeParser = new StatusCodeParser($statusCode, parse_url($origin, PHP_URL_SCHEME));
56
        $this->userAgent = $userAgent;
57
        $this->origin = $origin;
58
        $this->allow = $allow;
59
        $this->disallow = $disallow;
60
    }
61
62
    /**
63
     * Check if URL is allowed to crawl
64
     *
65
     * @param string $url
66
     * @return bool
67
     */
68
    public function isAllowed($url)
69
    {
70
        return $this->check(self::DIRECTIVE_ALLOW, $url);
71
    }
72
73
    /**
74
     * Check
75
     *
76
     * @param string $directive
77
     * @param string $url - URL to check
78
     * @return bool
79
     * @throws ClientException
80
     */
81
    protected function check($directive, $url)
82
    {
83
        if (!$this->isUrlApplicable([$url, $this->origin])) {
84
            throw new ClientException('URL belongs to a different robots.txt, please check it against that one instead');
85
        }
86
        $this->statusCodeParser->replaceUnofficial();
87
        if (($result = $this->statusCodeParser->check()) !== null) {
88
            return $directive === $result;
89
        }
90
        $result = self::DIRECTIVE_ALLOW;
91
        foreach ([self::DIRECTIVE_DISALLOW, self::DIRECTIVE_ALLOW] as $currentDirective) {
92
            if ($this->$currentDirective->check($url)) {
93
                $result = $currentDirective;
94
            }
95
        }
96
        return $directive === $result;
97
    }
98
99
    /**
100
     * Check if the URL belongs to current robots.txt
101
     *
102
     * @param $urls
103
     * @return bool
104
     */
105
    protected function isUrlApplicable($urls)
106
    {
107
        foreach ($urls as $url) {
108
            $parsed = parse_url($url);
109
            $parsed['port'] = is_int($port = parse_url($url, PHP_URL_PORT)) ? $port : getservbyname($parsed['scheme'], 'tcp');
110
            $assembled = $parsed['scheme'] . '://' . $parsed['host'] . ':' . $parsed['port'];
111
            if (!isset($result)) {
112
                $result = $assembled;
113
            } elseif ($result !== $assembled) {
114
                return false;
115
            }
116
        }
117
        return true;
118
    }
119
120
    /**
121
     * Check if URL is disallowed to crawl
122
     *
123
     * @param string $url
124
     * @return bool
125
     */
126
    public function isDisallowed($url)
127
    {
128
        return $this->check(self::DIRECTIVE_DISALLOW, $url);
129
    }
130
131
    /**
132
     * Get Cache-delay
133
     *
134
     * @return float|int
135
     */
136
    public function getCacheDelay()
137
    {
138
        $exported = $this->{self::DIRECTIVE_CACHE_DELAY}->export();
139
        return isset($exported[self::DIRECTIVE_CACHE_DELAY]) ? $exported[self::DIRECTIVE_CACHE_DELAY] : $this->getCrawlDelay();
140
    }
141
142
    /**
143
     * Get Crawl-delay
144
     *
145
     * @return float|int
146
     */
147
    public function getCrawlDelay()
148
    {
149
        $exported = $this->{self::DIRECTIVE_CRAWL_DELAY}->export();
150
        return isset($exported[self::DIRECTIVE_CRAWL_DELAY]) ? $exported[self::DIRECTIVE_CRAWL_DELAY] : 0;
151
    }
152
}
153