Completed
Push — master ( dbcf6b...d53ab6 )
by Peter
44:13 queued 37:37
created

AbstractCrawler::crawl()   B

Complexity

Conditions 6
Paths 6

Size

Total Lines 42

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 6.0029

Importance

Changes 0
Metric Value
dl 0
loc 42
ccs 22
cts 23
cp 0.9565
rs 8.6257
c 0
b 0
f 0
cc 6
nc 6
nop 1
crap 6.0029
1
<?php
2
3
namespace TreeHouse\IoBundle\Scrape\Crawler;
4
5
use Faker\Provider\UserAgent;
6
use Psr\Http\Message\ResponseInterface;
7
use TreeHouse\IoBundle\Scrape\Crawler\Client\ClientInterface;
8
use TreeHouse\IoBundle\Scrape\Crawler\Log\RequestLoggerInterface;
9
use TreeHouse\IoBundle\Scrape\Crawler\RateLimit\RateLimitInterface;
10
use TreeHouse\IoBundle\Scrape\Exception\NotFoundException;
11
use TreeHouse\IoBundle\Scrape\Exception\RateLimitException;
12
use TreeHouse\IoBundle\Scrape\Exception\UnexpectedResponseException;
13
14
abstract class AbstractCrawler implements CrawlerInterface
15
{
16
    /**
17
     * The client executing the http requests.
18
     *
19
     * @var ClientInterface
20
     */
21
    protected $client;
22
23
    /**
24
     * A logger that remembers crawled requests.
25
     *
26
     * @var RequestLoggerInterface
27
     */
28
    protected $logger;
29
30
    /**
31
     * The rate limit to apply when crawling.
32
     *
33
     * @var RateLimitInterface
34
     */
35
    protected $rateLimit;
36
37
    /**
38
     * Whether to randomize user agents on requests.
39
     *
40
     * @var bool
41
     */
42
    protected $randomizeUserAgent = false;
43
44
    /**
45
     * The response of the last crawled page.
46
     *
47
     * @var ResponseInterface
48
     */
49
    protected $response;
50
51
    /**
52
     * The last crawled url. When following redirects, the url is updated with the effective url.
53
     *
54
     * @var string
55
     */
56
    protected $url;
57
58
    /**
59
     * @param ClientInterface        $client
60
     * @param RequestLoggerInterface $logger
61
     * @param RateLimitInterface     $ratelimit
62
     * @param bool                   $randomizeUserAgent
63
     */
64 18
    public function __construct(ClientInterface $client, RequestLoggerInterface $logger, RateLimitInterface $ratelimit, $randomizeUserAgent = false)
65
    {
66 18
        $this->client = $client;
67 18
        $this->logger = $logger;
68 18
        $this->rateLimit = $ratelimit;
69 18
        $this->randomizeUserAgent = $randomizeUserAgent;
70 18
    }
71
72
    /**
73
     * @inheritdoc
74
     */
75 2
    public function getClient()
76
    {
77 2
        return $this->client;
78
    }
79
80
    /**
81
     * @inheritdoc
82
     */
83 2
    public function getLogger()
84
    {
85 2
        return $this->logger;
86
    }
87
88
    /**
89
     * @inheritdoc
90
     */
91 2
    public function getRateLimit()
92
    {
93 2
        return $this->rateLimit;
94
    }
95
96
    /**
97
     * @inheritdoc
98
     */
99 8
    public function getLastResponse()
100
    {
101 8
        if (!$this->response) {
102 2
            throw new \RuntimeException('Crawler has yet to make a request');
103
        }
104
105 6
        return $this->response;
106
    }
107
108
    /**
109
     * @inheritdoc
110
     */
111 4
    public function getLastUrl()
112
    {
113 4
        if (!$this->url) {
114 2
            throw new \RuntimeException('Crawler has yet to make a request');
115
        }
116
117 2
        return $this->url;
118
    }
119
120
    /**
121
     * @inheritdoc
122
     */
123 12
    public function crawl($url)
124
    {
125 12
        $this->response = null;
126
127 12
        if ($this->rateLimit->limitReached()) {
128 2
            throw new RateLimitException(
129 2
                $url,
130 2
                sprintf('Reached the rate limit of %s', $this->rateLimit->getLimit()),
131 2
                $this->rateLimit->getRetryDate()
132
            );
133
        }
134
135 10
        $this->logger->logRequest($url, new \DateTime());
136
137 10
        list($this->url, $this->response) = $this->client->fetch($url, $this->getUserAgent($url));
138
139 10
        if ($this->response->getStatusCode() === 429) {
140 4
            throw new RateLimitException(
141 4
                $url,
142 4
                sprintf('Server replied with response %d (Too Many Requests)', 429),
143 4
                $this->getRetryAfterDate()
144
            );
145
        }
146
147 6
        if ($this->islastResponseNotFound()) {
148
            throw new NotFoundException($url, $this->response);
149
        }
150
151 6
        if (!$this->islastResponseOk()) {
152 2
            throw new UnexpectedResponseException($url, $this->response);
153
        }
154
155 4
        $body = $this->response->getBody();
156 4
        $contents = $body->getContents();
157
158
        // rewind stream, in case we need to use the last response
159 4
        if ($body->isSeekable()) {
160 4
            $body->rewind();
161
        }
162
163 4
        return $contents;
164
    }
165
166
    /**
167
     * @inheritdoc
168
     */
169
    abstract public function getNextUrls();
170
171
    /**
172
     * @param string $url
173
     *
174
     * @return string|null
175
     */
176 10
    protected function getUserAgent($url)
0 ignored issues
show
Unused Code introduced by
The parameter $url is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
177
    {
178 10
        if (!$this->randomizeUserAgent) {
179 10
            return null;
180
        }
181
182
        return UserAgent::userAgent();
183
    }
184
185
    /**
186
     * @return \DateTime
187
     */
188 4
    protected function getRetryAfterDate()
189
    {
190 4
        if (null === $date = $this->response->getHeaderLine('Retry-After')) {
191
            return null;
192
        }
193
194 4
        if (is_numeric($date)) {
195 2
            return new \DateTime(sprintf('+%d seconds', $date));
196
        } else {
197 2
            if (false !== $date = \DateTime::createFromFormat(DATE_RFC2822, $date)) {
198 2
                return $date;
199
            }
200
        }
201
202
        return null;
203
    }
204
205
    /**
206
     * Returns whether the last response is a 200 OK.
207
     *
208
     * @return bool
209
     */
210 6
    protected function islastResponseOk()
211
    {
212 6
        return $this->getLastResponse()->getStatusCode() === 200;
213
    }
214
215
    /**
216
     * Returns whether the last response is not found. This includes checks for
217
     * soft 404's, redirects from what should be 404/410 responses to 200 OK
218
     * pages, and other tricks like that.
219
     *
220
     * In other words: returns true if the last response is not the actual page
221
     * that was requested.
222
     *
223
     * @return bool
224
     */
225 6
    protected function islastResponseNotFound()
226
    {
227 6
        return in_array($this->getLastResponse()->getStatusCode(), [404, 410]);
228
    }
229
}
230