1 | <?php |
||
14 | abstract class AbstractCrawler implements CrawlerInterface |
||
15 | { |
||
16 | /** |
||
17 | * The client executing the http requests. |
||
18 | * |
||
19 | * @var ClientInterface |
||
20 | */ |
||
21 | protected $client; |
||
22 | |||
23 | /** |
||
24 | * A logger that remembers crawled requests. |
||
25 | * |
||
26 | * @var RequestLoggerInterface |
||
27 | */ |
||
28 | protected $logger; |
||
29 | |||
30 | /** |
||
31 | * The rate limit to apply when crawling. |
||
32 | * |
||
33 | * @var RateLimitInterface |
||
34 | */ |
||
35 | protected $rateLimit; |
||
36 | |||
37 | /** |
||
38 | * Whether to randomize user agents on requests. |
||
39 | * |
||
40 | * @var bool |
||
41 | */ |
||
42 | protected $randomizeUserAgent = false; |
||
43 | |||
44 | /** |
||
45 | * The response of the last crawled page. |
||
46 | * |
||
47 | * @var ResponseInterface |
||
48 | */ |
||
49 | protected $response; |
||
50 | |||
51 | /** |
||
52 | * The last crawled url. When following redirects, the url is updated with the effective url. |
||
53 | * |
||
54 | * @var string |
||
55 | */ |
||
56 | protected $url; |
||
57 | |||
58 | /** |
||
59 | * @param ClientInterface $client |
||
60 | * @param RequestLoggerInterface $logger |
||
61 | * @param RateLimitInterface $ratelimit |
||
62 | * @param bool $randomizeUserAgent |
||
63 | */ |
||
64 | 18 | public function __construct(ClientInterface $client, RequestLoggerInterface $logger, RateLimitInterface $ratelimit, $randomizeUserAgent = false) |
|
65 | { |
||
66 | 18 | $this->client = $client; |
|
67 | 18 | $this->logger = $logger; |
|
68 | 18 | $this->rateLimit = $ratelimit; |
|
69 | 18 | $this->randomizeUserAgent = $randomizeUserAgent; |
|
70 | 18 | } |
|
71 | |||
72 | /** |
||
73 | * @inheritdoc |
||
74 | */ |
||
75 | 2 | public function getClient() |
|
76 | { |
||
77 | 2 | return $this->client; |
|
78 | } |
||
79 | |||
80 | /** |
||
81 | * @inheritdoc |
||
82 | */ |
||
83 | 2 | public function getLogger() |
|
84 | { |
||
85 | 2 | return $this->logger; |
|
86 | } |
||
87 | |||
88 | /** |
||
89 | * @inheritdoc |
||
90 | */ |
||
91 | 2 | public function getRateLimit() |
|
92 | { |
||
93 | 2 | return $this->rateLimit; |
|
94 | } |
||
95 | |||
96 | /** |
||
97 | * @inheritdoc |
||
98 | */ |
||
99 | 8 | public function getLastResponse() |
|
100 | { |
||
101 | 8 | if (!$this->response) { |
|
102 | 2 | throw new \RuntimeException('Crawler has yet to make a request'); |
|
103 | } |
||
104 | |||
105 | 6 | return $this->response; |
|
106 | } |
||
107 | |||
108 | /** |
||
109 | * @inheritdoc |
||
110 | */ |
||
111 | 4 | public function getLastUrl() |
|
112 | { |
||
113 | 4 | if (!$this->url) { |
|
114 | 2 | throw new \RuntimeException('Crawler has yet to make a request'); |
|
115 | } |
||
116 | |||
117 | 2 | return $this->url; |
|
118 | } |
||
119 | |||
120 | /** |
||
121 | * @inheritdoc |
||
122 | */ |
||
123 | 12 | public function crawl($url) |
|
124 | { |
||
125 | 12 | $this->response = null; |
|
126 | |||
127 | 12 | if ($this->rateLimit->limitReached()) { |
|
128 | 2 | throw new RateLimitException( |
|
129 | 2 | $url, |
|
130 | 2 | sprintf('Reached the rate limit of %s', $this->rateLimit->getLimit()), |
|
131 | 2 | $this->rateLimit->getRetryDate() |
|
132 | ); |
||
133 | } |
||
134 | |||
135 | 10 | $this->logger->logRequest($url, new \DateTime()); |
|
136 | |||
137 | 10 | list($this->url, $this->response) = $this->client->fetch($url, $this->getUserAgent($url)); |
|
138 | |||
139 | 10 | if ($this->response->getStatusCode() === 429) { |
|
140 | 4 | throw new RateLimitException( |
|
141 | 4 | $url, |
|
142 | 4 | sprintf('Server replied with response %d (Too Many Requests)', 429), |
|
143 | 4 | $this->getRetryAfterDate() |
|
144 | ); |
||
145 | } |
||
146 | |||
147 | 6 | if ($this->islastResponseNotFound()) { |
|
148 | throw new NotFoundException($url, $this->response); |
||
149 | } |
||
150 | |||
151 | 6 | if (!$this->islastResponseOk()) { |
|
152 | 2 | throw new UnexpectedResponseException($url, $this->response); |
|
153 | } |
||
154 | |||
155 | 4 | $body = $this->response->getBody(); |
|
156 | 4 | $contents = $body->getContents(); |
|
157 | |||
158 | // rewind stream, in case we need to use the last response |
||
159 | 4 | if ($body->isSeekable()) { |
|
160 | 4 | $body->rewind(); |
|
161 | } |
||
162 | |||
163 | 4 | return $contents; |
|
164 | } |
||
165 | |||
166 | /** |
||
167 | * @inheritdoc |
||
168 | */ |
||
169 | abstract public function getNextUrls(); |
||
170 | |||
171 | /** |
||
172 | * @param string $url |
||
173 | * |
||
174 | * @return string|null |
||
175 | */ |
||
176 | 10 | protected function getUserAgent($url) |
|
184 | |||
185 | /** |
||
186 | * @return \DateTime |
||
187 | */ |
||
188 | 4 | protected function getRetryAfterDate() |
|
204 | |||
205 | /** |
||
206 | * Returns whether the last response is a 200 OK. |
||
207 | * |
||
208 | * @return bool |
||
209 | */ |
||
210 | 6 | protected function islastResponseOk() |
|
214 | |||
215 | /** |
||
216 | * Returns whether the last response is not found. This includes checks for |
||
217 | * soft 404's, redirects from what should be 404/410 responses to 200 OK |
||
218 | * pages, and other tricks like that. |
||
219 | * |
||
220 | * In other words: returns true if the last response is not the actual page |
||
221 | * that was requested. |
||
222 | * |
||
223 | * @return bool |
||
224 | */ |
||
225 | 6 | protected function islastResponseNotFound() |
|
229 | } |
||
230 |
This check looks from parameters that have been defined for a function or method, but which are not used in the method body.