1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace TreeHouse\IoBundle\Scrape; |
4
|
|
|
|
5
|
|
|
use GuzzleHttp\Psr7\Uri; |
6
|
|
|
use Symfony\Component\EventDispatcher\EventDispatcher; |
7
|
|
|
use Symfony\Component\EventDispatcher\EventDispatcherInterface; |
8
|
|
|
use TreeHouse\Feeder\Exception\FilterException; |
9
|
|
|
use TreeHouse\Feeder\Exception\ModificationException; |
10
|
|
|
use TreeHouse\Feeder\Exception\ValidationException; |
11
|
|
|
use TreeHouse\IoBundle\Entity\Scraper as ScraperEntity; |
12
|
|
|
use TreeHouse\IoBundle\Import\Exception\FailedItemException; |
13
|
|
|
use TreeHouse\IoBundle\Scrape\Crawler\CrawlerInterface; |
14
|
|
|
use TreeHouse\IoBundle\Scrape\Event\FailedItemEvent; |
15
|
|
|
use TreeHouse\IoBundle\Scrape\Event\RateLimitEvent; |
16
|
|
|
use TreeHouse\IoBundle\Scrape\Event\ScrapeResponseEvent; |
17
|
|
|
use TreeHouse\IoBundle\Scrape\Event\ScrapeUrlEvent; |
18
|
|
|
use TreeHouse\IoBundle\Scrape\Event\SkippedItemEvent; |
19
|
|
|
use TreeHouse\IoBundle\Scrape\Event\SuccessItemEvent; |
20
|
|
|
use TreeHouse\IoBundle\Scrape\Exception\CrawlException; |
21
|
|
|
use TreeHouse\IoBundle\Scrape\Exception\RateLimitException; |
22
|
|
|
use TreeHouse\IoBundle\Scrape\Exception\UnexpectedResponseException; |
23
|
|
|
use TreeHouse\IoBundle\Scrape\Handler\HandlerInterface; |
24
|
|
|
use TreeHouse\IoBundle\Scrape\Parser\ParserInterface; |
25
|
|
|
|
26
|
|
|
class Scraper implements ScraperInterface |
27
|
|
|
{ |
28
|
|
|
/** |
29
|
|
|
* @var CrawlerInterface |
30
|
|
|
*/ |
31
|
|
|
protected $crawler; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* @var ParserInterface |
35
|
|
|
*/ |
36
|
|
|
protected $parser; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* @var HandlerInterface |
40
|
|
|
*/ |
41
|
|
|
protected $handler; |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* @var EventDispatcherInterface |
45
|
|
|
*/ |
46
|
|
|
protected $eventDispatcher; |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* @var bool |
50
|
|
|
*/ |
51
|
|
|
protected $async = false; |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* @param CrawlerInterface $crawler |
55
|
|
|
* @param ParserInterface $parser |
56
|
|
|
* @param HandlerInterface $handler |
57
|
|
|
* @param EventDispatcherInterface $dispatcher |
58
|
|
|
*/ |
59
|
34 |
|
public function __construct(CrawlerInterface $crawler, ParserInterface $parser, HandlerInterface $handler, EventDispatcherInterface $dispatcher = null) |
60
|
|
|
{ |
61
|
34 |
|
$this->crawler = $crawler; |
62
|
34 |
|
$this->parser = $parser; |
63
|
34 |
|
$this->handler = $handler; |
64
|
34 |
|
$this->eventDispatcher = $dispatcher ?: new EventDispatcher(); |
65
|
34 |
|
} |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* @inheritdoc |
69
|
|
|
*/ |
70
|
2 |
|
public function getCrawler() |
71
|
|
|
{ |
72
|
2 |
|
return $this->crawler; |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* @inheritdoc |
77
|
|
|
*/ |
78
|
4 |
|
public function getEventDispatcher() |
79
|
|
|
{ |
80
|
4 |
|
return $this->eventDispatcher; |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* @inheritdoc |
85
|
|
|
*/ |
86
|
6 |
|
public function setAsync($async) |
87
|
|
|
{ |
88
|
6 |
|
$this->async = $async; |
89
|
6 |
|
} |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* @inheritdoc |
93
|
|
|
*/ |
94
|
2 |
|
public function isAsync() |
95
|
|
|
{ |
96
|
2 |
|
return $this->async; |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
/** |
100
|
|
|
* @inheritdoc |
101
|
|
|
*/ |
102
|
16 |
|
public function scrape(ScraperEntity $scraper, $url, $continue = true) |
103
|
|
|
{ |
104
|
16 |
|
$url = $this->normalizeUrl($url); |
105
|
|
|
|
106
|
|
|
try { |
107
|
16 |
|
$html = $this->crawler->crawl($url); |
108
|
|
|
|
109
|
|
|
// put it in a bag |
110
|
8 |
|
$item = new ScrapedItemBag($scraper, $url, $html); |
111
|
|
|
|
112
|
|
|
// scrape the item and the next urls |
113
|
8 |
|
$this->scrapeItem($item); |
114
|
|
|
|
115
|
8 |
|
if ($continue) { |
116
|
8 |
|
$this->scrapeNext($scraper); |
117
|
|
|
} |
118
|
8 |
|
} catch (RateLimitException $e) { |
119
|
4 |
|
$this->handleRateLimitException($scraper, $url, $e); |
120
|
|
|
|
121
|
4 |
|
throw $e; |
122
|
4 |
|
} catch (UnexpectedResponseException $e) { |
123
|
|
|
// we didn't get a 200 OK response, let the application know |
124
|
4 |
|
$this->eventDispatcher->dispatch( |
125
|
4 |
|
ScraperEvents::SCRAPE_URL_NOT_OK, |
126
|
4 |
|
new ScrapeResponseEvent($scraper, $url, $e->getResponse()) |
127
|
|
|
); |
128
|
|
|
|
129
|
4 |
|
throw $e; |
130
|
|
|
} catch (CrawlException $e) { |
131
|
|
|
// something bad happened, let the calling command handle this |
132
|
|
|
throw $e; |
133
|
|
|
} |
134
|
8 |
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* @param ScraperEntity $scraper |
138
|
|
|
*/ |
139
|
12 |
|
public function scrapeNext(ScraperEntity $scraper) |
140
|
|
|
{ |
141
|
12 |
|
foreach ($this->crawler->getNextUrls() as $url) { |
142
|
4 |
|
if ($this->async) { |
143
|
2 |
|
$this->scrapeAfter($scraper, $url, new \DateTime()); |
144
|
|
|
} else { |
145
|
4 |
|
$this->scrape($scraper, $url); |
146
|
|
|
} |
147
|
|
|
} |
148
|
12 |
|
} |
149
|
|
|
|
150
|
|
|
/** |
151
|
|
|
* @inheritdoc |
152
|
|
|
*/ |
153
|
2 |
|
public function scrapeAfter(ScraperEntity $scraper, $url, \DateTime $date) |
154
|
|
|
{ |
155
|
2 |
|
$this->eventDispatcher->dispatch( |
156
|
2 |
|
ScraperEvents::SCRAPE_NEXT_URL, |
157
|
2 |
|
new ScrapeUrlEvent($scraper, $url), |
158
|
2 |
|
$date |
|
|
|
|
159
|
|
|
); |
160
|
2 |
|
} |
161
|
|
|
|
162
|
|
|
/** |
163
|
|
|
* @param ScrapedItemBag $item |
164
|
|
|
*/ |
165
|
8 |
|
protected function scrapeItem(ScrapedItemBag $item) |
166
|
|
|
{ |
167
|
|
|
try { |
168
|
8 |
|
$this->parser->parse($item); |
169
|
2 |
|
$source = $this->handler->handle($item); |
170
|
|
|
|
171
|
2 |
|
$this->eventDispatcher->dispatch(ScraperEvents::ITEM_SUCCESS, new SuccessItemEvent($this, $item, $source)); |
172
|
6 |
|
} catch (FilterException $e) { |
173
|
2 |
|
$this->eventDispatcher->dispatch(ScraperEvents::ITEM_SKIPPED, new SkippedItemEvent($this, $item, $e->getMessage())); |
174
|
4 |
|
} catch (ValidationException $e) { |
175
|
2 |
|
$this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage())); |
176
|
2 |
|
} catch (FailedItemException $e) { |
177
|
|
|
$this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage())); |
178
|
2 |
|
} catch (ModificationException $e) { |
179
|
2 |
|
if ($e->getPrevious()) { |
180
|
|
|
$e = $e->getPrevious(); |
181
|
|
|
} |
182
|
|
|
|
183
|
2 |
|
$this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage())); |
184
|
|
|
} |
185
|
8 |
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* @param ScraperEntity $scraper |
189
|
|
|
* @param string $url |
190
|
|
|
* @param RateLimitException $e |
191
|
|
|
*/ |
192
|
4 |
|
protected function handleRateLimitException(ScraperEntity $scraper, $url, RateLimitException $e) |
193
|
|
|
{ |
194
|
4 |
|
$date = $e->getRetryDate(); |
195
|
|
|
|
196
|
|
|
// dispatch event about rate limit |
197
|
4 |
|
if ($this->async) { |
198
|
2 |
|
$this->eventDispatcher->dispatch( |
199
|
2 |
|
ScraperEvents::RATE_LIMIT_REACHED, |
200
|
2 |
|
new RateLimitEvent($scraper, $url, $date) |
201
|
|
|
); |
202
|
|
|
} else { |
203
|
|
|
// if no retry-date is given, sleep for a minute |
204
|
2 |
|
$sleepTime = (null !== $date) ? $date->getTimestamp() - time() : 60; |
205
|
|
|
|
206
|
2 |
|
sleep($sleepTime); |
207
|
|
|
} |
208
|
4 |
|
} |
209
|
|
|
|
210
|
|
|
/** |
211
|
|
|
* @param string $url |
212
|
|
|
* |
213
|
|
|
* @return string |
214
|
|
|
*/ |
215
|
16 |
|
protected function normalizeUrl($url) |
216
|
|
|
{ |
217
|
16 |
|
return (string) new Uri($url); |
218
|
|
|
} |
219
|
|
|
} |
220
|
|
|
|
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.