Completed
Push — master ( dbcf6b...d53ab6 )
by Peter
44:13 queued 37:37
created

Scraper   A

Complexity

Total Complexity 25

Size/Duplication

Total Lines 194
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 20

Test Coverage

Coverage 94.37%

Importance

Changes 0
Metric Value
wmc 25
lcom 1
cbo 20
dl 0
loc 194
ccs 67
cts 71
cp 0.9437
rs 10
c 0
b 0
f 0

11 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 2
A getCrawler() 0 4 1
A getEventDispatcher() 0 4 1
A setAsync() 0 4 1
A isAsync() 0 4 1
A scrapeAfter() 0 8 1
A normalizeUrl() 0 4 1
A scrape() 0 33 5
A scrapeNext() 0 10 3
B scrapeItem() 0 21 6
A handleRateLimitException() 0 17 3
1
<?php
2
3
namespace TreeHouse\IoBundle\Scrape;
4
5
use GuzzleHttp\Psr7\Uri;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use TreeHouse\Feeder\Exception\FilterException;
9
use TreeHouse\Feeder\Exception\ModificationException;
10
use TreeHouse\Feeder\Exception\ValidationException;
11
use TreeHouse\IoBundle\Entity\Scraper as ScraperEntity;
12
use TreeHouse\IoBundle\Import\Exception\FailedItemException;
13
use TreeHouse\IoBundle\Scrape\Crawler\CrawlerInterface;
14
use TreeHouse\IoBundle\Scrape\Event\FailedItemEvent;
15
use TreeHouse\IoBundle\Scrape\Event\RateLimitEvent;
16
use TreeHouse\IoBundle\Scrape\Event\ScrapeResponseEvent;
17
use TreeHouse\IoBundle\Scrape\Event\ScrapeUrlEvent;
18
use TreeHouse\IoBundle\Scrape\Event\SkippedItemEvent;
19
use TreeHouse\IoBundle\Scrape\Event\SuccessItemEvent;
20
use TreeHouse\IoBundle\Scrape\Exception\CrawlException;
21
use TreeHouse\IoBundle\Scrape\Exception\RateLimitException;
22
use TreeHouse\IoBundle\Scrape\Exception\UnexpectedResponseException;
23
use TreeHouse\IoBundle\Scrape\Handler\HandlerInterface;
24
use TreeHouse\IoBundle\Scrape\Parser\ParserInterface;
25
26
class Scraper implements ScraperInterface
27
{
28
    /**
29
     * @var CrawlerInterface
30
     */
31
    protected $crawler;
32
33
    /**
34
     * @var ParserInterface
35
     */
36
    protected $parser;
37
38
    /**
39
     * @var HandlerInterface
40
     */
41
    protected $handler;
42
43
    /**
44
     * @var EventDispatcherInterface
45
     */
46
    protected $eventDispatcher;
47
48
    /**
49
     * @var bool
50
     */
51
    protected $async = false;
52
53
    /**
54
     * @param CrawlerInterface         $crawler
55
     * @param ParserInterface          $parser
56
     * @param HandlerInterface         $handler
57
     * @param EventDispatcherInterface $dispatcher
58
     */
59 34
    public function __construct(CrawlerInterface $crawler, ParserInterface $parser, HandlerInterface $handler, EventDispatcherInterface $dispatcher = null)
60
    {
61 34
        $this->crawler = $crawler;
62 34
        $this->parser = $parser;
63 34
        $this->handler = $handler;
64 34
        $this->eventDispatcher = $dispatcher ?: new EventDispatcher();
65 34
    }
66
67
    /**
68
     * @inheritdoc
69
     */
70 2
    public function getCrawler()
71
    {
72 2
        return $this->crawler;
73
    }
74
75
    /**
76
     * @inheritdoc
77
     */
78 4
    public function getEventDispatcher()
79
    {
80 4
        return $this->eventDispatcher;
81
    }
82
83
    /**
84
     * @inheritdoc
85
     */
86 6
    public function setAsync($async)
87
    {
88 6
        $this->async = $async;
89 6
    }
90
91
    /**
92
     * @inheritdoc
93
     */
94 2
    public function isAsync()
95
    {
96 2
        return $this->async;
97
    }
98
99
    /**
100
     * @inheritdoc
101
     */
102 16
    public function scrape(ScraperEntity $scraper, $url, $continue = true)
103
    {
104 16
        $url = $this->normalizeUrl($url);
105
106
        try {
107 16
            $html = $this->crawler->crawl($url);
108
109
            // put it in a bag
110 8
            $item = new ScrapedItemBag($scraper, $url, $html);
111
112
            // scrape the item and the next urls
113 8
            $this->scrapeItem($item);
114
115 8
            if ($continue) {
116 8
                $this->scrapeNext($scraper);
117
            }
118 8
        } catch (RateLimitException $e) {
119 4
            $this->handleRateLimitException($scraper, $url, $e);
120
121 4
            throw $e;
122 4
        } catch (UnexpectedResponseException $e) {
123
            // we didn't get a 200 OK response, let the application know
124 4
            $this->eventDispatcher->dispatch(
125 4
                ScraperEvents::SCRAPE_URL_NOT_OK,
126 4
                new ScrapeResponseEvent($scraper, $url, $e->getResponse())
127
            );
128
129 4
            throw $e;
130
        } catch (CrawlException $e) {
131
            // something bad happened, let the calling command handle this
132
            throw $e;
133
        }
134 8
    }
135
136
    /**
137
     * @param ScraperEntity $scraper
138
     */
139 12
    public function scrapeNext(ScraperEntity $scraper)
140
    {
141 12
        foreach ($this->crawler->getNextUrls() as $url) {
142 4
            if ($this->async) {
143 2
                $this->scrapeAfter($scraper, $url, new \DateTime());
144
            } else {
145 4
                $this->scrape($scraper, $url);
146
            }
147
        }
148 12
    }
149
150
    /**
151
     * @inheritdoc
152
     */
153 2
    public function scrapeAfter(ScraperEntity $scraper, $url, \DateTime $date)
154
    {
155 2
        $this->eventDispatcher->dispatch(
156 2
            ScraperEvents::SCRAPE_NEXT_URL,
157 2
            new ScrapeUrlEvent($scraper, $url),
158 2
            $date
0 ignored issues
show
Unused Code introduced by
The call to EventDispatcherInterface::dispatch() has too many arguments starting with $date.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
159
        );
160 2
    }
161
162
    /**
163
     * @param ScrapedItemBag $item
164
     */
165 8
    protected function scrapeItem(ScrapedItemBag $item)
166
    {
167
        try {
168 8
            $this->parser->parse($item);
169 2
            $source = $this->handler->handle($item);
170
171 2
            $this->eventDispatcher->dispatch(ScraperEvents::ITEM_SUCCESS, new SuccessItemEvent($this, $item, $source));
172 6
        } catch (FilterException $e) {
173 2
            $this->eventDispatcher->dispatch(ScraperEvents::ITEM_SKIPPED, new SkippedItemEvent($this, $item, $e->getMessage()));
174 4
        } catch (ValidationException $e) {
175 2
            $this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage()));
176 2
        } catch (FailedItemException $e) {
177
            $this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage()));
178 2
        } catch (ModificationException $e) {
179 2
            if ($e->getPrevious()) {
180
                $e = $e->getPrevious();
181
            }
182
183 2
            $this->eventDispatcher->dispatch(ScraperEvents::ITEM_FAILED, new FailedItemEvent($this, $item, $e->getMessage()));
184
        }
185 8
    }
186
187
    /**
188
     * @param ScraperEntity      $scraper
189
     * @param string             $url
190
     * @param RateLimitException $e
191
     */
192 4
    protected function handleRateLimitException(ScraperEntity $scraper, $url, RateLimitException $e)
193
    {
194 4
        $date = $e->getRetryDate();
195
196
        // dispatch event about rate limit
197 4
        if ($this->async) {
198 2
            $this->eventDispatcher->dispatch(
199 2
                ScraperEvents::RATE_LIMIT_REACHED,
200 2
                new RateLimitEvent($scraper, $url, $date)
201
            );
202
        } else {
203
            // if no retry-date is given, sleep for a minute
204 2
            $sleepTime = (null !== $date) ? $date->getTimestamp() - time() : 60;
205
206 2
            sleep($sleepTime);
207
        }
208 4
    }
209
210
    /**
211
     * @param string $url
212
     *
213
     * @return string
214
     */
215 16
    protected function normalizeUrl($url)
216
    {
217 16
        return (string) new Uri($url);
218
    }
219
}
220