Completed
Pull Request — master (#20)
by Matthijs
06:05
created

Downloader::setDownloadLimit()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 5
rs 9.4286
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace VDB\Spider\Downloader;
4
5
use VDB\Spider\Downloader\DownloaderInterface;
6
use Symfony\Component\EventDispatcher\Event;
7
use Symfony\Component\EventDispatcher\EventDispatcher;
8
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
9
use Symfony\Component\EventDispatcher\GenericEvent;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
12
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface;
13
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
14
use VDB\Spider\RequestHandler\RequestHandlerInterface;
15
use VDB\Spider\Filter\PostFetchFilterInterface;
16
use VDB\Spider\Resource;
17
use VDB\Spider\Uri\DiscoveredUri;
18
19
class Downloader implements DownloaderInterface
20
{
21
    /** @var EventDispatcherInterface */
22
    private $dispatcher;
23
24
    /** @var PersistenceHandlerInterface */
25
    private $persistenceHandler;
26
27
    /** @var RequestHandlerInterface */
28
    private $requestHandler;
29
30
    /** @var int the maximum number of downloaded resources. 0 means no limit */
31
    private $downloadLimit = 0;
32
33
    /** @var PostFetchFilterInterface[] */
34
    private $postFetchFilters = array();
35
36
    /**
37
     * @param int Maximum number of resources to download
38
     * @return $this
39
     */
40
    public function setDownloadLimit($downloadLimit)
41
    {
42
        $this->downloadLimit = $downloadLimit;
43
        return $this;
44
    }
45
46
    /**
47
     * @return int Maximum number of resources to download
48
     */
49
    public function getdownloadLimit()
50
    {
51
        return $this->downloadLimit;
52
    }
53
54
    /**
55
     * @param PostFetchFilterInterface $filter
56
     */
57
    public function addPostFetchFilter(PostFetchFilterInterface $filter)
58
    {
59
        $this->postFetchFilters[] = $filter;
60
    }
61
62
    /**
63
     * @return false|Resource
64
     */
65
    public function download(DiscoveredUri $uri)
66
    {
67
        // Fetch the document
68
        if (!$resource = $this->fetchResource($uri)) {
69
            return false;
70
        }
71
72
        $this->getPersistenceHandler()->persist($resource);
73
74
        return $resource;
75
    }
76
77
    public function isDownLoadLimitExceeded()
78
    {
79
        return $this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit;
80
    }
81
82
    /**
83
     * A shortcut for EventDispatcher::dispatch()
84
     *
85
     * @param string $eventName
86
     * @param null|Event $event
87
     */
88
    private function dispatch($eventName, Event $event = null)
89
    {
90
        $this->getDispatcher()->dispatch($eventName, $event);
91
    }
92
93
    /**
94
     * @param EventDispatcherInterface $eventDispatcher
95
     * @return $this
96
     */
97
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
98
    {
99
        $this->dispatcher = $eventDispatcher;
100
101
        return $this;
102
    }
103
104
    /**
105
     * @return EventDispatcherInterface
106
     */
107
    public function getDispatcher()
108
    {
109
        if (!$this->dispatcher) {
110
            $this->dispatcher = new EventDispatcher();
111
        }
112
        return $this->dispatcher;
113
    }
114
115
116
    /**
117
     * @param DiscoveredUri $uri
118
     * @return Resource|false
119
     */
120
    protected function fetchResource(DiscoveredUri $uri)
121
    {
122
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
123
124
        try {
125
            $resource = $this->getRequestHandler()->request($uri);
126
127
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
128
129
            if ($this->matchesPostfetchFilter($resource)) {
130
                return false;
131
            }
132
133
            return $resource;
134
        } catch (\Exception $e) {
135
            $this->dispatch(
136
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
137
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
138
            );
139
140
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
141
142
            return false;
143
        }
144
    }
145
146
    /**
147
     * @param Resource $resource
148
     * @return bool
149
     */
150
    private function matchesPostfetchFilter(Resource $resource)
151
    {
152
        foreach ($this->postFetchFilters as $filter) {
153
            if ($filter->match($resource)) {
154
                $this->dispatch(
155
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
156
                    new GenericEvent($this, array('uri' => $resource->getUri()))
157
                );
158
                return true;
159
            }
160
        }
161
        return false;
162
    }
163
164
    /**
165
     * @param PersistenceHandlerInterface $persistenceHandler
166
     */
167
    public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler)
168
    {
169
        $this->persistenceHandler = $persistenceHandler;
170
    }
171
172
    /**
173
     * @return PersistenceHandlerInterface
174
     */
175
    public function getPersistenceHandler()
176
    {
177
        if (!$this->persistenceHandler) {
178
            $this->persistenceHandler = new MemoryPersistenceHandler();
179
        }
180
181
        return $this->persistenceHandler;
182
    }
183
184
    /**
185
     * @param RequestHandlerInterface $requestHandler
186
     */
187
    public function setRequestHandler(RequestHandlerInterface $requestHandler)
188
    {
189
        $this->requestHandler = $requestHandler;
190
    }
191
192
    /**
193
     * @return RequestHandlerInterface
194
     */
195
    public function getRequestHandler()
196
    {
197
        if (!$this->requestHandler) {
198
            $this->requestHandler = new GuzzleRequestHandler();
199
        }
200
201
        return $this->requestHandler;
202
    }
203
204
}
205