Completed
Push — master ( 88bf51...34820f )
by Matthijs
15:18 queued 08:19
created

Downloader::getPersistenceHandler()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 8
rs 9.4286
cc 2
eloc 4
nc 2
nop 0
1
<?php
2
3
namespace VDB\Spider\Downloader;
4
5
use VDB\Spider\Downloader\DownloaderInterface;
6
use Symfony\Component\EventDispatcher\Event;
7
use Symfony\Component\EventDispatcher\EventDispatcher;
8
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
9
use Symfony\Component\EventDispatcher\GenericEvent;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
12
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface;
13
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
14
use VDB\Spider\RequestHandler\RequestHandlerInterface;
15
use VDB\Spider\Filter\PostFetchFilterInterface;
16
use VDB\Spider\Resource;
17
use VDB\Spider\Uri\DiscoveredUri;
18
19
class Downloader implements DownloaderInterface
20
{
21
    /** @var EventDispatcherInterface */
22
    private $dispatcher;
23
24
    /** @var PersistenceHandlerInterface */
25
    private $persistenceHandler;
26
27
    /** @var RequestHandlerInterface */
28
    private $requestHandler;
29
30
    /** @var int the maximum number of downloaded resources. 0 means no limit */
31
    private $downloadLimit = 0;
32
33
    /** @var PostFetchFilterInterface[] */
34
    private $postFetchFilters = array();
35
36
    /**
37
     * @param int Maximum number of resources to download
38
     * @return $this
39
     */
40
    public function setDownloadLimit($downloadLimit)
41
    {
42
        $this->downloadLimit = $downloadLimit;
43
        return $this;
44
    }
45
46
    /**
47
     * @return int Maximum number of resources to download
48
     */
49
    public function getdownloadLimit()
50
    {
51
        return $this->downloadLimit;
52
    }
53
54
    /**
55
     * @param PostFetchFilterInterface $filter
56
     */
57
    public function addPostFetchFilter(PostFetchFilterInterface $filter)
58
    {
59
        $this->postFetchFilters[] = $filter;
60
    }
61
62
    /**
63
     * @param DiscoveredUri $uri
64
     * @return false|Resource
65
     */
66
    public function download(DiscoveredUri $uri)
67
    {
68
        // Fetch the document
69
        if (!$resource = $this->fetchResource($uri)) {
70
            return false;
71
        }
72
73
        $this->getPersistenceHandler()->persist($resource);
74
75
        return $resource;
76
    }
77
78
    public function isDownLoadLimitExceeded()
79
    {
80
        return $this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit;
81
    }
82
83
    /**
84
     * A shortcut for EventDispatcher::dispatch()
85
     *
86
     * @param string $eventName
87
     * @param null|Event $event
88
     */
89
    private function dispatch($eventName, Event $event = null)
90
    {
91
        $this->getDispatcher()->dispatch($eventName, $event);
92
    }
93
94
    /**
95
     * @param EventDispatcherInterface $eventDispatcher
96
     * @return $this
97
     */
98
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
99
    {
100
        $this->dispatcher = $eventDispatcher;
101
102
        return $this;
103
    }
104
105
    /**
106
     * @return EventDispatcherInterface
107
     */
108
    public function getDispatcher()
109
    {
110
        if (!$this->dispatcher) {
111
            $this->dispatcher = new EventDispatcher();
112
        }
113
        return $this->dispatcher;
114
    }
115
116
117
    /**
118
     * @param DiscoveredUri $uri
119
     * @return Resource|false
120
     */
121
    protected function fetchResource(DiscoveredUri $uri)
122
    {
123
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
124
125
        try {
126
            $resource = $this->getRequestHandler()->request($uri);
127
128
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
129
130
            if ($this->matchesPostfetchFilter($resource)) {
131
                return false;
132
            }
133
134
            return $resource;
135
        } catch (\Exception $e) {
136
            $this->dispatch(
137
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
138
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
139
            );
140
141
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
142
143
            return false;
144
        }
145
    }
146
147
    /**
148
     * @param Resource $resource
149
     * @return bool
150
     */
151
    private function matchesPostfetchFilter(Resource $resource)
152
    {
153
        foreach ($this->postFetchFilters as $filter) {
154
            if ($filter->match($resource)) {
155
                $this->dispatch(
156
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
157
                    new GenericEvent($this, array('uri' => $resource->getUri()))
158
                );
159
                return true;
160
            }
161
        }
162
        return false;
163
    }
164
165
    /**
166
     * @param PersistenceHandlerInterface $persistenceHandler
167
     */
168
    public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler)
169
    {
170
        $this->persistenceHandler = $persistenceHandler;
171
    }
172
173
    /**
174
     * @return PersistenceHandlerInterface
175
     */
176
    public function getPersistenceHandler()
177
    {
178
        if (!$this->persistenceHandler) {
179
            $this->persistenceHandler = new MemoryPersistenceHandler();
180
        }
181
182
        return $this->persistenceHandler;
183
    }
184
185
    /**
186
     * @param RequestHandlerInterface $requestHandler
187
     */
188
    public function setRequestHandler(RequestHandlerInterface $requestHandler)
189
    {
190
        $this->requestHandler = $requestHandler;
191
    }
192
193
    /**
194
     * @return RequestHandlerInterface
195
     */
196
    public function getRequestHandler()
197
    {
198
        if (!$this->requestHandler) {
199
            $this->requestHandler = new GuzzleRequestHandler();
200
        }
201
202
        return $this->requestHandler;
203
    }
204
205
}
206