1 | <?php |
||
19 | class Downloader implements DownloaderInterface |
||
20 | { |
||
21 | /** @var EventDispatcherInterface */ |
||
22 | private $dispatcher; |
||
23 | |||
24 | /** @var PersistenceHandlerInterface */ |
||
25 | private $persistenceHandler; |
||
26 | |||
27 | /** @var RequestHandlerInterface */ |
||
28 | private $requestHandler; |
||
29 | |||
30 | /** @var int the maximum number of downloaded resources. 0 means no limit */ |
||
31 | private $downloadLimit = 0; |
||
32 | |||
33 | /** @var PostFetchFilterInterface[] */ |
||
34 | private $postFetchFilters = array(); |
||
35 | |||
36 | /** |
||
37 | * @param int Maximum number of resources to download |
||
38 | * @return $this |
||
39 | */ |
||
40 | public function setDownloadLimit($downloadLimit) |
||
41 | { |
||
42 | $this->downloadLimit = $downloadLimit; |
||
43 | return $this; |
||
44 | } |
||
45 | |||
46 | /** |
||
47 | * @return int Maximum number of resources to download |
||
48 | */ |
||
49 | public function getDownloadLimit() |
||
50 | { |
||
51 | return $this->downloadLimit; |
||
52 | } |
||
53 | |||
54 | /** |
||
55 | * @param PostFetchFilterInterface $filter |
||
56 | */ |
||
57 | public function addPostFetchFilter(PostFetchFilterInterface $filter) |
||
61 | |||
62 | /** |
||
63 | * @param DiscoveredUri $uri |
||
64 | * @return false|Resource |
||
65 | */ |
||
66 | public function download(DiscoveredUri $uri) |
||
67 | { |
||
68 | // Fetch the document |
||
69 | if (!$resource = $this->fetchResource($uri)) { |
||
70 | return false; |
||
71 | } |
||
72 | |||
73 | if ($this->matchesPostfetchFilter($resource)) { |
||
74 | return false; |
||
75 | } |
||
76 | |||
77 | $this->getPersistenceHandler()->persist($resource); |
||
78 | |||
79 | return $resource; |
||
80 | } |
||
81 | |||
82 | public function isDownLoadLimitExceeded() |
||
83 | { |
||
84 | return $this->getDownloadLimit() !== 0 && $this->getPersistenceHandler()->count() >= $this->getDownloadLimit(); |
||
85 | } |
||
86 | |||
87 | /** |
||
88 | * A shortcut for EventDispatcher::dispatch() |
||
89 | * |
||
90 | * @param string $eventName |
||
91 | * @param null|Event $event |
||
92 | */ |
||
93 | private function dispatch($eventName, Event $event = null) |
||
97 | |||
98 | /** |
||
99 | * @param EventDispatcherInterface $eventDispatcher |
||
100 | * @return $this |
||
101 | */ |
||
102 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
103 | { |
||
104 | $this->dispatcher = $eventDispatcher; |
||
105 | |||
106 | return $this; |
||
107 | } |
||
108 | |||
109 | /** |
||
110 | * @return EventDispatcherInterface |
||
111 | */ |
||
112 | public function getDispatcher() |
||
113 | { |
||
114 | if (!$this->dispatcher) { |
||
115 | $this->dispatcher = new EventDispatcher(); |
||
116 | } |
||
117 | return $this->dispatcher; |
||
118 | } |
||
119 | |||
120 | |||
121 | /** |
||
122 | * @param DiscoveredUri $uri |
||
123 | * @return Resource|false |
||
124 | */ |
||
125 | protected function fetchResource(DiscoveredUri $uri) |
||
126 | { |
||
127 | $resource = false; |
||
128 | |||
129 | $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri))); |
||
130 | |||
131 | try { |
||
132 | $resource = $this->getRequestHandler()->request($uri); |
||
133 | } catch (\Exception $e) { |
||
134 | $this->dispatch( |
||
135 | SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST, |
||
136 | new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage())) |
||
137 | ); |
||
138 | } finally { |
||
139 | $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); |
||
140 | } |
||
141 | |||
142 | return $resource; |
||
143 | } |
||
144 | |||
145 | /** |
||
146 | * @param Resource $resource |
||
147 | * @return bool |
||
148 | */ |
||
149 | private function matchesPostfetchFilter(Resource $resource) |
||
150 | { |
||
151 | foreach ($this->postFetchFilters as $filter) { |
||
152 | if ($filter->match($resource)) { |
||
153 | $this->dispatch( |
||
154 | SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH, |
||
155 | new GenericEvent($this, array('uri' => $resource->getUri())) |
||
156 | ); |
||
157 | return true; |
||
158 | } |
||
159 | } |
||
160 | return false; |
||
161 | } |
||
162 | |||
163 | /** |
||
164 | * @param PersistenceHandlerInterface $persistenceHandler |
||
165 | */ |
||
166 | public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler) |
||
170 | |||
171 | /** |
||
172 | * @return PersistenceHandlerInterface |
||
173 | */ |
||
174 | public function getPersistenceHandler() |
||
175 | { |
||
176 | if (!$this->persistenceHandler) { |
||
177 | $this->persistenceHandler = new MemoryPersistenceHandler(); |
||
178 | } |
||
179 | |||
180 | return $this->persistenceHandler; |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * @param RequestHandlerInterface $requestHandler |
||
185 | */ |
||
186 | public function setRequestHandler(RequestHandlerInterface $requestHandler) |
||
190 | |||
191 | /** |
||
192 | * @return RequestHandlerInterface |
||
193 | */ |
||
194 | public function getRequestHandler() |
||
195 | { |
||
196 | if (!$this->requestHandler) { |
||
197 | $this->requestHandler = new GuzzleRequestHandler(); |
||
202 | } |
||
203 |