Checks if the types of the passed arguments in a function/method call are compatible.
1 | <?php |
||
2 | namespace VDB\Spider; |
||
3 | |||
4 | use Symfony\Component\EventDispatcher\Event; |
||
5 | use Symfony\Component\EventDispatcher\EventDispatcher; |
||
6 | use Symfony\Component\EventDispatcher\EventDispatcherInterface; |
||
7 | use Symfony\Component\EventDispatcher\GenericEvent; |
||
8 | use VDB\Spider\Discoverer\DiscovererSet; |
||
9 | use VDB\Spider\Event\SpiderEvents; |
||
10 | use VDB\Spider\Exception\QueueException; |
||
11 | use VDB\Spider\QueueManager\QueueManagerInterface; |
||
12 | use VDB\Spider\QueueManager\InMemoryQueueManager; |
||
13 | use VDB\Spider\Uri\DiscoveredUri; |
||
14 | use VDB\Spider\Downloader\DownloaderInterface; |
||
15 | use VDB\Spider\Downloader\Downloader; |
||
16 | use VDB\Uri\UriInterface; |
||
17 | use VDB\Uri\Uri; |
||
18 | |||
19 | /** |
||
20 | * |
||
21 | */ |
||
22 | class Spider |
||
23 | { |
||
24 | /** @var DownloaderInterface */ |
||
25 | private $downloader; |
||
26 | |||
27 | /** @var QueueManagerInterface */ |
||
28 | private $queueManager; |
||
29 | |||
30 | /** @var EventDispatcherInterface */ |
||
31 | private $dispatcher; |
||
32 | |||
33 | /** @var DiscovererSet */ |
||
34 | private $discovererSet; |
||
35 | |||
36 | /** @var DiscoveredUri The URI of the site to spider */ |
||
37 | private $seed = array(); |
||
38 | |||
39 | /** @var string the unique id of this spider instance */ |
||
40 | private $spiderId; |
||
41 | |||
42 | /** |
||
43 | * @param string $seed the URI to start crawling |
||
44 | * @param string|null $spiderId |
||
45 | */ |
||
46 | public function __construct($seed, $spiderId = null) |
||
47 | { |
||
48 | $this->setSeed($seed); |
||
49 | if (null !== $spiderId) { |
||
50 | $this->spiderId = $spiderId; |
||
51 | } else { |
||
52 | $this->spiderId = md5($seed . microtime(true)); |
||
53 | } |
||
54 | |||
55 | // This makes the spider handle signals gracefully and allows us to do cleanup |
||
56 | if (php_sapi_name() == 'cli') { |
||
57 | declare(ticks = 1); |
||
58 | if (function_exists('pcntl_signal')) { |
||
59 | pcntl_signal(SIGTERM, array($this, 'handleSignal')); |
||
60 | pcntl_signal(SIGINT, array($this, 'handleSignal')); |
||
61 | pcntl_signal(SIGHUP, array($this, 'handleSignal')); |
||
62 | pcntl_signal(SIGQUIT, array($this, 'handleSignal')); |
||
63 | } |
||
64 | } |
||
65 | } |
||
66 | |||
67 | /** |
||
68 | * Starts crawling the URI provided on instantiation |
||
69 | * |
||
70 | * @return void |
||
71 | */ |
||
72 | public function crawl() |
||
73 | { |
||
74 | $this->getQueueManager()->addUri($this->seed); |
||
75 | $this->getDownloader()->getPersistenceHandler()->setSpiderId($this->spiderId); |
||
76 | |||
77 | $this->doCrawl(); |
||
78 | } |
||
79 | |||
80 | /** |
||
81 | * param DiscovererSet $discovererSet |
||
82 | */ |
||
83 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||
84 | { |
||
85 | $this->discovererSet = $discovererSet; |
||
86 | } |
||
87 | |||
88 | /** |
||
89 | * @return DiscovererSet |
||
90 | */ |
||
91 | public function getDiscovererSet() |
||
92 | { |
||
93 | if (!$this->discovererSet) { |
||
94 | $this->discovererSet = new DiscovererSet(); |
||
95 | } |
||
96 | |||
97 | return $this->discovererSet; |
||
98 | } |
||
99 | |||
100 | /** |
||
101 | * param QueueManagerInterface $queueManager |
||
102 | */ |
||
103 | public function setQueueManager(QueueManagerInterface $queueManager) |
||
104 | { |
||
105 | $this->queueManager = $queueManager; |
||
106 | } |
||
107 | |||
108 | /** |
||
109 | * @return QueueManagerInterface |
||
110 | */ |
||
111 | public function getQueueManager() |
||
112 | { |
||
113 | if (!$this->queueManager) { |
||
114 | $this->queueManager = new InMemoryQueueManager(); |
||
115 | } |
||
116 | |||
117 | return $this->queueManager; |
||
118 | } |
||
119 | |||
120 | /** |
||
121 | * @param DownloaderInterface $downloader |
||
122 | * @return $this |
||
123 | */ |
||
124 | public function setDownloader(DownloaderInterface $downloader) |
||
125 | { |
||
126 | $this->downloader = $downloader; |
||
127 | |||
128 | return $this; |
||
129 | } |
||
130 | |||
131 | /** |
||
132 | * @return DownloaderInterface |
||
133 | */ |
||
134 | public function getDownloader() |
||
135 | { |
||
136 | if (!$this->downloader) { |
||
137 | $this->downloader = new Downloader(); |
||
138 | } |
||
139 | return $this->downloader; |
||
140 | } |
||
141 | |||
142 | /** |
||
143 | * @param EventDispatcherInterface $eventDispatcher |
||
144 | * @return $this |
||
145 | */ |
||
146 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
147 | { |
||
148 | $this->dispatcher = $eventDispatcher; |
||
149 | |||
150 | return $this; |
||
151 | } |
||
152 | |||
153 | /** |
||
154 | * @return EventDispatcherInterface |
||
155 | */ |
||
156 | public function getDispatcher() |
||
157 | { |
||
158 | if (!$this->dispatcher) { |
||
159 | $this->dispatcher = new EventDispatcher(); |
||
160 | } |
||
161 | return $this->dispatcher; |
||
162 | } |
||
163 | |||
164 | public function handleSignal($signal) |
||
165 | { |
||
166 | switch ($signal) { |
||
167 | case SIGTERM: |
||
168 | case SIGKILL: |
||
169 | case SIGINT: |
||
170 | case SIGQUIT: |
||
171 | $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED); |
||
172 | } |
||
173 | } |
||
174 | |||
175 | /** |
||
176 | * Function that crawls each provided URI |
||
177 | * It applies all processors and listeners set on the Spider |
||
178 | * |
||
179 | * This is a either depth first algorithm as explained here: |
||
180 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
181 | * Note that because we don't do it recursive, but iteratively, |
||
182 | * results will be in a different order from the example, because |
||
183 | * we always take the right-most child first, whereas a recursive |
||
184 | * variant would always take the left-most child first |
||
185 | * |
||
186 | * or |
||
187 | * |
||
188 | * a breadth first algorithm |
||
189 | * |
||
190 | * @return void |
||
191 | */ |
||
192 | private function doCrawl() |
||
193 | { |
||
194 | while ($currentUri = $this->getQueueManager()->next()) { |
||
195 | if ($this->getDownloader()->isDownLoadLimitExceeded()) { |
||
196 | break; |
||
197 | } |
||
198 | |||
199 | if (!$resource = $this->getDownloader()->download($currentUri)) { |
||
200 | continue; |
||
201 | } |
||
202 | |||
203 | $this->dispatch( |
||
204 | SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED, |
||
205 | new GenericEvent($this, array('uri' => $currentUri)) |
||
206 | ); |
||
207 | |||
208 | // Once the document is enqueued, apply the discoverers to look for more links to follow |
||
209 | $discoveredUris = $this->getDiscovererSet()->discover($resource); |
||
210 | |||
211 | foreach ($discoveredUris as $uri) { |
||
212 | try { |
||
213 | $this->getQueueManager()->addUri($uri); |
||
214 | } catch (QueueException $e) { |
||
215 | // when the queue size is exceeded, we stop discovering |
||
216 | break; |
||
217 | } |
||
218 | } |
||
219 | } |
||
220 | } |
||
221 | |||
222 | /** |
||
223 | * A shortcut for EventDispatcher::dispatch() |
||
224 | * |
||
225 | * @param string $eventName |
||
226 | * @param null|Event $event |
||
227 | */ |
||
228 | private function dispatch($eventName, Event $event = null) |
||
229 | { |
||
230 | $this->getDispatcher()->dispatch($eventName, $event); |
||
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
231 | } |
||
232 | |||
233 | /** |
||
234 | * @param string $uri |
||
235 | */ |
||
236 | private function setSeed($uri) |
||
237 | { |
||
238 | $this->seed = new DiscoveredUri(new Uri($uri)); |
||
239 | $this->seed->setDepthFound(0); |
||
240 | } |
||
241 | } |
||
242 |