1 | <?php |
||||
2 | namespace VDB\Spider; |
||||
3 | |||||
4 | use Symfony\Component\EventDispatcher\Event; |
||||
5 | use Symfony\Component\EventDispatcher\EventDispatcher; |
||||
6 | use Symfony\Component\EventDispatcher\EventDispatcherInterface; |
||||
7 | use Symfony\Component\EventDispatcher\GenericEvent; |
||||
8 | use VDB\Spider\Discoverer\DiscovererSet; |
||||
9 | use VDB\Spider\Event\SpiderEvents; |
||||
10 | use VDB\Spider\Exception\QueueException; |
||||
11 | use VDB\Spider\QueueManager\QueueManagerInterface; |
||||
12 | use VDB\Spider\QueueManager\InMemoryQueueManager; |
||||
13 | use VDB\Spider\Uri\DiscoveredUri; |
||||
14 | use VDB\Spider\Downloader\DownloaderInterface; |
||||
15 | use VDB\Spider\Downloader\Downloader; |
||||
16 | use VDB\Uri\UriInterface; |
||||
17 | use VDB\Uri\Uri; |
||||
18 | |||||
19 | /** |
||||
20 | * |
||||
21 | */ |
||||
22 | class Spider |
||||
23 | { |
||||
24 | /** @var DownloaderInterface */ |
||||
25 | private $downloader; |
||||
26 | |||||
27 | /** @var QueueManagerInterface */ |
||||
28 | private $queueManager; |
||||
29 | |||||
30 | /** @var EventDispatcherInterface */ |
||||
31 | private $dispatcher; |
||||
32 | |||||
33 | /** @var DiscovererSet */ |
||||
34 | private $discovererSet; |
||||
35 | |||||
36 | /** @var DiscoveredUri The URI of the site to spider */ |
||||
37 | private $seed = array(); |
||||
38 | |||||
39 | /** @var string the unique id of this spider instance */ |
||||
40 | private $spiderId; |
||||
41 | |||||
42 | /** |
||||
43 | * @param string $seed the URI to start crawling |
||||
44 | * @param string|null $spiderId |
||||
45 | */ |
||||
46 | public function __construct($seed, $spiderId = null) |
||||
47 | { |
||||
48 | $this->setSeed($seed); |
||||
49 | if (null !== $spiderId) { |
||||
50 | $this->spiderId = $spiderId; |
||||
51 | } else { |
||||
52 | $this->spiderId = md5($seed . microtime(true)); |
||||
53 | } |
||||
54 | |||||
55 | // This makes the spider handle signals gracefully and allows us to do cleanup |
||||
56 | if (php_sapi_name() == 'cli') { |
||||
57 | declare(ticks = 1); |
||||
58 | if (function_exists('pcntl_signal')) { |
||||
59 | pcntl_signal(SIGTERM, array($this, 'handleSignal')); |
||||
60 | pcntl_signal(SIGINT, array($this, 'handleSignal')); |
||||
61 | pcntl_signal(SIGHUP, array($this, 'handleSignal')); |
||||
62 | pcntl_signal(SIGQUIT, array($this, 'handleSignal')); |
||||
63 | } |
||||
64 | } |
||||
65 | } |
||||
66 | |||||
67 | /** |
||||
68 | * Starts crawling the URI provided on instantiation |
||||
69 | * |
||||
70 | * @return void |
||||
71 | */ |
||||
72 | public function crawl() |
||||
73 | { |
||||
74 | $this->getQueueManager()->addUri($this->seed); |
||||
75 | $this->getDownloader()->getPersistenceHandler()->setSpiderId($this->spiderId); |
||||
76 | |||||
77 | $this->doCrawl(); |
||||
78 | } |
||||
79 | |||||
80 | /** |
||||
81 | * param DiscovererSet $discovererSet |
||||
82 | */ |
||||
83 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||||
84 | { |
||||
85 | $this->discovererSet = $discovererSet; |
||||
86 | } |
||||
87 | |||||
88 | /** |
||||
89 | * @return DiscovererSet |
||||
90 | */ |
||||
91 | public function getDiscovererSet() |
||||
92 | { |
||||
93 | if (!$this->discovererSet) { |
||||
94 | $this->discovererSet = new DiscovererSet(); |
||||
95 | } |
||||
96 | |||||
97 | return $this->discovererSet; |
||||
98 | } |
||||
99 | |||||
100 | /** |
||||
101 | * param QueueManagerInterface $queueManager |
||||
102 | */ |
||||
103 | public function setQueueManager(QueueManagerInterface $queueManager) |
||||
104 | { |
||||
105 | $this->queueManager = $queueManager; |
||||
106 | } |
||||
107 | |||||
108 | /** |
||||
109 | * @return QueueManagerInterface |
||||
110 | */ |
||||
111 | public function getQueueManager() |
||||
112 | { |
||||
113 | if (!$this->queueManager) { |
||||
114 | $this->queueManager = new InMemoryQueueManager(); |
||||
115 | } |
||||
116 | |||||
117 | return $this->queueManager; |
||||
118 | } |
||||
119 | |||||
120 | /** |
||||
121 | * @param DownloaderInterface $downloader |
||||
122 | * @return $this |
||||
123 | */ |
||||
124 | public function setDownloader(DownloaderInterface $downloader) |
||||
125 | { |
||||
126 | $this->downloader = $downloader; |
||||
127 | |||||
128 | return $this; |
||||
129 | } |
||||
130 | |||||
131 | /** |
||||
132 | * @return DownloaderInterface |
||||
133 | */ |
||||
134 | public function getDownloader() |
||||
135 | { |
||||
136 | if (!$this->downloader) { |
||||
137 | $this->downloader = new Downloader(); |
||||
138 | } |
||||
139 | return $this->downloader; |
||||
140 | } |
||||
141 | |||||
142 | /** |
||||
143 | * @param EventDispatcherInterface $eventDispatcher |
||||
144 | * @return $this |
||||
145 | */ |
||||
146 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||||
147 | { |
||||
148 | $this->dispatcher = $eventDispatcher; |
||||
149 | |||||
150 | return $this; |
||||
151 | } |
||||
152 | |||||
153 | /** |
||||
154 | * @return EventDispatcherInterface |
||||
155 | */ |
||||
156 | public function getDispatcher() |
||||
157 | { |
||||
158 | if (!$this->dispatcher) { |
||||
159 | $this->dispatcher = new EventDispatcher(); |
||||
160 | } |
||||
161 | return $this->dispatcher; |
||||
162 | } |
||||
163 | |||||
164 | public function handleSignal($signal) |
||||
165 | { |
||||
166 | switch ($signal) { |
||||
167 | case SIGTERM: |
||||
168 | case SIGKILL: |
||||
169 | case SIGINT: |
||||
170 | case SIGQUIT: |
||||
171 | $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED); |
||||
172 | } |
||||
173 | } |
||||
174 | |||||
175 | /** |
||||
176 | * Function that crawls each provided URI |
||||
177 | * It applies all processors and listeners set on the Spider |
||||
178 | * |
||||
179 | * This is a either depth first algorithm as explained here: |
||||
180 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||||
181 | * Note that because we don't do it recursive, but iteratively, |
||||
182 | * results will be in a different order from the example, because |
||||
183 | * we always take the right-most child first, whereas a recursive |
||||
184 | * variant would always take the left-most child first |
||||
185 | * |
||||
186 | * or |
||||
187 | * |
||||
188 | * a breadth first algorithm |
||||
189 | * |
||||
190 | * @return void |
||||
191 | */ |
||||
192 | private function doCrawl() |
||||
193 | { |
||||
194 | while ($currentUri = $this->getQueueManager()->next()) { |
||||
195 | if ($this->getDownloader()->isDownLoadLimitExceeded()) { |
||||
196 | break; |
||||
197 | } |
||||
198 | |||||
199 | if (!$resource = $this->getDownloader()->download($currentUri)) { |
||||
200 | continue; |
||||
201 | } |
||||
202 | |||||
203 | $this->dispatch( |
||||
204 | SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED, |
||||
205 | new GenericEvent($this, array('uri' => $currentUri)) |
||||
206 | ); |
||||
207 | |||||
208 | // Once the document is enqueued, apply the discoverers to look for more links to follow |
||||
209 | $discoveredUris = $this->getDiscovererSet()->discover($resource); |
||||
210 | |||||
211 | foreach ($discoveredUris as $uri) { |
||||
212 | try { |
||||
213 | $this->getQueueManager()->addUri($uri); |
||||
214 | } catch (QueueException $e) { |
||||
215 | // when the queue size is exceeded, we stop discovering |
||||
216 | break; |
||||
217 | } |
||||
218 | } |
||||
219 | } |
||||
220 | } |
||||
221 | |||||
222 | /** |
||||
223 | * A shortcut for EventDispatcher::dispatch() |
||||
224 | * |
||||
225 | * @param string $eventName |
||||
226 | * @param null|Event $event |
||||
227 | */ |
||||
228 | private function dispatch($eventName, Event $event = null) |
||||
229 | { |
||||
230 | $this->getDispatcher()->dispatch($eventName, $event); |
||||
0 ignored issues
–
show
$eventName of type string is incompatible with the type object expected by parameter $event of Symfony\Contracts\EventD...erInterface::dispatch() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
231 | } |
||||
232 | |||||
233 | /** |
||||
234 | * @param string $uri |
||||
235 | */ |
||||
236 | private function setSeed($uri) |
||||
237 | { |
||||
238 | $this->seed = new DiscoveredUri(new Uri($uri)); |
||||
239 | $this->seed->setDepthFound(0); |
||||
240 | } |
||||
241 | } |
||||
242 |
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.