1 | <?php |
||
23 | class Spider |
||
24 | { |
||
25 | /** @var DownloaderInterface */ |
||
26 | private $downloader; |
||
27 | |||
28 | /** @var QueueManagerInterface */ |
||
29 | private $queueManager; |
||
30 | |||
31 | /** @var EventDispatcherInterface */ |
||
32 | private $dispatcher; |
||
33 | |||
34 | /** @var DiscovererSet */ |
||
35 | private $discovererSet; |
||
36 | |||
37 | /** @var DiscoveredUri The URI of the site to spider */ |
||
38 | private $seed = array(); |
||
39 | |||
40 | /** @var string the unique id of this spider instance */ |
||
41 | private $spiderId; |
||
42 | |||
43 | /** |
||
44 | * @param string $seed the URI to start crawling |
||
45 | * @param string|null $spiderId |
||
46 | */ |
||
47 | public function __construct($seed, $spiderId = null) |
||
67 | |||
68 | /** |
||
69 | * Starts crawling the URI provided on instantiation |
||
70 | * |
||
71 | * @return void |
||
72 | */ |
||
73 | public function crawl() |
||
80 | |||
81 | /** |
||
82 | * param DiscovererSet $discovererSet |
||
83 | */ |
||
84 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||
88 | |||
89 | /** |
||
90 | * @return DiscovererSet |
||
91 | */ |
||
92 | public function getDiscovererSet() |
||
100 | |||
101 | /** |
||
102 | * param QueueManagerInterface $queueManager |
||
103 | */ |
||
104 | public function setQueueManager(QueueManagerInterface $queueManager) |
||
108 | |||
109 | /** |
||
110 | * @return QueueManagerInterface |
||
111 | */ |
||
112 | public function getQueueManager() |
||
120 | |||
121 | /** |
||
122 | * @param DownloaderInterface $downloader |
||
123 | * @return $this |
||
124 | */ |
||
125 | public function setDownloader(DownloaderInterface $downloader) |
||
131 | |||
132 | /** |
||
133 | * @return DownloaderInterface |
||
134 | */ |
||
135 | public function getDownloader() |
||
142 | |||
143 | /** |
||
144 | * @param EventDispatcherInterface $eventDispatcher |
||
145 | * @return $this |
||
146 | */ |
||
147 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
153 | |||
154 | /** |
||
155 | * @return EventDispatcherInterface |
||
156 | */ |
||
157 | public function getDispatcher() |
||
164 | |||
165 | public function handleSignal($signal) |
||
175 | |||
176 | /** |
||
177 | * Function that crawls each provided URI |
||
178 | * It applies all processors and listeners set on the Spider |
||
179 | * |
||
180 | * This is a either depth first algorithm as explained here: |
||
181 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
182 | * Note that because we don't do it recursive, but iteratively, |
||
183 | * results will be in a different order from the example, because |
||
184 | * we always take the right-most child first, whereas a recursive |
||
185 | * variant would always take the left-most child first |
||
186 | * |
||
187 | * or |
||
188 | * |
||
189 | * a breadth first algorithm |
||
190 | * |
||
191 | * @return void |
||
192 | */ |
||
193 | private function doCrawl() |
||
222 | |||
223 | /** |
||
224 | * A shortcut for EventDispatcher::dispatch() |
||
225 | * |
||
226 | * @param string $eventName |
||
227 | * @param null|Event $event |
||
228 | */ |
||
229 | private function dispatch($eventName, Event $event = null) |
||
233 | |||
234 | /** |
||
235 | * @param string $uri |
||
236 | */ |
||
237 | private function setSeed($uri) |
||
242 | } |
||
243 |