Completed
Pull Request — master (#20)
by Matthijs
05:25 queued 30s
created

Spider::addPostFetchFilter()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 5
Bugs 0 Features 1
Metric Value
c 5
b 0
f 1
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererSet;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\Exception\QueueException;
12
use VDB\Spider\QueueManager\QueueManagerInterface;
13
use VDB\Spider\QueueManager\InMemoryQueueManager;
14
use VDB\Spider\Uri\DiscoveredUri;
15
use VDB\Spider\Downloader\DownloaderInterface;
16
use VDB\Spider\Downloader\Downloader;
17
use VDB\Uri\UriInterface;
18
use VDB\Uri\Uri;
19
20
/**
21
 *
22
 */
23
class Spider
24
{
25
    /** @var DownloaderInterface */
26
    private $downloader;
27
28
    /** @var QueueManagerInterface */
29
    private $queueManager;
30
31
    /** @var EventDispatcherInterface */
32
    private $dispatcher;
33
34
    /** @var DiscovererSet */
35
    private $discovererSet;
36
37
    /** @var DiscoveredUri The URI of the site to spider */
38
    private $seed = array();
39
40
    /** @var string the unique id of this spider instance */
41
    private $spiderId;
42
43
    /**
44
     * @param string $seed the URI to start crawling
45
     * @param string|null $spiderId
46
     */
47
    public function __construct($seed, $spiderId = null)
48
    {
49
        $this->setSeed($seed);
50
        if (null !== $spiderId) {
51
            $this->spiderId = $spiderId;
52
        } else {
53
            $this->spiderId = md5($seed . microtime(true));
54
        }
55
56
        // This makes the spider handle signals gracefully and allows us to do cleanup
57
        if (php_sapi_name() == 'cli') {
58
            declare(ticks = 1);
59
            if (function_exists('pcntl_signal')) {
60
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
61
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
62
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
63
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
64
            }
65
        }
66
    }
67
68
    /**
69
     * Starts crawling the URI provided on instantiation
70
     *
71
     * @return void
72
     */
73
    public function crawl()
74
    {
75
        $this->getQueueManager()->addUri($this->seed);
76
        $this->getDownloader()->getPersistenceHandler()->setSpiderId($this->spiderId);
77
78
        $this->doCrawl();
79
    }
80
81
    /**
82
     * param DiscovererSet $discovererSet
83
     */
84
    public function setDiscovererSet(DiscovererSet $discovererSet)
85
    {
86
        $this->discovererSet = $discovererSet;
87
    }
88
89
    /**
90
     * @return DiscovererSet
91
     */
92
    public function getDiscovererSet()
93
    {
94
        if (!$this->discovererSet) {
95
            $this->discovererSet = new DiscovererSet();
96
        }
97
98
        return $this->discovererSet;
99
    }
100
101
    /**
102
     * param QueueManagerInterface $queueManager
103
     */
104
    public function setQueueManager(QueueManagerInterface $queueManager)
105
    {
106
        $this->queueManager = $queueManager;
107
    }
108
109
    /**
110
     * @return QueueManagerInterface
111
     */
112
    public function getQueueManager()
113
    {
114
        if (!$this->queueManager) {
115
            $this->queueManager = new InMemoryQueueManager();
116
        }
117
118
        return $this->queueManager;
119
    }
120
121
    /**
122
     * @param DownloaderInterface $downloader
123
     * @return $this
124
     */
125
    public function setDownloader(DownloaderInterface $downloader)
126
    {
127
        $this->downloader = $downloader;
128
129
        return $this;
130
    }
131
132
    /**
133
     * @return DownloaderInterface
134
     */
135
    public function getDownloader()
136
    {
137
        if (!$this->downloader) {
138
            $this->downloader = new Downloader();
139
        }
140
        return $this->downloader;
141
    }
142
143
    /**
144
     * @param EventDispatcherInterface $eventDispatcher
145
     * @return $this
146
     */
147
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
148
    {
149
        $this->dispatcher = $eventDispatcher;
150
151
        return $this;
152
    }
153
154
    /**
155
     * @return EventDispatcherInterface
156
     */
157
    public function getDispatcher()
158
    {
159
        if (!$this->dispatcher) {
160
            $this->dispatcher = new EventDispatcher();
161
        }
162
        return $this->dispatcher;
163
    }
164
165
    public function handleSignal($signal)
166
    {
167
        switch ($signal) {
168
            case SIGTERM:
169
            case SIGKILL:
170
            case SIGINT:
171
            case SIGQUIT:
172
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
173
        }
174
    }
175
176
    /**
177
     * Function that crawls each provided URI
178
     * It applies all processors and listeners set on the Spider
179
     *
180
     * This is a either depth first algorithm as explained here:
181
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
182
     * Note that because we don't do it recursive, but iteratively,
183
     * results will be in a different order from the example, because
184
     * we always take the right-most child first, whereas a recursive
185
     * variant would always take the left-most child first
186
     *
187
     * or
188
     *
189
     * a breadth first algorithm
190
     *
191
     * @return void
192
     */
193
    private function doCrawl()
194
    {
195
        while ($currentUri = $this->getQueueManager()->next()) {
196
            if ($this->getDownloader()->isDownLoadLimitExceeded()) {
197
                break;
198
            }
199
200
            if (!$resource = $this->downloader->download($currentUri)) {
0 ignored issues
show
Bug introduced by
The method download() does not exist on VDB\Spider\Downloader\DownloaderInterface. Did you maybe mean setDownloadLimit()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
201
                continue;
202
            }
203
204
            $this->dispatch(
205
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
206
                new GenericEvent($this, array('uri' => $currentUri))
207
            );
208
209
            // Once the document is enqueued, apply the discoverers to look for more links to follow
210
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
211
212
            foreach ($discoveredUris as $uri) {
213
                try {
214
                    $this->getQueueManager()->addUri($uri);
215
                } catch (QueueException $e) {
216
                    // when the queue size is exceeded, we stop discovering
217
                    break;
218
                }
219
            }
220
        }
221
    }
222
223
    /**
224
     * A shortcut for EventDispatcher::dispatch()
225
     *
226
     * @param string $eventName
227
     * @param null|Event $event
228
     */
229
    private function dispatch($eventName, Event $event = null)
230
    {
231
        $this->getDispatcher()->dispatch($eventName, $event);
232
    }
233
234
    /**
235
     * @param string $uri
236
     */
237
    private function setSeed($uri)
238
    {
239
        $this->seed = new DiscoveredUri(new Uri($uri));
240
        $this->seed->setDepthFound(0);
241
    }
242
}
243