Completed
Push — master ( 88bf51...34820f )
by Matthijs
15:18 queued 08:19
created

Spider::setPersistenceHandler()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererSet;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\Exception\QueueException;
12
use VDB\Spider\QueueManager\QueueManagerInterface;
13
use VDB\Spider\QueueManager\InMemoryQueueManager;
14
use VDB\Spider\Uri\DiscoveredUri;
15
use VDB\Spider\Downloader\DownloaderInterface;
16
use VDB\Spider\Downloader\Downloader;
17
use VDB\Uri\UriInterface;
18
use VDB\Uri\Uri;
19
20
/**
21
 *
22
 */
23
class Spider
24
{
25
    /** @var DownloaderInterface */
26
    private $downloader;
27
28
    /** @var QueueManagerInterface */
29
    private $queueManager;
30
31
    /** @var EventDispatcherInterface */
32
    private $dispatcher;
33
34
    /** @var DiscovererSet */
35
    private $discovererSet;
36
37
    /** @var DiscoveredUri The URI of the site to spider */
38
    private $seed = array();
39
40
    /** @var string the unique id of this spider instance */
41
    private $spiderId;
42
43
    /**
44
     * @param string $seed the URI to start crawling
45
     * @param string|null $spiderId
46
     */
47
    public function __construct($seed, $spiderId = null)
48
    {
49
        $this->setSeed($seed);
50
        if (null !== $spiderId) {
51
            $this->spiderId = $spiderId;
52
        } else {
53
            $this->spiderId = md5($seed . microtime(true));
54
        }
55
56
        // This makes the spider handle signals gracefully and allows us to do cleanup
57
        if (php_sapi_name() == 'cli') {
58
            declare(ticks = 1);
59
            if (function_exists('pcntl_signal')) {
60
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
61
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
62
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
63
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
64
            }
65
        }
66
    }
67
68
    /**
69
     * Starts crawling the URI provided on instantiation
70
     *
71
     * @return void
72
     */
73
    public function crawl()
74
    {
75
        $this->getQueueManager()->addUri($this->seed);
76
        $this->getDownloader()->getPersistenceHandler()->setSpiderId($this->spiderId);
77
78
        $this->doCrawl();
79
    }
80
81
    /**
82
     * param DiscovererSet $discovererSet
83
     */
84
    public function setDiscovererSet(DiscovererSet $discovererSet)
85
    {
86
        $this->discovererSet = $discovererSet;
87
    }
88
89
    /**
90
     * @return DiscovererSet
91
     */
92
    public function getDiscovererSet()
93
    {
94
        if (!$this->discovererSet) {
95
            $this->discovererSet = new DiscovererSet();
96
        }
97
98
        return $this->discovererSet;
99
    }
100
101
    /**
102
     * param QueueManagerInterface $queueManager
103
     */
104
    public function setQueueManager(QueueManagerInterface $queueManager)
105
    {
106
        $this->queueManager = $queueManager;
107
    }
108
109
    /**
110
     * @return QueueManagerInterface
111
     */
112
    public function getQueueManager()
113
    {
114
        if (!$this->queueManager) {
115
            $this->queueManager = new InMemoryQueueManager();
116
        }
117
118
        return $this->queueManager;
119
    }
120
121
    /**
122
     * @param DownloaderInterface $downloader
123
     * @return $this
124
     */
125
    public function setDownloader(DownloaderInterface $downloader)
126
    {
127
        $this->downloader = $downloader;
128
129
        return $this;
130
    }
131
132
    /**
133
     * @return DownloaderInterface
134
     */
135
    public function getDownloader()
136
    {
137
        if (!$this->downloader) {
138
            $this->downloader = new Downloader();
139
        }
140
        return $this->downloader;
141
    }
142
143
    /**
144
     * @param EventDispatcherInterface $eventDispatcher
145
     * @return $this
146
     */
147
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
148
    {
149
        $this->dispatcher = $eventDispatcher;
150
151
        return $this;
152
    }
153
154
    /**
155
     * @return EventDispatcherInterface
156
     */
157
    public function getDispatcher()
158
    {
159
        if (!$this->dispatcher) {
160
            $this->dispatcher = new EventDispatcher();
161
        }
162
        return $this->dispatcher;
163
    }
164
165
    public function handleSignal($signal)
166
    {
167
        switch ($signal) {
168
            case SIGTERM:
169
            case SIGKILL:
170
            case SIGINT:
171
            case SIGQUIT:
172
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
173
        }
174
    }
175
176
    /**
177
     * Function that crawls each provided URI
178
     * It applies all processors and listeners set on the Spider
179
     *
180
     * This is a either depth first algorithm as explained here:
181
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
182
     * Note that because we don't do it recursive, but iteratively,
183
     * results will be in a different order from the example, because
184
     * we always take the right-most child first, whereas a recursive
185
     * variant would always take the left-most child first
186
     *
187
     * or
188
     *
189
     * a breadth first algorithm
190
     *
191
     * @return void
192
     */
193
    private function doCrawl()
194
    {
195
        while ($currentUri = $this->getQueueManager()->next()) {
196
            if ($this->getDownloader()->isDownLoadLimitExceeded()) {
197
                break;
198
            }
199
200
            if (!$resource = $this->getDownloader()->download($currentUri)) {
201
                continue;
202
            }
203
204
            $this->dispatch(
205
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
206
                new GenericEvent($this, array('uri' => $currentUri))
207
            );
208
209
            // Once the document is enqueued, apply the discoverers to look for more links to follow
210
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
211
212
            foreach ($discoveredUris as $uri) {
213
                try {
214
                    $this->getQueueManager()->addUri($uri);
215
                } catch (QueueException $e) {
216
                    // when the queue size is exceeded, we stop discovering
217
                    break;
218
                }
219
            }
220
        }
221
    }
222
223
    /**
224
     * A shortcut for EventDispatcher::dispatch()
225
     *
226
     * @param string $eventName
227
     * @param null|Event $event
228
     */
229
    private function dispatch($eventName, Event $event = null)
230
    {
231
        $this->getDispatcher()->dispatch($eventName, $event);
232
    }
233
234
    /**
235
     * @param string $uri
236
     */
237
    private function setSeed($uri)
238
    {
239
        $this->seed = new DiscoveredUri(new Uri($uri));
240
        $this->seed->setDepthFound(0);
241
    }
242
}
243