Completed
Pull Request — master (#16)
by Matthijs
08:37
created

Spider::getDiscovererSet()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 8
rs 9.4286
cc 2
eloc 4
nc 2
nop 0
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererSet;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\Exception\QueueException;
12
use VDB\Spider\Filter\PostFetchFilterInterface;
13
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
14
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface;
15
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
16
use VDB\Spider\RequestHandler\RequestHandlerInterface;
17
use VDB\Spider\QueueManager\QueueManagerInterface;
18
use VDB\Spider\QueueManager\InMemoryQueueManager;
19
use VDB\Spider\Uri\FilterableUri;
20
use VDB\Uri\UriInterface;
21
use VDB\Uri\Uri;
22
23
/**
24
 *
25
 */
26
class Spider
27
{
28
    /** @var RequestHandlerInterface */
29
    private $requestHandler;
30
31
    /** @var PersistenceHandlerInterface */
32
    private $persistenceHandler;
33
34
    /** @var QueueManagerInterface */
35
    private $queueManager;
36
37
    /** @var EventDispatcherInterface */
38
    private $dispatcher;
39
40
    /** @var DiscovererSet */
41
    private $discovererSet;
42
43
    /** @var PostFetchFilterInterface[] */
44
    private $postFetchFilters = array();
45
46
    /** @var FilterableUri The URI of the site to spider */
47
    private $seed = array();
48
49
    /** @var string the unique id of this spider instance */
50
    private $spiderId;
51
52
    /** @var int the maximum number of downloaded resources. 0 means no limit */
53
    public $downloadLimit = 0;
54
55
    /**
56
     * @param string $seed the URI to start crawling
57
     * @param string $spiderId
0 ignored issues
show
Documentation introduced by
Should the type for parameter $spiderId not be string|null?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
58
     */
59
    public function __construct($seed, $spiderId = null)
60
    {
61
        $this->setSeed($seed);
62
        if (null !== $spiderId) {
63
            $this->spiderId = $spiderId;
64
        } else {
65
            $this->spiderId = md5($seed . microtime(true));
66
        }
67
68
        // This makes the spider handle signals gracefully and allows us to do cleanup
69
        if (php_sapi_name() == 'cli') {
70
            declare(ticks = 1);
71
            if (function_exists('pcntl_signal')) {
72
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
73
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
74
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
75
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
76
            }
77
        }
78
    }
79
80
    /**
81
     * Starts crawling the URI provided on instantiation
82
     *
83
     * @return array
0 ignored issues
show
Documentation introduced by
Should the return type not be array|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
84
     */
85
    public function crawl()
86
    {
87
        $this->getQueueManager()->addUri($this->seed);
88
        $this->getPersistenceHandler()->setSpiderId($this->spiderId);
89
90
        $this->doCrawl();
91
    }
92
93
    /**
94
     * @param PostFetchFilterInterface $filter
95
     */
96
    public function addPostFetchFilter(PostFetchFilterInterface $filter)
97
    {
98
        $this->postFetchFilters[] = $filter;
99
    }
100
101
    /**
102
     * @param RequestHandlerInterface $requestHandler
103
     */
104
    public function setRequestHandler(RequestHandlerInterface $requestHandler)
105
    {
106
        $this->requestHandler = $requestHandler;
107
    }
108
109
    /**
110
     * @return RequestHandlerInterface
111
     */
112
    public function getRequestHandler()
113
    {
114
        if (!$this->requestHandler) {
115
            $this->requestHandler = new GuzzleRequestHandler();
116
        }
117
118
        return $this->requestHandler;
119
    }
120
121
    /**
122
     * param DiscovererSet $discovererSet
123
     */
124
    public function setDiscovererSet(DiscovererSet $discovererSet)
125
    {
126
        $this->discovererSet = $discovererSet;
127
    }
128
129
    /**
130
     * @return DiscovererSet
131
     */
132
    public function getDiscovererSet()
133
    {
134
        if (!$this->discovererSet) {
135
            $this->discovererSet = new DiscovererSet();
136
        }
137
138
        return $this->discovererSet;
139
    }
140
141
    /**
142
     * param QueueManagerInterface $queueManager
143
     */
144
    public function setQueueManager(QueueManagerInterface $queueManager)
145
    {
146
        $this->queueManager = $queueManager;
147
    }
148
149
    /**
150
     * @return QueueManagerInterface
151
     */
152
    public function getQueueManager()
153
    {
154
        if (!$this->queueManager) {
155
            $this->queueManager = new InMemoryQueueManager();
156
        }
157
158
        return $this->queueManager;
159
    }
160
161
    /**
162
     * @param PersistenceHandlerInterface $persistenceHandler
163
     */
164
    public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler)
165
    {
166
        $this->persistenceHandler = $persistenceHandler;
167
    }
168
169
    /**
170
     * @return PersistenceHandlerInterface
171
     */
172
    public function getPersistenceHandler()
173
    {
174
        if (!$this->persistenceHandler) {
175
            $this->persistenceHandler = new MemoryPersistenceHandler();
176
        }
177
178
        return $this->persistenceHandler;
179
    }
180
181
    /**
182
     * @param EventDispatcherInterface $eventDispatcher
183
     * @return $this
184
     */
185
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
186
    {
187
        $this->dispatcher = $eventDispatcher;
188
189
        return $this;
190
    }
191
192
    /**
193
     * @return EventDispatcherInterface
194
     */
195
    public function getDispatcher()
196
    {
197
        if (!$this->dispatcher) {
198
            $this->dispatcher = new EventDispatcher();
199
        }
200
        return $this->dispatcher;
201
    }
202
203
    public function handleSignal($signal)
204
    {
205
        switch ($signal) {
206
            case SIGTERM:
207
            case SIGKILL:
208
            case SIGINT:
209
            case SIGQUIT:
210
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
211
        }
212
    }
213
214
    /**
215
     * @param Resource $resource
216
     * @return bool
217
     */
218
    private function matchesPostfetchFilter(Resource $resource)
219
    {
220
        foreach ($this->postFetchFilters as $filter) {
221
            if ($filter->match($resource)) {
222
                $this->dispatch(
223
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
224
                    new GenericEvent($this, array('uri' => $resource->getUri()))
225
                );
226
                return true;
227
            }
228
        }
229
        return false;
230
    }
231
232
    private function isDownLoadLimitExceeded()
233
    {
234
        if ($this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit) {
0 ignored issues
show
Unused Code introduced by
This if statement, and the following return statement can be replaced with return $this->downloadLi...= $this->downloadLimit;.
Loading history...
235
            return true;
236
        }
237
        return false;
238
    }
239
    /**
240
     * Function that crawls each provided URI
241
     * It applies all processors and listeners set on the Spider
242
     *
243
     * This is a either depth first algorithm as explained here:
244
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
245
     * Note that because we don't do it recursive, but iteratively,
246
     * results will be in a different order from the example, because
247
     * we always take the right-most child first, whereas a recursive
248
     * variant would always take the left-most child first
249
     *
250
     * or
251
     *
252
     * a breadth first algorithm
253
     *
254
     * @return void
255
     */
256
    private function doCrawl()
257
    {
258
        while ($currentUri = $this->getQueueManager()->next()) {
259
            if ($this->isDownLoadLimitExceeded()) {
260
                break;
261
            }
262
263
            // Fetch the document
264
            if (!$resource = $this->fetchResource($currentUri)) {
265
                continue;
266
            }
267
268
            $this->getPersistenceHandler()->persist($resource);
269
270
            $this->dispatch(
271
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
272
                new GenericEvent($this, array('uri' => $currentUri))
273
            );
274
275
            // Once the document is enqueued, apply the discoverers to look for more links to follow
276
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
277
278
            foreach ($discoveredUris as $uri) {
279
                try {
280
                    $this->getQueueManager()->addUri($uri);
281
                } catch (QueueException $e) {
282
                    // when the queue size is exceeded, we stop discovering
283
                    break;
284
                }
285
            }
286
        }
287
    }
288
289
    /**
290
     * @param UriInterface $uri
291
     * @return Resource
0 ignored issues
show
Documentation introduced by
Should the return type not be false|resource?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
292
     */
293
    protected function fetchResource(UriInterface $uri)
294
    {
295
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
296
297
        try {
298
            $resource = $this->getRequestHandler()->request($uri);
299
300
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
301
302
            if ($this->matchesPostfetchFilter($resource)) {
0 ignored issues
show
Documentation introduced by
$resource is of type resource, but the function expects a object<VDB\Spider\Resource>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
303
                return false;
304
            }
305
306
            return $resource;
307
        } catch (\Exception $e) {
308
            $this->dispatch(
309
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
310
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
311
            );
312
313
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
314
315
            return false;
316
        }
317
    }
318
319
    /**
320
     * A shortcut for EventDispatcher::dispatch()
321
     *
322
     * @param string $eventName
0 ignored issues
show
Documentation introduced by
Should the type for parameter $event not be null|Event?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
323
     * @param Event $event
324
     */
325
    private function dispatch($eventName, Event $event = null)
326
    {
327
        $this->getDispatcher()->dispatch($eventName, $event);
328
    }
329
330
    /**
331
     * @param string $uri
332
     */
333
    private function setSeed($uri)
334
    {
335
        $this->seed = new FilterableUri(new Uri($uri));
336
        $this->seed->setDepthFound(0);
337
    }
338
}
339