Completed
Pull Request — master (#16)
by Matthijs
05:12
created

Spider::setDispatcher()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 0 Features 1
Metric Value
c 4
b 0
f 1
dl 0
loc 6
rs 9.4286
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererSet;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\Exception\QueueException;
12
use VDB\Spider\Filter\PostFetchFilterInterface;
13
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
14
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface;
15
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
16
use VDB\Spider\RequestHandler\RequestHandlerInterface;
17
use VDB\Spider\QueueManager\QueueManagerInterface;
18
use VDB\Spider\QueueManager\InMemoryQueueManager;
19
use VDB\Spider\Uri\FilterableUri;
20
use VDB\Uri\UriInterface;
21
22
/**
23
 *
24
 */
25
class Spider
26
{
27
    /** @var RequestHandlerInterface */
28
    private $requestHandler;
29
30
    /** @var PersistenceHandlerInterface */
31
    private $persistenceHandler;
32
33
    /** @var QueueManagerInterface */
34
    private $queueManager;
35
36
    /** @var EventDispatcherInterface */
37
    private $dispatcher;
38
39
    /** @var DiscovererSet */
40
    private $discovererSet;
41
42
    /** @var PostFetchFilterInterface[] */
43
    private $postFetchFilters = array();
44
45
    /** @var FilterableUri The URI of the site to spider */
46
    private $seed = array();
47
48
    /** @var string the unique id of this spider instance */
49
    private $spiderId;
50
51
    /** @var the maximum number of downloaded resources. 0 means no limit */
52
    public $downloadLimit = 0;
53
54
    /**
55
     * @param string $seed the URI to start crawling
56
     * @param string $spiderId
0 ignored issues
show
Documentation introduced by
Should the type for parameter $spiderId not be string|null?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
57
     */
58
    public function __construct($seed, $spiderId = null)
59
    {
60
        $this->setSeed($seed);
61
        if (null !== $spiderId) {
62
            $this->spiderId = $spiderId;
63
        } else {
64
            $this->spiderId = md5($seed . microtime(true));
65
        }
66
67
        // This makes the spider handle signals gracefully and allows us to do cleanup
68
        if (php_sapi_name() == 'cli') {
69
            declare(ticks = 1);
70
            if (function_exists('pcntl_signal')) {
71
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
72
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
73
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
74
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
75
            }
76
        }
77
    }
78
79
    /**
80
     * Starts crawling the URI provided on instantiation
81
     *
82
     * @return array
0 ignored issues
show
Documentation introduced by
Should the return type not be array|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
83
     */
84
    public function crawl()
85
    {
86
        $this->getQueueManager()->addUri($this->seed);
87
        $this->getPersistenceHandler()->setSpiderId($this->spiderId);
88
89
        $this->doCrawl();
90
    }
91
92
    /**
93
     * @param PostFetchFilterInterface $filter
94
     */
95
    public function addPostFetchFilter(PostFetchFilterInterface $filter)
96
    {
97
        $this->postFetchFilters[] = $filter;
98
    }
99
100
    /**
101
     * @param RequestHandlerInterface $requestHandler
102
     */
103
    public function setRequestHandler(RequestHandlerInterface $requestHandler)
104
    {
105
        $this->requestHandler = $requestHandler;
106
    }
107
108
    /**
109
     * @return RequestHandlerInterface
110
     */
111
    public function getRequestHandler()
112
    {
113
        if (!$this->requestHandler) {
114
            $this->requestHandler = new GuzzleRequestHandler();
115
        }
116
117
        return $this->requestHandler;
118
    }
119
120
    /**
121
     * param DiscovererSet $discovererSet
122
     */
123
    public function setDiscovererSet(DiscovererSet $discovererSet)
124
    {
125
        $this->discovererSet = $discovererSet;
126
    }
127
128
    /**
129
     * @return DiscovererSet
130
     */
131
    public function getDiscovererSet()
132
    {
133
        if (!$this->discovererSet) {
134
            $this->discovererSet = new DiscovererSet();
135
        }
136
137
        return $this->discovererSet;
138
    }
139
140
    /**
141
     * param QueueManagerInterface $queueManager
142
     */
143
    public function setQueueManager(QueueManagerInterface $queueManager)
144
    {
145
        $this->queueManager = $queueManager;
146
    }
147
148
    /**
149
     * @return QueueManagerInterface
150
     */
151
    public function getQueueManager()
152
    {
153
        if (!$this->queueManager) {
154
            $this->queueManager = new InMemoryQueueManager();
155
        }
156
157
        return $this->queueManager;
158
    }
159
160
    /**
161
     * @param PersistenceHandlerInterface $persistenceHandler
162
     */
163
    public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler)
164
    {
165
        $this->persistenceHandler = $persistenceHandler;
166
    }
167
168
    /**
169
     * @return PersistenceHandlerInterface
170
     */
171
    public function getPersistenceHandler()
172
    {
173
        if (!$this->persistenceHandler) {
174
            $this->persistenceHandler = new MemoryPersistenceHandler();
175
        }
176
177
        return $this->persistenceHandler;
178
    }
179
180
    /**
181
     * @param EventDispatcherInterface $eventDispatcher
182
     * @return $this
183
     */
184
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
185
    {
186
        $this->dispatcher = $eventDispatcher;
187
188
        return $this;
189
    }
190
191
    /**
192
     * @return EventDispatcherInterface
193
     */
194
    public function getDispatcher()
195
    {
196
        if (!$this->dispatcher) {
197
            $this->dispatcher = new EventDispatcher();
198
        }
199
        return $this->dispatcher;
200
    }
201
202
    public function handleSignal($signal)
203
    {
204
        switch ($signal) {
205
            case SIGTERM:
206
            case SIGKILL:
207
            case SIGINT:
208
            case SIGQUIT:
209
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
210
        }
211
    }
212
213
    /**
214
     * @param Resource $resource
215
     * @return bool
216
     */
217
    private function matchesPostfetchFilter(Resource $resource)
218
    {
219
        foreach ($this->postFetchFilters as $filter) {
220
            if ($filter->match($resource)) {
221
                $this->dispatch(
222
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
223
                    new GenericEvent($this, array('uri' => $resource->getUri()))
224
                );
225
                return true;
226
            }
227
        }
228
        return false;
229
    }
230
231
    private function isDownLoadLimitExceeded()
232
    {
233
        if ($this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit) {
0 ignored issues
show
Unused Code introduced by
This if statement, and the following return statement can be replaced with return $this->downloadLi...= $this->downloadLimit;.
Loading history...
234
            return true;
235
        }
236
        return false;
237
    }
238
    /**
239
     * Function that crawls each provided URI
240
     * It applies all processors and listeners set on the Spider
241
     *
242
     * This is a either depth first algorithm as explained here:
243
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
244
     * Note that because we don't do it recursive, but iteratively,
245
     * results will be in a different order from the example, because
246
     * we always take the right-most child first, whereas a recursive
247
     * variant would always take the left-most child first
248
     *
249
     * or
250
     *
251
     * a breadth first algorithm
252
     *
253
     * @return void
254
     */
255
    private function doCrawl()
256
    {
257
        while ($currentUri = $this->getQueueManager()->next()) {
258
            if ($this->isDownLoadLimitExceeded()) {
259
                break;
260
            }
261
262
            // Fetch the document
263
            if (!$resource = $this->fetchResource($currentUri)) {
264
                continue;
265
            }
266
267
            $this->getPersistenceHandler()->persist($resource);
268
269
            $this->dispatch(
270
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
271
                new GenericEvent($this, array('uri' => $currentUri))
272
            );
273
274
            // Once the document is enqueued, apply the discoverers to look for more links to follow
275
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
276
277
            foreach ($discoveredUris as $uri) {
278
                try {
279
                    $this->getQueueManager()->addUri($uri);
280
                } catch (QueueException $e) {
281
                    // when the queue size is exceeded, we stop discovering
282
                    break;
283
                }
284
            }
285
        }
286
    }
287
288
    /**
289
     * @param UriInterface $uri
290
     * @return Resource
0 ignored issues
show
Documentation introduced by
Should the return type not be false|resource?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
291
     */
292
    protected function fetchResource(UriInterface $uri)
293
    {
294
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
295
296
        try {
297
            $resource = $this->getRequestHandler()->request($uri);
298
299
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
300
301
            if ($this->matchesPostfetchFilter($resource)) {
0 ignored issues
show
Documentation introduced by
$resource is of type resource, but the function expects a object<VDB\Spider\Resource>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
302
                return false;
303
            }
304
305
            return $resource;
306
        } catch (\Exception $e) {
307
            $this->dispatch(
308
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
309
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
310
            );
311
312
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
313
314
            return false;
315
        }
316
    }
317
318
    /**
319
     * A shortcut for EventDispatcher::dispatch()
320
     *
321
     * @param string $eventName
0 ignored issues
show
Documentation introduced by
Should the type for parameter $event not be null|Event?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
322
     * @param Event $event
323
     */
324
    private function dispatch($eventName, Event $event = null)
325
    {
326
        $this->getDispatcher()->dispatch($eventName, $event);
327
    }
328
329
    /**
330
     * @param string $uri
331
     */
332
    private function setSeed($uri)
333
    {
334
        $this->seed = new FilterableUri($uri);
335
        $this->seed->setDepthFound(0);
336
    }
337
}
338