Passed
Branch feature/cleanup (2bd333)
by Matthijs
06:03
created

Spider::handleSignal()   B

Complexity

Conditions 5
Paths 5

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 1 Features 1
Metric Value
c 4
b 1
f 1
dl 0
loc 10
rs 8.8571
cc 5
eloc 7
nc 5
nop 1
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererSet;
10
use VDB\Spider\Event\SpiderEvents;
11
use VDB\Spider\Exception\QueueException;
12
use VDB\Spider\Filter\PostFetchFilter;
13
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
14
use VDB\Spider\PersistenceHandler\PersistenceHandler;
15
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
16
use VDB\Spider\RequestHandler\RequestHandler;
17
use VDB\Spider\QueueManager\QueueManager;
18
use VDB\Spider\QueueManager\InMemoryQueueManager;
19
use VDB\Spider\Uri\FilterableUri;
20
use VDB\Uri\UriInterface;
21
22
/**
23
 *
24
 */
25
class Spider
26
{
27
    /** @var RequestHandler */
28
    private $requestHandler;
29
30
    /** @var PersistenceHandler */
31
    private $persistenceHandler;
32
33
    /** @var QueueManager */
34
    private $queueManager;
35
36
    /** @var EventDispatcherInterface */
37
    private $dispatcher;
38
39
    /** @var DiscovererSet */
40
    private $discovererSet;
41
42
    /** @var PostFetchFilter[] */
43
    private $postFetchFilters = array();
44
45
    /** @var FilterableUri The URI of the site to spider */
46
    private $seed = array();
47
48
    /** @var string the unique id of this spider instance */
49
    private $spiderId;
50
51
    /** @var array the list of already visited URIs with the depth they were discovered on as value */
52
    private $alreadySeenUris = array();
53
54
    /** @var the maximum number of downloaded resources. 0 means no limit */
55
    public $downloadLimit = 0;
56
57
    /**
58
     * @param string $seed the URI to start crawling
59
     * @param string $spiderId
0 ignored issues
show
Documentation introduced by
Should the type for parameter $spiderId not be string|null?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
60
     */
61
    public function __construct($seed, $spiderId = null)
62
    {
63
        $this->setSeed($seed);
64
        if (null !== $spiderId) {
65
            $this->spiderId = $spiderId;
66
        } else {
67
            $this->spiderId = md5($seed . microtime(true));
68
        }
69
70
        // This makes the spider handle signals gracefully and allows us to do cleanup
71
        if (php_sapi_name() == 'cli') {
72
            declare(ticks = 1);
73
            if (function_exists('pcntl_signal')) {
74
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
75
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
76
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
77
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
78
            }
79
        }
80
    }
81
82
    /**
83
     * Starts crawling the URI provided on instantiation
84
     *
85
     * @return array
0 ignored issues
show
Documentation introduced by
Should the return type not be array|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
86
     */
87
    public function crawl()
88
    {
89
        $this->getQueueManager()->addUri($this->seed);
90
        $this->getPersistenceHandler()->setSpiderId($this->spiderId);
91
92
        $this->doCrawl();
93
    }
94
95
    /**
96
     * @param PostFetchFilter $filter
97
     */
98
    public function addPostFetchFilter(PostFetchFilter $filter)
99
    {
100
        $this->postFetchFilters[] = $filter;
101
    }
102
103
    /**
104
     * @param RequestHandler $requestHandler
105
     */
106
    public function setRequestHandler(RequestHandler $requestHandler)
107
    {
108
        $this->requestHandler = $requestHandler;
109
    }
110
111
    /**
112
     * @return RequestHandler
113
     */
114
    public function getRequestHandler()
115
    {
116
        if (!$this->requestHandler) {
117
            $this->requestHandler = new GuzzleRequestHandler();
118
        }
119
120
        return $this->requestHandler;
121
    }
122
123
    /**
124
     * param DiscovererSet $discovererSet
125
     */
126
    public function setDiscovererSet(DiscovererSet $discovererSet)
127
    {
128
        $this->discovererSet = $discovererSet;
129
    }
130
131
    /**
132
     * @return DiscovererSet
133
     */
134
    public function getDiscovererSet()
135
    {
136
        if (!$this->discovererSet) {
137
            $this->discovererSet = new DiscovererSet();
138
        }
139
140
        return $this->discovererSet;
141
    }
142
143
    /**
144
     * param QueueManager $queueManager
145
     */
146
    public function setQueueManager(QueueManager $queueManager)
147
    {
148
        $this->queueManager = $queueManager;
149
    }
150
151
    /**
152
     * @return QueueManager
153
     */
154
    public function getQueueManager()
155
    {
156
        if (!$this->queueManager) {
157
            $this->queueManager = new InMemoryQueueManager();
158
        }
159
160
        return $this->queueManager;
161
    }
162
163
    /**
164
     * @param PersistenceHandler $persistenceHandler
165
     */
166
    public function setPersistenceHandler(PersistenceHandler $persistenceHandler)
167
    {
168
        $this->persistenceHandler = $persistenceHandler;
169
    }
170
171
    /**
172
     * @return PersistenceHandler
173
     */
174
    public function getPersistenceHandler()
175
    {
176
        if (!$this->persistenceHandler) {
177
            $this->persistenceHandler = new MemoryPersistenceHandler();
178
        }
179
180
        return $this->persistenceHandler;
181
    }
182
183
    /**
184
     * @param EventDispatcherInterface $eventDispatcher
185
     * @return $this
186
     */
187
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
188
    {
189
        $this->dispatcher = $eventDispatcher;
190
191
        return $this;
192
    }
193
194
    /**
195
     * @return EventDispatcherInterface
196
     */
197
    public function getDispatcher()
198
    {
199
        if (!$this->dispatcher) {
200
            $this->dispatcher = new EventDispatcher();
201
        }
202
        return $this->dispatcher;
203
    }
204
205
    public function handleSignal($signal)
206
    {
207
        switch ($signal) {
208
            case SIGTERM:
209
            case SIGKILL:
210
            case SIGINT:
211
            case SIGQUIT:
212
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
213
        }
214
    }
215
216
    /**
217
     * @param Resource $resource
218
     * @return bool
219
     */
220
    private function matchesPostfetchFilter(Resource $resource)
221
    {
222
        foreach ($this->postFetchFilters as $filter) {
223
            if ($filter->match($resource)) {
224
                $this->dispatch(
225
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
226
                    new GenericEvent($this, array('uri' => $resource->getUri()))
227
                );
228
                return true;
229
            }
230
        }
231
        return false;
232
    }
233
234
    private function isDownLoadLimitExceeded()
235
    {
236
        if ($this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit) {
0 ignored issues
show
Unused Code introduced by
This if statement, and the following return statement can be replaced with return $this->downloadLi...= $this->downloadLimit;.
Loading history...
237
            return true;
238
        }
239
        return false;
240
    }
241
    /**
242
     * Function that crawls each provided URI
243
     * It applies all processors and listeners set on the Spider
244
     *
245
     * This is a either depth first algorithm as explained here:
246
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
247
     * Note that because we don't do it recursive, but iteratively,
248
     * results will be in a different order from the example, because
249
     * we always take the right-most child first, whereas a recursive
250
     * variant would always take the left-most child first
251
     *
252
     * or
253
     *
254
     * a breadth first algorithm
255
     *
256
     * @return void
257
     */
258
    private function doCrawl()
259
    {
260
        while ($currentUri = $this->getQueueManager()->next()) {
261
            if ($this->isDownLoadLimitExceeded()) {
262
                break;
263
            }
264
265
            // Fetch the document
266
            if (!$resource = $this->fetchResource($currentUri)) {
267
                continue;
268
            }
269
270
            $this->getPersistenceHandler()->persist($resource);
271
272
            $this->dispatch(
273
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
274
                new GenericEvent($this, array('uri' => $currentUri))
275
            );
276
277
            $nextLevel = $resource->depthFound + 1;
278
            if ($nextLevel > $this->getQueueManager()->maxDepth) {
0 ignored issues
show
Bug introduced by
Accessing maxDepth on the interface VDB\Spider\QueueManager\QueueManager suggest that you code against a concrete implementation. How about adding an instanceof check?

If you access a property on an interface, you most likely code against a concrete implementation of the interface.

Available Fixes

  1. Adding an additional type check:

    interface SomeInterface { }
    class SomeClass implements SomeInterface {
        public $a;
    }
    
    function someFunction(SomeInterface $object) {
        if ($object instanceof SomeClass) {
            $a = $object->a;
        }
    }
    
  2. Changing the type hint:

    interface SomeInterface { }
    class SomeClass implements SomeInterface {
        public $a;
    }
    
    function someFunction(SomeClass $object) {
        $a = $object->a;
    }
    
Loading history...
279
                continue;
280
            }
281
282
            // Once the document is enqueued, apply the discoverers to look for more links to follow
283
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
284
285
            foreach ($discoveredUris as $uri) {
286
287
                // Always skip nodes we already visited
288
                if (array_key_exists($uri->toString(), $this->alreadySeenUris)) {
289
                    continue;
290
                }
291
292
                try {
293
                    $this->getQueueManager()->addUri($uri);
294
                } catch (QueueException $e) {
295
                    // when the queue size is exceeded, we stop discovering
296
                    break;
297
                }
298
299
                // filtered or queued: mark as seen
300
                $this->alreadySeenUris[$uri->toString()] = $resource->depthFound + 1;
301
            }
302
        }
303
    }
304
305
    /**
306
     * @param UriInterface $uri
307
     * @return bool|Resource
308
     */
309
    protected function fetchResource(UriInterface $uri)
310
    {
311
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
312
313
        try {
314
            $resource = $this->getRequestHandler()->request($uri);
315
            $resource->depthFound = $this->alreadySeenUris[$uri->toString()];
316
317
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
318
319
            if ($this->matchesPostfetchFilter($resource)) {
0 ignored issues
show
Documentation introduced by
$resource is of type resource, but the function expects a object<VDB\Spider\Resource>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
320
                return false;
321
            }
322
323
            return $resource;
324
        } catch (\Exception $e) {
325
            $this->dispatch(
326
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
327
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
328
            );
329
330
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
331
332
            return false;
333
        }
334
    }
335
336
    /**
337
     * A shortcut for EventDispatcher::dispatch()
338
     *
339
     * @param string $eventName
0 ignored issues
show
Documentation introduced by
Should the type for parameter $event not be null|Event?

This check looks for @param annotations where the type inferred by our type inference engine differs from the declared type.

It makes a suggestion as to what type it considers more descriptive.

Most often this is a case of a parameter that can be null in addition to its declared types.

Loading history...
340
     * @param Event $event
341
     */
342
    private function dispatch($eventName, Event $event = null)
343
    {
344
        $this->getDispatcher()->dispatch($eventName, $event);
345
    }
346
347
    /**
348
     * @param string $uri
349
     */
350
    private function setSeed($uri)
351
    {
352
        $this->seed = new FilterableUri($uri);
353
        $this->alreadySeenUris[$this->seed->normalize()->toString()] = 0;
354
    }
355
}
356