Completed
Pull Request — master (#16)
by Matthijs
02:29
created

Spider::matchesPrefetchFilter()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 1
Metric Value
c 2
b 0
f 1
dl 0
loc 9
rs 9.6667
cc 3
eloc 5
nc 3
nop 1
1
<?php
2
namespace VDB\Spider;
3
4
use Exception;
5
use Symfony\Component\EventDispatcher\Event;
6
use Symfony\Component\EventDispatcher\EventDispatcher;
7
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
8
use Symfony\Component\EventDispatcher\GenericEvent;
9
use VDB\Spider\Discoverer\DiscovererInterface;
10
use VDB\Spider\Discoverer\DiscovererSet;
11
use VDB\Spider\Event\SpiderEvents;
12
use VDB\Spider\Exception\QueueException;
13
use VDB\Spider\Filter\PostFetchFilter;
14
use VDB\Spider\Filter\PreFetchFilter;
15
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler;
16
use VDB\Spider\PersistenceHandler\PersistenceHandler;
17
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
18
use VDB\Spider\RequestHandler\RequestHandler;
19
use VDB\Spider\QueueManager\QueueManager;
20
use VDB\Spider\QueueManager\InMemoryQueueManager;
21
use VDB\Spider\Uri\FilterableUri;
22
use VDB\Uri\UriInterface;
23
24
/**
25
 *
26
 */
27
class Spider
28
{
29
    /** @var RequestHandler */
30
    private $requestHandler;
31
32
    /** @var PersistenceHandler */
33
    private $persistenceHandler;
34
35
    /** @var QueueManager */
36
    private $queueManager;
37
38
    /** @var EventDispatcherInterface */
39
    private $dispatcher;
40
41
    /** @var DiscovererSet */
42
    private $discovererSet;
43
44
    /** @var PostFetchFilter[] */
45
    private $postFetchFilters = array();
46
47
    /** @var FilterableUri The URI of the site to spider */
48
    private $seed = array();
49
50
    /** @var string the unique id of this spider instance */
51
    private $spiderId;
52
53
    /** @var array the list of already visited URIs with the depth they were discovered on as value */
54
    private $alreadySeenUris = array();
55
56
    /** @var the maximum number of downloaded resources. 0 means no limit */
57
    public $downloadLimit = 0;
58
59
    /**
60
     * @param string $seed the URI to start crawling
61
     * @param string $spiderId
62
     */
63
    public function __construct($seed, $spiderId = null)
64
    {
65
        $this->setSeed($seed);
66
        if (null !== $spiderId) {
67
            $this->spiderId = $spiderId;
68
        } else {
69
            $this->spiderId = md5($seed . microtime(true));
70
        }
71
72
        // This makes the spider handle signals gracefully and allows us to do cleanup
73
        if (php_sapi_name() == 'cli') {
74
            declare(ticks = 1);
75
            if (function_exists('pcntl_signal')) {
76
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
77
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
78
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
79
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
80
            }
81
        }
82
    }
83
84
    /**
85
     * Starts crawling the URI provided on instantiation
86
     *
87
     * @return array
88
     */
89
    public function crawl()
90
    {
91
        $this->getQueueManager()->addUri($this->seed);
92
        $this->getPersistenceHandler()->setSpiderId($this->spiderId);
93
94
        $this->doCrawl();
95
    }
96
97
    /**
98
     * @param PostFetchFilter $filter
99
     */
100
    public function addPostFetchFilter(PostFetchFilter $filter)
101
    {
102
        $this->postFetchFilters[] = $filter;
103
    }
104
105
    /**
106
     * @param RequestHandler $requestHandler
107
     */
108
    public function setRequestHandler(RequestHandler $requestHandler)
109
    {
110
        $this->requestHandler = $requestHandler;
111
    }
112
113
    /**
114
     * @return RequestHandler
115
     */
116
    public function getRequestHandler()
117
    {
118
        if (!$this->requestHandler) {
119
            $this->requestHandler = new GuzzleRequestHandler();
120
        }
121
122
        return $this->requestHandler;
123
    }
124
125
    /**
126
     * param DiscovererSet $discovererSet
127
     */
128
    public function setDiscovererSet(DiscovererSet $discovererSet)
129
    {
130
        $this->discovererSet = $discovererSet;
131
    }
132
133
    /**
134
     * @return DiscovererSet
135
     */
136
    public function getDiscovererSet()
137
    {
138
        if (!$this->discovererSet) {
139
            $this->discovererSet = new DiscovererSet();
140
        }
141
142
        return $this->discovererSet;
143
    }
144
145
    /**
146
     * param QueueManager $queueManager
147
     */
148
    public function setQueueManager(QueueManager $queueManager)
149
    {
150
        $this->queueManager = $queueManager;
151
    }
152
153
    /**
154
     * @return QueueManager
155
     */
156
    public function getQueueManager()
157
    {
158
        if (!$this->queueManager) {
159
            $this->queueManager = new InMemoryQueueManager();
160
        }
161
162
        return $this->queueManager;
163
    }
164
165
    /**
166
     * @param PersistenceHandler $persistenceHandler
167
     */
168
    public function setPersistenceHandler(PersistenceHandler $persistenceHandler)
169
    {
170
        $this->persistenceHandler = $persistenceHandler;
171
    }
172
173
    /**
174
     * @return PersistenceHandler
175
     */
176
    public function getPersistenceHandler()
177
    {
178
        if (!$this->persistenceHandler) {
179
            $this->persistenceHandler = new MemoryPersistenceHandler();
180
        }
181
182
        return $this->persistenceHandler;
183
    }
184
185
    /**
186
     * @param EventDispatcherInterface $eventDispatcher
187
     * @return $this
188
     */
189
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
190
    {
191
        $this->dispatcher = $eventDispatcher;
192
193
        return $this;
194
    }
195
196
    /**
197
     * @return EventDispatcherInterface
198
     */
199
    public function getDispatcher()
200
    {
201
        if (!$this->dispatcher) {
202
            $this->dispatcher = new EventDispatcher();
203
        }
204
        return $this->dispatcher;
205
    }
206
207
    public function handleSignal($signal)
208
    {
209
        switch ($signal) {
210
            case SIGTERM:
211
            case SIGKILL:
212
            case SIGINT:
213
            case SIGQUIT:
214
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
215
        }
216
    }
217
218
    /**
219
     * @param Resource $resource
220
     * @return bool
221
     */
222
    private function matchesPostfetchFilter(Resource $resource)
223
    {
224
        foreach ($this->postFetchFilters as $filter) {
225
            if ($filter->match($resource)) {
226
                $this->dispatch(
227
                    SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH,
228
                    new GenericEvent($this, array('uri' => $resource->getUri()))
229
                );
230
                return true;
231
            }
232
        }
233
        return false;
234
    }
235
236
    private function isDownLoadLimitExceeded()
237
    {
238
        if ($this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit) {
239
            return true;
240
        }
241
        return false;
242
    }
243
    /**
244
     * Function that crawls each provided URI
245
     * It applies all processors and listeners set on the Spider
246
     *
247
     * This is a either depth first algorithm as explained here:
248
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
249
     * Note that because we don't do it recursive, but iteratively,
250
     * results will be in a different order from the example, because
251
     * we always take the right-most child first, whereas a recursive
252
     * variant would always take the left-most child first
253
     *
254
     * or
255
     *
256
     * a breadth first algorithm
257
     *
258
     * @return void
259
     */
260
    private function doCrawl()
261
    {
262
        while ($currentUri = $this->getQueueManager()->next()) {
263
            if ($this->isDownLoadLimitExceeded()) {
264
                break;
265
            }
266
267
            // Fetch the document
268
            if (!$resource = $this->fetchResource($currentUri)) {
269
                continue;
270
            }
271
272
            $this->getPersistenceHandler()->persist($resource);
273
274
            $this->dispatch(
275
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
276
                new GenericEvent($this, array('uri' => $currentUri))
277
            );
278
279
            $nextLevel = $resource->depthFound + 1;
280
            if ($nextLevel > $this->getQueueManager()->maxDepth) {
0 ignored issues
show
Bug introduced by
Accessing maxDepth on the interface VDB\Spider\QueueManager\QueueManager suggest that you code against a concrete implementation. How about adding an instanceof check?

If you access a property on an interface, you most likely code against a concrete implementation of the interface.

Available Fixes

  1. Adding an additional type check:

    interface SomeInterface { }
    class SomeClass implements SomeInterface {
        public $a;
    }
    
    function someFunction(SomeInterface $object) {
        if ($object instanceof SomeClass) {
            $a = $object->a;
        }
    }
    
  2. Changing the type hint:

    interface SomeInterface { }
    class SomeClass implements SomeInterface {
        public $a;
    }
    
    function someFunction(SomeClass $object) {
        $a = $object->a;
    }
    
Loading history...
281
                continue;
282
            }
283
284
            // Once the document is enqueued, apply the discoverers to look for more links to follow
285
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
286
287
            foreach ($discoveredUris as $uri) {
288
289
                // Always skip nodes we already visited
290
                if (array_key_exists($uri->toString(), $this->alreadySeenUris)) {
291
                    continue;
292
                }
293
294
                try {
295
                    $this->getQueueManager()->addUri($uri);
296
                } catch (QueueException $e) {
297
                    // when the queue size is exceeded, we stop discovering
298
                    break;
299
                }
300
301
                // filtered or queued: mark as seen
302
                $this->alreadySeenUris[$uri->toString()] = $resource->depthFound + 1;
303
            }
304
        }
305
    }
306
307
    /**
308
     * @param UriInterface $uri
309
     * @return bool|Resource
310
     */
311
    protected function fetchResource(UriInterface $uri)
312
    {
313
        $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri)));
314
315
        try {
316
            $resource = $this->getRequestHandler()->request($uri);
317
            $resource->depthFound = $this->alreadySeenUris[$uri->toString()];
318
319
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
320
321
            if ($this->matchesPostfetchFilter($resource)) {
0 ignored issues
show
Documentation introduced by
$resource is of type resource, but the function expects a object<VDB\Spider\Resource>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
322
                return false;
323
            }
324
325
            return $resource;
326
        } catch (\Exception $e) {
327
            $this->dispatch(
328
                SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST,
329
                new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage()))
330
            );
331
332
            $this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally'
333
334
            return false;
335
        }
336
    }
337
338
    /**
339
     * A shortcut for EventDispatcher::dispatch()
340
     *
341
     * @param string $eventName
342
     * @param Event $event
343
     */
344
    private function dispatch($eventName, Event $event = null)
345
    {
346
        $this->getDispatcher()->dispatch($eventName, $event);
347
    }
348
349
    /**
350
     * @param string $uri
351
     */
352
    private function setSeed($uri)
353
    {
354
        $this->seed = new FilterableUri($uri);
355
        $this->alreadySeenUris[$this->seed->normalize()->toString()] = 0;
356
    }
357
}
358