Spider::__construct()   A
last analyzed

Complexity

Conditions 4
Paths 6

Size

Total Lines 17
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 12
nc 6
nop 2
dl 0
loc 17
rs 9.8666
c 0
b 0
f 0
1
<?php
2
namespace VDB\Spider;
3
4
use Symfony\Component\EventDispatcher\Event;
5
use Symfony\Component\EventDispatcher\EventDispatcher;
6
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
7
use Symfony\Component\EventDispatcher\GenericEvent;
8
use VDB\Spider\Discoverer\DiscovererSet;
9
use VDB\Spider\Event\SpiderEvents;
10
use VDB\Spider\Exception\QueueException;
11
use VDB\Spider\QueueManager\QueueManagerInterface;
12
use VDB\Spider\QueueManager\InMemoryQueueManager;
13
use VDB\Spider\Uri\DiscoveredUri;
14
use VDB\Spider\Downloader\DownloaderInterface;
15
use VDB\Spider\Downloader\Downloader;
16
use VDB\Uri\UriInterface;
17
use VDB\Uri\Uri;
18
19
/**
20
 *
21
 */
22
class Spider
23
{
24
    /** @var DownloaderInterface */
25
    private $downloader;
26
27
    /** @var QueueManagerInterface */
28
    private $queueManager;
29
30
    /** @var EventDispatcherInterface */
31
    private $dispatcher;
32
33
    /** @var DiscovererSet */
34
    private $discovererSet;
35
36
    /** @var DiscoveredUri The URI of the site to spider */
37
    private $seed = array();
38
39
    /** @var string the unique id of this spider instance */
40
    private $spiderId;
41
42
    /**
43
     * @param string $seed the URI to start crawling
44
     * @param string|null $spiderId
45
     */
46
    public function __construct($seed, $spiderId = null)
47
    {
48
        $this->setSeed($seed);
49
        if (null !== $spiderId) {
50
            $this->spiderId = $spiderId;
51
        } else {
52
            $this->spiderId = md5($seed . microtime(true));
53
        }
54
55
        // This makes the spider handle signals gracefully and allows us to do cleanup
56
        if (php_sapi_name() == 'cli') {
57
            declare(ticks = 1);
58
            if (function_exists('pcntl_signal')) {
59
                pcntl_signal(SIGTERM, array($this, 'handleSignal'));
60
                pcntl_signal(SIGINT, array($this, 'handleSignal'));
61
                pcntl_signal(SIGHUP, array($this, 'handleSignal'));
62
                pcntl_signal(SIGQUIT, array($this, 'handleSignal'));
63
            }
64
        }
65
    }
66
67
    /**
68
     * Starts crawling the URI provided on instantiation
69
     *
70
     * @return void
71
     */
72
    public function crawl()
73
    {
74
        $this->getQueueManager()->addUri($this->seed);
75
        $this->getDownloader()->getPersistenceHandler()->setSpiderId($this->spiderId);
76
77
        $this->doCrawl();
78
    }
79
80
    /**
81
     * param DiscovererSet $discovererSet
82
     */
83
    public function setDiscovererSet(DiscovererSet $discovererSet)
84
    {
85
        $this->discovererSet = $discovererSet;
86
    }
87
88
    /**
89
     * @return DiscovererSet
90
     */
91
    public function getDiscovererSet()
92
    {
93
        if (!$this->discovererSet) {
94
            $this->discovererSet = new DiscovererSet();
95
        }
96
97
        return $this->discovererSet;
98
    }
99
100
    /**
101
     * param QueueManagerInterface $queueManager
102
     */
103
    public function setQueueManager(QueueManagerInterface $queueManager)
104
    {
105
        $this->queueManager = $queueManager;
106
    }
107
108
    /**
109
     * @return QueueManagerInterface
110
     */
111
    public function getQueueManager()
112
    {
113
        if (!$this->queueManager) {
114
            $this->queueManager = new InMemoryQueueManager();
115
        }
116
117
        return $this->queueManager;
118
    }
119
120
    /**
121
     * @param DownloaderInterface $downloader
122
     * @return $this
123
     */
124
    public function setDownloader(DownloaderInterface $downloader)
125
    {
126
        $this->downloader = $downloader;
127
128
        return $this;
129
    }
130
131
    /**
132
     * @return DownloaderInterface
133
     */
134
    public function getDownloader()
135
    {
136
        if (!$this->downloader) {
137
            $this->downloader = new Downloader();
138
        }
139
        return $this->downloader;
140
    }
141
142
    /**
143
     * @param EventDispatcherInterface $eventDispatcher
144
     * @return $this
145
     */
146
    public function setDispatcher(EventDispatcherInterface $eventDispatcher)
147
    {
148
        $this->dispatcher = $eventDispatcher;
149
150
        return $this;
151
    }
152
153
    /**
154
     * @return EventDispatcherInterface
155
     */
156
    public function getDispatcher()
157
    {
158
        if (!$this->dispatcher) {
159
            $this->dispatcher = new EventDispatcher();
160
        }
161
        return $this->dispatcher;
162
    }
163
164
    public function handleSignal($signal)
165
    {
166
        switch ($signal) {
167
            case SIGTERM:
168
            case SIGKILL:
169
            case SIGINT:
170
            case SIGQUIT:
171
                $this->dispatch(SpiderEvents::SPIDER_CRAWL_USER_STOPPED);
172
        }
173
    }
174
175
    /**
176
     * Function that crawls each provided URI
177
     * It applies all processors and listeners set on the Spider
178
     *
179
     * This is a either depth first algorithm as explained here:
180
     *  https://en.wikipedia.org/wiki/Depth-first_search#Example
181
     * Note that because we don't do it recursive, but iteratively,
182
     * results will be in a different order from the example, because
183
     * we always take the right-most child first, whereas a recursive
184
     * variant would always take the left-most child first
185
     *
186
     * or
187
     *
188
     * a breadth first algorithm
189
     *
190
     * @return void
191
     */
192
    private function doCrawl()
193
    {
194
        while ($currentUri = $this->getQueueManager()->next()) {
195
            if ($this->getDownloader()->isDownLoadLimitExceeded()) {
196
                break;
197
            }
198
199
            if (!$resource = $this->getDownloader()->download($currentUri)) {
200
                continue;
201
            }
202
203
            $this->dispatch(
204
                SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED,
205
                new GenericEvent($this, array('uri' => $currentUri))
206
            );
207
208
            // Once the document is enqueued, apply the discoverers to look for more links to follow
209
            $discoveredUris = $this->getDiscovererSet()->discover($resource);
210
211
            foreach ($discoveredUris as $uri) {
212
                try {
213
                    $this->getQueueManager()->addUri($uri);
214
                } catch (QueueException $e) {
215
                    // when the queue size is exceeded, we stop discovering
216
                    break;
217
                }
218
            }
219
        }
220
    }
221
222
    /**
223
     * A shortcut for EventDispatcher::dispatch()
224
     *
225
     * @param string $eventName
226
     * @param null|Event $event
227
     */
228
    private function dispatch($eventName, Event $event = null)
229
    {
230
        $this->getDispatcher()->dispatch($eventName, $event);
0 ignored issues
show
Unused Code introduced by
The call to Symfony\Contracts\EventD...erInterface::dispatch() has too many arguments starting with $event. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

230
        $this->getDispatcher()->/** @scrutinizer ignore-call */ dispatch($eventName, $event);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
Bug introduced by
$eventName of type string is incompatible with the type object expected by parameter $event of Symfony\Contracts\EventD...erInterface::dispatch(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

230
        $this->getDispatcher()->dispatch(/** @scrutinizer ignore-type */ $eventName, $event);
Loading history...
231
    }
232
233
    /**
234
     * @param string $uri
235
     */
236
    private function setSeed($uri)
237
    {
238
        $this->seed = new DiscoveredUri(new Uri($uri));
239
        $this->seed->setDepthFound(0);
240
    }
241
}
242