Completed
Pull Request — master (#150)
by Brent
02:10
created

Crawler::startCrawlingQueue()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 15
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 15
rs 9.4285
cc 2
eloc 9
nc 2
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\Handlers\CrawlRequestFailed;
16
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
17
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
18
19
class Crawler
20
{
21
    use CrawlerProperties;
22
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\ObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    protected static $defaultClientOptions = [
69
        RequestOptions::COOKIES => true,
70
        RequestOptions::CONNECT_TIMEOUT => 10,
71
        RequestOptions::TIMEOUT => 10,
72
        RequestOptions::ALLOW_REDIRECTS => false,
73
    ];
74
75
    /**
76
     * @param array $clientOptions
77
     *
78
     * @return static
79
     */
80
    public static function create(array $clientOptions = [])
81
    {
82
        $clientOptions = (count($clientOptions))
83
            ? $clientOptions
84
            : self::$defaultClientOptions;
85
86
        $client = new Client($clientOptions);
87
88
        return new static($client);
89
    }
90
91
    public function __construct(Client $client, int $concurrency = 10)
92
    {
93
        $this->client = $client;
94
95
        $this->concurrency = $concurrency;
96
97
        $this->crawlProfile = new CrawlAllUrls();
98
99
        $this->crawlQueue = new CollectionCrawlQueue();
100
101
        $this->crawlObservers = new ObserverCollection();
102
    }
103
104
    /**
105
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
106
     */
107
    public function startCrawling($baseUrl)
108
    {
109
        if (! $baseUrl instanceof UriInterface) {
110
            $baseUrl = new Uri($baseUrl);
111
        }
112
113
        if ($baseUrl->getScheme() === '') {
114
            $baseUrl = $baseUrl->withScheme('http');
115
        }
116
117
        if ($baseUrl->getPath() === '') {
118
            $baseUrl = $baseUrl->withPath('/');
119
        }
120
121
        $this->baseUrl = $baseUrl;
122
123
        $crawlUrl = CrawlUrl::create($this->baseUrl);
124
125
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
126
127
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
128
            $this->addToCrawlQueue($crawlUrl);
129
        }
130
131
        $this->depthTree = new Node((string) $this->baseUrl);
132
133
        $this->startCrawlingQueue();
134
135
        foreach ($this->crawlObservers as $crawlObserver) {
136
            $crawlObserver->finishedCrawling();
137
        }
138
    }
139
140
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
141
    {
142
        $node = $node ?? $this->depthTree;
143
144
        $returnNode = null;
145
146
        if ($node->getValue() === (string) $parentUrl) {
147
            $newNode = new Node((string) $url);
148
149
            $node->addChild($newNode);
150
151
            return $newNode;
152
        }
153
154
        foreach ($node->getChildren() as $currentNode) {
155
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
156
157
            if (! is_null($returnNode)) {
158
                break;
159
            }
160
        }
161
162
        return $returnNode;
163
    }
164
165
    protected function startCrawlingQueue()
166
    {
167
        while ($this->crawlQueue->hasPendingUrls()) {
168
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
169
                'concurrency' => $this->concurrency,
170
                'options' => $this->client->getConfig(),
171
                'fulfilled' => new CrawlRequestFulfilled($this),
172
                'rejected' => new CrawlRequestFailed($this),
173
            ]);
174
175
            $promise = $pool->promise();
176
177
            $promise->wait();
178
        }
179
    }
180
181
    /**
182
     * @deprecated This function will be removed in the next major version
183
     */
184
    public function endsWith($haystack, $needle)
185
    {
186
        return strrpos($haystack, $needle) + strlen($needle) ===
187
            strlen($haystack);
188
    }
189
190
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
191
    {
192
        return RobotsTxt::create($uri->withPath('/robots.txt'));
193
    }
194
195
    protected function getCrawlRequests(): Generator
196
    {
197
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
198
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
199
                $this->crawlQueue->markAsProcessed($crawlUrl);
200
                continue;
201
            }
202
203
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
204
                continue;
205
            }
206
207
            foreach ($this->crawlObservers as $crawlObserver) {
208
                $crawlObserver->willCrawl($crawlUrl->url);
209
            }
210
211
            $this->crawlQueue->markAsProcessed($crawlUrl);
212
213
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
214
        }
215
    }
216
217
    public function addToCrawlQueue(CrawlUrl $crawlUrl): self
218
    {
219
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
220
            return $this;
221
        }
222
223
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
224
            return $this;
225
        }
226
227
        $this->crawledUrlCount++;
228
229
        $this->crawlQueue->add($crawlUrl);
230
231
        return $this;
232
    }
233
234
    public function maximumCrawlCountReached(): bool
235
    {
236
        $maximumCrawlCount = $this->getMaximumCrawlCount();
237
238
        if (is_null($maximumCrawlCount)) {
239
            return false;
240
        }
241
242
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
243
    }
244
}
245