Completed
Push — master ( 338372...e58160 )
by Freek
01:32
created

Crawler::setConcurrency()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
20
class Crawler
21
{
22
    /** @var \GuzzleHttp\Client */
23
    protected $client;
24
25
    /** @var \Psr\Http\Message\UriInterface */
26
    protected $baseUrl;
27
28
    /** @var \Spatie\Crawler\CrawlObserverCollection */
29
    protected $crawlObservers;
30
31
    /** @var \Spatie\Crawler\CrawlProfile */
32
    protected $crawlProfile;
33
34
    /** @var int */
35
    protected $concurrency;
36
37
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
38
    protected $crawlQueue;
39
40
    /** @var int */
41
    protected $crawledUrlCount = 0;
42
43
    /** @var int|null */
44
    protected $maximumCrawlCount = null;
45
46
    /** @var int */
47
    protected $maximumResponseSize = 1024 * 1024 * 2;
48
49
    /** @var int|null */
50
    protected $maximumDepth = null;
51
52
    /** @var bool */
53
    protected $respectRobots = true;
54
55
    /** @var \Tree\Node\Node */
56
    protected $depthTree;
57
58
    /** @var bool */
59
    protected $executeJavaScript = false;
60
61
    /** @var Browsershot */
62
    protected $browsershot = null;
63
64
    /** @var \Spatie\Robots\RobotsTxt */
65
    protected $robotsTxt = null;
66
67
    protected static $defaultClientOptions = [
68
        RequestOptions::COOKIES => true,
69
        RequestOptions::CONNECT_TIMEOUT => 10,
70
        RequestOptions::TIMEOUT => 10,
71
        RequestOptions::ALLOW_REDIRECTS => false,
72
    ];
73
74
    /**
75
     * @param array $clientOptions
76
     *
77
     * @return static
78
     */
79
    public static function create(array $clientOptions = [])
80
    {
81
        $clientOptions = (count($clientOptions))
82
            ? $clientOptions
83
            : static::$defaultClientOptions;
84
85
        $client = new Client($clientOptions);
86
87
        return new static($client);
88
    }
89
90
    public function __construct(Client $client, int $concurrency = 10)
91
    {
92
        $this->client = $client;
93
94
        $this->concurrency = $concurrency;
95
96
        $this->crawlProfile = new CrawlAllUrls();
97
98
        $this->crawlQueue = new CollectionCrawlQueue();
99
100
        $this->crawlObservers = new CrawlObserverCollection();
101
    }
102
103
    public function setConcurrency(int $concurrency): self
104
    {
105
        $this->concurrency = $concurrency;
106
107
        return $this;
108
    }
109
110
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self
111
    {
112
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
113
114
        return $this;
115
    }
116
117
    public function getMaximumResponseSize(): ?int
118
    {
119
        return $this->maximumResponseSize;
120
    }
121
122
    public function setMaximumCrawlCount(int $maximumCrawlCount): self
123
    {
124
        $this->maximumCrawlCount = $maximumCrawlCount;
125
126
        return $this;
127
    }
128
129
    public function getMaximumCrawlCount(): ?int
130
    {
131
        return $this->maximumCrawlCount;
132
    }
133
134
    public function getCrawlerUrlCount(): int
135
    {
136
        return $this->crawledUrlCount;
137
    }
138
139
    public function setMaximumDepth(int $maximumDepth): self
140
    {
141
        $this->maximumDepth = $maximumDepth;
142
143
        return $this;
144
    }
145
146
    public function getMaximumDepth(): ?int
147
    {
148
        return $this->maximumDepth;
149
    }
150
151
    public function ignoreRobots(): self
152
    {
153
        $this->respectRobots = false;
154
155
        return $this;
156
    }
157
158
    public function respectRobots(): self
159
    {
160
        $this->respectRobots = true;
161
162
        return $this;
163
    }
164
165
    public function mustRespectRobots(): bool
166
    {
167
        return $this->respectRobots;
168
    }
169
170
    public function getRobotsTxt(): RobotsTxt
171
    {
172
        return $this->robotsTxt;
173
    }
174
175
    public function setCrawlQueue(CrawlQueue $crawlQueue): self
176
    {
177
        $this->crawlQueue = $crawlQueue;
178
179
        return $this;
180
    }
181
182
    public function getCrawlQueue(): CrawlQueue
183
    {
184
        return $this->crawlQueue;
185
    }
186
187
    public function executeJavaScript(): self
188
    {
189
        $this->executeJavaScript = true;
190
191
        return $this;
192
    }
193
194
    public function doNotExecuteJavaScript(): self
195
    {
196
        $this->executeJavaScript = false;
197
198
        return $this;
199
    }
200
201
    public function mayExecuteJavascript(): bool
202
    {
203
        return $this->executeJavaScript;
204
    }
205
206
    /**
207
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
208
     *
209
     * @return $this
210
     */
211
    public function setCrawlObserver($crawlObservers): self
212
    {
213
        if (! is_array($crawlObservers)) {
214
            $crawlObservers = [$crawlObservers];
215
        }
216
217
        return $this->setCrawlObservers($crawlObservers);
218
    }
219
220
    public function setCrawlObservers(array $crawlObservers): self
221
    {
222
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
223
224
        return $this;
225
    }
226
227
    public function addCrawlObserver(CrawlObserver $crawlObserver): self
228
    {
229
        $this->crawlObservers->addObserver($crawlObserver);
230
231
        return $this;
232
    }
233
234
    public function getCrawlObservers(): CrawlObserverCollection
235
    {
236
        return $this->crawlObservers;
237
    }
238
239
    public function setCrawlProfile(CrawlProfile $crawlProfile): self
240
    {
241
        $this->crawlProfile = $crawlProfile;
242
243
        return $this;
244
    }
245
246
    public function getCrawlProfile(): CrawlProfile
247
    {
248
        return $this->crawlProfile;
249
    }
250
251
    public function setBrowsershot(Browsershot $browsershot)
252
    {
253
        $this->browsershot = $browsershot;
254
255
        return $this;
256
    }
257
258
    public function getBrowsershot(): Browsershot
259
    {
260
        if (! $this->browsershot) {
261
            $this->browsershot = new Browsershot();
262
        }
263
264
        return $this->browsershot;
265
    }
266
267
    public function getBaseUrl(): UriInterface
268
    {
269
        return $this->baseUrl;
270
    }
271
272
    /**
273
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
274
     */
275
    public function startCrawling($baseUrl)
276
    {
277
        if (! $baseUrl instanceof UriInterface) {
278
            $baseUrl = new Uri($baseUrl);
279
        }
280
281
        if ($baseUrl->getScheme() === '') {
282
            $baseUrl = $baseUrl->withScheme('http');
283
        }
284
285
        if ($baseUrl->getPath() === '') {
286
            $baseUrl = $baseUrl->withPath('/');
287
        }
288
289
        $this->baseUrl = $baseUrl;
290
291
        $crawlUrl = CrawlUrl::create($this->baseUrl);
292
293
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
294
295
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
296
            ! $this->respectRobots
297
        ) {
298
            $this->addToCrawlQueue($crawlUrl);
299
        }
300
301
        $this->depthTree = new Node((string) $this->baseUrl);
302
303
        $this->startCrawlingQueue();
304
305
        foreach ($this->crawlObservers as $crawlObserver) {
306
            $crawlObserver->finishedCrawling();
307
        }
308
    }
309
310
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
311
    {
312
        $node = $node ?? $this->depthTree;
313
314
        $returnNode = null;
315
316
        if ($node->getValue() === (string) $parentUrl) {
317
            $newNode = new Node((string) $url);
318
319
            $node->addChild($newNode);
320
321
            return $newNode;
322
        }
323
324
        foreach ($node->getChildren() as $currentNode) {
325
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
326
327
            if (! is_null($returnNode)) {
328
                break;
329
            }
330
        }
331
332
        return $returnNode;
333
    }
334
335
    protected function startCrawlingQueue()
336
    {
337
        while ($this->crawlQueue->hasPendingUrls()) {
338
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
339
                'concurrency' => $this->concurrency,
340
                'options' => $this->client->getConfig(),
341
                'fulfilled' => new CrawlRequestFulfilled($this),
342
                'rejected' => new CrawlRequestFailed($this),
343
            ]);
344
345
            $promise = $pool->promise();
346
347
            $promise->wait();
348
        }
349
    }
350
351
    /**
352
     * @deprecated This function will be removed in the next major version
353
     */
354
    public function endsWith($haystack, $needle)
355
    {
356
        return strrpos($haystack, $needle) + strlen($needle) ===
357
            strlen($haystack);
358
    }
359
360
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
361
    {
362
        return RobotsTxt::create($uri->withPath('/robots.txt'));
363
    }
364
365
    protected function getCrawlRequests(): Generator
366
    {
367
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
368
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
369
                $this->crawlQueue->markAsProcessed($crawlUrl);
370
                continue;
371
            }
372
373
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
374
                continue;
375
            }
376
377
            foreach ($this->crawlObservers as $crawlObserver) {
378
                $crawlObserver->willCrawl($crawlUrl->url);
379
            }
380
381
            $this->crawlQueue->markAsProcessed($crawlUrl);
382
383
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
384
        }
385
    }
386
387
    public function addToCrawlQueue(CrawlUrl $crawlUrl): self
388
    {
389
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
390
            return $this;
391
        }
392
393
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
394
            return $this;
395
        }
396
397
        $this->crawledUrlCount++;
398
399
        $this->crawlQueue->add($crawlUrl);
400
401
        return $this;
402
    }
403
404
    public function maximumCrawlCountReached(): bool
405
    {
406
        $maximumCrawlCount = $this->getMaximumCrawlCount();
407
408
        if (is_null($maximumCrawlCount)) {
409
            return false;
410
        }
411
412
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
413
    }
414
}
415