Completed
Push — master ( 136b4b...077631 )
by Freek
03:49 queued 01:14
created

Crawler::getMaximumCrawlCount()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Spatie\Crawler\CrawlQueue\CrawlQueue;
7
use Tree\Node\Node;
8
use GuzzleHttp\Pool;
9
use GuzzleHttp\Client;
10
use GuzzleHttp\Psr7\Uri;
11
use GuzzleHttp\Psr7\Request;
12
use Spatie\Robots\RobotsTxt;
13
use GuzzleHttp\RequestOptions;
14
use Psr\Http\Message\UriInterface;
15
use Spatie\Browsershot\Browsershot;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
20
class Crawler
21
{
22
    /** @var \GuzzleHttp\Client */
23
    protected $client;
24
25
    /** @var \Psr\Http\Message\UriInterface */
26
    protected $baseUrl;
27
28
    /** @var \Spatie\Crawler\CrawlObserverCollection */
29
    protected $crawlObservers;
30
31
    /** @var \Spatie\Crawler\CrawlProfile */
32
    protected $crawlProfile;
33
34
    /** @var int */
35
    protected $concurrency;
36
37
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
38
    protected $crawlQueue;
39
40
    /** @var int */
41
    protected $crawledUrlCount = 0;
42
43
    /** @var int|null */
44
    protected $maximumCrawlCount = null;
45
46
    /** @var int */
47
    protected $maximumResponseSize = 1024 * 1024 * 2;
48
49
    /** @var int|null */
50
    protected $maximumDepth = null;
51
52
    /** @var bool */
53
    protected $respectRobots = true;
54
55
    /** @var \Tree\Node\Node */
56
    protected $depthTree;
57
58
    /** @var bool */
59
    protected $executeJavaScript = false;
60
61
    /** @var Browsershot */
62
    protected $browsershot = null;
63
64
    /** @var \Spatie\Robots\RobotsTxt */
65
    protected $robotsTxt = null;
66
67
    protected static $defaultClientOptions = [
68
        RequestOptions::COOKIES => true,
69
        RequestOptions::CONNECT_TIMEOUT => 10,
70
        RequestOptions::TIMEOUT => 10,
71
        RequestOptions::ALLOW_REDIRECTS => false,
72
    ];
73
74
    /**
75
     * @param array $clientOptions
76
     *
77
     * @return static
78
     */
79
    public static function create(array $clientOptions = [])
80
    {
81
        $clientOptions = (count($clientOptions))
82
            ? $clientOptions
83
            : static::$defaultClientOptions;
84
85
        $client = new Client($clientOptions);
86
87
        return new static($client);
88
    }
89
90
    public function __construct(Client $client, int $concurrency = 10)
91
    {
92
        $this->client = $client;
93
94
        $this->concurrency = $concurrency;
95
96
        $this->crawlProfile = new CrawlAllUrls();
97
98
        $this->crawlQueue = new CollectionCrawlQueue();
99
100
        $this->crawlObservers = new CrawlObserverCollection();
101
    }
102
103
    public function setConcurrency(int $concurrency): self
104
    {
105
        $this->concurrency = $concurrency;
106
107
        return $this;
108
    }
109
110
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self
111
    {
112
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
113
114
        return $this;
115
    }
116
117
    public function getMaximumResponseSize(): ?int
118
    {
119
        return $this->maximumResponseSize;
120
    }
121
122
    public function setMaximumCrawlCount(int $maximumCrawlCount): self
123
    {
124
        $this->maximumCrawlCount = $maximumCrawlCount;
125
126
        return $this;
127
    }
128
129
    public function getMaximumCrawlCount(): ?int
130
    {
131
        return $this->maximumCrawlCount;
132
    }
133
134
    public function getCrawlerUrlCount(): int
135
    {
136
        return $this->crawledUrlCount;
137
    }
138
139
    public function setMaximumDepth(int $maximumDepth): self
140
    {
141
        $this->maximumDepth = $maximumDepth;
142
143
        return $this;
144
    }
145
146
    public function getMaximumDepth(): ?int
147
    {
148
        return $this->maximumDepth;
149
    }
150
151
    public function ignoreRobots(): self
152
    {
153
        $this->respectRobots = false;
154
155
        return $this;
156
    }
157
158
    public function respectRobots(): self
159
    {
160
        $this->respectRobots = true;
161
162
        return $this;
163
    }
164
165
    public function mustRespectRobots(): bool
166
    {
167
        return $this->respectRobots;
168
    }
169
170
    public function getRobotsTxt(): RobotsTxt
171
    {
172
        return $this->robotsTxt;
173
    }
174
175
    public function setCrawlQueue(CrawlQueue $crawlQueue): self
176
    {
177
        $this->crawlQueue = $crawlQueue;
178
179
        return $this;
180
    }
181
182
    public function getCrawlQueue(): CrawlQueue
183
    {
184
        return $this->crawlQueue;
185
    }
186
187
    public function executeJavaScript(): self
188
    {
189
        $this->executeJavaScript = true;
190
191
        return $this;
192
    }
193
194
    public function doNotExecuteJavaScript(): self
195
    {
196
        $this->executeJavaScript = false;
197
198
        return $this;
199
    }
200
201
    public function mayExecuteJavascript(): bool
202
    {
203
        return $this->executeJavaScript;
204
    }
205
206
    /**
207
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
208
     *
209
     * @return $this
210
     */
211
    public function setCrawlObserver($crawlObservers): self
212
    {
213
        if (! is_array($crawlObservers)) {
214
            $crawlObservers = [$crawlObservers];
215
        }
216
217
        return $this->setCrawlObservers($crawlObservers);
218
    }
219
220
    public function setCrawlObservers(array $crawlObservers): self
221
    {
222
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
223
224
        return $this;
225
    }
226
227
    public function addCrawlObserver(CrawlObserver $crawlObserver): self
228
    {
229
        $this->crawlObservers->addObserver($crawlObserver);
230
231
        return $this;
232
    }
233
234
    public function getCrawlObservers(): CrawlObserverCollection
235
    {
236
        return $this->crawlObservers;
237
    }
238
239
    public function setCrawlProfile(CrawlProfile $crawlProfile): self
240
    {
241
        $this->crawlProfile = $crawlProfile;
242
243
        return $this;
244
    }
245
246
    public function getCrawlProfile(): CrawlProfile
247
    {
248
        return $this->crawlProfile;
249
    }
250
251
    public function setBrowsershot(Browsershot $browsershot)
252
    {
253
        $this->browsershot = $browsershot;
254
255
        return $this;
256
    }
257
258
    public function getBrowsershot(): Browsershot
259
    {
260
        if (! $this->browsershot) {
261
            $this->browsershot = new Browsershot();
262
        }
263
264
        return $this->browsershot;
265
    }
266
267
    public function getBaseUrl(): UriInterface
268
    {
269
        return $this->baseUrl;
270
    }
271
272
    /**
273
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
274
     */
275
    public function startCrawling($baseUrl)
276
    {
277
        if (! $baseUrl instanceof UriInterface) {
278
            $baseUrl = new Uri($baseUrl);
279
        }
280
281
        if ($baseUrl->getScheme() === '') {
282
            $baseUrl = $baseUrl->withScheme('http');
283
        }
284
285
        if ($baseUrl->getPath() === '') {
286
            $baseUrl = $baseUrl->withPath('/');
287
        }
288
289
        $this->baseUrl = $baseUrl;
290
291
        $crawlUrl = CrawlUrl::create($this->baseUrl);
292
293
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
294
295
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
296
            $this->addToCrawlQueue($crawlUrl);
297
        }
298
299
        $this->depthTree = new Node((string) $this->baseUrl);
300
301
        $this->startCrawlingQueue();
302
303
        foreach ($this->crawlObservers as $crawlObserver) {
304
            $crawlObserver->finishedCrawling();
305
        }
306
    }
307
308
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
309
    {
310
        $node = $node ?? $this->depthTree;
311
312
        $returnNode = null;
313
314
        if ($node->getValue() === (string) $parentUrl) {
315
            $newNode = new Node((string) $url);
316
317
            $node->addChild($newNode);
318
319
            return $newNode;
320
        }
321
322
        foreach ($node->getChildren() as $currentNode) {
323
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
324
325
            if (! is_null($returnNode)) {
326
                break;
327
            }
328
        }
329
330
        return $returnNode;
331
    }
332
333
    protected function startCrawlingQueue()
334
    {
335
        while ($this->crawlQueue->hasPendingUrls()) {
336
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
337
                'concurrency' => $this->concurrency,
338
                'options' => $this->client->getConfig(),
339
                'fulfilled' => new CrawlRequestFulfilled($this),
340
                'rejected' => new CrawlRequestFailed($this),
341
            ]);
342
343
            $promise = $pool->promise();
344
345
            $promise->wait();
346
        }
347
    }
348
349
    /**
350
     * @deprecated This function will be removed in the next major version
351
     */
352
    public function endsWith($haystack, $needle)
353
    {
354
        return strrpos($haystack, $needle) + strlen($needle) ===
355
            strlen($haystack);
356
    }
357
358
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
359
    {
360
        return RobotsTxt::create($uri->withPath('/robots.txt'));
361
    }
362
363
    protected function getCrawlRequests(): Generator
364
    {
365
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
366
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
367
                $this->crawlQueue->markAsProcessed($crawlUrl);
368
                continue;
369
            }
370
371
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
372
                continue;
373
            }
374
375
            foreach ($this->crawlObservers as $crawlObserver) {
376
                $crawlObserver->willCrawl($crawlUrl->url);
377
            }
378
379
            $this->crawlQueue->markAsProcessed($crawlUrl);
380
381
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
382
        }
383
    }
384
385
    public function addToCrawlQueue(CrawlUrl $crawlUrl): self
386
    {
387
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
388
            return $this;
389
        }
390
391
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
392
            return $this;
393
        }
394
395
        $this->crawledUrlCount++;
396
397
        $this->crawlQueue->add($crawlUrl);
398
399
        return $this;
400
    }
401
402
    public function maximumCrawlCountReached(): bool
403
    {
404
        $maximumCrawlCount = $this->getMaximumCrawlCount();
405
406
        if (is_null($maximumCrawlCount)) {
407
            return false;
408
        }
409
410
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
411
    }
412
}
413