Completed
Pull Request — master (#150)
by Brent
02:13
created

Crawler::respectRobots()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Spatie\Crawler\Handlers\CrawlRequestFailed;
7
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
8
use Tree\Node\Node;
9
use GuzzleHttp\Pool;
10
use GuzzleHttp\Client;
11
use GuzzleHttp\Psr7\Uri;
12
use GuzzleHttp\Psr7\Request;
13
use Spatie\Robots\RobotsTxt;
14
use InvalidArgumentException;
15
use GuzzleHttp\RequestOptions;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Symfony\Component\DomCrawler\Link;
19
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
20
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
21
22
class Crawler
23
{
24
    use CrawlerProperties;
25
26
    /** @var \GuzzleHttp\Client */
27
    protected $client;
28
29
    /** @var \Psr\Http\Message\UriInterface */
30
    protected $baseUrl;
31
32
    /** @var array[\Spatie\Crawler\CrawlObserver] */
33
    protected $crawlObservers;
34
35
    /** @var \Spatie\Crawler\CrawlProfile */
36
    protected $crawlProfile;
37
38
    /** @var int */
39
    protected $concurrency;
40
41
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
42
    protected $crawlQueue;
43
44
    /** @var int */
45
    protected $crawledUrlCount = 0;
46
47
    /** @var int|null */
48
    protected $maximumCrawlCount = null;
49
50
    /** @var int */
51
    protected $maximumResponseSize = 1024 * 1024 * 2;
52
53
    /** @var int|null */
54
    protected $maximumDepth = null;
55
56
    /** @var bool */
57
    protected $respectRobots = true;
58
59
    /** @var \Tree\Node\Node */
60
    protected $depthTree;
61
62
    /** @var bool */
63
    protected $executeJavaScript = false;
64
65
    /** @var Browsershot */
66
    protected $browsershot = null;
67
68
    /** @var \Spatie\Robots\RobotsTxt */
69
    protected $robotsTxt = null;
70
71
    protected static $defaultClientOptions = [
72
        RequestOptions::COOKIES => true,
73
        RequestOptions::CONNECT_TIMEOUT => 10,
74
        RequestOptions::TIMEOUT => 10,
75
        RequestOptions::ALLOW_REDIRECTS => false,
76
    ];
77
78
    /**
79
     * @param array $clientOptions
80
     *
81
     * @return static
82
     */
83
    public static function create(array $clientOptions = [])
84
    {
85
        $clientOptions = (count($clientOptions))
86
            ? $clientOptions
87
            : self::$defaultClientOptions;
88
89
        $client = new Client($clientOptions);
90
91
        return new static($client);
92
    }
93
94
    public function __construct(Client $client, int $concurrency = 10)
95
    {
96
        $this->client = $client;
97
98
        $this->concurrency = $concurrency;
99
100
        $this->crawlProfile = new CrawlAllUrls();
101
102
        $this->crawlQueue = new CollectionCrawlQueue();
103
    }
104
105
    /**
106
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
107
     */
108
    public function startCrawling($baseUrl)
109
    {
110
        if (! $baseUrl instanceof UriInterface) {
111
            $baseUrl = new Uri($baseUrl);
112
        }
113
114
        if ($baseUrl->getScheme() === '') {
115
            $baseUrl = $baseUrl->withScheme('http');
116
        }
117
118
        if ($baseUrl->getPath() === '') {
119
            $baseUrl = $baseUrl->withPath('/');
120
        }
121
122
        $this->baseUrl = $baseUrl;
123
124
        $crawlUrl = CrawlUrl::create($this->baseUrl);
125
126
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
127
128
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
129
            $this->addToCrawlQueue($crawlUrl);
130
        }
131
132
        $this->depthTree = new Node((string) $this->baseUrl);
133
134
        $this->startCrawlingQueue();
135
136
        foreach ($this->crawlObservers as $crawlObserver) {
137
            $crawlObserver->finishedCrawling();
138
        }
139
    }
140
141
    protected function startCrawlingQueue()
142
    {
143
        while ($this->crawlQueue->hasPendingUrls()) {
144
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
145
                'concurrency' => $this->concurrency,
146
                'options' => $this->client->getConfig(),
147
                'fulfilled' => new CrawlRequestFulfilled($this),
148
                'rejected' => new CrawlRequestFailed($this),
149
            ]);
150
151
            $promise = $pool->promise();
152
153
            $promise->wait();
154
        }
155
    }
156
157
    public function endsWith($haystack, $needle)
158
    {
159
        return strrpos($haystack, $needle) + strlen($needle) ===
160
            strlen($haystack);
161
    }
162
163
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
164
    {
165
        return RobotsTxt::create($uri->withPath('/robots.txt'));
166
    }
167
168
    protected function getCrawlRequests(): Generator
169
    {
170
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
171
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
172
                $this->crawlQueue->markAsProcessed($crawlUrl);
173
                continue;
174
            }
175
176
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
177
                continue;
178
            }
179
180
            foreach ($this->crawlObservers as $crawlObserver) {
181
                $crawlObserver->willCrawl($crawlUrl->url);
182
            }
183
184
            $this->crawlQueue->markAsProcessed($crawlUrl);
185
186
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
187
        }
188
    }
189
190
    public function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
191
    {
192
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
193
194
        collect($allLinks)
195
            ->filter(function (UriInterface $url) {
196
                return $this->hasCrawlableScheme($url);
197
            })
198
            ->map(function (UriInterface $url) {
199
                return $this->normalizeUrl($url);
200
            })
201
            ->filter(function (UriInterface $url) {
202
                return $this->crawlProfile->shouldCrawl($url);
203
            })
204
            ->reject(function (UriInterface $url) {
205
                return $this->crawlQueue->has($url);
206
            })
207
            ->each(function (UriInterface $url) use ($foundOnUrl) {
208
                $node = $this->addToDepthTree($this->depthTree, $url, $foundOnUrl);
209
210
                if (strpos($url->getPath(), '/tel:') === 0) {
211
                    return;
212
                }
213
214
                if (! $this->shouldCrawl($node)) {
215
                    return;
216
                }
217
218
                if ($this->maximumCrawlCountReached()) {
219
                    return;
220
                }
221
222
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
223
224
                $this->addToCrawlQueue($crawlUrl);
225
            });
226
    }
227
228
    protected function shouldCrawl(Node $node): bool
229
    {
230
        if ($this->respectRobots) {
231
            return $this->robotsTxt->allows($node->getValue());
232
        }
233
234
        if (is_null($this->maximumDepth)) {
235
            return true;
236
        }
237
238
        return $node->getDepth() <= $this->maximumDepth;
239
    }
240
241
    /**
242
     * @param string                         $html
243
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
244
     *
245
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
246
     */
247
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
248
    {
249
        if ($this->executeJavaScript) {
250
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
251
        }
252
253
        $domCrawler = new DomCrawler($html, $foundOnUrl);
254
255
        return collect($domCrawler->filterXpath('//a')->links())
256
            ->reject(function (Link $link) {
257
                return $link->getNode()->getAttribute('rel') === 'nofollow';
258
            })
259
            ->map(function (Link $link) {
260
                try {
261
                    return new Uri($link->getUri());
262
                } catch (InvalidArgumentException $exception) {
263
                    return;
264
                }
265
            })
266
            ->filter();
267
    }
268
269
    protected function normalizeUrl(UriInterface $url): UriInterface
270
    {
271
        return $url->withFragment('');
272
    }
273
274
    protected function hasCrawlableScheme(UriInterface $uri): bool
275
    {
276
        return in_array($uri->getScheme(), ['http', 'https']);
277
    }
278
279
    protected function addToDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
280
    {
281
        $returnNode = null;
282
283
        if ($node->getValue() === (string) $parentUrl) {
284
            $newNode = new Node((string) $url);
285
286
            $node->addChild($newNode);
287
288
            return $newNode;
289
        }
290
291
        foreach ($node->getChildren() as $currentNode) {
292
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
293
294
            if (! is_null($returnNode)) {
295
                break;
296
            }
297
        }
298
299
        return $returnNode;
300
    }
301
302
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
303
    {
304
        $browsershot = $this->getBrowsershot();
305
306
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
307
308
        return html_entity_decode($html);
309
    }
310
311
    protected function getBrowsershot(): Browsershot
312
    {
313
        if (! $this->browsershot) {
314
            $this->browsershot = new Browsershot();
315
        }
316
317
        return $this->browsershot;
318
    }
319
320
    public function setBrowsershot(Browsershot $browsershot)
321
    {
322
        $this->browsershot = $browsershot;
323
324
        return $this;
325
    }
326
327
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
328
    {
329
        $this->crawledUrlCount++;
330
331
        $this->crawlQueue->add($crawlUrl);
332
333
        return $this;
334
    }
335
336
    protected function maximumCrawlCountReached(): bool
337
    {
338
        if (is_null($this->maximumCrawlCount)) {
339
            return false;
340
        }
341
342
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
343
    }
344
}
345