Completed
Push — master ( f55f25...8f63cd )
by Freek
01:50
created

CrawlRequestFulfilled::__invoke()   B

Complexity

Conditions 6
Paths 11

Size

Total Lines 32

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
nc 11
nop 2
dl 0
loc 32
rs 8.7857
c 0
b 0
f 0
1
<?php
2
3
namespace Spatie\Crawler\Handlers;
4
5
use Spatie\Crawler\Crawler;
6
use Spatie\Crawler\CrawlUrl;
7
use Spatie\Crawler\LinkAdder;
8
use Spatie\Crawler\CrawlerRobots;
9
use Psr\Http\Message\UriInterface;
10
use Spatie\Crawler\CrawlSubdomains;
11
use Psr\Http\Message\StreamInterface;
12
use Psr\Http\Message\ResponseInterface;
13
use function GuzzleHttp\Psr7\stream_for;
14
15
class CrawlRequestFulfilled
16
{
17
    /** @var \Spatie\Crawler\Crawler */
18
    protected $crawler;
19
20
    /** @var \Spatie\Crawler\LinkAdder */
21
    protected $linkAdder;
22
23
    public function __construct(Crawler $crawler)
24
    {
25
        $this->crawler = $crawler;
26
27
        $this->linkAdder = new LinkAdder($this->crawler);
28
    }
29
30
    public function __invoke(ResponseInterface $response, $index)
31
    {
32
        $robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots());
33
34
        if (! $robots->mayIndex()) {
35
            return;
36
        }
37
38
        $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
39
40
        if ($this->crawler->mayExecuteJavaScript()) {
41
            $html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
42
43
            $response = $response->withBody(stream_for($html));
44
        }
45
46
        $this->handleCrawled($response, $crawlUrl);
47
48
        if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
49
            if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
50
                return;
51
            }
52
        }
53
54
        if (! $robots->mayFollow()) {
55
            return;
56
        }
57
58
        $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
59
60
        $this->linkAdder->addFromHtml($body, $crawlUrl->url);
61
    }
62
63
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
64
    {
65
        $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
66
    }
67
68
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
69
    {
70
        $bodyStream->rewind();
71
72
        $body = $bodyStream->read($readMaximumBytes);
73
74
        return $body;
75
    }
76
77
    protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
78
    {
79
        $browsershot = $this->crawler->getBrowsershot();
80
81
        $html = $browsershot->setUrl((string) $url)->bodyHtml();
82
83
        return html_entity_decode($html);
84
    }
85
}
86