CrawlRequestFulfilled   A
last analyzed

Complexity

Total Complexity 12

Size/Duplication

Total Lines 83
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 11

Importance

Changes 0
Metric Value
wmc 12
lcom 1
cbo 11
dl 0
loc 83
rs 10
c 0
b 0
f 0

6 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 6 1
B __invoke() 0 33 6
A getBaseUrl() 0 10 2
A handleCrawled() 0 4 1
A convertBodyToString() 0 8 1
A getBodyAfterExecutingJavaScript() 0 8 1
1
<?php
2
3
namespace Spatie\Crawler\Handlers;
4
5
use function GuzzleHttp\Psr7\stream_for;
6
use GuzzleHttp\Psr7\Uri;
7
use GuzzleHttp\RedirectMiddleware;
8
use Psr\Http\Message\ResponseInterface;
9
use Psr\Http\Message\StreamInterface;
10
use Psr\Http\Message\UriInterface;
11
use Spatie\Crawler\Crawler;
12
use Spatie\Crawler\CrawlerRobots;
13
use Spatie\Crawler\CrawlSubdomains;
14
use Spatie\Crawler\CrawlUrl;
15
use Spatie\Crawler\LinkAdder;
16
17
class CrawlRequestFulfilled
18
{
19
    /** @var \Spatie\Crawler\Crawler */
20
    protected $crawler;
21
22
    /** @var \Spatie\Crawler\LinkAdder */
23
    protected $linkAdder;
24
25
    public function __construct(Crawler $crawler)
26
    {
27
        $this->crawler = $crawler;
28
29
        $this->linkAdder = new LinkAdder($this->crawler);
30
    }
31
32
    public function __invoke(ResponseInterface $response, $index)
33
    {
34
        $robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots());
35
36
        $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
37
38
        if ($this->crawler->mayExecuteJavaScript()) {
39
            $html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
40
41
            $response = $response->withBody(stream_for($html));
42
        }
43
44
        if ($robots->mayIndex()) {
45
            $this->handleCrawled($response, $crawlUrl);
46
        }
47
48
        if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
49
            if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
50
                return;
51
            }
52
        }
53
54
        if (! $robots->mayFollow()) {
55
            return;
56
        }
57
58
        $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
59
        $baseUrl = $this->getBaseUrl($response, $crawlUrl);
60
61
        $this->linkAdder->addFromHtml($body, $baseUrl);
62
63
        usleep($this->crawler->getDelayBetweenRequests());
64
    }
65
66
    protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl)
67
    {
68
        $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);
69
70
        if (empty($redirectHistory)) {
71
            return $crawlUrl->url;
72
        }
73
74
        return new Uri(end($redirectHistory));
0 ignored issues
show
Security Bug introduced by
It seems like end($redirectHistory) targeting end() can also be of type false; however, GuzzleHttp\Psr7\Uri::__construct() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
75
    }
76
77
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
78
    {
79
        $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
80
    }
81
82
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
83
    {
84
        $bodyStream->rewind();
85
86
        $body = $bodyStream->read($readMaximumBytes);
87
88
        return $body;
89
    }
90
91
    protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
92
    {
93
        $browsershot = $this->crawler->getBrowsershot();
94
95
        $html = $browsershot->setUrl((string) $url)->bodyHtml();
96
97
        return html_entity_decode($html);
98
    }
99
}
100