CrawlRequestFulfilled - Code Metrics - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

CrawlRequestFulfilled A
last analyzed 2019-11-23 19:50 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	83
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	11

Importance

Changes

Metric	Value
wmc	12
lcom	1
cbo	11
dl	0
loc	83
rs	10
c	0
b	0
f	0

6 Methods

Rating	Name	Size	Complexity
A	__construct()	6	1
B	__invoke()	33	6
A	getBaseUrl()	10	2
A	handleCrawled()	4	1
A	convertBodyToString()	8	1
A	getBodyAfterExecutingJavaScript()	8	1

<?php

namespace Spatie\Crawler\Handlers;

use function GuzzleHttp\Psr7\stream_for;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\RedirectMiddleware;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlerRobots;
use Spatie\Crawler\CrawlSubdomains;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\LinkAdder;

class CrawlRequestFulfilled
{
    /** @var \Spatie\Crawler\Crawler */
    protected $crawler;

    /** @var \Spatie\Crawler\LinkAdder */
    protected $linkAdder;

    public function __construct(Crawler $crawler)
    {
        $this->crawler = $crawler;

        $this->linkAdder = new LinkAdder($this->crawler);
    }

    public function __invoke(ResponseInterface $response, $index)
    {
        $robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots());

        $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

        if ($this->crawler->mayExecuteJavaScript()) {
            $html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);

            $response = $response->withBody(stream_for($html));
        }

        if ($robots->mayIndex()) {
            $this->handleCrawled($response, $crawlUrl);
        }

        if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
            if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
                return;
            }
        }

        if (! $robots->mayFollow()) {
            return;
        }

        $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
        $baseUrl = $this->getBaseUrl($response, $crawlUrl);

        $this->linkAdder->addFromHtml($body, $baseUrl);

        usleep($this->crawler->getDelayBetweenRequests());
    }

    protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);

        if (empty($redirectHistory)) {
            return $crawlUrl->url;
        }

        return new Uri(end($redirectHistory));

    }

    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
    }

    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
    {
        $bodyStream->rewind();

        $body = $bodyStream->read($readMaximumBytes);

        return $body;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
    {
        $browsershot = $this->crawler->getBrowsershot();

        $html = $browsershot->setUrl((string) $url)->bodyHtml();

        return html_entity_decode($html);
    }
}


1			<?php
2
3			namespace Spatie\Crawler\Handlers;
4
5			use function GuzzleHttp\Psr7\stream_for;
6			use GuzzleHttp\Psr7\Uri;
7			use GuzzleHttp\RedirectMiddleware;
8			use Psr\Http\Message\ResponseInterface;
9			use Psr\Http\Message\StreamInterface;
10			use Psr\Http\Message\UriInterface;
11			use Spatie\Crawler\Crawler;
12			use Spatie\Crawler\CrawlerRobots;
13			use Spatie\Crawler\CrawlSubdomains;
14			use Spatie\Crawler\CrawlUrl;
15			use Spatie\Crawler\LinkAdder;
16
17			class CrawlRequestFulfilled
18			{
19			/** @var \Spatie\Crawler\Crawler */
20			protected $crawler;
21
22			/** @var \Spatie\Crawler\LinkAdder */
23			protected $linkAdder;
24
25			public function __construct(Crawler $crawler)
26			{
27			$this->crawler = $crawler;
28
29			$this->linkAdder = new LinkAdder($this->crawler);
30			}
31
32			public function __invoke(ResponseInterface $response, $index)
33			{
34			$robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots());
35
36			$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
37
38			if ($this->crawler->mayExecuteJavaScript()) {
39			$html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
40
41			$response = $response->withBody(stream_for($html));
42			}
43
44			if ($robots->mayIndex()) {
45			$this->handleCrawled($response, $crawlUrl);
46			}
47
48			if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
49			if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
50			return;
51			}
52			}
53
54			if (! $robots->mayFollow()) {
55			return;
56			}
57
58			$body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
59			$baseUrl = $this->getBaseUrl($response, $crawlUrl);
60
61			$this->linkAdder->addFromHtml($body, $baseUrl);
62
63			usleep($this->crawler->getDelayBetweenRequests());
64			}
65
66			protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl)
67			{
68			$redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);
69
70			if (empty($redirectHistory)) {
71			return $crawlUrl->url;
72			}
73
74			return new Uri(end($redirectHistory));
			0 ignored issues – show Security Bug introduced 2019-06-06 07:59 UTC by Report Bug Copy Issue Report It seems like `end($redirectHistory)` targeting `end()` can also be of type `false`; however, `GuzzleHttp\Psr7\Uri::__construct()` does only seem to accept `string`, did you maybe forget to handle an error condition? Loading history...
75			}
76
77			protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
78			{
79			$this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
80			}
81
82			protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
83			{
84			$bodyStream->rewind();
85
86			$body = $bodyStream->read($readMaximumBytes);
87
88			return $body;
89			}
90
91			protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
92			{
93			$browsershot = $this->crawler->getBrowsershot();
94
95			$html = $browsershot->setUrl((string) $url)->bodyHtml();
96
97			return html_entity_decode($html);
98			}
99			}
100

spatie / crawler

CrawlRequestFulfilled A last analyzed 2019-11-23 19:50 UTC

Complexity

Size/Duplication

Coupling/Cohesion

Importance

6 Methods

Duplication Side-by-Side

Filter issues like

CrawlRequestFulfilled A
last analyzed 2019-11-23 19:50 UTC