Crawler - Code Metrics - Inspection of "Respect robots" - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#145)

by Brent

created 2018-05-04 12:07 UTC

Crawler F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	546
Duplicated Lines	0 %

Coupling/Cohesion

Components	2
Dependencies	20

Importance

Changes	6
Bugs	2	Features	0

Metric	Value
wmc	69
c	6
b	2
f	0
lcom	2
cbo	20
dl	0
loc	546
rs	2.1568

35 Methods

Rating	Name	Size	Complexity
A	create()	10	2
A	setConcurrency()	6	1
A	setMaximumResponseSize()	6	1
A	setMaximumCrawlCount()	6	1
A	setMaximumDepth()	6	1
A	ignoreRobots()	6	1
A	setCrawlQueue()	6	1
A	executeJavaScript()	6	1
A	doNotExecuteJavaScript()	6	1
A	setCrawlObserver()	8	2
A	setCrawlObservers()	6	1
A	addCrawlObserver()	6	1
A	setCrawlProfile()	6	1
B	startCrawlingQueue()	49	6
A	endsWith()	5	1
A	convertBodyToString()	8	1
A	createRobotsTxt()	4	1
A	handleCrawled()	10	2
A	handleCrawlFailed()	10	2
B	getCrawlRequests()	21	5
B	addAllLinksToCrawlQueue()	37	4
A	shouldCrawl()	12	4
A	extractAllLinks()	21	3
A	normalizeUrl()	4	1
A	hasCrawlableScheme()	4	1
B	addtoDepthTree()	22	4
A	getBodyAfterExecutingJavaScript()	8	1
A	getBrowsershot()	10	2
A	setBrowsershot()	6	1
A	addToCrawlQueue()	8	1
A	maximumCrawlCountReached()	8	2
A	mayIndex()	8	3
A	mayFollow()	8	3
B	startCrawling()	30	5
A	__construct()	10	1

How to fix Complexity

<?php

namespace Spatie\Crawler;

use Generator;
use Tree\Node\Node;
use GuzzleHttp\Pool;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Uri;
use Spatie\Robots\Robots;
use GuzzleHttp\Psr7\Request;
use Spatie\Robots\RobotsTxt;
use InvalidArgumentException;
use Spatie\Robots\RobotsMeta;
use GuzzleHttp\RequestOptions;
use Spatie\Robots\RobotsHeaders;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Psr\Http\Message\StreamInterface;
use Symfony\Component\DomCrawler\Link;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlQueue\CrawlQueue;
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

class Crawler
{
    /** @var \GuzzleHttp\Client */
    protected $client;

    /** @var \Psr\Http\Message\UriInterface */
    protected $baseUrl;

    /** @var array[\Spatie\Crawler\CrawlObserver] */
    protected $crawlObservers;

    /** @var \Spatie\Crawler\CrawlProfile */
    protected $crawlProfile;

    /** @var int */
    protected $concurrency;

    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
    protected $crawlQueue;

    /** @var int */
    protected $crawledUrlCount = 0;

    /** @var int|null */
    protected $maximumCrawlCount = null;

    /** @var int */
    protected $maximumResponseSize = 1024 * 1024 * 2;

    /** @var int|null */
    protected $maximumDepth = null;

    /** @var bool */
    protected $ignoreRobots = false;

    /** @var \Tree\Node\Node */
    protected $depthTree;

    /** @var bool */
    protected $executeJavaScript = false;

    /** @var Browsershot */
    protected $browsershot = null;

    /** @var \Spatie\Robots\RobotsTxt */
    private $robotsTxt = null;

    protected static $defaultClientOptions = [
        RequestOptions::COOKIES => true,
        RequestOptions::CONNECT_TIMEOUT => 10,
        RequestOptions::TIMEOUT => 10,
        RequestOptions::ALLOW_REDIRECTS => false,
    ];

    /**
     * @param array $clientOptions
     *
     * @return static
     */
    public static function create(array $clientOptions = [])
    {
        $clientOptions = (count($clientOptions))
            ? $clientOptions
            : self::$defaultClientOptions;

        $client = new Client($clientOptions);

        return new static($client);
    }

    public function __construct(Client $client, int $concurrency = 10)
    {
        $this->client = $client;

        $this->concurrency = $concurrency;

        $this->crawlProfile = new CrawlAllUrls();

        $this->crawlQueue = new CollectionCrawlQueue();
    }

    /**
     * @param int $concurrency
     *
     * @return $this
     */
    public function setConcurrency(int $concurrency)
    {
        $this->concurrency = $concurrency;

        return $this;
    }

    /**
     * Responses that are larger that then specified value will be ignored.
     *
     * @param int $maximumResponseSizeInBytes
     *
     * @return $this
     */
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
    {
        $this->maximumResponseSize = $maximumResponseSizeInBytes;

        return $this;
    }

    /**
     * @param int $maximumCrawlCount
     *
     * @return $this
     */
    public function setMaximumCrawlCount(int $maximumCrawlCount)
    {
        $this->maximumCrawlCount = $maximumCrawlCount;

        return $this;
    }

    /**
     * @param int $maximumDepth
     *
     * @return $this
     */
    public function setMaximumDepth(int $maximumDepth)
    {
        $this->maximumDepth = $maximumDepth;

        return $this;
    }

    /**
     * @param bool $ignoreRobots
     *
     * @return $this
     */
    public function ignoreRobots(bool $ignoreRobots = true)
    {
        $this->ignoreRobots = $ignoreRobots;

        return $this;
    }

    /**
     * @param CrawlQueue $crawlQueue
     *
     * @return $this
     */
    public function setCrawlQueue(CrawlQueue $crawlQueue)
    {
        $this->crawlQueue = $crawlQueue;

        return $this;
    }

    /**
     * @return $this
     */
    public function executeJavaScript()
    {
        $this->executeJavaScript = true;

        return $this;
    }

    /**
     * @return $this
     */
    public function doNotExecuteJavaScript()
    {
        $this->executeJavaScript = false;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers

     *
     * @return $this
     */
    public function setCrawlObserver($crawlObservers)
    {
        if (! is_array($crawlObservers)) {
            $crawlObservers = [$crawlObservers];
        }

        return $this->setCrawlObservers($crawlObservers);
    }

    public function setCrawlObservers(array $crawlObservers)
    {
        $this->crawlObservers = $crawlObservers;

        return $this;
    }

    public function addCrawlObserver(CrawlObserver $crawlObserver)
    {
        $this->crawlObservers[] = $crawlObserver;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
     *
     * @return $this
     */
    public function setCrawlProfile(CrawlProfile $crawlProfile)
    {
        $this->crawlProfile = $crawlProfile;

        return $this;
    }

    /**
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
     */
    public function startCrawling($baseUrl)
    {
        if (! $baseUrl instanceof UriInterface) {
            $baseUrl = new Uri($baseUrl);
        }

        if ($baseUrl->getScheme() === '') {
            $baseUrl = $baseUrl->withScheme('http');
        }

        if ($baseUrl->getPath() === '') {
            $baseUrl = $baseUrl->withPath('/');
        }

        $this->baseUrl = $baseUrl;

        $crawlUrl = CrawlUrl::create($this->baseUrl);

        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);

        $this->addToCrawlQueue($crawlUrl);

        $this->depthTree = new Node((string) $this->baseUrl);

        $this->startCrawlingQueue();

        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->finishedCrawling();
        }
    }

    protected function startCrawlingQueue()
    {
        while ($this->crawlQueue->hasPendingUrls()) {
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
                'concurrency' => $this->concurrency,
                'options' => $this->client->getConfig(),
                'fulfilled' => function (ResponseInterface $response, $index) {
                    $crawlUrl = $this->crawlQueue->getUrlById($index);

                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);

                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());

                    $robotsMeta = RobotsMeta::create($body);

                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->handleCrawled($response, $crawlUrl);

                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
                            return;
                        }
                    }

                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->addAllLinksToCrawlQueue(
                        $body,
                        $crawlUrl->url
                    );
                },
                'rejected' => function (RequestException $exception, $index) {
                    $this->handleCrawlFailed(
                        $exception,
                        $this->crawlQueue->getUrlById($index),
                        $exception

                    );
                },
            ]);

            $promise = $pool->promise();
            $promise->wait();
        }
    }

    public function endsWith($haystack, $needle)
    {
        return strrpos($haystack, $needle) + strlen($needle) ===
            strlen($haystack);
    }

    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
    {
        $bodyStream->rewind();

        $body = $bodyStream->read($readMaximumBytes);

        return $body;
    }

    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
    {
        return RobotsTxt::create($uri->withPath('/robots.txt'));
    }

    /**
     * @param ResponseInterface|null $response
     * @param CrawlUrl               $crawlUrl
     */
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawled(
                $crawlUrl->url,
                $response,
                $crawlUrl->foundOnUrl
            );
        }
    }

    /**
     * @param RequestException $exception
     * @param CrawlUrl         $crawlUrl
     */
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawlFailed(
                $crawlUrl->url,
                $exception,
                $crawlUrl->foundOnUrl
            );
        }
    }

    protected function getCrawlRequests(): Generator
    {
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
                $this->crawlQueue->markAsProcessed($crawlUrl);
                continue;
            }

            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
                continue;
            }

            foreach ($this->crawlObservers as $crawlObserver) {
                $crawlObserver->willCrawl($crawlUrl->url);
            }

            $this->crawlQueue->markAsProcessed($crawlUrl);

            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
        }
    }

    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
    {
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);

        collect($allLinks)
            ->filter(function (UriInterface $url) {
                return $this->hasCrawlableScheme($url);
            })
            ->map(function (UriInterface $url) {
                return $this->normalizeUrl($url);
            })
            ->filter(function (UriInterface $url) {
                return $this->crawlProfile->shouldCrawl($url);
            })
            ->reject(function (UriInterface $url) {
                return $this->crawlQueue->has($url);
            })
            ->each(function (UriInterface $url) use ($foundOnUrl) {
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);

                if (strpos($url->getPath(), '/tel:') === 0) {
                    return;
                }

                if (! $this->shouldCrawl($node)) {
                    return;
                }

                if ($this->maximumCrawlCountReached()) {
                    return;
                }

                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);

                $this->addToCrawlQueue($crawlUrl);
            });
    }

    protected function shouldCrawl(Node $node): bool
    {
        if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
            return false;
        }

        if (is_null($this->maximumDepth)) {
            return true;
        }

        return $node->getDepth() <= $this->maximumDepth;
    }

    /**
     * @param string                         $html
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
     *
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
     */
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
    {
        if ($this->executeJavaScript) {
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
        }

        $domCrawler = new DomCrawler($html, $foundOnUrl);

        return collect($domCrawler->filterXpath('//a')->links())
            ->reject(function (Link $link) {
                return $link->getNode()->getAttribute('rel') === 'nofollow';
            })
            ->map(function (Link $link) {
                try {
                    return new Uri($link->getUri());
                } catch (InvalidArgumentException $exception) {
                    return;
                }
            })
            ->filter();
    }

    protected function normalizeUrl(UriInterface $url): UriInterface
    {
        return $url->withFragment('');
    }

    protected function hasCrawlableScheme(UriInterface $uri): bool
    {
        return in_array($uri->getScheme(), ['http', 'https']);
    }

    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
    {
        $returnNode = null;

        if ($node->getValue() === (string) $parentUrl) {
            $newNode = new Node((string) $url);

            $node->addChild($newNode);

            return $newNode;
        }

        foreach ($node->getChildren() as $currentNode) {
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);

            if (! is_null($returnNode)) {
                break;
            }
        }

        return $returnNode;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
    {
        $browsershot = $this->getBrowsershot();

        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

        return html_entity_decode($html);
    }

    protected function getBrowsershot(): Browsershot
    {
        if ($this->browsershot) {
            return $this->browsershot;
        }

        $this->browsershot = new Browsershot();

        return $this->browsershot;
    }

    public function setBrowsershot(Browsershot $browsershot)
    {
        $this->browsershot = $browsershot;

        return $this;
    }

    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
    {
        $this->crawledUrlCount++;

        $this->crawlQueue->add($crawlUrl);

        return $this;
    }

    protected function maximumCrawlCountReached(): bool
    {
        if (is_null($this->maximumCrawlCount)) {
            return false;
        }

        return $this->crawledUrlCount >= $this->maximumCrawlCount;
    }

    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
    {
        if ($this->ignoreRobots) {
            return true;
        }

        return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
    }

    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
    {
        if ($this->ignoreRobots) {
            return true;
        }

        return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
    }
}


1			<?php
2
3			namespace Spatie\Crawler;
4
5			use Generator;
6			use Tree\Node\Node;
7			use GuzzleHttp\Pool;
8			use GuzzleHttp\Client;
9			use GuzzleHttp\Psr7\Uri;
10			use Spatie\Robots\Robots;
11			use GuzzleHttp\Psr7\Request;
12			use Spatie\Robots\RobotsTxt;
13			use InvalidArgumentException;
14			use Spatie\Robots\RobotsMeta;
15			use GuzzleHttp\RequestOptions;
16			use Spatie\Robots\RobotsHeaders;
17			use Psr\Http\Message\UriInterface;
18			use Spatie\Browsershot\Browsershot;
19			use Psr\Http\Message\StreamInterface;
20			use Symfony\Component\DomCrawler\Link;
21			use Psr\Http\Message\ResponseInterface;
22			use Spatie\Crawler\CrawlQueue\CrawlQueue;
23			use GuzzleHttp\Exception\RequestException;
24			use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
25			use Symfony\Component\DomCrawler\Crawler as DomCrawler;
26
27			class Crawler
28			{
29			/** @var \GuzzleHttp\Client */
30			protected $client;
31
32			/** @var \Psr\Http\Message\UriInterface */
33			protected $baseUrl;
34
35			/** @var array[\Spatie\Crawler\CrawlObserver] */
36			protected $crawlObservers;
37
38			/** @var \Spatie\Crawler\CrawlProfile */
39			protected $crawlProfile;
40
41			/** @var int */
42			protected $concurrency;
43
44			/** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
45			protected $crawlQueue;
46
47			/** @var int */
48			protected $crawledUrlCount = 0;
49
50			/** @var int\|null */
51			protected $maximumCrawlCount = null;
52
53			/** @var int */
54			protected $maximumResponseSize = 1024 * 1024 * 2;
55
56			/** @var int\|null */
57			protected $maximumDepth = null;
58
59			/** @var bool */
60			protected $ignoreRobots = false;
61
62			/** @var \Tree\Node\Node */
63			protected $depthTree;
64
65			/** @var bool */
66			protected $executeJavaScript = false;
67
68			/** @var Browsershot */
69			protected $browsershot = null;
70
71			/** @var \Spatie\Robots\RobotsTxt */
72			private $robotsTxt = null;
73
74			protected static $defaultClientOptions = [
75			RequestOptions::COOKIES => true,
76			RequestOptions::CONNECT_TIMEOUT => 10,
77			RequestOptions::TIMEOUT => 10,
78			RequestOptions::ALLOW_REDIRECTS => false,
79			];
80
81			/**
82			* @param array $clientOptions
83			*
84			* @return static
85			*/
86			public static function create(array $clientOptions = [])
87			{
88			$clientOptions = (count($clientOptions))
89			? $clientOptions
90			: self::$defaultClientOptions;
91
92			$client = new Client($clientOptions);
93
94			return new static($client);
95			}
96
97			public function __construct(Client $client, int $concurrency = 10)
98			{
99			$this->client = $client;
100
101			$this->concurrency = $concurrency;
102
103			$this->crawlProfile = new CrawlAllUrls();
104
105			$this->crawlQueue = new CollectionCrawlQueue();
106			}
107
108			/**
109			* @param int $concurrency
110			*
111			* @return $this
112			*/
113			public function setConcurrency(int $concurrency)
114			{
115			$this->concurrency = $concurrency;
116
117			return $this;
118			}
119
120			/**
121			* Responses that are larger that then specified value will be ignored.
122			*
123			* @param int $maximumResponseSizeInBytes
124			*
125			* @return $this
126			*/
127			public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
128			{
129			$this->maximumResponseSize = $maximumResponseSizeInBytes;
130
131			return $this;
132			}
133
134			/**
135			* @param int $maximumCrawlCount
136			*
137			* @return $this
138			*/
139			public function setMaximumCrawlCount(int $maximumCrawlCount)
140			{
141			$this->maximumCrawlCount = $maximumCrawlCount;
142
143			return $this;
144			}
145
146			/**
147			* @param int $maximumDepth
148			*
149			* @return $this
150			*/
151			public function setMaximumDepth(int $maximumDepth)
152			{
153			$this->maximumDepth = $maximumDepth;
154
155			return $this;
156			}
157
158			/**
159			* @param bool $ignoreRobots
160			*
161			* @return $this
162			*/
163			public function ignoreRobots(bool $ignoreRobots = true)
164			{
165			$this->ignoreRobots = $ignoreRobots;
166
167			return $this;
168			}
169
170			/**
171			* @param CrawlQueue $crawlQueue
172			*
173			* @return $this
174			*/
175			public function setCrawlQueue(CrawlQueue $crawlQueue)
176			{
177			$this->crawlQueue = $crawlQueue;
178
179			return $this;
180			}
181
182			/**
183			* @return $this
184			*/
185			public function executeJavaScript()
186			{
187			$this->executeJavaScript = true;
188
189			return $this;
190			}
191
192			/**
193			* @return $this
194			*/
195			public function doNotExecuteJavaScript()
196			{
197			$this->executeJavaScript = false;
198
199			return $this;
200			}
201
202			/**
203			* @param \Spatie\Crawler\CrawlObserver\|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
			0 ignored issues – show Documentation introduced 2018-01-25 20:52 UTC by Report Bug Copy Issue Report The doc-type `\Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver]` could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types) This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types. Loading history...
204			*
205			* @return $this
206			*/
207			public function setCrawlObserver($crawlObservers)
208			{
209			if (! is_array($crawlObservers)) {
210			$crawlObservers = [$crawlObservers];
211			}
212
213			return $this->setCrawlObservers($crawlObservers);
214			}
215
216			public function setCrawlObservers(array $crawlObservers)
217			{
218			$this->crawlObservers = $crawlObservers;
219
220			return $this;
221			}
222
223			public function addCrawlObserver(CrawlObserver $crawlObserver)
224			{
225			$this->crawlObservers[] = $crawlObserver;
226
227			return $this;
228			}
229
230			/**
231			* @param \Spatie\Crawler\CrawlProfile $crawlProfile
232			*
233			* @return $this
234			*/
235			public function setCrawlProfile(CrawlProfile $crawlProfile)
236			{
237			$this->crawlProfile = $crawlProfile;
238
239			return $this;
240			}
241
242			/**
243			* @param \Psr\Http\Message\UriInterface\|string $baseUrl
244			*/
245			public function startCrawling($baseUrl)
246			{
247			if (! $baseUrl instanceof UriInterface) {
248			$baseUrl = new Uri($baseUrl);
249			}
250
251			if ($baseUrl->getScheme() === '') {
252			$baseUrl = $baseUrl->withScheme('http');
253			}
254
255			if ($baseUrl->getPath() === '') {
256			$baseUrl = $baseUrl->withPath('/');
257			}
258
259			$this->baseUrl = $baseUrl;
260
261			$crawlUrl = CrawlUrl::create($this->baseUrl);
262
263			$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
264
265			$this->addToCrawlQueue($crawlUrl);
266
267			$this->depthTree = new Node((string) $this->baseUrl);
268
269			$this->startCrawlingQueue();
270
271			foreach ($this->crawlObservers as $crawlObserver) {
272			$crawlObserver->finishedCrawling();
273			}
274			}
275
276			protected function startCrawlingQueue()
277			{
278			while ($this->crawlQueue->hasPendingUrls()) {
279			$pool = new Pool($this->client, $this->getCrawlRequests(), [
280			'concurrency' => $this->concurrency,
281			'options' => $this->client->getConfig(),
282			'fulfilled' => function (ResponseInterface $response, $index) {
283			$crawlUrl = $this->crawlQueue->getUrlById($index);
284
285			$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
286
287			$robotsHeaders = RobotsHeaders::create($response->getHeaders());
288
289			$robotsMeta = RobotsMeta::create($body);
290
291			if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
292			return;
293			}
294
295			$this->handleCrawled($response, $crawlUrl);
296
297			if (! $this->crawlProfile instanceof CrawlSubdomains) {
298			if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
299			return;
300			}
301			}
302
303			if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
304			return;
305			}
306
307			$this->addAllLinksToCrawlQueue(
308			$body,
309			$crawlUrl->url
310			);
311			},
312			'rejected' => function (RequestException $exception, $index) {
313			$this->handleCrawlFailed(
314			$exception,
315			$this->crawlQueue->getUrlById($index),
316			$exception
			0 ignored issues – show Unused Code introduced 2018-02-28 11:45 UTC by Report Bug Copy Issue Report The call to `Crawler::handleCrawlFailed()` has too many arguments starting with `$exception`. This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the `@ignore` PhpDoc annotation to the duplicate definition and it will be ignored. Loading history...
317			);
318			},
319			]);
320
321			$promise = $pool->promise();
322			$promise->wait();
323			}
324			}
325
326			public function endsWith($haystack, $needle)
327			{
328			return strrpos($haystack, $needle) + strlen($needle) ===
329			strlen($haystack);
330			}
331
332			protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
333			{
334			$bodyStream->rewind();
335
336			$body = $bodyStream->read($readMaximumBytes);
337
338			return $body;
339			}
340
341			protected function createRobotsTxt(UriInterface $uri): RobotsTxt
342			{
343			return RobotsTxt::create($uri->withPath('/robots.txt'));
344			}
345
346			/**
347			* @param ResponseInterface\|null $response
348			* @param CrawlUrl $crawlUrl
349			*/
350			protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
351			{
352			foreach ($this->crawlObservers as $crawlObserver) {
353			$crawlObserver->crawled(
354			$crawlUrl->url,
355			$response,
356			$crawlUrl->foundOnUrl
357			);
358			}
359			}
360
361			/**
362			* @param RequestException $exception
363			* @param CrawlUrl $crawlUrl
364			*/
365			protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
366			{
367			foreach ($this->crawlObservers as $crawlObserver) {
368			$crawlObserver->crawlFailed(
369			$crawlUrl->url,
370			$exception,
371			$crawlUrl->foundOnUrl
372			);
373			}
374			}
375
376			protected function getCrawlRequests(): Generator
377			{
378			while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
379			if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
380			$this->crawlQueue->markAsProcessed($crawlUrl);
381			continue;
382			}
383
384			if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
385			continue;
386			}
387
388			foreach ($this->crawlObservers as $crawlObserver) {
389			$crawlObserver->willCrawl($crawlUrl->url);
390			}
391
392			$this->crawlQueue->markAsProcessed($crawlUrl);
393
394			yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
395			}
396			}
397
398			protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
399			{
400			$allLinks = $this->extractAllLinks($html, $foundOnUrl);
401
402			collect($allLinks)
403			->filter(function (UriInterface $url) {
404			return $this->hasCrawlableScheme($url);
405			})
406			->map(function (UriInterface $url) {
407			return $this->normalizeUrl($url);
408			})
409			->filter(function (UriInterface $url) {
410			return $this->crawlProfile->shouldCrawl($url);
411			})
412			->reject(function (UriInterface $url) {
413			return $this->crawlQueue->has($url);
414			})
415			->each(function (UriInterface $url) use ($foundOnUrl) {
416			$node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
417
418			if (strpos($url->getPath(), '/tel:') === 0) {
419			return;
420			}
421
422			if (! $this->shouldCrawl($node)) {
423			return;
424			}
425
426			if ($this->maximumCrawlCountReached()) {
427			return;
428			}
429
430			$crawlUrl = CrawlUrl::create($url, $foundOnUrl);
431
432			$this->addToCrawlQueue($crawlUrl);
433			});
434			}
435
436			protected function shouldCrawl(Node $node): bool
437			{
438			if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
439			return false;
440			}
441
442			if (is_null($this->maximumDepth)) {
443			return true;
444			}
445
446			return $node->getDepth() <= $this->maximumDepth;
447			}
448
449			/**
450			* @param string $html
451			* @param \Psr\Http\Message\UriInterface $foundOnUrl
452			*
453			* @return \Illuminate\Support\Collection\|\Tightenco\Collect\Support\Collection\|null
454			*/
455			protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
456			{
457			if ($this->executeJavaScript) {
458			$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
459			}
460
461			$domCrawler = new DomCrawler($html, $foundOnUrl);
462
463			return collect($domCrawler->filterXpath('//a')->links())
464			->reject(function (Link $link) {
465			return $link->getNode()->getAttribute('rel') === 'nofollow';
466			})
467			->map(function (Link $link) {
468			try {
469			return new Uri($link->getUri());
470			} catch (InvalidArgumentException $exception) {
471			return;
472			}
473			})
474			->filter();
475			}
476
477			protected function normalizeUrl(UriInterface $url): UriInterface
478			{
479			return $url->withFragment('');
480			}
481
482			protected function hasCrawlableScheme(UriInterface $uri): bool
483			{
484			return in_array($uri->getScheme(), ['http', 'https']);
485			}
486
487			protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
488			{
489			$returnNode = null;
490
491			if ($node->getValue() === (string) $parentUrl) {
492			$newNode = new Node((string) $url);
493
494			$node->addChild($newNode);
495
496			return $newNode;
497			}
498
499			foreach ($node->getChildren() as $currentNode) {
500			$returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
501
502			if (! is_null($returnNode)) {
503			break;
504			}
505			}
506
507			return $returnNode;
508			}
509
510			protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
511			{
512			$browsershot = $this->getBrowsershot();
513
514			$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
515
516			return html_entity_decode($html);
517			}
518
519			protected function getBrowsershot(): Browsershot
520			{
521			if ($this->browsershot) {
522			return $this->browsershot;
523			}
524
525			$this->browsershot = new Browsershot();
526
527			return $this->browsershot;
528			}
529
530			public function setBrowsershot(Browsershot $browsershot)
531			{
532			$this->browsershot = $browsershot;
533
534			return $this;
535			}
536
537			protected function addToCrawlQueue(CrawlUrl $crawlUrl)
538			{
539			$this->crawledUrlCount++;
540
541			$this->crawlQueue->add($crawlUrl);
542
543			return $this;
544			}
545
546			protected function maximumCrawlCountReached(): bool
547			{
548			if (is_null($this->maximumCrawlCount)) {
549			return false;
550			}
551
552			return $this->crawledUrlCount >= $this->maximumCrawlCount;
553			}
554
555			protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
556			{
557			if ($this->ignoreRobots) {
558			return true;
559			}
560
561			return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
562			}
563
564			protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
565			{
566			if ($this->ignoreRobots) {
567			return true;
568			}
569
570			return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
571			}
572			}
573

spatie / crawler

Pull Request — master (#145)

Crawler F

Complexity

Size/Duplication

Coupling/Cohesion

Importance

35 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like