Crawler::mayIndex() - Code Metrics - Inspection of "Respect robots" - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#145)

by Freek

created 2018-05-04 18:10 UTC

Crawler::mayIndex() A

↳ Parent: Crawler

Complexity

Conditions	4
Paths	4

Size

Total Lines	16
Code Lines	8

Duplication

Lines	16
Ratio	100 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	16
loc	16
rs	9.2
cc	4
eloc	8
nc	4
nop	2

<?php

namespace Spatie\Crawler;

use Generator;
use Tree\Node\Node;
use GuzzleHttp\Pool;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\Request;
use Spatie\Robots\RobotsTxt;
use InvalidArgumentException;
use Spatie\Robots\RobotsMeta;
use GuzzleHttp\RequestOptions;
use Spatie\Robots\RobotsHeaders;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Psr\Http\Message\StreamInterface;
use Symfony\Component\DomCrawler\Link;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlQueue\CrawlQueue;
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

class Crawler
{
    /** @var \GuzzleHttp\Client */
    protected $client;

    /** @var \Psr\Http\Message\UriInterface */
    protected $baseUrl;

    /** @var array[\Spatie\Crawler\CrawlObserver] */
    protected $crawlObservers;

    /** @var \Spatie\Crawler\CrawlProfile */
    protected $crawlProfile;

    /** @var int */
    protected $concurrency;

    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
    protected $crawlQueue;

    /** @var int */
    protected $crawledUrlCount = 0;

    /** @var int|null */
    protected $maximumCrawlCount = null;

    /** @var int */
    protected $maximumResponseSize = 1024 * 1024 * 2;

    /** @var int|null */
    protected $maximumDepth = null;

    /** @var bool */
    protected $respectRobots = true;

    /** @var \Tree\Node\Node */
    protected $depthTree;

    /** @var bool */
    protected $executeJavaScript = false;

    /** @var Browsershot */
    protected $browsershot = null;

    /** @var \Spatie\Robots\RobotsTxt */
    protected $robotsTxt = null;

    protected static $defaultClientOptions = [
        RequestOptions::COOKIES => true,
        RequestOptions::CONNECT_TIMEOUT => 10,
        RequestOptions::TIMEOUT => 10,
        RequestOptions::ALLOW_REDIRECTS => false,
    ];

    /**
     * @param array $clientOptions
     *
     * @return static
     */
    public static function create(array $clientOptions = [])
    {
        $clientOptions = (count($clientOptions))
            ? $clientOptions
            : self::$defaultClientOptions;

        $client = new Client($clientOptions);

        return new static($client);
    }

    public function __construct(Client $client, int $concurrency = 10)
    {
        $this->client = $client;

        $this->concurrency = $concurrency;

        $this->crawlProfile = new CrawlAllUrls();

        $this->crawlQueue = new CollectionCrawlQueue();
    }

    /**
     * @param int $concurrency
     *
     * @return $this
     */
    public function setConcurrency(int $concurrency)
    {
        $this->concurrency = $concurrency;

        return $this;
    }

    /**
     * Responses that are larger that then specified value will be ignored.
     *
     * @param int $maximumResponseSizeInBytes
     *
     * @return $this
     */
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
    {
        $this->maximumResponseSize = $maximumResponseSizeInBytes;

        return $this;
    }

    /**
     * @param int $maximumCrawlCount
     *
     * @return $this
     */
    public function setMaximumCrawlCount(int $maximumCrawlCount)
    {
        $this->maximumCrawlCount = $maximumCrawlCount;

        return $this;
    }

    /**
     * @param int $maximumDepth
     *
     * @return $this
     */
    public function setMaximumDepth(int $maximumDepth)
    {
        $this->maximumDepth = $maximumDepth;

        return $this;
    }

    /**
     * @return $this
     */
    public function ignoreRobots()
    {
        $this->respectRobots = false;

        return $this;
    }

    /**
     * @return $this
     */
    public function respectRobots()
    {
        $this->respectRobots = true;

        return $this;
    }

    /**
     * @param CrawlQueue $crawlQueue
     *
     * @return $this
     */
    public function setCrawlQueue(CrawlQueue $crawlQueue)
    {
        $this->crawlQueue = $crawlQueue;

        return $this;
    }

    /**
     * @return $this
     */
    public function executeJavaScript()
    {
        $this->executeJavaScript = true;

        return $this;
    }

    /**
     * @return $this
     */
    public function doNotExecuteJavaScript()
    {
        $this->executeJavaScript = false;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers

     *
     * @return $this
     */
    public function setCrawlObserver($crawlObservers)
    {
        if (! is_array($crawlObservers)) {
            $crawlObservers = [$crawlObservers];
        }

        return $this->setCrawlObservers($crawlObservers);
    }

    public function setCrawlObservers(array $crawlObservers)
    {
        $this->crawlObservers = $crawlObservers;

        return $this;
    }

    public function addCrawlObserver(CrawlObserver $crawlObserver)
    {
        $this->crawlObservers[] = $crawlObserver;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
     *
     * @return $this
     */
    public function setCrawlProfile(CrawlProfile $crawlProfile)
    {
        $this->crawlProfile = $crawlProfile;

        return $this;
    }

    /**
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
     */
    public function startCrawling($baseUrl)
    {
        if (! $baseUrl instanceof UriInterface) {
            $baseUrl = new Uri($baseUrl);
        }

        if ($baseUrl->getScheme() === '') {
            $baseUrl = $baseUrl->withScheme('http');
        }

        if ($baseUrl->getPath() === '') {
            $baseUrl = $baseUrl->withPath('/');
        }

        $this->baseUrl = $baseUrl;

        $crawlUrl = CrawlUrl::create($this->baseUrl);

        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);

        $this->addToCrawlQueue($crawlUrl);

        $this->depthTree = new Node((string) $this->baseUrl);

        $this->startCrawlingQueue();

        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->finishedCrawling();
        }
    }

    protected function startCrawlingQueue()
    {
        while ($this->crawlQueue->hasPendingUrls()) {
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
                'concurrency' => $this->concurrency,
                'options' => $this->client->getConfig(),
                'fulfilled' => function (ResponseInterface $response, $index) {
                    $crawlUrl = $this->crawlQueue->getUrlById($index);

                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);

                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());

                    $robotsMeta = RobotsMeta::create($body);

                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->handleCrawled($response, $crawlUrl);

                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
                            return;
                        }
                    }

                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->addAllLinksToCrawlQueue(
                        $body,
                        $crawlUrl->url
                    );
                },
                'rejected' => function (RequestException $exception, $index) {
                    $this->handleCrawlFailed(
                        $exception,
                        $this->crawlQueue->getUrlById($index),
                        $exception

                    );
                },
            ]);

            $promise = $pool->promise();
            $promise->wait();
        }
    }

    public function endsWith($haystack, $needle)
    {
        return strrpos($haystack, $needle) + strlen($needle) ===
            strlen($haystack);
    }

    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
    {
        $bodyStream->rewind();

        $body = $bodyStream->read($readMaximumBytes);

        return $body;
    }

    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
    {
        return RobotsTxt::create($uri->withPath('/robots.txt'));
    }

    /**
     * @param ResponseInterface|null $response
     * @param CrawlUrl               $crawlUrl
     */
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawled(
                $crawlUrl->url,
                $response,
                $crawlUrl->foundOnUrl
            );
        }
    }

    /**
     * @param RequestException $exception
     * @param CrawlUrl         $crawlUrl
     */
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawlFailed(
                $crawlUrl->url,
                $exception,
                $crawlUrl->foundOnUrl
            );
        }
    }

    protected function getCrawlRequests(): Generator
    {
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
                $this->crawlQueue->markAsProcessed($crawlUrl);
                continue;
            }

            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
                continue;
            }

            foreach ($this->crawlObservers as $crawlObserver) {
                $crawlObserver->willCrawl($crawlUrl->url);
            }

            $this->crawlQueue->markAsProcessed($crawlUrl);

            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
        }
    }

    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
    {
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);

        collect($allLinks)
            ->filter(function (UriInterface $url) {
                return $this->hasCrawlableScheme($url);
            })
            ->map(function (UriInterface $url) {
                return $this->normalizeUrl($url);
            })
            ->filter(function (UriInterface $url) {
                return $this->crawlProfile->shouldCrawl($url);
            })
            ->reject(function (UriInterface $url) {
                return $this->crawlQueue->has($url);
            })
            ->each(function (UriInterface $url) use ($foundOnUrl) {
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);

                if (strpos($url->getPath(), '/tel:') === 0) {
                    return;
                }

                if (! $this->shouldCrawl($node)) {
                    return;
                }

                if ($this->maximumCrawlCountReached()) {
                    return;
                }

                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);

                $this->addToCrawlQueue($crawlUrl);
            });
    }

    protected function shouldCrawl(Node $node): bool
    {
        if ($this->respectRobots) {
            return $this->robotsTxt->allows($node->getValue());
        }

        if (is_null($this->maximumDepth)) {
            return true;
        }

        return $node->getDepth() <= $this->maximumDepth;
    }

    /**
     * @param string                         $html
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
     *
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
     */
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
    {
        if ($this->executeJavaScript) {
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
        }

        $domCrawler = new DomCrawler($html, $foundOnUrl);

        return collect($domCrawler->filterXpath('//a')->links())
            ->reject(function (Link $link) {
                return $link->getNode()->getAttribute('rel') === 'nofollow';
            })
            ->map(function (Link $link) {
                try {
                    return new Uri($link->getUri());
                } catch (InvalidArgumentException $exception) {
                    return;
                }
            })
            ->filter();
    }

    protected function normalizeUrl(UriInterface $url): UriInterface
    {
        return $url->withFragment('');
    }

    protected function hasCrawlableScheme(UriInterface $uri): bool
    {
        return in_array($uri->getScheme(), ['http', 'https']);
    }

    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
    {
        $returnNode = null;

        if ($node->getValue() === (string) $parentUrl) {
            $newNode = new Node((string) $url);

            $node->addChild($newNode);

            return $newNode;
        }

        foreach ($node->getChildren() as $currentNode) {
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);

            if (! is_null($returnNode)) {
                break;
            }
        }

        return $returnNode;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
    {
        $browsershot = $this->getBrowsershot();

        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

        return html_entity_decode($html);
    }

    protected function getBrowsershot(): Browsershot
    {
        if ($this->browsershot) {
            return $this->browsershot;
        }

        $this->browsershot = new Browsershot();

        return $this->browsershot;
    }

    public function setBrowsershot(Browsershot $browsershot)
    {
        $this->browsershot = $browsershot;

        return $this;
    }

    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
    {
        $this->crawledUrlCount++;

        $this->crawlQueue->add($crawlUrl);

        return $this;
    }

    protected function maximumCrawlCountReached(): bool
    {
        if (is_null($this->maximumCrawlCount)) {
            return false;
        }

        return $this->crawledUrlCount >= $this->maximumCrawlCount;
    }

    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool

    {
        if (! $this->respectRobots) {
            return false;
        }

        if (! $robotsHeaders->mayIndex()) {
            return false;
        }

        if (! $robotsMeta->mayIndex()) {
            return false;
        }

        return true;
    }

    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool

    {
        if (! $this->respectRobots) {
            return false;
        }

        if (! $robotsHeaders->mayFollow()) {
            return false;
        }

        if (! $robotsMeta->mayFollow()) {
            return false;
        }

        return true;
    }
}


1		<?php
2
3		namespace Spatie\Crawler;
4
5		use Generator;
6		use Tree\Node\Node;
7		use GuzzleHttp\Pool;
8		use GuzzleHttp\Client;
9		use GuzzleHttp\Psr7\Uri;
10		use GuzzleHttp\Psr7\Request;
11		use Spatie\Robots\RobotsTxt;
12		use InvalidArgumentException;
13		use Spatie\Robots\RobotsMeta;
14		use GuzzleHttp\RequestOptions;
15		use Spatie\Robots\RobotsHeaders;
16		use Psr\Http\Message\UriInterface;
17		use Spatie\Browsershot\Browsershot;
18		use Psr\Http\Message\StreamInterface;
19		use Symfony\Component\DomCrawler\Link;
20		use Psr\Http\Message\ResponseInterface;
21		use Spatie\Crawler\CrawlQueue\CrawlQueue;
22		use GuzzleHttp\Exception\RequestException;
23		use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24		use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26		class Crawler
27		{
28		/** @var \GuzzleHttp\Client */
29		protected $client;
30
31		/** @var \Psr\Http\Message\UriInterface */
32		protected $baseUrl;
33
34		/** @var array[\Spatie\Crawler\CrawlObserver] */
35		protected $crawlObservers;
36
37		/** @var \Spatie\Crawler\CrawlProfile */
38		protected $crawlProfile;
39
40		/** @var int */
41		protected $concurrency;
42
43		/** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44		protected $crawlQueue;
45
46		/** @var int */
47		protected $crawledUrlCount = 0;
48
49		/** @var int\|null */
50		protected $maximumCrawlCount = null;
51
52		/** @var int */
53		protected $maximumResponseSize = 1024 * 1024 * 2;
54
55		/** @var int\|null */
56		protected $maximumDepth = null;
57
58		/** @var bool */
59		protected $respectRobots = true;
60
61		/** @var \Tree\Node\Node */
62		protected $depthTree;
63
64		/** @var bool */
65		protected $executeJavaScript = false;
66
67		/** @var Browsershot */
68		protected $browsershot = null;
69
70		/** @var \Spatie\Robots\RobotsTxt */
71		protected $robotsTxt = null;
72
73		protected static $defaultClientOptions = [
74		RequestOptions::COOKIES => true,
75		RequestOptions::CONNECT_TIMEOUT => 10,
76		RequestOptions::TIMEOUT => 10,
77		RequestOptions::ALLOW_REDIRECTS => false,
78		];
79
80		/**
81		* @param array $clientOptions
82		*
83		* @return static
84		*/
85		public static function create(array $clientOptions = [])
86		{
87		$clientOptions = (count($clientOptions))
88		? $clientOptions
89		: self::$defaultClientOptions;
90
91		$client = new Client($clientOptions);
92
93		return new static($client);
94		}
95
96		public function __construct(Client $client, int $concurrency = 10)
97		{
98		$this->client = $client;
99
100		$this->concurrency = $concurrency;
101
102		$this->crawlProfile = new CrawlAllUrls();
103
104		$this->crawlQueue = new CollectionCrawlQueue();
105		}
106
107		/**
108		* @param int $concurrency
109		*
110		* @return $this
111		*/
112		public function setConcurrency(int $concurrency)
113		{
114		$this->concurrency = $concurrency;
115
116		return $this;
117		}
118
119		/**
120		* Responses that are larger that then specified value will be ignored.
121		*
122		* @param int $maximumResponseSizeInBytes
123		*
124		* @return $this
125		*/
126		public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127		{
128		$this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130		return $this;
131		}
132
133		/**
134		* @param int $maximumCrawlCount
135		*
136		* @return $this
137		*/
138		public function setMaximumCrawlCount(int $maximumCrawlCount)
139		{
140		$this->maximumCrawlCount = $maximumCrawlCount;
141
142		return $this;
143		}
144
145		/**
146		* @param int $maximumDepth
147		*
148		* @return $this
149		*/
150		public function setMaximumDepth(int $maximumDepth)
151		{
152		$this->maximumDepth = $maximumDepth;
153
154		return $this;
155		}
156
157		/**
158		* @return $this
159		*/
160		public function ignoreRobots()
161		{
162		$this->respectRobots = false;
163
164		return $this;
165		}
166
167		/**
168		* @return $this
169		*/
170		public function respectRobots()
171		{
172		$this->respectRobots = true;
173
174		return $this;
175		}
176
177		/**
178		* @param CrawlQueue $crawlQueue
179		*
180		* @return $this
181		*/
182		public function setCrawlQueue(CrawlQueue $crawlQueue)
183		{
184		$this->crawlQueue = $crawlQueue;
185
186		return $this;
187		}
188
189		/**
190		* @return $this
191		*/
192		public function executeJavaScript()
193		{
194		$this->executeJavaScript = true;
195
196		return $this;
197		}
198
199		/**
200		* @return $this
201		*/
202		public function doNotExecuteJavaScript()
203		{
204		$this->executeJavaScript = false;
205
206		return $this;
207		}
208
209		/**
210		* @param \Spatie\Crawler\CrawlObserver\|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
		0 ignored issues – show Documentation introduced 2018-01-25 20:52 UTC by Report Bug Copy Issue Report The doc-type `\Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver]` could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types) This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types. Loading history...
211		*
212		* @return $this
213		*/
214		public function setCrawlObserver($crawlObservers)
215		{
216		if (! is_array($crawlObservers)) {
217		$crawlObservers = [$crawlObservers];
218		}
219
220		return $this->setCrawlObservers($crawlObservers);
221		}
222
223		public function setCrawlObservers(array $crawlObservers)
224		{
225		$this->crawlObservers = $crawlObservers;
226
227		return $this;
228		}
229
230		public function addCrawlObserver(CrawlObserver $crawlObserver)
231		{
232		$this->crawlObservers[] = $crawlObserver;
233
234		return $this;
235		}
236
237		/**
238		* @param \Spatie\Crawler\CrawlProfile $crawlProfile
239		*
240		* @return $this
241		*/
242		public function setCrawlProfile(CrawlProfile $crawlProfile)
243		{
244		$this->crawlProfile = $crawlProfile;
245
246		return $this;
247		}
248
249		/**
250		* @param \Psr\Http\Message\UriInterface\|string $baseUrl
251		*/
252		public function startCrawling($baseUrl)
253		{
254		if (! $baseUrl instanceof UriInterface) {
255		$baseUrl = new Uri($baseUrl);
256		}
257
258		if ($baseUrl->getScheme() === '') {
259		$baseUrl = $baseUrl->withScheme('http');
260		}
261
262		if ($baseUrl->getPath() === '') {
263		$baseUrl = $baseUrl->withPath('/');
264		}
265
266		$this->baseUrl = $baseUrl;
267
268		$crawlUrl = CrawlUrl::create($this->baseUrl);
269
270		$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
271
272		$this->addToCrawlQueue($crawlUrl);
273
274		$this->depthTree = new Node((string) $this->baseUrl);
275
276		$this->startCrawlingQueue();
277
278		foreach ($this->crawlObservers as $crawlObserver) {
279		$crawlObserver->finishedCrawling();
280		}
281		}
282
283		protected function startCrawlingQueue()
284		{
285		while ($this->crawlQueue->hasPendingUrls()) {
286		$pool = new Pool($this->client, $this->getCrawlRequests(), [
287		'concurrency' => $this->concurrency,
288		'options' => $this->client->getConfig(),
289		'fulfilled' => function (ResponseInterface $response, $index) {
290		$crawlUrl = $this->crawlQueue->getUrlById($index);
291
292		$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
293
294		$robotsHeaders = RobotsHeaders::create($response->getHeaders());
295
296		$robotsMeta = RobotsMeta::create($body);
297
298		if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
299		return;
300		}
301
302		$this->handleCrawled($response, $crawlUrl);
303
304		if (! $this->crawlProfile instanceof CrawlSubdomains) {
305		if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
306		return;
307		}
308		}
309
310		if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
311		return;
312		}
313
314		$this->addAllLinksToCrawlQueue(
315		$body,
316		$crawlUrl->url
317		);
318		},
319		'rejected' => function (RequestException $exception, $index) {
320		$this->handleCrawlFailed(
321		$exception,
322		$this->crawlQueue->getUrlById($index),
323		$exception
		0 ignored issues – show Unused Code introduced 2018-02-28 11:45 UTC by Report Bug Copy Issue Report The call to `Crawler::handleCrawlFailed()` has too many arguments starting with `$exception`. This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the `@ignore` PhpDoc annotation to the duplicate definition and it will be ignored. Loading history...
324		);
325		},
326		]);
327
328		$promise = $pool->promise();
329		$promise->wait();
330		}
331		}
332
333		public function endsWith($haystack, $needle)
334		{
335		return strrpos($haystack, $needle) + strlen($needle) ===
336		strlen($haystack);
337		}
338
339		protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
340		{
341		$bodyStream->rewind();
342
343		$body = $bodyStream->read($readMaximumBytes);
344
345		return $body;
346		}
347
348		protected function createRobotsTxt(UriInterface $uri): RobotsTxt
349		{
350		return RobotsTxt::create($uri->withPath('/robots.txt'));
351		}
352
353		/**
354		* @param ResponseInterface\|null $response
355		* @param CrawlUrl $crawlUrl
356		*/
357		protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
358		{
359		foreach ($this->crawlObservers as $crawlObserver) {
360		$crawlObserver->crawled(
361		$crawlUrl->url,
362		$response,
363		$crawlUrl->foundOnUrl
364		);
365		}
366		}
367
368		/**
369		* @param RequestException $exception
370		* @param CrawlUrl $crawlUrl
371		*/
372		protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
373		{
374		foreach ($this->crawlObservers as $crawlObserver) {
375		$crawlObserver->crawlFailed(
376		$crawlUrl->url,
377		$exception,
378		$crawlUrl->foundOnUrl
379		);
380		}
381		}
382
383		protected function getCrawlRequests(): Generator
384		{
385		while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
386		if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
387		$this->crawlQueue->markAsProcessed($crawlUrl);
388		continue;
389		}
390
391		if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
392		continue;
393		}
394
395		foreach ($this->crawlObservers as $crawlObserver) {
396		$crawlObserver->willCrawl($crawlUrl->url);
397		}
398
399		$this->crawlQueue->markAsProcessed($crawlUrl);
400
401		yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
402		}
403		}
404
405		protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
406		{
407		$allLinks = $this->extractAllLinks($html, $foundOnUrl);
408
409		collect($allLinks)
410		->filter(function (UriInterface $url) {
411		return $this->hasCrawlableScheme($url);
412		})
413		->map(function (UriInterface $url) {
414		return $this->normalizeUrl($url);
415		})
416		->filter(function (UriInterface $url) {
417		return $this->crawlProfile->shouldCrawl($url);
418		})
419		->reject(function (UriInterface $url) {
420		return $this->crawlQueue->has($url);
421		})
422		->each(function (UriInterface $url) use ($foundOnUrl) {
423		$node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
424
425		if (strpos($url->getPath(), '/tel:') === 0) {
426		return;
427		}
428
429		if (! $this->shouldCrawl($node)) {
430		return;
431		}
432
433		if ($this->maximumCrawlCountReached()) {
434		return;
435		}
436
437		$crawlUrl = CrawlUrl::create($url, $foundOnUrl);
438
439		$this->addToCrawlQueue($crawlUrl);
440		});
441		}
442
443		protected function shouldCrawl(Node $node): bool
444		{
445		if ($this->respectRobots) {
446		return $this->robotsTxt->allows($node->getValue());
447		}
448
449		if (is_null($this->maximumDepth)) {
450		return true;
451		}
452
453		return $node->getDepth() <= $this->maximumDepth;
454		}
455
456		/**
457		* @param string $html
458		* @param \Psr\Http\Message\UriInterface $foundOnUrl
459		*
460		* @return \Illuminate\Support\Collection\|\Tightenco\Collect\Support\Collection\|null
461		*/
462		protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
463		{
464		if ($this->executeJavaScript) {
465		$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
466		}
467
468		$domCrawler = new DomCrawler($html, $foundOnUrl);
469
470		return collect($domCrawler->filterXpath('//a')->links())
471		->reject(function (Link $link) {
472		return $link->getNode()->getAttribute('rel') === 'nofollow';
473		})
474		->map(function (Link $link) {
475		try {
476		return new Uri($link->getUri());
477		} catch (InvalidArgumentException $exception) {
478		return;
479		}
480		})
481		->filter();
482		}
483
484		protected function normalizeUrl(UriInterface $url): UriInterface
485		{
486		return $url->withFragment('');
487		}
488
489		protected function hasCrawlableScheme(UriInterface $uri): bool
490		{
491		return in_array($uri->getScheme(), ['http', 'https']);
492		}
493
494		protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
495		{
496		$returnNode = null;
497
498		if ($node->getValue() === (string) $parentUrl) {
499		$newNode = new Node((string) $url);
500
501		$node->addChild($newNode);
502
503		return $newNode;
504		}
505
506		foreach ($node->getChildren() as $currentNode) {
507		$returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
508
509		if (! is_null($returnNode)) {
510		break;
511		}
512		}
513
514		return $returnNode;
515		}
516
517		protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
518		{
519		$browsershot = $this->getBrowsershot();
520
521		$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
522
523		return html_entity_decode($html);
524		}
525
526		protected function getBrowsershot(): Browsershot
527		{
528		if ($this->browsershot) {
529		return $this->browsershot;
530		}
531
532		$this->browsershot = new Browsershot();
533
534		return $this->browsershot;
535		}
536
537		public function setBrowsershot(Browsershot $browsershot)
538		{
539		$this->browsershot = $browsershot;
540
541		return $this;
542		}
543
544		protected function addToCrawlQueue(CrawlUrl $crawlUrl)
545		{
546		$this->crawledUrlCount++;
547
548		$this->crawlQueue->add($crawlUrl);
549
550		return $this;
551		}
552
553		protected function maximumCrawlCountReached(): bool
554		{
555		if (is_null($this->maximumCrawlCount)) {
556		return false;
557		}
558
559		return $this->crawledUrlCount >= $this->maximumCrawlCount;
560		}
561
562	View Code Duplication	protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
		0 ignored issues – show Duplication introduced 2018-05-04 12:46 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
563		{
564		if (! $this->respectRobots) {
565		return false;
566		}
567
568		if (! $robotsHeaders->mayIndex()) {
569		return false;
570		}
571
572		if (! $robotsMeta->mayIndex()) {
573		return false;
574		}
575
576		return true;
577		}
578
579	View Code Duplication	protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
		0 ignored issues – show Duplication introduced 2018-05-04 12:46 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
580		{
581		if (! $this->respectRobots) {
582		return false;
583		}
584
585		if (! $robotsHeaders->mayFollow()) {
586		return false;
587		}
588
589		if (! $robotsMeta->mayFollow()) {
590		return false;
591		}
592
593		return true;
594		}
595		}
596

spatie / crawler

Pull Request — master (#145)

Crawler::mayIndex() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like