Crawler::mayIndex() - Code Metrics - Inspection of "Respect robots" - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#145)

by Brent

created 2018-05-04 12:23 UTC

Crawler::mayIndex() A

↳ Parent: Crawler

Complexity

Conditions	3
Paths	3

Size

Total Lines	8
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	0
loc	8
rs	9.4285
cc	3
eloc	4
nc	3
nop	2

<?php

namespace Spatie\Crawler;

use Generator;
use Tree\Node\Node;
use GuzzleHttp\Pool;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\Request;
use Spatie\Robots\RobotsTxt;
use InvalidArgumentException;
use Spatie\Robots\RobotsMeta;
use GuzzleHttp\RequestOptions;
use Spatie\Robots\RobotsHeaders;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Psr\Http\Message\StreamInterface;
use Symfony\Component\DomCrawler\Link;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlQueue\CrawlQueue;
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

class Crawler
{
    /** @var \GuzzleHttp\Client */
    protected $client;

    /** @var \Psr\Http\Message\UriInterface */
    protected $baseUrl;

    /** @var array[\Spatie\Crawler\CrawlObserver] */
    protected $crawlObservers;

    /** @var \Spatie\Crawler\CrawlProfile */
    protected $crawlProfile;

    /** @var int */
    protected $concurrency;

    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
    protected $crawlQueue;

    /** @var int */
    protected $crawledUrlCount = 0;

    /** @var int|null */
    protected $maximumCrawlCount = null;

    /** @var int */
    protected $maximumResponseSize = 1024 * 1024 * 2;

    /** @var int|null */
    protected $maximumDepth = null;

    /** @var bool */
    protected $ignoreRobots = false;

    /** @var \Tree\Node\Node */
    protected $depthTree;

    /** @var bool */
    protected $executeJavaScript = false;

    /** @var Browsershot */
    protected $browsershot = null;

    /** @var \Spatie\Robots\RobotsTxt */
    protected $robotsTxt = null;

    protected static $defaultClientOptions = [
        RequestOptions::COOKIES => true,
        RequestOptions::CONNECT_TIMEOUT => 10,
        RequestOptions::TIMEOUT => 10,
        RequestOptions::ALLOW_REDIRECTS => false,
    ];

    /**
     * @param array $clientOptions
     *
     * @return static
     */
    public static function create(array $clientOptions = [])
    {
        $clientOptions = (count($clientOptions))
            ? $clientOptions
            : self::$defaultClientOptions;

        $client = new Client($clientOptions);

        return new static($client);
    }

    public function __construct(Client $client, int $concurrency = 10)
    {
        $this->client = $client;

        $this->concurrency = $concurrency;

        $this->crawlProfile = new CrawlAllUrls();

        $this->crawlQueue = new CollectionCrawlQueue();
    }

    /**
     * @param int $concurrency
     *
     * @return $this
     */
    public function setConcurrency(int $concurrency)
    {
        $this->concurrency = $concurrency;

        return $this;
    }

    /**
     * Responses that are larger that then specified value will be ignored.
     *
     * @param int $maximumResponseSizeInBytes
     *
     * @return $this
     */
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
    {
        $this->maximumResponseSize = $maximumResponseSizeInBytes;

        return $this;
    }

    /**
     * @param int $maximumCrawlCount
     *
     * @return $this
     */
    public function setMaximumCrawlCount(int $maximumCrawlCount)
    {
        $this->maximumCrawlCount = $maximumCrawlCount;

        return $this;
    }

    /**
     * @param int $maximumDepth
     *
     * @return $this
     */
    public function setMaximumDepth(int $maximumDepth)
    {
        $this->maximumDepth = $maximumDepth;

        return $this;
    }

    /**
     * @param bool $ignoreRobots
     *
     * @return $this
     */
    public function ignoreRobots(bool $ignoreRobots = true)
    {
        $this->ignoreRobots = $ignoreRobots;

        return $this;
    }

    /**
     * @param CrawlQueue $crawlQueue
     *
     * @return $this
     */
    public function setCrawlQueue(CrawlQueue $crawlQueue)
    {
        $this->crawlQueue = $crawlQueue;

        return $this;
    }

    /**
     * @return $this
     */
    public function executeJavaScript()
    {
        $this->executeJavaScript = true;

        return $this;
    }

    /**
     * @return $this
     */
    public function doNotExecuteJavaScript()
    {
        $this->executeJavaScript = false;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers

     *
     * @return $this
     */
    public function setCrawlObserver($crawlObservers)
    {
        if (! is_array($crawlObservers)) {
            $crawlObservers = [$crawlObservers];
        }

        return $this->setCrawlObservers($crawlObservers);
    }

    public function setCrawlObservers(array $crawlObservers)
    {
        $this->crawlObservers = $crawlObservers;

        return $this;
    }

    public function addCrawlObserver(CrawlObserver $crawlObserver)
    {
        $this->crawlObservers[] = $crawlObserver;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
     *
     * @return $this
     */
    public function setCrawlProfile(CrawlProfile $crawlProfile)
    {
        $this->crawlProfile = $crawlProfile;

        return $this;
    }

    /**
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
     */
    public function startCrawling($baseUrl)
    {
        if (! $baseUrl instanceof UriInterface) {
            $baseUrl = new Uri($baseUrl);
        }

        if ($baseUrl->getScheme() === '') {
            $baseUrl = $baseUrl->withScheme('http');
        }

        if ($baseUrl->getPath() === '') {
            $baseUrl = $baseUrl->withPath('/');
        }

        $this->baseUrl = $baseUrl;

        $crawlUrl = CrawlUrl::create($this->baseUrl);

        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);

        $this->addToCrawlQueue($crawlUrl);

        $this->depthTree = new Node((string) $this->baseUrl);

        $this->startCrawlingQueue();

        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->finishedCrawling();
        }
    }

    protected function startCrawlingQueue()
    {
        while ($this->crawlQueue->hasPendingUrls()) {
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
                'concurrency' => $this->concurrency,
                'options' => $this->client->getConfig(),
                'fulfilled' => function (ResponseInterface $response, $index) {
                    $crawlUrl = $this->crawlQueue->getUrlById($index);

                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);

                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());

                    $robotsMeta = RobotsMeta::create($body);

                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->handleCrawled($response, $crawlUrl);

                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
                            return;
                        }
                    }

                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->addAllLinksToCrawlQueue(
                        $body,
                        $crawlUrl->url
                    );
                },
                'rejected' => function (RequestException $exception, $index) {
                    $this->handleCrawlFailed(
                        $exception,
                        $this->crawlQueue->getUrlById($index),
                        $exception

                    );
                },
            ]);

            $promise = $pool->promise();
            $promise->wait();
        }
    }

    public function endsWith($haystack, $needle)
    {
        return strrpos($haystack, $needle) + strlen($needle) ===
            strlen($haystack);
    }

    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
    {
        $bodyStream->rewind();

        $body = $bodyStream->read($readMaximumBytes);

        return $body;
    }

    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
    {
        return RobotsTxt::create($uri->withPath('/robots.txt'));
    }

    /**
     * @param ResponseInterface|null $response
     * @param CrawlUrl               $crawlUrl
     */
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawled(
                $crawlUrl->url,
                $response,
                $crawlUrl->foundOnUrl
            );
        }
    }

    /**
     * @param RequestException $exception
     * @param CrawlUrl         $crawlUrl
     */
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawlFailed(
                $crawlUrl->url,
                $exception,
                $crawlUrl->foundOnUrl
            );
        }
    }

    protected function getCrawlRequests(): Generator
    {
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
                $this->crawlQueue->markAsProcessed($crawlUrl);
                continue;
            }

            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
                continue;
            }

            foreach ($this->crawlObservers as $crawlObserver) {
                $crawlObserver->willCrawl($crawlUrl->url);
            }

            $this->crawlQueue->markAsProcessed($crawlUrl);

            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
        }
    }

    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
    {
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);

        collect($allLinks)
            ->filter(function (UriInterface $url) {
                return $this->hasCrawlableScheme($url);
            })
            ->map(function (UriInterface $url) {
                return $this->normalizeUrl($url);
            })
            ->filter(function (UriInterface $url) {
                return $this->crawlProfile->shouldCrawl($url);
            })
            ->reject(function (UriInterface $url) {
                return $this->crawlQueue->has($url);
            })
            ->each(function (UriInterface $url) use ($foundOnUrl) {
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);

                if (strpos($url->getPath(), '/tel:') === 0) {
                    return;
                }

                if (! $this->shouldCrawl($node)) {
                    return;
                }

                if ($this->maximumCrawlCountReached()) {
                    return;
                }

                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);

                $this->addToCrawlQueue($crawlUrl);
            });
    }

    protected function shouldCrawl(Node $node): bool
    {
        if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
            return false;
        }

        if (is_null($this->maximumDepth)) {
            return true;
        }

        return $node->getDepth() <= $this->maximumDepth;
    }

    /**
     * @param string                         $html
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
     *
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
     */
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
    {
        if ($this->executeJavaScript) {
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
        }

        $domCrawler = new DomCrawler($html, $foundOnUrl);

        return collect($domCrawler->filterXpath('//a')->links())
            ->reject(function (Link $link) {
                return $link->getNode()->getAttribute('rel') === 'nofollow';
            })
            ->map(function (Link $link) {
                try {
                    return new Uri($link->getUri());
                } catch (InvalidArgumentException $exception) {
                    return;
                }
            })
            ->filter();
    }

    protected function normalizeUrl(UriInterface $url): UriInterface
    {
        return $url->withFragment('');
    }

    protected function hasCrawlableScheme(UriInterface $uri): bool
    {
        return in_array($uri->getScheme(), ['http', 'https']);
    }

    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
    {
        $returnNode = null;

        if ($node->getValue() === (string) $parentUrl) {
            $newNode = new Node((string) $url);

            $node->addChild($newNode);

            return $newNode;
        }

        foreach ($node->getChildren() as $currentNode) {
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);

            if (! is_null($returnNode)) {
                break;
            }
        }

        return $returnNode;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
    {
        $browsershot = $this->getBrowsershot();

        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

        return html_entity_decode($html);
    }

    protected function getBrowsershot(): Browsershot
    {
        if ($this->browsershot) {
            return $this->browsershot;
        }

        $this->browsershot = new Browsershot();

        return $this->browsershot;
    }

    public function setBrowsershot(Browsershot $browsershot)
    {
        $this->browsershot = $browsershot;

        return $this;
    }

    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
    {
        $this->crawledUrlCount++;

        $this->crawlQueue->add($crawlUrl);

        return $this;
    }

    protected function maximumCrawlCountReached(): bool
    {
        if (is_null($this->maximumCrawlCount)) {
            return false;
        }

        return $this->crawledUrlCount >= $this->maximumCrawlCount;
    }

    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
    {
        if ($this->ignoreRobots) {
            return true;
        }

        return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
    }

    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
    {
        if ($this->ignoreRobots) {
            return true;
        }

        return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
    }
}


1			<?php
2
3			namespace Spatie\Crawler;
4
5			use Generator;
6			use Tree\Node\Node;
7			use GuzzleHttp\Pool;
8			use GuzzleHttp\Client;
9			use GuzzleHttp\Psr7\Uri;
10			use GuzzleHttp\Psr7\Request;
11			use Spatie\Robots\RobotsTxt;
12			use InvalidArgumentException;
13			use Spatie\Robots\RobotsMeta;
14			use GuzzleHttp\RequestOptions;
15			use Spatie\Robots\RobotsHeaders;
16			use Psr\Http\Message\UriInterface;
17			use Spatie\Browsershot\Browsershot;
18			use Psr\Http\Message\StreamInterface;
19			use Symfony\Component\DomCrawler\Link;
20			use Psr\Http\Message\ResponseInterface;
21			use Spatie\Crawler\CrawlQueue\CrawlQueue;
22			use GuzzleHttp\Exception\RequestException;
23			use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24			use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26			class Crawler
27			{
28			/** @var \GuzzleHttp\Client */
29			protected $client;
30
31			/** @var \Psr\Http\Message\UriInterface */
32			protected $baseUrl;
33
34			/** @var array[\Spatie\Crawler\CrawlObserver] */
35			protected $crawlObservers;
36
37			/** @var \Spatie\Crawler\CrawlProfile */
38			protected $crawlProfile;
39
40			/** @var int */
41			protected $concurrency;
42
43			/** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44			protected $crawlQueue;
45
46			/** @var int */
47			protected $crawledUrlCount = 0;
48
49			/** @var int\|null */
50			protected $maximumCrawlCount = null;
51
52			/** @var int */
53			protected $maximumResponseSize = 1024 * 1024 * 2;
54
55			/** @var int\|null */
56			protected $maximumDepth = null;
57
58			/** @var bool */
59			protected $ignoreRobots = false;
60
61			/** @var \Tree\Node\Node */
62			protected $depthTree;
63
64			/** @var bool */
65			protected $executeJavaScript = false;
66
67			/** @var Browsershot */
68			protected $browsershot = null;
69
70			/** @var \Spatie\Robots\RobotsTxt */
71			protected $robotsTxt = null;
72
73			protected static $defaultClientOptions = [
74			RequestOptions::COOKIES => true,
75			RequestOptions::CONNECT_TIMEOUT => 10,
76			RequestOptions::TIMEOUT => 10,
77			RequestOptions::ALLOW_REDIRECTS => false,
78			];
79
80			/**
81			* @param array $clientOptions
82			*
83			* @return static
84			*/
85			public static function create(array $clientOptions = [])
86			{
87			$clientOptions = (count($clientOptions))
88			? $clientOptions
89			: self::$defaultClientOptions;
90
91			$client = new Client($clientOptions);
92
93			return new static($client);
94			}
95
96			public function __construct(Client $client, int $concurrency = 10)
97			{
98			$this->client = $client;
99
100			$this->concurrency = $concurrency;
101
102			$this->crawlProfile = new CrawlAllUrls();
103
104			$this->crawlQueue = new CollectionCrawlQueue();
105			}
106
107			/**
108			* @param int $concurrency
109			*
110			* @return $this
111			*/
112			public function setConcurrency(int $concurrency)
113			{
114			$this->concurrency = $concurrency;
115
116			return $this;
117			}
118
119			/**
120			* Responses that are larger that then specified value will be ignored.
121			*
122			* @param int $maximumResponseSizeInBytes
123			*
124			* @return $this
125			*/
126			public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127			{
128			$this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130			return $this;
131			}
132
133			/**
134			* @param int $maximumCrawlCount
135			*
136			* @return $this
137			*/
138			public function setMaximumCrawlCount(int $maximumCrawlCount)
139			{
140			$this->maximumCrawlCount = $maximumCrawlCount;
141
142			return $this;
143			}
144
145			/**
146			* @param int $maximumDepth
147			*
148			* @return $this
149			*/
150			public function setMaximumDepth(int $maximumDepth)
151			{
152			$this->maximumDepth = $maximumDepth;
153
154			return $this;
155			}
156
157			/**
158			* @param bool $ignoreRobots
159			*
160			* @return $this
161			*/
162			public function ignoreRobots(bool $ignoreRobots = true)
163			{
164			$this->ignoreRobots = $ignoreRobots;
165
166			return $this;
167			}
168
169			/**
170			* @param CrawlQueue $crawlQueue
171			*
172			* @return $this
173			*/
174			public function setCrawlQueue(CrawlQueue $crawlQueue)
175			{
176			$this->crawlQueue = $crawlQueue;
177
178			return $this;
179			}
180
181			/**
182			* @return $this
183			*/
184			public function executeJavaScript()
185			{
186			$this->executeJavaScript = true;
187
188			return $this;
189			}
190
191			/**
192			* @return $this
193			*/
194			public function doNotExecuteJavaScript()
195			{
196			$this->executeJavaScript = false;
197
198			return $this;
199			}
200
201			/**
202			* @param \Spatie\Crawler\CrawlObserver\|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
			0 ignored issues – show Documentation introduced 2018-01-25 20:52 UTC by Report Bug Copy Issue Report The doc-type `\Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver]` could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types) This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types. Loading history...
203			*
204			* @return $this
205			*/
206			public function setCrawlObserver($crawlObservers)
207			{
208			if (! is_array($crawlObservers)) {
209			$crawlObservers = [$crawlObservers];
210			}
211
212			return $this->setCrawlObservers($crawlObservers);
213			}
214
215			public function setCrawlObservers(array $crawlObservers)
216			{
217			$this->crawlObservers = $crawlObservers;
218
219			return $this;
220			}
221
222			public function addCrawlObserver(CrawlObserver $crawlObserver)
223			{
224			$this->crawlObservers[] = $crawlObserver;
225
226			return $this;
227			}
228
229			/**
230			* @param \Spatie\Crawler\CrawlProfile $crawlProfile
231			*
232			* @return $this
233			*/
234			public function setCrawlProfile(CrawlProfile $crawlProfile)
235			{
236			$this->crawlProfile = $crawlProfile;
237
238			return $this;
239			}
240
241			/**
242			* @param \Psr\Http\Message\UriInterface\|string $baseUrl
243			*/
244			public function startCrawling($baseUrl)
245			{
246			if (! $baseUrl instanceof UriInterface) {
247			$baseUrl = new Uri($baseUrl);
248			}
249
250			if ($baseUrl->getScheme() === '') {
251			$baseUrl = $baseUrl->withScheme('http');
252			}
253
254			if ($baseUrl->getPath() === '') {
255			$baseUrl = $baseUrl->withPath('/');
256			}
257
258			$this->baseUrl = $baseUrl;
259
260			$crawlUrl = CrawlUrl::create($this->baseUrl);
261
262			$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
263
264			$this->addToCrawlQueue($crawlUrl);
265
266			$this->depthTree = new Node((string) $this->baseUrl);
267
268			$this->startCrawlingQueue();
269
270			foreach ($this->crawlObservers as $crawlObserver) {
271			$crawlObserver->finishedCrawling();
272			}
273			}
274
275			protected function startCrawlingQueue()
276			{
277			while ($this->crawlQueue->hasPendingUrls()) {
278			$pool = new Pool($this->client, $this->getCrawlRequests(), [
279			'concurrency' => $this->concurrency,
280			'options' => $this->client->getConfig(),
281			'fulfilled' => function (ResponseInterface $response, $index) {
282			$crawlUrl = $this->crawlQueue->getUrlById($index);
283
284			$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
285
286			$robotsHeaders = RobotsHeaders::create($response->getHeaders());
287
288			$robotsMeta = RobotsMeta::create($body);
289
290			if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
291			return;
292			}
293
294			$this->handleCrawled($response, $crawlUrl);
295
296			if (! $this->crawlProfile instanceof CrawlSubdomains) {
297			if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
298			return;
299			}
300			}
301
302			if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
303			return;
304			}
305
306			$this->addAllLinksToCrawlQueue(
307			$body,
308			$crawlUrl->url
309			);
310			},
311			'rejected' => function (RequestException $exception, $index) {
312			$this->handleCrawlFailed(
313			$exception,
314			$this->crawlQueue->getUrlById($index),
315			$exception
			0 ignored issues – show Unused Code introduced 2018-02-28 11:45 UTC by Report Bug Copy Issue Report The call to `Crawler::handleCrawlFailed()` has too many arguments starting with `$exception`. This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the `@ignore` PhpDoc annotation to the duplicate definition and it will be ignored. Loading history...
316			);
317			},
318			]);
319
320			$promise = $pool->promise();
321			$promise->wait();
322			}
323			}
324
325			public function endsWith($haystack, $needle)
326			{
327			return strrpos($haystack, $needle) + strlen($needle) ===
328			strlen($haystack);
329			}
330
331			protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
332			{
333			$bodyStream->rewind();
334
335			$body = $bodyStream->read($readMaximumBytes);
336
337			return $body;
338			}
339
340			protected function createRobotsTxt(UriInterface $uri): RobotsTxt
341			{
342			return RobotsTxt::create($uri->withPath('/robots.txt'));
343			}
344
345			/**
346			* @param ResponseInterface\|null $response
347			* @param CrawlUrl $crawlUrl
348			*/
349			protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
350			{
351			foreach ($this->crawlObservers as $crawlObserver) {
352			$crawlObserver->crawled(
353			$crawlUrl->url,
354			$response,
355			$crawlUrl->foundOnUrl
356			);
357			}
358			}
359
360			/**
361			* @param RequestException $exception
362			* @param CrawlUrl $crawlUrl
363			*/
364			protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
365			{
366			foreach ($this->crawlObservers as $crawlObserver) {
367			$crawlObserver->crawlFailed(
368			$crawlUrl->url,
369			$exception,
370			$crawlUrl->foundOnUrl
371			);
372			}
373			}
374
375			protected function getCrawlRequests(): Generator
376			{
377			while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
378			if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
379			$this->crawlQueue->markAsProcessed($crawlUrl);
380			continue;
381			}
382
383			if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
384			continue;
385			}
386
387			foreach ($this->crawlObservers as $crawlObserver) {
388			$crawlObserver->willCrawl($crawlUrl->url);
389			}
390
391			$this->crawlQueue->markAsProcessed($crawlUrl);
392
393			yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
394			}
395			}
396
397			protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
398			{
399			$allLinks = $this->extractAllLinks($html, $foundOnUrl);
400
401			collect($allLinks)
402			->filter(function (UriInterface $url) {
403			return $this->hasCrawlableScheme($url);
404			})
405			->map(function (UriInterface $url) {
406			return $this->normalizeUrl($url);
407			})
408			->filter(function (UriInterface $url) {
409			return $this->crawlProfile->shouldCrawl($url);
410			})
411			->reject(function (UriInterface $url) {
412			return $this->crawlQueue->has($url);
413			})
414			->each(function (UriInterface $url) use ($foundOnUrl) {
415			$node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
416
417			if (strpos($url->getPath(), '/tel:') === 0) {
418			return;
419			}
420
421			if (! $this->shouldCrawl($node)) {
422			return;
423			}
424
425			if ($this->maximumCrawlCountReached()) {
426			return;
427			}
428
429			$crawlUrl = CrawlUrl::create($url, $foundOnUrl);
430
431			$this->addToCrawlQueue($crawlUrl);
432			});
433			}
434
435			protected function shouldCrawl(Node $node): bool
436			{
437			if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
438			return false;
439			}
440
441			if (is_null($this->maximumDepth)) {
442			return true;
443			}
444
445			return $node->getDepth() <= $this->maximumDepth;
446			}
447
448			/**
449			* @param string $html
450			* @param \Psr\Http\Message\UriInterface $foundOnUrl
451			*
452			* @return \Illuminate\Support\Collection\|\Tightenco\Collect\Support\Collection\|null
453			*/
454			protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
455			{
456			if ($this->executeJavaScript) {
457			$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
458			}
459
460			$domCrawler = new DomCrawler($html, $foundOnUrl);
461
462			return collect($domCrawler->filterXpath('//a')->links())
463			->reject(function (Link $link) {
464			return $link->getNode()->getAttribute('rel') === 'nofollow';
465			})
466			->map(function (Link $link) {
467			try {
468			return new Uri($link->getUri());
469			} catch (InvalidArgumentException $exception) {
470			return;
471			}
472			})
473			->filter();
474			}
475
476			protected function normalizeUrl(UriInterface $url): UriInterface
477			{
478			return $url->withFragment('');
479			}
480
481			protected function hasCrawlableScheme(UriInterface $uri): bool
482			{
483			return in_array($uri->getScheme(), ['http', 'https']);
484			}
485
486			protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
487			{
488			$returnNode = null;
489
490			if ($node->getValue() === (string) $parentUrl) {
491			$newNode = new Node((string) $url);
492
493			$node->addChild($newNode);
494
495			return $newNode;
496			}
497
498			foreach ($node->getChildren() as $currentNode) {
499			$returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
500
501			if (! is_null($returnNode)) {
502			break;
503			}
504			}
505
506			return $returnNode;
507			}
508
509			protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
510			{
511			$browsershot = $this->getBrowsershot();
512
513			$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
514
515			return html_entity_decode($html);
516			}
517
518			protected function getBrowsershot(): Browsershot
519			{
520			if ($this->browsershot) {
521			return $this->browsershot;
522			}
523
524			$this->browsershot = new Browsershot();
525
526			return $this->browsershot;
527			}
528
529			public function setBrowsershot(Browsershot $browsershot)
530			{
531			$this->browsershot = $browsershot;
532
533			return $this;
534			}
535
536			protected function addToCrawlQueue(CrawlUrl $crawlUrl)
537			{
538			$this->crawledUrlCount++;
539
540			$this->crawlQueue->add($crawlUrl);
541
542			return $this;
543			}
544
545			protected function maximumCrawlCountReached(): bool
546			{
547			if (is_null($this->maximumCrawlCount)) {
548			return false;
549			}
550
551			return $this->crawledUrlCount >= $this->maximumCrawlCount;
552			}
553
554			protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
555			{
556			if ($this->ignoreRobots) {
557			return true;
558			}
559
560			return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
561			}
562
563			protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
564			{
565			if ($this->ignoreRobots) {
566			return true;
567			}
568
569			return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
570			}
571			}
572

spatie / crawler

Pull Request — master (#145)

Crawler::mayIndex() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like