Crawler::mayFollow() - Code Metrics - Inspection of "Merge pull request #145 from spatie/respect-robots" - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 016fca...0d1e60 )

by Brent

created 2018-05-08 12:07 UTC

Crawler::mayFollow() A

↳ Parent: Crawler

Complexity

Conditions	4
Paths	4

Size

Total Lines	16
Code Lines	8

Duplication

Lines	16
Ratio	100 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	16
loc	16
rs	9.2
cc	4
eloc	8
nc	4
nop	2

<?php

namespace Spatie\Crawler;

use Generator;
use Tree\Node\Node;
use GuzzleHttp\Pool;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\Request;
use Spatie\Robots\RobotsTxt;
use InvalidArgumentException;
use Spatie\Robots\RobotsMeta;
use GuzzleHttp\RequestOptions;
use Spatie\Robots\RobotsHeaders;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Psr\Http\Message\StreamInterface;
use Symfony\Component\DomCrawler\Link;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlQueue\CrawlQueue;
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

class Crawler
{
    /** @var \GuzzleHttp\Client */
    protected $client;

    /** @var \Psr\Http\Message\UriInterface */
    protected $baseUrl;

    /** @var array[\Spatie\Crawler\CrawlObserver] */
    protected $crawlObservers;

    /** @var \Spatie\Crawler\CrawlProfile */
    protected $crawlProfile;

    /** @var int */
    protected $concurrency;

    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
    protected $crawlQueue;

    /** @var int */
    protected $crawledUrlCount = 0;

    /** @var int|null */
    protected $maximumCrawlCount = null;

    /** @var int */
    protected $maximumResponseSize = 1024 * 1024 * 2;

    /** @var int|null */
    protected $maximumDepth = null;

    /** @var bool */
    protected $respectRobots = true;

    /** @var \Tree\Node\Node */
    protected $depthTree;

    /** @var bool */
    protected $executeJavaScript = false;

    /** @var Browsershot */
    protected $browsershot = null;

    /** @var \Spatie\Robots\RobotsTxt */
    protected $robotsTxt = null;

    protected static $defaultClientOptions = [
        RequestOptions::COOKIES => true,
        RequestOptions::CONNECT_TIMEOUT => 10,
        RequestOptions::TIMEOUT => 10,
        RequestOptions::ALLOW_REDIRECTS => false,
    ];

    /**
     * @param array $clientOptions
     *
     * @return static
     */
    public static function create(array $clientOptions = [])
    {
        $clientOptions = (count($clientOptions))
            ? $clientOptions
            : self::$defaultClientOptions;

        $client = new Client($clientOptions);

        return new static($client);
    }

    public function __construct(Client $client, int $concurrency = 10)
    {
        $this->client = $client;

        $this->concurrency = $concurrency;

        $this->crawlProfile = new CrawlAllUrls();

        $this->crawlQueue = new CollectionCrawlQueue();
    }

    /**
     * @param int $concurrency
     *
     * @return $this
     */
    public function setConcurrency(int $concurrency)
    {
        $this->concurrency = $concurrency;

        return $this;
    }

    /**
     * Responses that are larger that then specified value will be ignored.
     *
     * @param int $maximumResponseSizeInBytes
     *
     * @return $this
     */
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
    {
        $this->maximumResponseSize = $maximumResponseSizeInBytes;

        return $this;
    }

    /**
     * @param int $maximumCrawlCount
     *
     * @return $this
     */
    public function setMaximumCrawlCount(int $maximumCrawlCount)
    {
        $this->maximumCrawlCount = $maximumCrawlCount;

        return $this;
    }

    /**
     * @param int $maximumDepth
     *
     * @return $this
     */
    public function setMaximumDepth(int $maximumDepth)
    {
        $this->maximumDepth = $maximumDepth;

        return $this;
    }

    /**
     * @return $this
     */
    public function ignoreRobots()
    {
        $this->respectRobots = false;

        return $this;
    }

    /**
     * @return $this
     */
    public function respectRobots()
    {
        $this->respectRobots = true;

        return $this;
    }

    /**
     * @param CrawlQueue $crawlQueue
     *
     * @return $this
     */
    public function setCrawlQueue(CrawlQueue $crawlQueue)
    {
        $this->crawlQueue = $crawlQueue;

        return $this;
    }

    /**
     * @return $this
     */
    public function executeJavaScript()
    {
        $this->executeJavaScript = true;

        return $this;
    }

    /**
     * @return $this
     */
    public function doNotExecuteJavaScript()
    {
        $this->executeJavaScript = false;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers

     *
     * @return $this
     */
    public function setCrawlObserver($crawlObservers)
    {
        if (! is_array($crawlObservers)) {
            $crawlObservers = [$crawlObservers];
        }

        return $this->setCrawlObservers($crawlObservers);
    }

    public function setCrawlObservers(array $crawlObservers)
    {
        $this->crawlObservers = $crawlObservers;

        return $this;
    }

    public function addCrawlObserver(CrawlObserver $crawlObserver)
    {
        $this->crawlObservers[] = $crawlObserver;

        return $this;
    }

    /**
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
     *
     * @return $this
     */
    public function setCrawlProfile(CrawlProfile $crawlProfile)
    {
        $this->crawlProfile = $crawlProfile;

        return $this;
    }

    /**
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
     */
    public function startCrawling($baseUrl)
    {
        if (! $baseUrl instanceof UriInterface) {
            $baseUrl = new Uri($baseUrl);
        }

        if ($baseUrl->getScheme() === '') {
            $baseUrl = $baseUrl->withScheme('http');
        }

        if ($baseUrl->getPath() === '') {
            $baseUrl = $baseUrl->withPath('/');
        }

        $this->baseUrl = $baseUrl;

        $crawlUrl = CrawlUrl::create($this->baseUrl);

        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);

        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
            $this->addToCrawlQueue($crawlUrl);
        }

        $this->depthTree = new Node((string) $this->baseUrl);

        $this->startCrawlingQueue();

        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->finishedCrawling();
        }
    }

    protected function startCrawlingQueue()
    {
        while ($this->crawlQueue->hasPendingUrls()) {
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
                'concurrency' => $this->concurrency,
                'options' => $this->client->getConfig(),
                'fulfilled' => function (ResponseInterface $response, $index) {
                    $crawlUrl = $this->crawlQueue->getUrlById($index);

                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);

                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());

                    $robotsMeta = RobotsMeta::create($body);

                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->handleCrawled($response, $crawlUrl);

                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
                            return;
                        }
                    }

                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
                        return;
                    }

                    $this->addAllLinksToCrawlQueue(
                        $body,
                        $crawlUrl->url
                    );
                },
                'rejected' => function (RequestException $exception, $index) {
                    $this->handleCrawlFailed(
                        $exception,
                        $this->crawlQueue->getUrlById($index),
                        $exception

                    );
                },
            ]);

            $promise = $pool->promise();
            $promise->wait();
        }
    }

    public function endsWith($haystack, $needle)
    {
        return strrpos($haystack, $needle) + strlen($needle) ===
            strlen($haystack);
    }

    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
    {
        $bodyStream->rewind();

        $body = $bodyStream->read($readMaximumBytes);

        return $body;
    }

    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
    {
        return RobotsTxt::create($uri->withPath('/robots.txt'));
    }

    /**
     * @param ResponseInterface|null $response
     * @param CrawlUrl               $crawlUrl
     */
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawled(
                $crawlUrl->url,
                $response,
                $crawlUrl->foundOnUrl
            );
        }
    }

    /**
     * @param RequestException $exception
     * @param CrawlUrl         $crawlUrl
     */
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
    {
        foreach ($this->crawlObservers as $crawlObserver) {
            $crawlObserver->crawlFailed(
                $crawlUrl->url,
                $exception,
                $crawlUrl->foundOnUrl
            );
        }
    }

    protected function getCrawlRequests(): Generator
    {
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
                $this->crawlQueue->markAsProcessed($crawlUrl);
                continue;
            }

            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
                continue;
            }

            foreach ($this->crawlObservers as $crawlObserver) {
                $crawlObserver->willCrawl($crawlUrl->url);
            }

            $this->crawlQueue->markAsProcessed($crawlUrl);

            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
        }
    }

    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
    {
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);

        collect($allLinks)
            ->filter(function (UriInterface $url) {
                return $this->hasCrawlableScheme($url);
            })
            ->map(function (UriInterface $url) {
                return $this->normalizeUrl($url);
            })
            ->filter(function (UriInterface $url) {
                return $this->crawlProfile->shouldCrawl($url);
            })
            ->reject(function (UriInterface $url) {
                return $this->crawlQueue->has($url);
            })
            ->each(function (UriInterface $url) use ($foundOnUrl) {
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);

                if (strpos($url->getPath(), '/tel:') === 0) {
                    return;
                }

                if (! $this->shouldCrawl($node)) {
                    return;
                }

                if ($this->maximumCrawlCountReached()) {
                    return;
                }

                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);

                $this->addToCrawlQueue($crawlUrl);
            });
    }

    protected function shouldCrawl(Node $node): bool
    {
        if ($this->respectRobots) {
            return $this->robotsTxt->allows($node->getValue());
        }

        if (is_null($this->maximumDepth)) {
            return true;
        }

        return $node->getDepth() <= $this->maximumDepth;
    }

    /**
     * @param string                         $html
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
     *
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
     */
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
    {
        if ($this->executeJavaScript) {
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
        }

        $domCrawler = new DomCrawler($html, $foundOnUrl);

        return collect($domCrawler->filterXpath('//a')->links())
            ->reject(function (Link $link) {
                return $link->getNode()->getAttribute('rel') === 'nofollow';
            })
            ->map(function (Link $link) {
                try {
                    return new Uri($link->getUri());
                } catch (InvalidArgumentException $exception) {
                    return;
                }
            })
            ->filter();
    }

    protected function normalizeUrl(UriInterface $url): UriInterface
    {
        return $url->withFragment('');
    }

    protected function hasCrawlableScheme(UriInterface $uri): bool
    {
        return in_array($uri->getScheme(), ['http', 'https']);
    }

    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
    {
        $returnNode = null;

        if ($node->getValue() === (string) $parentUrl) {
            $newNode = new Node((string) $url);

            $node->addChild($newNode);

            return $newNode;
        }

        foreach ($node->getChildren() as $currentNode) {
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);

            if (! is_null($returnNode)) {
                break;
            }
        }

        return $returnNode;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
    {
        $browsershot = $this->getBrowsershot();

        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

        return html_entity_decode($html);
    }

    protected function getBrowsershot(): Browsershot
    {
        if ($this->browsershot) {
            return $this->browsershot;
        }

        $this->browsershot = new Browsershot();

        return $this->browsershot;
    }

    public function setBrowsershot(Browsershot $browsershot)
    {
        $this->browsershot = $browsershot;

        return $this;
    }

    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
    {
        $this->crawledUrlCount++;

        $this->crawlQueue->add($crawlUrl);

        return $this;
    }

    protected function maximumCrawlCountReached(): bool
    {
        if (is_null($this->maximumCrawlCount)) {
            return false;
        }

        return $this->crawledUrlCount >= $this->maximumCrawlCount;
    }

    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool

    {
        if (! $this->respectRobots) {
            return true;
        }

        if (! $robotsHeaders->mayIndex()) {
            return false;
        }

        if (! $robotsMeta->mayIndex()) {
            return false;
        }

        return true;
    }

    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool

    {
        if (! $this->respectRobots) {
            return true;
        }

        if (! $robotsHeaders->mayFollow()) {
            return false;
        }

        if (! $robotsMeta->mayFollow()) {
            return false;
        }

        return true;
    }
}


1		<?php
2
3		namespace Spatie\Crawler;
4
5		use Generator;
6		use Tree\Node\Node;
7		use GuzzleHttp\Pool;
8		use GuzzleHttp\Client;
9		use GuzzleHttp\Psr7\Uri;
10		use GuzzleHttp\Psr7\Request;
11		use Spatie\Robots\RobotsTxt;
12		use InvalidArgumentException;
13		use Spatie\Robots\RobotsMeta;
14		use GuzzleHttp\RequestOptions;
15		use Spatie\Robots\RobotsHeaders;
16		use Psr\Http\Message\UriInterface;
17		use Spatie\Browsershot\Browsershot;
18		use Psr\Http\Message\StreamInterface;
19		use Symfony\Component\DomCrawler\Link;
20		use Psr\Http\Message\ResponseInterface;
21		use Spatie\Crawler\CrawlQueue\CrawlQueue;
22		use GuzzleHttp\Exception\RequestException;
23		use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24		use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26		class Crawler
27		{
28		/** @var \GuzzleHttp\Client */
29		protected $client;
30
31		/** @var \Psr\Http\Message\UriInterface */
32		protected $baseUrl;
33
34		/** @var array[\Spatie\Crawler\CrawlObserver] */
35		protected $crawlObservers;
36
37		/** @var \Spatie\Crawler\CrawlProfile */
38		protected $crawlProfile;
39
40		/** @var int */
41		protected $concurrency;
42
43		/** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44		protected $crawlQueue;
45
46		/** @var int */
47		protected $crawledUrlCount = 0;
48
49		/** @var int\|null */
50		protected $maximumCrawlCount = null;
51
52		/** @var int */
53		protected $maximumResponseSize = 1024 * 1024 * 2;
54
55		/** @var int\|null */
56		protected $maximumDepth = null;
57
58		/** @var bool */
59		protected $respectRobots = true;
60
61		/** @var \Tree\Node\Node */
62		protected $depthTree;
63
64		/** @var bool */
65		protected $executeJavaScript = false;
66
67		/** @var Browsershot */
68		protected $browsershot = null;
69
70		/** @var \Spatie\Robots\RobotsTxt */
71		protected $robotsTxt = null;
72
73		protected static $defaultClientOptions = [
74		RequestOptions::COOKIES => true,
75		RequestOptions::CONNECT_TIMEOUT => 10,
76		RequestOptions::TIMEOUT => 10,
77		RequestOptions::ALLOW_REDIRECTS => false,
78		];
79
80		/**
81		* @param array $clientOptions
82		*
83		* @return static
84		*/
85		public static function create(array $clientOptions = [])
86		{
87		$clientOptions = (count($clientOptions))
88		? $clientOptions
89		: self::$defaultClientOptions;
90
91		$client = new Client($clientOptions);
92
93		return new static($client);
94		}
95
96		public function __construct(Client $client, int $concurrency = 10)
97		{
98		$this->client = $client;
99
100		$this->concurrency = $concurrency;
101
102		$this->crawlProfile = new CrawlAllUrls();
103
104		$this->crawlQueue = new CollectionCrawlQueue();
105		}
106
107		/**
108		* @param int $concurrency
109		*
110		* @return $this
111		*/
112		public function setConcurrency(int $concurrency)
113		{
114		$this->concurrency = $concurrency;
115
116		return $this;
117		}
118
119		/**
120		* Responses that are larger that then specified value will be ignored.
121		*
122		* @param int $maximumResponseSizeInBytes
123		*
124		* @return $this
125		*/
126		public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127		{
128		$this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130		return $this;
131		}
132
133		/**
134		* @param int $maximumCrawlCount
135		*
136		* @return $this
137		*/
138		public function setMaximumCrawlCount(int $maximumCrawlCount)
139		{
140		$this->maximumCrawlCount = $maximumCrawlCount;
141
142		return $this;
143		}
144
145		/**
146		* @param int $maximumDepth
147		*
148		* @return $this
149		*/
150		public function setMaximumDepth(int $maximumDepth)
151		{
152		$this->maximumDepth = $maximumDepth;
153
154		return $this;
155		}
156
157		/**
158		* @return $this
159		*/
160		public function ignoreRobots()
161		{
162		$this->respectRobots = false;
163
164		return $this;
165		}
166
167		/**
168		* @return $this
169		*/
170		public function respectRobots()
171		{
172		$this->respectRobots = true;
173
174		return $this;
175		}
176
177		/**
178		* @param CrawlQueue $crawlQueue
179		*
180		* @return $this
181		*/
182		public function setCrawlQueue(CrawlQueue $crawlQueue)
183		{
184		$this->crawlQueue = $crawlQueue;
185
186		return $this;
187		}
188
189		/**
190		* @return $this
191		*/
192		public function executeJavaScript()
193		{
194		$this->executeJavaScript = true;
195
196		return $this;
197		}
198
199		/**
200		* @return $this
201		*/
202		public function doNotExecuteJavaScript()
203		{
204		$this->executeJavaScript = false;
205
206		return $this;
207		}
208
209		/**
210		* @param \Spatie\Crawler\CrawlObserver\|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
		0 ignored issues – show Documentation introduced 2018-01-25 20:52 UTC by Report Bug Copy Issue Report The doc-type `\Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver]` could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types) This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types. Loading history...
211		*
212		* @return $this
213		*/
214		public function setCrawlObserver($crawlObservers)
215		{
216		if (! is_array($crawlObservers)) {
217		$crawlObservers = [$crawlObservers];
218		}
219
220		return $this->setCrawlObservers($crawlObservers);
221		}
222
223		public function setCrawlObservers(array $crawlObservers)
224		{
225		$this->crawlObservers = $crawlObservers;
226
227		return $this;
228		}
229
230		public function addCrawlObserver(CrawlObserver $crawlObserver)
231		{
232		$this->crawlObservers[] = $crawlObserver;
233
234		return $this;
235		}
236
237		/**
238		* @param \Spatie\Crawler\CrawlProfile $crawlProfile
239		*
240		* @return $this
241		*/
242		public function setCrawlProfile(CrawlProfile $crawlProfile)
243		{
244		$this->crawlProfile = $crawlProfile;
245
246		return $this;
247		}
248
249		/**
250		* @param \Psr\Http\Message\UriInterface\|string $baseUrl
251		*/
252		public function startCrawling($baseUrl)
253		{
254		if (! $baseUrl instanceof UriInterface) {
255		$baseUrl = new Uri($baseUrl);
256		}
257
258		if ($baseUrl->getScheme() === '') {
259		$baseUrl = $baseUrl->withScheme('http');
260		}
261
262		if ($baseUrl->getPath() === '') {
263		$baseUrl = $baseUrl->withPath('/');
264		}
265
266		$this->baseUrl = $baseUrl;
267
268		$crawlUrl = CrawlUrl::create($this->baseUrl);
269
270		$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
271
272		if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
273		$this->addToCrawlQueue($crawlUrl);
274		}
275
276		$this->depthTree = new Node((string) $this->baseUrl);
277
278		$this->startCrawlingQueue();
279
280		foreach ($this->crawlObservers as $crawlObserver) {
281		$crawlObserver->finishedCrawling();
282		}
283		}
284
285		protected function startCrawlingQueue()
286		{
287		while ($this->crawlQueue->hasPendingUrls()) {
288		$pool = new Pool($this->client, $this->getCrawlRequests(), [
289		'concurrency' => $this->concurrency,
290		'options' => $this->client->getConfig(),
291		'fulfilled' => function (ResponseInterface $response, $index) {
292		$crawlUrl = $this->crawlQueue->getUrlById($index);
293
294		$body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
295
296		$robotsHeaders = RobotsHeaders::create($response->getHeaders());
297
298		$robotsMeta = RobotsMeta::create($body);
299
300		if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
301		return;
302		}
303
304		$this->handleCrawled($response, $crawlUrl);
305
306		if (! $this->crawlProfile instanceof CrawlSubdomains) {
307		if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
308		return;
309		}
310		}
311
312		if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
313		return;
314		}
315
316		$this->addAllLinksToCrawlQueue(
317		$body,
318		$crawlUrl->url
319		);
320		},
321		'rejected' => function (RequestException $exception, $index) {
322		$this->handleCrawlFailed(
323		$exception,
324		$this->crawlQueue->getUrlById($index),
325		$exception
		0 ignored issues – show Unused Code introduced 2018-02-28 11:45 UTC by Report Bug Copy Issue Report The call to `Crawler::handleCrawlFailed()` has too many arguments starting with `$exception`. This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the `@ignore` PhpDoc annotation to the duplicate definition and it will be ignored. Loading history...
326		);
327		},
328		]);
329
330		$promise = $pool->promise();
331		$promise->wait();
332		}
333		}
334
335		public function endsWith($haystack, $needle)
336		{
337		return strrpos($haystack, $needle) + strlen($needle) ===
338		strlen($haystack);
339		}
340
341		protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
342		{
343		$bodyStream->rewind();
344
345		$body = $bodyStream->read($readMaximumBytes);
346
347		return $body;
348		}
349
350		protected function createRobotsTxt(UriInterface $uri): RobotsTxt
351		{
352		return RobotsTxt::create($uri->withPath('/robots.txt'));
353		}
354
355		/**
356		* @param ResponseInterface\|null $response
357		* @param CrawlUrl $crawlUrl
358		*/
359		protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
360		{
361		foreach ($this->crawlObservers as $crawlObserver) {
362		$crawlObserver->crawled(
363		$crawlUrl->url,
364		$response,
365		$crawlUrl->foundOnUrl
366		);
367		}
368		}
369
370		/**
371		* @param RequestException $exception
372		* @param CrawlUrl $crawlUrl
373		*/
374		protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
375		{
376		foreach ($this->crawlObservers as $crawlObserver) {
377		$crawlObserver->crawlFailed(
378		$crawlUrl->url,
379		$exception,
380		$crawlUrl->foundOnUrl
381		);
382		}
383		}
384
385		protected function getCrawlRequests(): Generator
386		{
387		while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
388		if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
389		$this->crawlQueue->markAsProcessed($crawlUrl);
390		continue;
391		}
392
393		if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
394		continue;
395		}
396
397		foreach ($this->crawlObservers as $crawlObserver) {
398		$crawlObserver->willCrawl($crawlUrl->url);
399		}
400
401		$this->crawlQueue->markAsProcessed($crawlUrl);
402
403		yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
404		}
405		}
406
407		protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
408		{
409		$allLinks = $this->extractAllLinks($html, $foundOnUrl);
410
411		collect($allLinks)
412		->filter(function (UriInterface $url) {
413		return $this->hasCrawlableScheme($url);
414		})
415		->map(function (UriInterface $url) {
416		return $this->normalizeUrl($url);
417		})
418		->filter(function (UriInterface $url) {
419		return $this->crawlProfile->shouldCrawl($url);
420		})
421		->reject(function (UriInterface $url) {
422		return $this->crawlQueue->has($url);
423		})
424		->each(function (UriInterface $url) use ($foundOnUrl) {
425		$node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
426
427		if (strpos($url->getPath(), '/tel:') === 0) {
428		return;
429		}
430
431		if (! $this->shouldCrawl($node)) {
432		return;
433		}
434
435		if ($this->maximumCrawlCountReached()) {
436		return;
437		}
438
439		$crawlUrl = CrawlUrl::create($url, $foundOnUrl);
440
441		$this->addToCrawlQueue($crawlUrl);
442		});
443		}
444
445		protected function shouldCrawl(Node $node): bool
446		{
447		if ($this->respectRobots) {
448		return $this->robotsTxt->allows($node->getValue());
449		}
450
451		if (is_null($this->maximumDepth)) {
452		return true;
453		}
454
455		return $node->getDepth() <= $this->maximumDepth;
456		}
457
458		/**
459		* @param string $html
460		* @param \Psr\Http\Message\UriInterface $foundOnUrl
461		*
462		* @return \Illuminate\Support\Collection\|\Tightenco\Collect\Support\Collection\|null
463		*/
464		protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
465		{
466		if ($this->executeJavaScript) {
467		$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
468		}
469
470		$domCrawler = new DomCrawler($html, $foundOnUrl);
471
472		return collect($domCrawler->filterXpath('//a')->links())
473		->reject(function (Link $link) {
474		return $link->getNode()->getAttribute('rel') === 'nofollow';
475		})
476		->map(function (Link $link) {
477		try {
478		return new Uri($link->getUri());
479		} catch (InvalidArgumentException $exception) {
480		return;
481		}
482		})
483		->filter();
484		}
485
486		protected function normalizeUrl(UriInterface $url): UriInterface
487		{
488		return $url->withFragment('');
489		}
490
491		protected function hasCrawlableScheme(UriInterface $uri): bool
492		{
493		return in_array($uri->getScheme(), ['http', 'https']);
494		}
495
496		protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
497		{
498		$returnNode = null;
499
500		if ($node->getValue() === (string) $parentUrl) {
501		$newNode = new Node((string) $url);
502
503		$node->addChild($newNode);
504
505		return $newNode;
506		}
507
508		foreach ($node->getChildren() as $currentNode) {
509		$returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
510
511		if (! is_null($returnNode)) {
512		break;
513		}
514		}
515
516		return $returnNode;
517		}
518
519		protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
520		{
521		$browsershot = $this->getBrowsershot();
522
523		$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
524
525		return html_entity_decode($html);
526		}
527
528		protected function getBrowsershot(): Browsershot
529		{
530		if ($this->browsershot) {
531		return $this->browsershot;
532		}
533
534		$this->browsershot = new Browsershot();
535
536		return $this->browsershot;
537		}
538
539		public function setBrowsershot(Browsershot $browsershot)
540		{
541		$this->browsershot = $browsershot;
542
543		return $this;
544		}
545
546		protected function addToCrawlQueue(CrawlUrl $crawlUrl)
547		{
548		$this->crawledUrlCount++;
549
550		$this->crawlQueue->add($crawlUrl);
551
552		return $this;
553		}
554
555		protected function maximumCrawlCountReached(): bool
556		{
557		if (is_null($this->maximumCrawlCount)) {
558		return false;
559		}
560
561		return $this->crawledUrlCount >= $this->maximumCrawlCount;
562		}
563
564	View Code Duplication	protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
		0 ignored issues – show Duplication introduced 2018-05-04 12:46 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
565		{
566		if (! $this->respectRobots) {
567		return true;
568		}
569
570		if (! $robotsHeaders->mayIndex()) {
571		return false;
572		}
573
574		if (! $robotsMeta->mayIndex()) {
575		return false;
576		}
577
578		return true;
579		}
580
581	View Code Duplication	protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
		0 ignored issues – show Duplication introduced 2018-05-04 12:46 UTC by Report Bug Copy Issue Report This method seems to be duplicated in your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
582		{
583		if (! $this->respectRobots) {
584		return true;
585		}
586
587		if (! $robotsHeaders->mayFollow()) {
588		return false;
589		}
590
591		if (! $robotsMeta->mayFollow()) {
592		return false;
593		}
594
595		return true;
596		}
597		}
598

spatie / crawler

Push — master ( 016fca...0d1e60 )

Crawler::mayFollow() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like