LinkAdder::__construct() - Code Metrics - Inspection of "Code refactor" - spatie/crawler - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#150)

by Brent

created 2018-05-17 07:27 UTC

LinkAdder::__construct() A

↳ Parent: LinkAdder

Complexity

Conditions	1
Paths	1

Size

Total Lines	4
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	0
loc	4
rs	10
cc	1
eloc	2
nc	1
nop	1

<?php

namespace Spatie\Crawler;

use Tree\Node\Node;
use GuzzleHttp\Psr7\Uri;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Symfony\Component\DomCrawler\Link;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;

class LinkAdder
{
    /** @var \Spatie\Crawler\Crawler */
    protected $crawler;

    public function __construct(Crawler $crawler)
    {
        $this->crawler = $crawler;
    }

    public function addFromHtml(string $html, UriInterface $foundOnUrl)
    {
        $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

        collect($allLinks)
            ->filter(function (UriInterface $url) {
                return $this->hasCrawlableScheme($url);
            })
            ->map(function (UriInterface $url) {
                return $this->normalizeUrl($url);
            })
            ->filter(function (UriInterface $url) use ($foundOnUrl) {
                $node = $this->crawler->addToDepthTree($url, $foundOnUrl);

                return $this->shouldCrawl($node);
            })
            ->filter(function (UriInterface $url) {
                return strpos($url->getPath(), '/tel:') === false;
            })
            ->each(function (UriInterface $url) use ($foundOnUrl) {
                if ($this->crawler->maximumCrawlCountReached()) {
                    return;
                }

                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);

                $this->crawler->addToCrawlQueue($crawlUrl);
            });
    }

    /**
     * @param string $html
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
     *
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
     */
    protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
    {
        if ($this->crawler->mayExecuteJavaScript()) {
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
        }

        $domCrawler = new DomCrawler($html, $foundOnUrl);

        return collect($domCrawler->filterXpath('//a')->links())
            ->reject(function (Link $link) {
                return $link->getNode()->getAttribute('rel') === 'nofollow';
            })
            ->map(function (Link $link) {
                try {
                    return new Uri($link->getUri());
                } catch (InvalidArgumentException $exception) {
                    return;
                }
            })
            ->filter();
    }

    protected function hasCrawlableScheme(UriInterface $uri): bool
    {
        return in_array($uri->getScheme(), ['http', 'https']);
    }

    protected function normalizeUrl(UriInterface $url): UriInterface
    {
        return $url->withFragment('');
    }

    protected function shouldCrawl(Node $node): bool
    {
        if ($this->crawler->mustRespectRobots()) {
            return $this->crawler->getRobotsTxt()->allows($node->getValue());
        }

        $maximumDepth = $this->crawler->getMaximumDepth();

        if (is_null($maximumDepth)) {
            return true;
        }

        return $node->getDepth() <= $maximumDepth;
    }

    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
    {
        $browsershot = $this->crawler->getBrowsershot();

        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

        return html_entity_decode($html);
    }
}


1			<?php
2
3			namespace Spatie\Crawler;
4
5			use Tree\Node\Node;
6			use GuzzleHttp\Psr7\Uri;
7			use InvalidArgumentException;
8			use Psr\Http\Message\UriInterface;
9			use Symfony\Component\DomCrawler\Link;
10			use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12			class LinkAdder
13			{
14			/** @var \Spatie\Crawler\Crawler */
15			protected $crawler;
16
17			public function __construct(Crawler $crawler)
18			{
19			$this->crawler = $crawler;
20			}
21
22			public function addFromHtml(string $html, UriInterface $foundOnUrl)
23			{
24			$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
25
26			collect($allLinks)
27			->filter(function (UriInterface $url) {
28			return $this->hasCrawlableScheme($url);
29			})
30			->map(function (UriInterface $url) {
31			return $this->normalizeUrl($url);
32			})
33			->filter(function (UriInterface $url) use ($foundOnUrl) {
34			$node = $this->crawler->addToDepthTree($url, $foundOnUrl);
35
36			return $this->shouldCrawl($node);
37			})
38			->filter(function (UriInterface $url) {
39			return strpos($url->getPath(), '/tel:') === false;
40			})
41			->each(function (UriInterface $url) use ($foundOnUrl) {
42			if ($this->crawler->maximumCrawlCountReached()) {
43			return;
44			}
45
46			$crawlUrl = CrawlUrl::create($url, $foundOnUrl);
47
48			$this->crawler->addToCrawlQueue($crawlUrl);
49			});
50			}
51
52			/**
53			* @param string $html
54			* @param \Psr\Http\Message\UriInterface $foundOnUrl
55			*
56			* @return \Illuminate\Support\Collection\|\Tightenco\Collect\Support\Collection\|null
57			*/
58			protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
59			{
60			if ($this->crawler->mayExecuteJavaScript()) {
61			$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
62			}
63
64			$domCrawler = new DomCrawler($html, $foundOnUrl);
65
66			return collect($domCrawler->filterXpath('//a')->links())
67			->reject(function (Link $link) {
68			return $link->getNode()->getAttribute('rel') === 'nofollow';
69			})
70			->map(function (Link $link) {
71			try {
72			return new Uri($link->getUri());
73			} catch (InvalidArgumentException $exception) {
74			return;
75			}
76			})
77			->filter();
78			}
79
80			protected function hasCrawlableScheme(UriInterface $uri): bool
81			{
82			return in_array($uri->getScheme(), ['http', 'https']);
83			}
84
85			protected function normalizeUrl(UriInterface $url): UriInterface
86			{
87			return $url->withFragment('');
88			}
89
90			protected function shouldCrawl(Node $node): bool
91			{
92			if ($this->crawler->mustRespectRobots()) {
93			return $this->crawler->getRobotsTxt()->allows($node->getValue());
94			}
95
96			$maximumDepth = $this->crawler->getMaximumDepth();
97
98			if (is_null($maximumDepth)) {
99			return true;
100			}
101
102			return $node->getDepth() <= $maximumDepth;
103			}
104
105			protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
106			{
107			$browsershot = $this->crawler->getBrowsershot();
108
109			$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
110
111			return html_entity_decode($html);
112			}
113			}
114

spatie / crawler

Pull Request — master (#150)

LinkAdder::__construct() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like