Completed
Pull Request — master (#150)
by Brent
02:10
created

LinkAdder::hasCrawlableScheme()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Tree\Node\Node;
6
use GuzzleHttp\Psr7\Uri;
7
use InvalidArgumentException;
8
use Psr\Http\Message\UriInterface;
9
use Symfony\Component\DomCrawler\Link;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12
class LinkAdder
13
{
14
    /** @var \Spatie\Crawler\Crawler */
15
    protected $crawler;
16
17
    public function __construct(Crawler $crawler)
18
    {
19
        $this->crawler = $crawler;
20
    }
21
22
    public function addFromHtml(string $html, UriInterface $foundOnUrl)
23
    {
24
        $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
25
26
        collect($allLinks)
27
            ->filter(function (UriInterface $url) {
28
                return $this->hasCrawlableScheme($url);
29
            })
30
            ->map(function (UriInterface $url) {
31
                return $this->normalizeUrl($url);
32
            })
33
            ->each(function (UriInterface $url) use ($foundOnUrl) {
34
                $node = $this->crawler->addToDepthTree($url, $foundOnUrl);
35
36
                if (strpos($url->getPath(), '/tel:') === 0) {
37
                    return;
38
                }
39
40
                if (! $this->shouldCrawl($node)) {
41
                    return;
42
                }
43
44
                if ($this->crawler->maximumCrawlCountReached()) {
45
                    return;
46
                }
47
48
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
49
50
                $this->crawler->addToCrawlQueue($crawlUrl);
51
            });
52
    }
53
54
    /**
55
     * @param string $html
56
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
57
     *
58
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
59
     */
60
    protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
61
    {
62
        if ($this->crawler->mayExecuteJavaScript()) {
63
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
64
        }
65
66
        $domCrawler = new DomCrawler($html, $foundOnUrl);
67
68
        return collect($domCrawler->filterXpath('//a')->links())
69
            ->reject(function (Link $link) {
70
                return $link->getNode()->getAttribute('rel') === 'nofollow';
71
            })
72
            ->map(function (Link $link) {
73
                try {
74
                    return new Uri($link->getUri());
75
                } catch (InvalidArgumentException $exception) {
76
                    return;
77
                }
78
            })
79
            ->filter();
80
    }
81
82
    protected function hasCrawlableScheme(UriInterface $uri): bool
83
    {
84
        return in_array($uri->getScheme(), ['http', 'https']);
85
    }
86
87
    protected function normalizeUrl(UriInterface $url): UriInterface
88
    {
89
        return $url->withFragment('');
90
    }
91
92
    protected function shouldCrawl(Node $node): bool
93
    {
94
        if ($this->crawler->mustRespectRobots()) {
95
            return $this->crawler->getRobotsTxt()->allows($node->getValue());
96
        }
97
98
        $maximumDepth = $this->crawler->getMaximumDepth();
99
100
        if (is_null($maximumDepth)) {
101
            return true;
102
        }
103
104
        return $node->getDepth() <= $maximumDepth;
105
    }
106
107
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
108
    {
109
        $browsershot = $this->crawler->getBrowsershot();
110
111
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
112
113
        return html_entity_decode($html);
114
    }
115
}
116