Completed
Pull Request — master (#150)
by Brent
02:40
created

LinkAdder::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Tree\Node\Node;
6
use GuzzleHttp\Psr7\Uri;
7
use InvalidArgumentException;
8
use Psr\Http\Message\UriInterface;
9
use Symfony\Component\DomCrawler\Link;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12
class LinkAdder
13
{
14
    /** @var \Spatie\Crawler\Crawler */
15
    protected $crawler;
16
17
    public function __construct(Crawler $crawler)
18
    {
19
        $this->crawler = $crawler;
20
    }
21
22
    public function addFromHtml(string $html, UriInterface $foundOnUrl)
23
    {
24
        $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
25
26
        collect($allLinks)
27
            ->filter(function (UriInterface $url) {
28
                return $this->hasCrawlableScheme($url);
29
            })
30
            ->map(function (UriInterface $url) {
31
                return $this->normalizeUrl($url);
32
            })
33
            ->filter(function (UriInterface $url) use ($foundOnUrl) {
34
                $node = $this->crawler->addToDepthTree($url, $foundOnUrl);
35
36
                return $this->shouldCrawl($node);
37
            })
38
            ->filter(function (UriInterface $url) {
39
                return strpos($url->getPath(), '/tel:') === false;
40
            })
41
            ->each(function (UriInterface $url) use ($foundOnUrl) {
42
                if ($this->crawler->maximumCrawlCountReached()) {
43
                    return;
44
                }
45
46
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
47
48
                $this->crawler->addToCrawlQueue($crawlUrl);
49
            });
50
    }
51
52
    /**
53
     * @param string $html
54
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
55
     *
56
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
57
     */
58
    protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
59
    {
60
        if ($this->crawler->mayExecuteJavaScript()) {
61
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
62
        }
63
64
        $domCrawler = new DomCrawler($html, $foundOnUrl);
65
66
        return collect($domCrawler->filterXpath('//a')->links())
67
            ->reject(function (Link $link) {
68
                return $link->getNode()->getAttribute('rel') === 'nofollow';
69
            })
70
            ->map(function (Link $link) {
71
                try {
72
                    return new Uri($link->getUri());
73
                } catch (InvalidArgumentException $exception) {
74
                    return;
75
                }
76
            })
77
            ->filter();
78
    }
79
80
    protected function hasCrawlableScheme(UriInterface $uri): bool
81
    {
82
        return in_array($uri->getScheme(), ['http', 'https']);
83
    }
84
85
    protected function normalizeUrl(UriInterface $url): UriInterface
86
    {
87
        return $url->withFragment('');
88
    }
89
90
    protected function shouldCrawl(Node $node): bool
91
    {
92
        if ($this->crawler->mustRespectRobots()) {
93
            return $this->crawler->getRobotsTxt()->allows($node->getValue());
94
        }
95
96
        $maximumDepth = $this->crawler->getMaximumDepth();
97
98
        if (is_null($maximumDepth)) {
99
            return true;
100
        }
101
102
        return $node->getDepth() <= $maximumDepth;
103
    }
104
105
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
106
    {
107
        $browsershot = $this->crawler->getBrowsershot();
108
109
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
110
111
        return html_entity_decode($html);
112
    }
113
}
114