Completed
Pull Request — master (#166)
by Brent
03:57
created

LinkAdder::getBodyAfterExecutingJavaScript()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
rs 9.4285
cc 1
eloc 4
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Tree\Node\Node;
6
use GuzzleHttp\Psr7\Uri;
7
use InvalidArgumentException;
8
use Psr\Http\Message\UriInterface;
9
use Symfony\Component\DomCrawler\Link;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12
class LinkAdder
13
{
14
    /** @var \Spatie\Crawler\Crawler */
15
    protected $crawler;
16
17
    public function __construct(Crawler $crawler)
18
    {
19
        $this->crawler = $crawler;
20
    }
21
22
    public function addFromHtml(string $html, UriInterface $foundOnUrl)
23
    {
24
        $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
25
26
        collect($allLinks)
27
            ->filter(function (UriInterface $url) {
28
                return $this->hasCrawlableScheme($url);
29
            })
30
            ->map(function (UriInterface $url) {
31
                return $this->normalizeUrl($url);
32
            })
33
            ->filter(function (UriInterface $url) use ($foundOnUrl) {
34
                $node = $this->crawler->addToDepthTree($url, $foundOnUrl);
35
36
                return $this->shouldCrawl($node);
37
            })
38
            ->filter(function (UriInterface $url) {
39
                return strpos($url->getPath(), '/tel:') === false;
40
            })
41
            ->each(function (UriInterface $url) use ($foundOnUrl) {
42
                if ($this->crawler->maximumCrawlCountReached()) {
43
                    return;
44
                }
45
46
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
47
48
                $this->crawler->addToCrawlQueue($crawlUrl);
49
            });
50
    }
51
52
    /**
53
     * @param string $html
54
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
55
     *
56
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
57
     */
58
    protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
59
    {
60
        $domCrawler = new DomCrawler($html, $foundOnUrl);
61
62
        return collect($domCrawler->filterXpath('//a')->links())
63
            ->reject(function (Link $link) {
64
                return $link->getNode()->getAttribute('rel') === 'nofollow';
65
            })
66
            ->map(function (Link $link) {
67
                try {
68
                    return new Uri($link->getUri());
69
                } catch (InvalidArgumentException $exception) {
70
                    return;
71
                }
72
            })
73
            ->filter();
74
    }
75
76
    protected function hasCrawlableScheme(UriInterface $uri): bool
77
    {
78
        return in_array($uri->getScheme(), ['http', 'https']);
79
    }
80
81
    protected function normalizeUrl(UriInterface $url): UriInterface
82
    {
83
        return $url->withFragment('');
84
    }
85
86
    protected function shouldCrawl(Node $node): bool
87
    {
88
        if ($this->crawler->mustRespectRobots()) {
89
            return $this->crawler->getRobotsTxt()->allows($node->getValue());
90
        }
91
92
        $maximumDepth = $this->crawler->getMaximumDepth();
93
94
        if (is_null($maximumDepth)) {
95
            return true;
96
        }
97
98
        return $node->getDepth() <= $maximumDepth;
99
    }
100
}
101