LinkAdder::extractLinksFromHtml()   A
last analyzed

Complexity

Conditions 2
Paths 1

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 1
nop 2
dl 0
loc 17
rs 9.7
c 0
b 0
f 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use GuzzleHttp\Psr7\Uri;
6
use InvalidArgumentException;
7
use Psr\Http\Message\UriInterface;
8
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
9
use Symfony\Component\DomCrawler\Link;
10
use Tree\Node\Node;
11
12
class LinkAdder
13
{
14
    /** @var \Spatie\Crawler\Crawler */
15
    protected $crawler;
16
17
    public function __construct(Crawler $crawler)
18
    {
19
        $this->crawler = $crawler;
20
    }
21
22
    public function addFromHtml(string $html, UriInterface $foundOnUrl)
23
    {
24
        $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
25
26
        collect($allLinks)
27
            ->filter(function (UriInterface $url) {
28
                return $this->hasCrawlableScheme($url);
29
            })
30
            ->map(function (UriInterface $url) {
31
                return $this->normalizeUrl($url);
32
            })
33
            ->filter(function (UriInterface $url) use ($foundOnUrl) {
34
                if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
35
                    return false;
36
                }
37
38
                return $this->shouldCrawl($node);
39
            })
40
            ->filter(function (UriInterface $url) {
41
                return strpos($url->getPath(), '/tel:') === false;
42
            })
43
            ->each(function (UriInterface $url) use ($foundOnUrl) {
44
                if ($this->crawler->maximumCrawlCountReached()) {
45
                    return;
46
                }
47
48
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
49
50
                $this->crawler->addToCrawlQueue($crawlUrl);
51
            });
52
    }
53
54
    /**
55
     * @param string $html
56
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
57
     *
58
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
59
     */
60
    protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
61
    {
62
        $domCrawler = new DomCrawler($html, $foundOnUrl);
63
64
        return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links())
65
            ->reject(function (Link $link) {
66
                return $link->getNode()->getAttribute('rel') === 'nofollow';
67
            })
68
            ->map(function (Link $link) {
69
                try {
70
                    return new Uri($link->getUri());
71
                } catch (InvalidArgumentException $exception) {
72
                    return;
73
                }
74
            })
75
            ->filter();
76
    }
77
78
    protected function hasCrawlableScheme(UriInterface $uri): bool
79
    {
80
        return in_array($uri->getScheme(), ['http', 'https']);
81
    }
82
83
    protected function normalizeUrl(UriInterface $url): UriInterface
84
    {
85
        return $url->withFragment('');
86
    }
87
88
    protected function shouldCrawl(Node $node): bool
89
    {
90
        if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) {
91
            return false;
92
        }
93
94
        $maximumDepth = $this->crawler->getMaximumDepth();
95
96
        if (is_null($maximumDepth)) {
97
            return true;
98
        }
99
100
        return $node->getDepth() <= $maximumDepth;
101
    }
102
}
103