Completed
Push — master ( b295dd...e532eb )
by Sebastian
04:13
created

Crawler::crawlAllLinks()   A

Complexity

Conditions 3
Paths 1

Size

Total Lines 22
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 22
rs 9.2
cc 3
eloc 14
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\RequestException;
7
use GuzzleHttp\RequestOptions;
8
use Spatie\Crawler\Exceptions\InvalidBaseUrl;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Crawler
12
{
13
    /**
14
     * @var \GuzzleHttp\Client
15
     */
16
    protected $client;
17
18
    /**
19
     * @var \Spatie\Crawler\Url;
20
     */
21
    protected $baseUrl;
22
23
    /**
24
     * @var \Illuminate\Support\Collection
25
     */
26
    protected $crawledUrls;
27
28
    /**
29
     * @var \Spatie\Crawler\CrawlObserver
30
     */
31
    protected $crawlObserver;
32
33
    /**
34
     * @var \Spatie\Crawler\CrawlProfile
35
     */
36
    protected $crawlProfile;
37
38
    /**
39
     * @return static
40
     */
41
    public static function create()
42
    {
43
        $client = new Client([
44
            RequestOptions::ALLOW_REDIRECTS => false,
45
            RequestOptions::COOKIES         => true,
46
        ]);
47
48
        return new static($client);
49
    }
50
51
    /**
52
     * @param \GuzzleHttp\Client $client
53
     */
54
    public function __construct(Client $client)
55
    {
56
        $this->client = $client;
57
58
        $this->crawlProfile = new CrawlAllUrls();
59
60
        $this->crawledUrls = collect();
61
    }
62
63
    /**
64
     * Set the crawl observer.
65
     *
66
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
67
     *
68
     * @return $this
69
     */
70
    public function setCrawlObserver(CrawlObserver $crawlObserver)
71
    {
72
        $this->crawlObserver = $crawlObserver;
73
74
        return $this;
75
    }
76
77
    /**
78
     * Set the crawl profile.
79
     *
80
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
81
     *
82
     * @return $this
83
     */
84
    public function setCrawlProfile(CrawlProfile $crawlProfile)
85
    {
86
        $this->crawlProfile = $crawlProfile;
87
88
        return $this;
89
    }
90
91
    /**
92
     * Start the crawling process.
93
     *
94
     * @param \Spatie\Crawler\Url|string $baseUrl
95
     *
96
     * @throws \Spatie\Crawler\Exceptions\InvalidBaseUrl
97
     */
98
    public function startCrawling($baseUrl)
99
    {
100
        if (! $baseUrl instanceof Url) {
101
            $baseUrl = Url::create($baseUrl);
102
        }
103
104
        if ($baseUrl->isRelative()) {
105
            throw new InvalidBaseUrl();
106
        }
107
108
        $this->baseUrl = $baseUrl;
109
110
        $this->crawlUrl($baseUrl);
111
112
        $this->crawlObserver->finishedCrawling();
113
    }
114
115
    /**
116
     * Crawl the given url.
117
     *
118
     * @param \Spatie\Crawler\Url $url
119
     */
120
    protected function crawlUrl(Url $url)
121
    {
122
        if (! $this->crawlProfile->shouldCrawl($url)) {
123
            return;
124
        }
125
126
        if ($this->hasAlreadyCrawled($url)) {
127
            return;
128
        }
129
130
        $this->crawlObserver->willCrawl($url);
131
132
        try {
133
            $response = $this->client->request('GET', (string) $url);
134
        } catch (RequestException $exception) {
135
            $response = $exception->getResponse();
136
        }
137
138
        $this->crawlObserver->hasBeenCrawled($url, $response);
139
140
        $this->crawledUrls->push($url);
141
142
        if (! $response) {
143
            return;
144
        }
145
146
        if ($url->host === $this->baseUrl->host) {
147
            $this->crawlAllLinks($response->getBody()->getContents());
148
        }
149
    }
150
151
    /**
152
     * Crawl all links in the given html.
153
     *
154
     * @param string $html
155
     */
156
    protected function crawlAllLinks($html)
157
    {
158
        $allLinks = $this->getAllLinks($html);
159
160
        collect($allLinks)
161
            ->reject(function (Url $url) {
162
                return (
163
                    $url->isEmailUrl() ||
164
                    $url->isTelUrl() ||
165
                    $url->isJavascript()
166
                );
167
            })
168
            ->map(function (Url $url) {
169
                return $this->normalizeUrl($url);
170
            })
171
            ->filter(function (Url $url) {
172
                return $this->crawlProfile->shouldCrawl($url);
173
            })
174
            ->each(function (Url $url) {
175
                $this->crawlUrl($url);
176
            });
177
    }
178
179
    /**
180
     * Get all links in the given html.
181
     *
182
     * @param string $html
183
     *
184
     * @return \Spatie\Crawler\Url[]
185
     */
186
    protected function getAllLinks($html)
187
    {
188
        $domCrawler = new DomCrawler($html);
189
190
        return collect($domCrawler->filterXpath('//a')
191
            ->extract(['href']))
192
            ->map(function ($url) {
193
                return Url::create($url);
194
            });
195
    }
196
197
    /**
198
     * Determine if the crawled has already crawled the given url.
199
     *
200
     * @param \Spatie\Crawler\Url $url
201
     *
202
     * @return bool
203
     */
204
    protected function hasAlreadyCrawled(Url $url)
205
    {
206
        foreach ($this->crawledUrls as $crawledUrl) {
207
            if ((string) $crawledUrl === (string) $url) {
208
                return true;
209
            }
210
        }
211
212
        return false;
213
    }
214
215
    /**
216
     * Normalize the given url.
217
     *
218
     * @param \Spatie\Crawler\Url $url
219
     *
220
     * @return $this
221
     */
222
    protected function normalizeUrl(Url $url)
223
    {
224
        if ($url->isRelative()) {
225
            $url->setScheme($this->baseUrl->scheme)
226
                ->setHost($this->baseUrl->host)
227
                ->setPort($this->baseUrl->port);
228
        }
229
230
        if ($url->isProtocolIndependent()) {
231
            $url->setScheme($this->baseUrl->scheme);
232
        }
233
234
        return $url->removeFragment();
235
    }
236
}
237