Completed
Push — master ( e33465...b295dd )
by Sebastian
10s
created

Crawler::crawlAllLinks()   B

Complexity

Conditions 1
Paths 1

Size

Total Lines 24
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 24
rs 8.9713
c 0
b 0
f 0
cc 1
eloc 15
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\RequestException;
7
use GuzzleHttp\RequestOptions;
8
use Spatie\Crawler\Exceptions\InvalidBaseUrl;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Crawler
12
{
13
    /**
14
     * @var \GuzzleHttp\Client
15
     */
16
    protected $client;
17
18
    /**
19
     * @var \Spatie\Crawler\Url;
20
     */
21
    protected $baseUrl;
22
23
    /**
24
     * @var \Illuminate\Support\Collection
25
     */
26
    protected $crawledUrls;
27
28
    /**
29
     * @var \Spatie\Crawler\CrawlObserver
30
     */
31
    protected $crawlObserver;
32
33
    /**
34
     * @var \Spatie\Crawler\CrawlProfile
35
     */
36
    protected $crawlProfile;
37
38
    /**
39
     * @return static
40
     */
41
    public static function create()
42
    {
43
        $client = new Client([
44
            RequestOptions::ALLOW_REDIRECTS => false,
45
            RequestOptions::COOKIES         => true,
46
        ]);
47
48
        return new static($client);
49
    }
50
51
    /**
52
     * @param \GuzzleHttp\Client $client
53
     */
54
    public function __construct(Client $client)
55
    {
56
        $this->client = $client;
57
58
        $this->crawlProfile = new CrawlAllUrls();
59
60
        $this->crawledUrls = collect();
61
    }
62
63
    /**
64
     * Set the crawl observer.
65
     *
66
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
67
     *
68
     * @return $this
69
     */
70
    public function setCrawlObserver(CrawlObserver $crawlObserver)
71
    {
72
        $this->crawlObserver = $crawlObserver;
73
74
        return $this;
75
    }
76
77
    /**
78
     * Set the crawl profile.
79
     *
80
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
81
     *
82
     * @return $this
83
     */
84
    public function setCrawlProfile(CrawlProfile $crawlProfile)
85
    {
86
        $this->crawlProfile = $crawlProfile;
87
88
        return $this;
89
    }
90
91
    /**
92
     * Start the crawling process.
93
     *
94
     * @param \Spatie\Crawler\Url|string $baseUrl
95
     *
96
     * @throws \Spatie\Crawler\Exceptions\InvalidBaseUrl
97
     */
98
    public function startCrawling($baseUrl)
99
    {
100
        if (! $baseUrl instanceof Url) {
101
            $baseUrl = Url::create($baseUrl);
102
        }
103
104
        if ($baseUrl->isRelative()) {
105
            throw new InvalidBaseUrl();
106
        }
107
108
        $this->baseUrl = $baseUrl;
109
110
        $this->crawlUrl($baseUrl);
111
112
        $this->crawlObserver->finishedCrawling();
113
    }
114
115
    /**
116
     * Crawl the given url.
117
     *
118
     * @param \Spatie\Crawler\Url $url
119
     */
120
    protected function crawlUrl(Url $url)
121
    {
122
        if (! $this->crawlProfile->shouldCrawl($url)) {
123
            return;
124
        }
125
126
        if ($this->hasAlreadyCrawled($url)) {
127
            return;
128
        }
129
130
        $this->crawlObserver->willCrawl($url);
131
132
        try {
133
            $response = $this->client->request('GET', (string) $url);
134
        } catch (RequestException $exception) {
135
            $response = $exception->getResponse();
136
        }
137
138
        $this->crawlObserver->hasBeenCrawled($url, $response);
139
140
        $this->crawledUrls->push($url);
141
142
        if (! $response) {
143
            return;
144
        }
145
146
        if ($url->host === $this->baseUrl->host) {
147
            $this->crawlAllLinks($response->getBody()->getContents());
148
        }
149
    }
150
151
    /**
152
     * Crawl all links in the given html.
153
     *
154
     * @param string $html
155
     */
156
    protected function crawlAllLinks($html)
157
    {
158
        $allLinks = $this->getAllLinks($html);
159
160
        collect($allLinks)
161
            ->filter(function (Url $url) {
162
                return ! $url->isEmailUrl();
163
            })
164
            ->filter(function (Url $url) {
165
                return ! $url->isTelUrl();
166
            })
167
            ->filter(function (Url $url) {
168
                return ! $url->isJavascript();
169
            })
170
            ->map(function (Url $url) {
171
                return $this->normalizeUrl($url);
172
            })
173
            ->filter(function (Url $url) {
174
                return $this->crawlProfile->shouldCrawl($url);
175
            })
176
            ->each(function (Url $url) {
177
                $this->crawlUrl($url);
178
            });
179
    }
180
181
    /**
182
     * Get all links in the given html.
183
     *
184
     * @param string $html
185
     *
186
     * @return \Spatie\Crawler\Url[]
187
     */
188
    protected function getAllLinks($html)
189
    {
190
        $domCrawler = new DomCrawler($html);
191
192
        return collect($domCrawler->filterXpath('//a')
193
            ->extract(['href']))
194
            ->map(function ($url) {
195
                return Url::create($url);
196
            });
197
    }
198
199
    /**
200
     * Determine if the crawled has already crawled the given url.
201
     *
202
     * @param \Spatie\Crawler\Url $url
203
     *
204
     * @return bool
205
     */
206
    protected function hasAlreadyCrawled(Url $url)
207
    {
208
        foreach ($this->crawledUrls as $crawledUrl) {
209
            if ((string) $crawledUrl === (string) $url) {
210
                return true;
211
            }
212
        }
213
214
        return false;
215
    }
216
217
    /**
218
     * Normalize the given url.
219
     *
220
     * @param \Spatie\Crawler\Url $url
221
     *
222
     * @return $this
223
     */
224
    protected function normalizeUrl(Url $url)
225
    {
226
        if ($url->isRelative()) {
227
            $url->setScheme($this->baseUrl->scheme)
228
                ->setHost($this->baseUrl->host)
229
                ->setPort($this->baseUrl->port);
230
        }
231
232
        if ($url->isProtocolIndependent()) {
233
            $url->setScheme($this->baseUrl->scheme);
234
        }
235
236
        return $url->removeFragment();
237
    }
238
}
239