Completed
Pull Request — master (#65)
by
unknown
03:05
created

Crawler::addAllLinksToCrawlQueue()   B

Complexity

Conditions 3
Paths 1

Size

Total Lines 30
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 30
rs 8.8571
cc 3
eloc 17
nc 1
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Symfony\Component\DomCrawler\Link;
12
use Psr\Http\Message\ResponseInterface;
13
use GuzzleHttp\Exception\RequestException;
14
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
15
use Tree\Node\Node;
16
use Tree\Visitor\PreOrderVisitor;
17
18
class Crawler
19
{
20
    /** @var \GuzzleHttp\Client */
21
    protected $client;
22
23
    /** @var \Spatie\Crawler\Url */
24
    protected $baseUrl;
25
26
    /** @var \Spatie\Crawler\CrawlObserver */
27
    protected $crawlObserver;
28
29
    /** @var \Spatie\Crawler\CrawlProfile */
30
    protected $crawlProfile;
31
32
    /** @var int */
33
    protected $concurrency;
34
35
    /** @var \Spatie\Crawler\CrawlQueue */
36
    protected $crawlQueue;
37
38
    /** @var int */
39
    protected $depth = 0;
40
41
    /** @var \Tree\Node\Node */
42
    protected $linkTree;
43
44
    /**
45
     * @param array $clientOptions
46
     *
47
     * @return static
48
     */
49
    public static function create(array $clientOptions = [])
50
    {
51
        $hasClientOpts = (bool) count($clientOptions);
52
        $client = new Client($hasClientOpts ? $clientOptions : [
53
                RequestOptions::COOKIES => true,
54
                RequestOptions::CONNECT_TIMEOUT => 10,
55
                RequestOptions::TIMEOUT => 10,
56
                RequestOptions::ALLOW_REDIRECTS => false,
57
            ]);
58
59
        return new static($client);
60
    }
61
62
    public function __construct(Client $client, int $concurrency = 10)
63
    {
64
        $this->client = $client;
65
66
        $this->concurrency = $concurrency;
67
68
        $this->crawlProfile = new CrawlAllUrls();
69
70
        $this->crawlQueue = new CrawlQueue();
71
    }
72
73
    /**
74
     * @param int $concurrency
75
     *
76
     * @return $this
77
     */
78
    public function setConcurrency(int $concurrency)
79
    {
80
        $this->concurrency = $concurrency;
81
82
        return $this;
83
    }
84
85
	/**
86
	 * @param int $depth
87
	 *
88
	 * @return $this
89
	 */
90
	public function setDepth(int $depth)
91
	{
92
		$this->depth = $depth;
93
94
		return $this;
95
	}
96
97
    /**
98
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
99
     *
100
     * @return $this
101
     */
102
    public function setCrawlObserver(CrawlObserver $crawlObserver)
103
    {
104
        $this->crawlObserver = $crawlObserver;
105
106
        return $this;
107
    }
108
109
    /**
110
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
111
     *
112
     * @return $this
113
     */
114
    public function setCrawlProfile(CrawlProfile $crawlProfile)
115
    {
116
        $this->crawlProfile = $crawlProfile;
117
118
        return $this;
119
    }
120
121
    /**
122
     * @param \Spatie\Crawler\Url|string $baseUrl
123
     */
124
    public function startCrawling($baseUrl)
125
    {
126
        if (! $baseUrl instanceof Url) {
127
            $baseUrl = Url::create($baseUrl);
128
        }
129
130
        $this->baseUrl = $baseUrl;
131
132
        $crawlUrl = CrawlUrl::create($baseUrl);
133
134
        $this->crawlQueue->add($crawlUrl);
135
136
        $this->linkTree = new Node((string)$this->baseUrl);
137
138
        $this->startCrawlingQueue();
139
140
        $this->crawlObserver->finishedCrawling();
141
    }
142
143
    protected function startCrawlingQueue()
144
    {
145
    	while ($this->crawlQueue->hasPendingUrls()) {
146
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
147
                'concurrency' => $this->concurrency,
148
                'options' => $this->client->getConfig(),
149
                'fulfilled' => function (ResponseInterface $response, int $index) {
150
                    $this->handleResponse($response, $index);
151
152
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
153
154
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
155
                        return;
156
                    }
157
158
                    $this->addAllLinksToCrawlQueue(
159
                        (string) $response->getBody(),
160
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
161
                    );
162
                },
163
                'rejected' => function (RequestException $exception, int $index) {
164
                    $this->handleResponse($exception->getResponse(), $index);
165
                },
166
            ]);
167
168
            $promise = $pool->promise();
169
            $promise->wait();
170
171
            $this->crawlQueue->removeProcessedUrlsFromPending();
172
        }
173
    }
174
175
    /**
176
     * @param ResponseInterface|null $response
177
     * @param int $index
178
     */
179
    protected function handleResponse($response, int $index)
180
    {
181
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
182
183
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
184
    }
185
186
    protected function getCrawlRequests(): Generator
187
    {
188
        $i = 0;
189
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
190
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
191
                $i++;
192
                continue;
193
            }
194
195
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
196
                $i++;
197
                continue;
198
            }
199
200
            $this->crawlObserver->willCrawl($crawlUrl->url);
201
202
            $this->crawlQueue->markAsProcessed($crawlUrl);
203
204
            yield new Request('GET', (string) $crawlUrl->url);
205
            $i++;
206
        }
207
    }
208
209
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
210
    {
211
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
212
213
        collect($allLinks)
214
            ->filter(function (Url $url) {
215
                return $url->hasCrawlableScheme();
216
            })
217
            ->map(function (Url $url) use ($foundOnUrl) {
218
                return $this->normalizeUrl($url);
219
            })
220
            ->filter(function (Url $url) {
221
                return $this->crawlProfile->shouldCrawl($url);
222
            })
223
            ->reject(function ($url) {
224
                return $this->crawlQueue->has($url);
225
            })
226
            ->each(function (Url $url) use ($foundOnUrl) {
227
228
				$newNode = null;
229
230
            	$this->addToLinkTree($this->linkTree, (string)$url, $foundOnUrl, $newNode);
231
232
				if(($this->depth == 0) or ($newNode->getDepth() <= $this->depth)) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as or instead of || is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
233
					$this->crawlQueue->add(
234
						CrawlUrl::create($url, $foundOnUrl)
235
					);
236
				}
237
            });
238
    }
239
240
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
241
    {
242
        $domCrawler = new DomCrawler($html, $foundOnUrl);
243
244
        return collect($domCrawler->filterXpath('//a')->links())
245
            ->map(function (Link $link) {
246
                return Url::create($link->getUri());
247
            });
248
    }
249
250
    /**
251
     * @param \Spatie\Crawler\Url $url
252
     *
253
     * @return \Spatie\Crawler\Url
254
     */
255
    protected function normalizeUrl(Url $url): Url
256
    {
257
        return $url->removeFragment();
258
    }
259
260
261
	/**
262
	 * @param $node \Tree\Node\Node
263
	 * @param $url string
264
	 * @param $parentUrl string
265
	 * @param $newNode \Tree\Node\Node
266
	 */
267
	protected function addToLinkTree($node, $url, $parentUrl, &$newNode) {
268
269
    	if($node->getValue() == $parentUrl) {
270
    		$newNode = new Node($url);
271
			$node->addChild($newNode);
272
		}
273
274
		foreach($node->getChildren() as $currentNode) {
275
			$this->addToLinkTree($currentNode, $url, $parentUrl, $newNode);
276
		}
277
	}
278
279
}
280