Completed
Pull Request — master (#150)
by Brent
03:17
created

Crawler::addToDepthTree()   B

Complexity

Conditions 4
Paths 4

Size

Total Lines 24
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 24
rs 8.6845
cc 4
eloc 12
nc 4
nop 3
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\Handlers\CrawlRequestFailed;
16
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
17
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
18
19
class Crawler
20
{
21
    use CrawlerProperties;
22
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var array[\Spatie\Crawler\CrawlObserver] */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    protected static $defaultClientOptions = [
69
        RequestOptions::COOKIES => true,
70
        RequestOptions::CONNECT_TIMEOUT => 10,
71
        RequestOptions::TIMEOUT => 10,
72
        RequestOptions::ALLOW_REDIRECTS => false,
73
    ];
74
75
    /**
76
     * @param array $clientOptions
77
     *
78
     * @return static
79
     */
80
    public static function create(array $clientOptions = [])
81
    {
82
        $clientOptions = (count($clientOptions))
83
            ? $clientOptions
84
            : self::$defaultClientOptions;
85
86
        $client = new Client($clientOptions);
87
88
        return new static($client);
89
    }
90
91
    public function __construct(Client $client, int $concurrency = 10)
92
    {
93
        $this->client = $client;
94
95
        $this->concurrency = $concurrency;
96
97
        $this->crawlProfile = new CrawlAllUrls();
98
99
        $this->crawlQueue = new CollectionCrawlQueue();
100
    }
101
102
    /**
103
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
104
     */
105
    public function startCrawling($baseUrl)
106
    {
107
        if (! $baseUrl instanceof UriInterface) {
108
            $baseUrl = new Uri($baseUrl);
109
        }
110
111
        if ($baseUrl->getScheme() === '') {
112
            $baseUrl = $baseUrl->withScheme('http');
113
        }
114
115
        if ($baseUrl->getPath() === '') {
116
            $baseUrl = $baseUrl->withPath('/');
117
        }
118
119
        $this->baseUrl = $baseUrl;
120
121
        $crawlUrl = CrawlUrl::create($this->baseUrl);
122
123
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
124
125
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
126
            $this->addToCrawlQueue($crawlUrl);
127
        }
128
129
        $this->depthTree = new Node((string) $this->baseUrl);
130
131
        $this->startCrawlingQueue();
132
133
        foreach ($this->crawlObservers as $crawlObserver) {
134
            $crawlObserver->finishedCrawling();
135
        }
136
    }
137
138
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
139
    {
140
        $node = $node ?? $this->depthTree;
141
142
        $returnNode = null;
143
144
        if ($node->getValue() === (string) $parentUrl) {
145
            $newNode = new Node((string) $url);
146
147
            $node->addChild($newNode);
148
149
            return $newNode;
150
        }
151
152
        foreach ($node->getChildren() as $currentNode) {
153
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
154
155
            if (! is_null($returnNode)) {
156
                break;
157
            }
158
        }
159
160
        return $returnNode;
161
    }
162
163
    protected function startCrawlingQueue()
164
    {
165
        while ($this->crawlQueue->hasPendingUrls()) {
166
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
167
                'concurrency' => $this->concurrency,
168
                'options' => $this->client->getConfig(),
169
                'fulfilled' => new CrawlRequestFulfilled($this),
170
                'rejected' => new CrawlRequestFailed($this),
171
            ]);
172
173
            $promise = $pool->promise();
174
175
            $promise->wait();
176
        }
177
    }
178
179
    public function endsWith($haystack, $needle)
180
    {
181
        return strrpos($haystack, $needle) + strlen($needle) ===
182
            strlen($haystack);
183
    }
184
185
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
186
    {
187
        return RobotsTxt::create($uri->withPath('/robots.txt'));
188
    }
189
190
    protected function getCrawlRequests(): Generator
191
    {
192
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
193
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
194
                $this->crawlQueue->markAsProcessed($crawlUrl);
195
                continue;
196
            }
197
198
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
199
                continue;
200
            }
201
202
            foreach ($this->crawlObservers as $crawlObserver) {
203
                $crawlObserver->willCrawl($crawlUrl->url);
204
            }
205
206
            $this->crawlQueue->markAsProcessed($crawlUrl);
207
208
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
209
        }
210
    }
211
212
    public function addToCrawlQueue(CrawlUrl $crawlUrl): self
213
    {
214
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
215
            return $this;
216
        }
217
218
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
219
            return $this;
220
        }
221
222
        $this->crawledUrlCount++;
223
224
        $this->crawlQueue->add($crawlUrl);
225
226
        return $this;
227
    }
228
229
    public function maximumCrawlCountReached(): bool
230
    {
231
        $maximumCrawlCount = $this->getMaximumCrawlCount();
232
233
        if (is_null($maximumCrawlCount)) {
234
            return false;
235
        }
236
237
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
238
    }
239
}
240