Completed
Push — master ( 4bb569...381ee2 )
by Brent
10s
created

SitemapGenerator::configureCrawler()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Sitemap;
4
5
use Closure;
6
use GuzzleHttp\Psr7\Uri;
7
use Spatie\Crawler\Crawler;
8
use Spatie\Sitemap\Tags\Url;
9
use Spatie\Crawler\CrawlProfile;
10
use Illuminate\Support\Collection;
11
use Psr\Http\Message\UriInterface;
12
use Spatie\Sitemap\Crawler\Profile;
13
use Spatie\Sitemap\Crawler\Observer;
14
use Psr\Http\Message\ResponseInterface;
15
16
class SitemapGenerator
17
{
18
    /** @var \Illuminate\Support\Collection */
19
    protected $sitemaps;
20
21
    /** @var \GuzzleHttp\Psr7\Uri */
22
    protected $urlToBeCrawled = '';
23
24
    /** @var \Spatie\Crawler\Crawler */
25
    protected $crawler;
26
27
    /** @var callable */
28
    protected $shouldCrawl;
29
30
    /** @var callable */
31
    protected $hasCrawled;
32
33
    /** @var int */
34
    protected $concurrency = 10;
35
36
    /** @var bool|int $maximumTagsPerSitemap */
37
    protected $maximumTagsPerSitemap = false;
38
39
    /** @var int|null */
40
    protected $maximumCrawlCount = null;
41
42
    /**
43
     * @param string $urlToBeCrawled
44
     *
45
     * @return static
46
     */
47
    public static function create(string $urlToBeCrawled)
48
    {
49
        return app(static::class)->setUrl($urlToBeCrawled);
50
    }
51
52
    public function __construct(Crawler $crawler)
53
    {
54
        $this->crawler = $crawler;
55
56
        $this->sitemaps = new Collection([new Sitemap]);
57
58
        $this->hasCrawled = function (Url $url, ResponseInterface $response = null) {
0 ignored issues
show
Unused Code introduced by
The parameter $response is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
59
            return $url;
60
        };
61
    }
62
63
    public function configureCrawler(Closure $closure): self
64
    {
65
        call_user_func_array($closure, [$this->crawler]);
66
67
        return $this;
68
    }
69
70
    public function setConcurrency(int $concurrency)
71
    {
72
        $this->concurrency = $concurrency;
73
    }
74
75
    public function setMaximumCrawlCount(int $maximumCrawlCount)
76
    {
77
        $this->maximumCrawlCount = $maximumCrawlCount;
78
    }
79
80
    public function maxTagsPerSitemap(int $maximumTagsPerSitemap = 50000): self
81
    {
82
        $this->maximumTagsPerSitemap = $maximumTagsPerSitemap;
83
84
        return $this;
85
    }
86
87
    public function setUrl(string $urlToBeCrawled)
88
    {
89
        $this->urlToBeCrawled = new Uri($urlToBeCrawled);
90
91
        if ($this->urlToBeCrawled->getPath() === '') {
92
            $this->urlToBeCrawled = $this->urlToBeCrawled->withPath('/');
93
        }
94
95
        return $this;
96
    }
97
98
    public function shouldCrawl(callable $shouldCrawl)
99
    {
100
        $this->shouldCrawl = $shouldCrawl;
101
102
        return $this;
103
    }
104
105
    public function hasCrawled(callable $hasCrawled)
106
    {
107
        $this->hasCrawled = $hasCrawled;
108
109
        return $this;
110
    }
111
112
    public function getSitemap(): Sitemap
113
    {
114
        if (config('sitemap.execute_javascript')) {
115
            $this->crawler->executeJavaScript(config('sitemap.chrome_binary_path'));
0 ignored issues
show
Unused Code introduced by
The call to Crawler::executeJavaScript() has too many arguments starting with config('sitemap.chrome_binary_path').

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
116
        }
117
118
        if (! is_null($this->maximumCrawlCount)) {
119
            $this->crawler->setMaximumCrawlCount($this->maximumCrawlCount);
120
        }
121
122
        $this->crawler
123
            ->setCrawlProfile($this->getCrawlProfile())
124
            ->setCrawlObserver($this->getCrawlObserver())
125
            ->setConcurrency($this->concurrency)
126
            ->startCrawling($this->urlToBeCrawled);
127
128
        return $this->sitemaps->first();
129
    }
130
131
    /**
132
     * @param string $path
133
     *
134
     * @return $this
135
     */
136
    public function writeToFile(string $path)
137
    {
138
        $sitemap = $this->getSitemap();
139
140
        if ($this->maximumTagsPerSitemap) {
141
            $sitemap = SitemapIndex::create();
142
            $format = str_replace('.xml', '_%d.xml', $path);
143
144
            // Parses each sub-sitemaps, writes and pushs them into the sitemap
145
            // index
146
            $this->sitemaps->each(function (Sitemap $item, int $key) use ($sitemap, $format) {
147
                $path = sprintf($format, $key);
148
149
                $item->writeToFile(sprintf($format, $key));
150
                $sitemap->add(last(explode('public', $path)));
151
            });
152
        }
153
154
        $sitemap->writeToFile($path);
155
156
        return $this;
157
    }
158
159
    protected function getCrawlProfile(): CrawlProfile
160
    {
161
        $shouldCrawl = function (UriInterface $url) {
162
            if ($url->getHost() !== $this->urlToBeCrawled->getHost()) {
163
                return false;
164
            }
165
166
            if (! is_callable($this->shouldCrawl)) {
167
                return true;
168
            }
169
170
            return ($this->shouldCrawl)($url);
171
        };
172
173
        $profileClass = config('sitemap.crawl_profile', Profile::class);
174
        $profile = new $profileClass($this->urlToBeCrawled);
175
176
        if (method_exists($profile, 'shouldCrawlCallback')) {
177
            $profile->shouldCrawlCallback($shouldCrawl);
178
        }
179
180
        return $profile;
181
    }
182
183
    protected function getCrawlObserver(): Observer
184
    {
185
        $performAfterUrlHasBeenCrawled = function (UriInterface $crawlerUrl, ResponseInterface $response = null) {
186
            $sitemapUrl = ($this->hasCrawled)(Url::create((string) $crawlerUrl), $response);
187
188
            if ($this->shouldStartNewSitemapFile()) {
189
                $this->sitemaps->push(new Sitemap);
190
            }
191
192
            if ($sitemapUrl) {
193
                $this->sitemaps->last()->add($sitemapUrl);
194
            }
195
        };
196
197
        return new Observer($performAfterUrlHasBeenCrawled);
198
    }
199
200
    protected function shouldStartNewSitemapFile(): bool
201
    {
202
        if (! $this->maximumTagsPerSitemap) {
203
            return false;
204
        }
205
206
        $currentNumberOfTags = count($this->sitemaps->last()->getTags());
207
208
        return $currentNumberOfTags >= $this->maximumTagsPerSitemap;
209
    }
210
}
211