Completed
Push — master ( ff6fd6...9fafdc )
by Dev
02:28
created

Crawler::updateInboundLinksCounter()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 14
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 5.1158

Importance

Changes 0
Metric Value
eloc 12
dl 0
loc 14
ccs 10
cts 12
cp 0.8333
rs 9.5555
c 0
b 0
f 0
cc 5
nc 4
nop 3
crap 5.1158
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
use Spatie\Robots\RobotsTxt;
9
10
class Crawler
11
{
12
    const FOLLOW = 1;
13
    const NOFOLLOW = 2;
14
15
    /**
16
     * @var string contain the user agent used during the crawl
17
     */
18
    protected $userAgent;
19
20
    /**
21
     * @var string crawl id
22
     */
23
    protected $id;
24
25
    /**
26
     * @var RobotsTxt page to ignore during the crawl
27
     */
28
    protected $ignore;
29
30
    /**
31
     * @var int depth max where to crawl
32
     */
33
    protected $limit;
34
35
    /**
36
     * @var string contain https://domain.tdl from start url
37
     */
38
    protected $base;
39
40
    /**
41
     * @var bool
42
     */
43
    protected $fromCache;
44
45
    protected $recorder;
46
    protected $robotsTxt;
47
    protected $request;
48
    protected $wait = 0;
49
50
    protected $currentClick = 0;
51
52
    protected $counter = 0;
53
54
    protected $urls = [];
55
56 6
    public function __construct(
57
        string $startUrl,
58
        string $ignore,
59
        int $limit,
60
        string $userAgent,
61
        int $cacheMethod = Recorder::CACHE_ID,
62
        int $waitInMicroSeconds = 100000
63
    ) {
64 6
        $startUrl = $this->setBaseAndReturnNormalizedStartUrl($startUrl);
65 6
        $this->urls[$startUrl] = null;
66 6
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
67 6
        $this->ignore = new RobotsTxt($ignore);
68 6
        $this->userAgent = $userAgent;
69 6
        $this->limit = $limit;
70 6
        $this->wait = $waitInMicroSeconds;
71
72 6
        $this->recorder = new Recorder($this->getDataFolder(), $cacheMethod);
73
74 6
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
75 6
            'startUrl' => $startUrl,
76 6
            'base' => $this->base,
77 6
            'ignore' => $ignore,
78 6
            'limit' => $limit,
79 6
            'userAgent' => $userAgent,
80 6
            'cacheMethod' => $cacheMethod,
81 6
            'wait' => $waitInMicroSeconds,
82
        ]));
83 6
    }
84
85 3
    public function getId()
86
    {
87 3
        return $this->id;
88
    }
89
90 6
    protected function setBaseAndReturnNormalizedStartUrl(string $url): string
91
    {
92 6
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
93
            throw new \Exception('start is not a valid URL `'.$url.'`');
94
        }
95
96 6
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
97 6
        $url = substr($url, strlen($this->base));
98
99 6
        return ('/' != $url[0] ? '/' : '').$url;
100
    }
101
102 6
    public function getDataFolder()
103
    {
104 6
        return __DIR__.'/../data/'.$this->id;
105
    }
106
107 6
    public function crawl(bool $debug = false)
108
    {
109 6
        $nothingUpdated = true;
110
111 6
        if ($debug) {
112 6
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
113 6
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
114
        }
115
116 6
        foreach ($this->urls as $urlToParse => $url) {
117 6
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
118 3
                continue;
119 6
            } elseif ($this->currentClick > $this->limit) {
120 2
                continue;
121
            }
122
123 6
            if ($debug) {
124 6
                echo $this->counter.'/'.count($this->urls).'    '.$this->base.$urlToParse.PHP_EOL;
125
            }
126
127 6
            $nothingUpdated = false;
128 6
            ++$this->counter;
129
130 6
            $harvest = $this->harvest($urlToParse);
131 6
            $this->urls[$urlToParse]->setDiscovered(count($this->urls));
132
133 6
            $this->cacheRobotsTxt($harvest);
134
135 6
            $this->cacheRequest($harvest);
136
137 6
            usleep($this->wait);
138
139 6
            if ($this->counter / 500 == round($this->counter / 500)) {
140
                echo $debug ? '    --- auto-save'.PHP_EOL : '';
141 4
                $this->recorder->record($this->urls);
142
            }
143
        }
144
145 6
        ++$this->currentClick;
146
147
        // Record after each Level:
148 6
        $this->recorder->record($this->urls);
149
150 6
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
151
152 6
        return $record ? null : $this->crawl($debug);
153
    }
154
155 6
    protected function cacheRobotsTxt($harvest)
156
    {
157 6
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
158 4
            $this->robotsTxt = $harvest->getRobotsTxt();
159
        }
160
161 6
        return $this;
162
    }
163
164 6
    protected function cacheRequest($harvest)
165
    {
166 6
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
167 4
            $this->request = $harvest->getResponse()->getRequest();
168
        }
169
170 6
        return $this;
171
    }
172
173 4
    protected function loadRobotsTxt(Harvest $harvest)
174
    {
175 4
        if (null !== $this->robotsTxt) {
176
            $harvest->setRobotsTxt($this->robotsTxt);
177
        }
178
179 4
        return $this;
180
    }
181
182 6
    protected function getHarvest(Url $url)
183
    {
184 6
        return Harvest::fromUrl(
185 6
            $this->base.$url->uri,
186 6
            $this->userAgent,
187 6
            'en,en-US;q=0.5',
188 6
            $this->request
189
        );
190
    }
191
192 6
    protected function harvest(string $urlToParse)
193
    {
194 6
        $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($this->base.$urlToParse, $this->currentClick);
195 6
        $url = $this->urls[$urlToParse];
196
197 6
        $url->can_be_crawled = $this->ignore->allows($this->base.$urlToParse, $this->userAgent);
198
199 6
        if (false === $url->can_be_crawled) {
200
            return;
201
        }
202
203 6
        $harvest = $this->getHarvest($url);
204
205 6
        if (!$harvest instanceof Harvest) {
206 2
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
207
208 2
            return;
209
        }
210
211 4
        $this->loadRobotsTxt($harvest);
212
213 4
        $url->indexable = $harvest->isIndexable(); // slow ~30%
214
215 4
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
216
            $redir = $harvest->getRedirection();
217
            if (false !== $redir) {
218
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [new Link($redir)] : [];
219
            }
220
        } else {
221 4
            $this->recorder->cache($harvest, $url);
222
223 4
            $mimeType = $harvest->getResponse()->getMimeType();
224 4
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
225
226 4
            $this->recorder->recordOutboundLink($url, $harvest->getLinks()); // ~10%
227 4
            $url->links = count($harvest->getLinks());
228 4
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
229 4
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
230 4
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
231 4
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
232 4
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
233 4
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
234
235
            //$url->ratio_text_code = $harvest->getRatioTxtCode(); // Slow ~30%
236 4
            $url->words_count = $harvest->getTextAnalysis()->getWordNumber();
237 4
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
238 4
            $url->size = $harvest->getResponse()->getInfo('size_download');
239
240 4
            $breadcrumb = $harvest->getBreadCrumb();
241 4
            if (is_array($breadcrumb)) {
242
                $url->breadcrumb_level = count($breadcrumb);
243
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
244
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
245
            }
246
247 4
            $url->title = $harvest->getUniqueTag('head title') ?? '';
248 4
            $url->kws = ','.implode(',', array_keys($harvest->getKws())).','; // Slow ~20%
249 4
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
250 4
            $url->h1 = $url->title == $url->h1 ? '=' : $url->h1;
251
        }
252
253 4
        if (isset($links)) {
254 4
            $this->updateInboundLinksCounter($url, $links, $harvest);
255
        }
256
257 4
        return $harvest;
258
    }
259
260 4
    public function updateInboundLinksCounter(Url $url, array $links, Harvest $harvest)
261
    {
262 4
        $everAdd = [];
263 4
        foreach ($links as $link) {
264 4
            $newUri = substr($link->getPageUrl(), strlen($this->base));
265 4
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
266 4
            if (!isset($everAdd[$newUri])) {
267 4
                $everAdd[$newUri] = 1;
268 4
                if (!$link->mayFollow() || !$harvest->mayFollow()) {
269
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
270
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::NOFOLLOW);
271
                } else {
272 4
                    ++$this->urls[$newUri]->inboundlinks;
273 4
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::FOLLOW);
274
                }
275
            }
276
        }
277 4
    }
278
}
279