Passed
Pull Request — master (#1)
by
unknown
03:40
created

Crawler::getId()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
use Spatie\Robots\RobotsTxt;
9
10
class Crawler
11
{
12
    const FOLLOW = 1;
13
    const NOFOLLOW = 2;
14
15
    /**
16
     * @var string contain the user agent used during the crawl
17
     */
18
    protected $userAgent;
19
20
    /**
21
     * @var string crawl id
22
     */
23
    protected $id;
24
25
    /**
26
     * @var RobotsTxt page to ignore during the crawl
27
     */
28
    protected $ignore;
29
30
    /**
31
     * @var int depth max where to crawl
32
     */
33
    protected $limit;
34
35
    /**
36
     * @var string contain https://domain.tdl from start url
37
     */
38
    protected $base;
39
40
    /**
41
     * @var bool
42
     */
43
    protected $fromCache;
44
45
    /**
46
     * @var string
47
     */
48
    protected $dataDirectoryBasePath;
49
50
    protected $recorder;
51
    protected $robotsTxt;
52
    protected $request;
53
    protected $wait = 0;
54
55
    protected $currentClick = 0;
56
57
    protected $counter = 0;
58
59
    protected $urls = [];
60
61 9
    public function __construct(
62
        string $startUrl,
63
        string $ignore,
64
        int $limit,
65
        string $userAgent,
66
        int $cacheMethod = Recorder::CACHE_ID,
67
        int $waitInMicroSeconds = 100000,
68
        string $dataDirectoryBasePath = null
69
    ) {
70 9
        $startUrl = $this->setBaseAndReturnNormalizedStartUrl($startUrl);
71 9
        $this->urls[$startUrl] = null;
72 9
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
73 9
        $this->ignore = new RobotsTxt($ignore);
74 9
        $this->userAgent = $userAgent;
75 9
        $this->limit = $limit;
76 9
        $this->wait = $waitInMicroSeconds;
77
78 9
        $this->initDataDirectory($dataDirectoryBasePath);
79
80 9
        $this->recorder = new Recorder($this->getDataFolder(), $cacheMethod);
81
82 9
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
83 9
            'startUrl' => $startUrl,
84 9
            'base' => $this->base,
85 9
            'ignore' => $ignore,
86 9
            'limit' => $limit,
87 9
            'userAgent' => $userAgent,
88 9
            'cacheMethod' => $cacheMethod,
89 9
            'wait' => $waitInMicroSeconds,
90
        ]));
91 9
    }
92
93
    /**
94
     * @param string $dataDirectoryBasePath
95
     */
96 9
    protected function initDataDirectory(string $dataDirectoryBasePath = null)
97
    {
98 9
        $this->dataDirectoryBasePath = rtrim($dataDirectoryBasePath ?? __DIR__.'/../data', '/');
99 9
    }
100
101 6
    public function getId()
102
    {
103 6
        return $this->id;
104
    }
105
106 9
    protected function setBaseAndReturnNormalizedStartUrl(string $url): string
107
    {
108 9
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
109
            throw new \Exception('start is not a valid URL `'.$url.'`');
110
        }
111
112 9
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
113 9
        $url = substr($url, strlen($this->base));
114
115 9
        return ('/' != $url[0] ? '/' : '').$url;
116
    }
117
118 9
    public function getDataFolder()
119
    {
120 9
        return $this->dataDirectoryBasePath.'/'.$this->id;
121
    }
122
123 9
    public function crawl(bool $debug = false)
124
    {
125 9
        $nothingUpdated = true;
126
127 9
        if ($debug) {
128 9
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
129 9
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
130
        }
131
132 9
        foreach ($this->urls as $urlToParse => $url) {
133 9
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
134 3
                continue;
135 9
            } elseif ($this->currentClick > $this->limit) {
136 2
                continue;
137
            }
138
139 9
            if ($debug) {
140 9
                echo $this->counter.'/'.count($this->urls).'    '.$this->base.$urlToParse.PHP_EOL;
141
            }
142
143 9
            $nothingUpdated = false;
144 9
            ++$this->counter;
145
146 9
            $harvest = $this->harvest($urlToParse);
147 9
            $this->urls[$urlToParse]->setDiscovered(count($this->urls));
148
149 9
            $this->cacheRobotsTxt($harvest);
150
151 9
            $this->cacheRequest($harvest);
152
153 9
            usleep($this->wait);
154
155 9
            if ($this->counter / 500 == round($this->counter / 500)) {
156
                echo $debug ? '    --- auto-save'.PHP_EOL : '';
157 6
                $this->recorder->record($this->urls);
158
            }
159
        }
160
161 9
        ++$this->currentClick;
162
163
        // Record after each Level:
164 9
        $this->recorder->record($this->urls);
165
166 9
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
167
168 9
        return $record ? null : $this->crawl($debug);
169
    }
170
171 9
    protected function cacheRobotsTxt($harvest)
172
    {
173 9
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
174 6
            $this->robotsTxt = $harvest->getRobotsTxt();
175
        }
176
177 9
        return $this;
178
    }
179
180 9
    protected function cacheRequest($harvest)
181
    {
182 9
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
183 6
            $this->request = $harvest->getResponse()->getRequest();
184
        }
185
186 9
        return $this;
187
    }
188
189 6
    protected function loadRobotsTxt(Harvest $harvest)
190
    {
191 6
        if (null !== $this->robotsTxt) {
192
            $harvest->setRobotsTxt($this->robotsTxt);
193
        }
194
195 6
        return $this;
196
    }
197
198 9
    protected function getHarvest(Url $url)
199
    {
200 9
        return Harvest::fromUrl(
201 9
            $this->base.$url->uri,
202 9
            $this->userAgent,
203 9
            'en,en-US;q=0.5',
204 9
            $this->request
205
        );
206
    }
207
208 9
    protected function harvest(string $urlToParse)
209
    {
210 9
        $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($this->base.$urlToParse, $this->currentClick);
211 9
        $url = $this->urls[$urlToParse];
212
213 9
        $url->can_be_crawled = $this->ignore->allows($this->base.$urlToParse, $this->userAgent);
214
215 9
        if (false === $url->can_be_crawled) {
216
            return;
217
        }
218
219 9
        $harvest = $this->getHarvest($url);
220
221 9
        if (!$harvest instanceof Harvest) {
222 3
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
223
224 3
            return;
225
        }
226
227 6
        $this->loadRobotsTxt($harvest);
228
229 6
        $url->indexable = $harvest->isIndexable(); // slow ~30%
230
231 6
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
232
            $redir = $harvest->getRedirection();
233
            if (false !== $redir) {
234
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [new Link($redir)] : [];
235
            }
236
        } else {
237 6
            $this->recorder->cache($harvest, $url);
238
239 6
            $mimeType = $harvest->getResponse()->getMimeType();
240 6
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
241
242 6
            $this->recorder->recordOutboundLink($url, $harvest->getLinks()); // ~10%
243 6
            $url->links = count($harvest->getLinks());
244 6
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
245 6
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
246 6
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
247 6
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
248 6
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
249 6
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
250
251
            //$url->ratio_text_code = $harvest->getRatioTxtCode(); // Slow ~30%
252 6
            $url->words_count = $harvest->getTextAnalysis()->getWordNumber();
253 6
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
254 6
            $url->size = $harvest->getResponse()->getInfo('size_download');
255
256 6
            $breadcrumb = $harvest->getBreadCrumb();
257 6
            if (is_array($breadcrumb)) {
258
                $url->breadcrumb_level = count($breadcrumb);
259
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
260
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
261
            }
262
263 6
            $url->title = $harvest->getUniqueTag('head title') ?? '';
264 6
            $url->kws = ','.implode(',', array_keys($harvest->getKws())).','; // Slow ~20%
265 6
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
266 6
            $url->h1 = $url->title == $url->h1 ? '=' : $url->h1;
267
        }
268
269 6
        if (isset($links)) {
270 6
            $this->updateInboundLinksCounter($url, $links, $harvest);
271
        }
272
273 6
        return $harvest;
274
    }
275
276 6
    public function updateInboundLinksCounter(Url $url, array $links, Harvest $harvest)
277
    {
278 6
        $everAdd = [];
279 6
        foreach ($links as $link) {
280 6
            $newUri = substr($link->getPageUrl(), strlen($this->base));
281 6
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
282 6
            if (!isset($everAdd[$newUri])) {
283 6
                $everAdd[$newUri] = 1;
284 6
                if (!$link->mayFollow() || !$harvest->mayFollow()) {
285
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
286
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::NOFOLLOW);
287
                } else {
288 6
                    ++$this->urls[$newUri]->inboundlinks;
289 6
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::FOLLOW);
290
                }
291
            }
292
        }
293 6
    }
294
}
295