Completed
Push — master ( 31ed5b...1b5be9 )
by Dev
08:32 queued 07:20
created

Crawler   A

Complexity

Total Complexity 30

Size/Duplication

Total Lines 168
Duplicated Lines 0 %

Test Coverage

Coverage 83.87%

Importance

Changes 0
Metric Value
eloc 92
dl 0
loc 168
ccs 78
cts 93
cp 0.8387
rs 10
c 0
b 0
f 0
wmc 30
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use Spatie\Robots\RobotsTxt;
8
9
class Crawler
10
{
11
    protected $userAgent;
12
    protected $project;
13
    protected $ignore;
14
    protected $limit;
15
    protected $recorder;
16
17
    protected $currentClick = 0;
18
19
    protected $counter = 0;
20
21
    protected $base;
22
    protected $urls = [];
23
24
    public function __construct(string $startUrl, string $ignore, int $limit, string $userAgent)
25
    {
26
        $this->urls[$startUrl] = null;
27
        $this->base = Harvest::getDomainAndSchemeFrom($startUrl);
28
        $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl);
29
        $this->ignore = new RobotsTxt($ignore);
30
        $this->userAgent = $userAgent;
31
        $this->limit = $limit;
32
33
        $this->initRecorderAndCache();
34
    }
35
36
    public function getDataFolder()
37
    {
38
        return __DIR__.'/../data/'.$this->project;
39
    }
40
41
    public function getCacheFolder()
42
    {
43
        return __DIR__.'/../cache/'.$this->project;
44
    }
45
46
    protected function initRecorderAndCache()
47
    {
48
        $this->recorder = new Recorder($this->getDataFolder());
49
50
        exec('rm -rf '.$this->getDataFolder());
51
        exec('rm -rf '.$this->getCacheFolder());
52
53
        if (!file_exists($this->getDataFolder())) {
54
            mkdir($this->getDataFolder());
55
            mkdir($this->getDataFolder().'/links');
56
            mkdir($this->getCacheFolder());
57
        }
58
    }
59
60
    public function crawl(bool $debug = false)
61
    {
62
        $nothingUpdated = true;
63
64
        if ($debug) {
65
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
66
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
67
        }
68
69
        foreach ($this->urls as $urlToParse => $url) {
70
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
71
                continue;
72
            }
73
74
            if ($debug) {
75
                echo '    '.$urlToParse.PHP_EOL;
76
            }
77
78
            $nothingUpdated = false;
79
            ++$this->counter;
80
81
            $this->harvest($urlToParse);
82
        }
83
84
        ++$this->currentClick;
85
86
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
87
88
        return $record ? $this->recorder->record($this->urls) : $this->crawl($debug);
89
    }
90
91
    protected function cache($harvest)
92
    {
93
        if (false === strpos($harvest->getResponse()->getContentType(), 'text/html')) {
94
            return;
95
        }
96
97
        $url = ltrim($harvest->getAbsoluteInternalLink($harvest->getResponse()->getEffectiveUrl()), '/');
98
        $urlPart = explode('/', $url);
99
        $folder = $this->getCacheFolder();
100
101
        $urlPartLenght = count($urlPart);
102
        for ($i = 0; $i < $urlPartLenght; ++$i) {
103
            if ($i == count($urlPart) - 1) {
104
                $filename = empty($urlPart[$i]) ? 'index.html' : $urlPart[$i];
105
                file_put_contents($folder.'/'.$filename, $harvest->getResponse()->getContent());
106
            } else {
107
                $folder .= '/'.$urlPart[$i];
108
                if (!file_exists($folder)) {
109
                    mkdir($folder);
110
                }
111
            }
112
        }
113
    }
114
115
    protected function harvest(string $urlToParse)
116
    {
117
        $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick);
118
119
        $url->updated_at = date('Ymd');
120
        $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent);
121
122
        if (false === $url->can_be_crawled) {
123
            return;
124
        }
125
126
        $harvest = Harvest::fromUrl($urlToParse, $this->userAgent);
127
128
        if (!$harvest instanceof Harvest) {
129
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
130
131
            return;
132
        }
133
134
        $url->indexable = $harvest->isIndexable();
135
136
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
137
            $redir = $harvest->getRedirection()
138
            if (false !== $redir) {
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected T_IF on line 138 at column 12
Loading history...
139
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];
140
            }
141
        } else {
142
            $this->cache($harvest);
143
144
            $this->recorder->recordOutboundLink($url, $harvest->getLinks());
145
146
            $url->links = count($harvest->getLinks());
147
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
148
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
149
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
150
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
151
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
152
153
            $url->ratio_text_code = $harvest->getRatioTxtCode();
154
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
155
            $url->size = $harvest->getResponse()->getInfo('size_download');
156
157
            $breadcrumb = $harvest->getBreadCrumb();
158
            if (is_array($breadcrumb)) {
159
                $url->breadcrumb_level = count($breadcrumb);
160
                $url->breadcrumb_fisrt = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
161
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
162
            }
163
164
            $url->title = $harvest->getUniqueTag('head title') ?? '';
165
            $url->kws = ','.implode(',', $harvest->getKws()).',';
166
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
167
        }
168
169
        foreach ($harvest->getLinks(Harvest::LINK_INTERNAL) as $link) {
170
            $linkUrl = $link->getPageUrl();
171
            $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1));
172
            $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]);
173
            ++$this->urls[$linkUrl]->inboundlinks;
174
        }
175
    }
176
}
177