Completed
Push — master ( a2b567...2ce769 )
by Dev
02:37 queued 57s
created

Crawler::crawl()   B

Complexity

Conditions 9
Paths 32

Size

Total Lines 29
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 9.0197

Importance

Changes 0
Metric Value
cc 9
eloc 15
nc 32
nop 1
dl 0
loc 29
ccs 15
cts 16
cp 0.9375
crap 9.0197
rs 8.0555
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use Spatie\Robots\RobotsTxt;
8
9
class Crawler
10
{
11
    protected $userAgent;
12
    protected $project;
13
    protected $ignore;
14
    protected $limit;
15
    protected $recorder;
16
17
    protected $currentClick = 0;
18
19
    protected $counter = 0;
20
21
    protected $base;
22
    protected $urls = [];
23
24 3
    public function __construct(string $startUrl, string $ignore, int $limit, string $userAgent)
25
    {
26 3
        $this->urls[$startUrl] = null;
27 3
        $this->base = Harvest::getDomainAndSchemeFrom($startUrl);
28 3
        $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl);
29 3
        $this->ignore = new RobotsTxt($ignore);
30 3
        $this->userAgent = $userAgent;
31 3
        $this->limit = $limit;
32
33 3
        $this->initRecorderAndCache();
34 3
    }
35
36 3
    public function getDataFolder()
37
    {
38 3
        return __DIR__.'/../data/'.$this->project;
39
    }
40
41 3
    public function getCacheFolder()
42
    {
43 3
        return __DIR__.'/../cache/'.$this->project;
44
    }
45
46 3
    protected function initRecorderAndCache()
47
    {
48 3
        $this->recorder = new Recorder($this->getDataFolder());
49
50 3
        exec('rm -rf '.$this->getDataFolder());
51 3
        exec('rm -rf '.$this->getCacheFolder());
52
53 3
        if (!file_exists($this->getDataFolder())) {
54 3
            mkdir($this->getDataFolder());
55 3
            mkdir($this->getDataFolder().'/links');
56 3
            mkdir($this->getCacheFolder());
57
        }
58 3
    }
59
60 3
    public function crawl(bool $debug = false)
61
    {
62 3
        $nothingUpdated = true;
63
64 3
        if ($debug) {
65 3
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
66 3
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
67
        }
68
69 3
        foreach ($this->urls as $urlToParse => $url) {
70 3
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
71
                continue;
72
            }
73
74 3
            if ($debug) {
75 3
                echo '    '.$urlToParse.PHP_EOL;
76
            }
77
78 3
            $nothingUpdated = false;
79 3
            ++$this->counter;
80
81 3
            $this->harvest($urlToParse);
82
        }
83
84 3
        ++$this->currentClick;
85
86 3
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
87
88 3
        return $record ? $this->recorder->record($this->urls) : $this->crawl($debug);
89
    }
90
91 2
    protected function cache($harvest)
92
    {
93 2
        if (false === strpos($harvest->getResponse()->getContentType(), 'text/html')) {
94
            return;
95
        }
96
97 2
        $url = ltrim($harvest->getAbsoluteInternalLink($harvest->getResponse()->getEffectiveUrl()), '/');
98 2
        $urlPart = explode('/', $url);
99 2
        $folder = $this->getCacheFolder();
100
101 2
        $urlPartLenght = count($urlPart);
102 2
        for ($i = 0; $i < $urlPartLenght; ++$i) {
103 2
            if ($i == count($urlPart) - 1) {
104 2
                $filename = empty($urlPart[$i]) ? 'index.html' : $urlPart[$i];
105 2
                file_put_contents($folder.'/'.$filename, $harvest->getResponse()->getContent());
106
            } else {
107
                $folder .= '/'.$urlPart[$i];
108
                if (!file_exists($folder)) {
109
                    mkdir($folder);
110
                }
111
            }
112
        }
113 2
    }
114
115 3
    protected function harvest(string $urlToParse)
116
    {
117 3
        $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick);
118
119 3
        $url->updated_at = date('Ymd');
120 3
        $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent);
121
122 3
        if (false === $url->can_be_crawled) {
123
            return;
124
        }
125
126 3
        $harvest = Harvest::fromUrl($urlToParse, $this->userAgent);
127
128 3
        if (!$harvest instanceof Harvest) {
129 1
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
130
131 1
            return;
132
        }
133
134 2
        $url->indexable = $harvest->isIndexable();
135
136 2
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
137
            $redir = $harvest->getRedirection();
138
            if (false !== $redir) {
139
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];
0 ignored issues
show
introduced by
The condition PiedWeb\UrlHarvester\Har...arvest->getType($redir) is always false.
Loading history...
Unused Code introduced by
The assignment to $links is dead and can be removed.
Loading history...
140
            }
141
        } else {
142 2
            $this->cache($harvest);
143
144 2
            $this->recorder->recordOutboundLink($url, $harvest->getLinks());
145
146 2
            $url->links = count($harvest->getLinks());
147 2
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
148 2
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
149 2
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
150 2
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
151 2
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
152
153 2
            $url->ratio_text_code = $harvest->getRatioTxtCode();
154 2
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
155 2
            $url->size = $harvest->getResponse()->getInfo('size_download');
156
157 2
            $breadcrumb = $harvest->getBreadCrumb();
158 2
            if (is_array($breadcrumb)) {
159
                $url->breadcrumb_level = count($breadcrumb);
160
                $url->breadcrumb_fisrt = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
0 ignored issues
show
Bug introduced by
The property breadcrumb_fisrt does not exist on PiedWeb\SeoPocketCrawler\Url. Did you mean breadcrumb_first?
Loading history...
161
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
162
            }
163
164 2
            $url->title = $harvest->getUniqueTag('head title') ?? '';
165 2
            $url->kws = ','.implode(',', $harvest->getKws()).',';
166 2
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
167
        }
168
169 2
        foreach ($harvest->getLinks(Harvest::LINK_INTERNAL) as $link) {
170 2
            $linkUrl = $link->getPageUrl();
171 2
            $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1));
172 2
            $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]);
173 2
            ++$this->urls[$linkUrl]->inboundlinks;
174
        }
175 2
    }
176
}
177