Completed
Push — master ( b5c69a...6cbd96 )
by Dev
02:24
created

Crawler::cacheRequest()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 3
nc 2
nop 1
dl 0
loc 7
ccs 4
cts 4
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use Spatie\Robots\RobotsTxt;
8
9
class Crawler
10
{
11
    protected $userAgent;
12
    protected $project;
13
    protected $ignore;
14
    protected $limit;
15
    protected $recorder;
16
    protected $robotsTxt;
17
    protected $request;
18
    protected $wait = 0;
19
20
    protected $currentClick = 0;
21
22
    protected $counter = 0;
23
24
    protected $urls = [];
25
26 3
    public function __construct(string $startUrl, string $ignore, int $limit, string $userAgent)
27
    {
28 3
        $this->urls[$startUrl] = null;
29 3
        $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl).'-'.date('ymd-Hi');
30 3
        $this->ignore = new RobotsTxt($ignore);
31 3
        $this->userAgent = $userAgent;
32 3
        $this->limit = $limit;
33
34 3
        $this->initRecorderAndCache();
35 3
    }
36
37 3
    public function getDataFolder()
38
    {
39 3
        return __DIR__.'/../data/'.$this->project;
40
    }
41
42 3
    public function getCacheFolder()
43
    {
44 3
        return __DIR__.'/../cache/'.$this->project;
45
    }
46
47 3
    protected function initRecorderAndCache()
48
    {
49 3
        $this->recorder = new Recorder($this->getDataFolder());
50
51 3
        exec('rm -rf '.$this->getDataFolder());
52 3
        exec('rm -rf '.$this->getCacheFolder());
53
54 3
        if (!file_exists($this->getDataFolder())) {
55 3
            mkdir($this->getDataFolder());
56 3
            mkdir($this->getDataFolder().'/links');
57 3
            mkdir($this->getCacheFolder());
58
        }
59 3
    }
60
61
    public function setWaitBetweenRequest(int $microSeconds = 100)
62
    {
63
        $this->wai = $microSeconds;
0 ignored issues
show
Bug Best Practice introduced by
The property wai does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
64
    }
65
66 3
    public function crawl(bool $debug = false)
67
    {
68 3
        $nothingUpdated = true;
69
70 3
        if ($debug) {
71 3
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
72 3
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
73
        }
74
75 3
        foreach ($this->urls as $urlToParse => $url) {
76 3
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
77
                continue;
78
            }
79
80 3
            if ($debug) {
81 3
                echo '    '.$urlToParse.PHP_EOL;
82
            }
83
84 3
            $nothingUpdated = false;
85 3
            ++$this->counter;
86
87 3
            $harvest = $this->harvest($urlToParse);
88
89 3
            $this->cacheRobotsTxt($harvest);
90
91 3
            $this->cacheRequest($harvest);
92
93 3
            usleep($this->wait);
94
        }
95
96 3
        ++$this->currentClick;
97
98 3
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
99
100 3
        return $record ? $this->recorder->record($this->urls) : $this->crawl($debug);
101
    }
102
103 3
    protected function cacheRobotsTxt($harvest)
104
    {
105 3
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
106 2
            $this->robotsTxt = $harvest->getRobotsTxt();
107
        }
108
109 3
        return $this;
110
    }
111
112 3
    protected function cacheRequest($harvest)
113
    {
114 3
        if ($harvest instanceof Harvest) {
115 2
            $this->request = $harvest->getResponse()->getRequest();
116
        }
117
118 3
        return $this;
119
    }
120
121 2
    protected function loadRobotsTxt(Harvest $harvest)
122
    {
123 2
        if (null !== $this->robotsTxt) {
124
            $harvest->setRobotsTxt($this->robotsTxt);
125
        }
126
127 2
        return $this;
128
    }
129
130 2
    protected function cache($harvest)
131
    {
132 2
        if (false === strpos($harvest->getResponse()->getContentType(), 'text/html')) {
133
            return;
134
        }
135
136 2
        $url = ltrim($harvest->getAbsoluteInternalLink($harvest->getResponse()->getEffectiveUrl()), '/');
137 2
        $urlPart = explode('/', $url);
138 2
        $folder = $this->getCacheFolder();
139
140 2
        $urlPartLenght = count($urlPart);
141 2
        for ($i = 0; $i < $urlPartLenght; ++$i) {
142 2
            if ($i == count($urlPart) - 1) {
143 2
                $filename = empty($urlPart[$i]) ? 'index.html' : $urlPart[$i];
144 2
                file_put_contents($folder.'/'.$filename, $harvest->getResponse()->getContent());
145
            } else {
146
                $folder .= '/'.$urlPart[$i];
147
                if (!file_exists($folder)) {
148
                    mkdir($folder);
149
                }
150
            }
151
        }
152 2
    }
153
154 3
    protected function harvest(string $urlToParse)
155
    {
156 3
        $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick);
157
158 3
        $url->updated_at = date('Ymd');
159 3
        $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent);
160
161 3
        if (false === $url->can_be_crawled) {
162
            return;
163
        }
164
165 3
        $harvest = Harvest::fromUrl(
166 3
            $urlToParse,
167 3
            $this->userAgent,
168 3
            'en,en-US;q=0.5',
169 3
            $this->request
170
        );
171
172 3
        if (!$harvest instanceof Harvest) {
173 1
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
174
175 1
            return;
176
        }
177
178 2
        $this->loadRobotsTxt($harvest);
179
180 2
        $url->indexable = $harvest->isIndexable();
181
182 2
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
183
            $redir = $harvest->getRedirection();
184
            if (false !== $redir) {
185
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];
0 ignored issues
show
introduced by
The condition PiedWeb\UrlHarvester\Har...arvest->getType($redir) is always false.
Loading history...
186
            }
187
        } else {
188 2
            $this->cache($harvest);
189
190 2
            $this->recorder->recordOutboundLink($url, $harvest->getLinks());
191
192 2
            $url->links = count($harvest->getLinks());
193 2
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
194 2
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
195 2
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
196 2
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
197 2
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
198 2
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
199
200 2
            $url->ratio_text_code = $harvest->getRatioTxtCode();
201 2
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
202 2
            $url->size = $harvest->getResponse()->getInfo('size_download');
203
204 2
            $breadcrumb = $harvest->getBreadCrumb();
205 2
            if (is_array($breadcrumb)) {
206
                $url->breadcrumb_level = count($breadcrumb);
207
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
208
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
209
            }
210
211 2
            $url->title = $harvest->getUniqueTag('head title') ?? '';
212 2
            $url->kws = ','.implode(',', $harvest->getKws()).',';
213 2
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
214
        }
215
216 2
        if (isset($links)) {
217 2
            foreach ($links as $link) {
218 2
                $linkUrl = $link->getPageUrl();
219 2
                $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1));
220 2
                $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]);
221 2
                ++$this->urls[$linkUrl]->inboundlinks;
222
            }
223
        }
224
225 2
        return $harvest;
226
    }
227
}
228