Completed
Push — master ( 6cbd96...c237cf )
by Dev
02:36
created

Crawler::cache()   A

Complexity

Conditions 6
Paths 6

Size

Total Lines 19
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 6.8395

Importance

Changes 0
Metric Value
cc 6
eloc 14
nc 6
nop 1
dl 0
loc 19
ccs 10
cts 14
cp 0.7143
crap 6.8395
rs 9.2222
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use Spatie\Robots\RobotsTxt;
8
9
class Crawler
10
{
11
    protected $userAgent;
12
    protected $project;
13
    protected $ignore;
14
    protected $limit;
15
    protected $recorder;
16
    protected $robotsTxt;
17
    protected $request;
18
    protected $wait = 0;
19
20
    protected $currentClick = 0;
21
22
    protected $counter = 0;
23
24
    protected $urls = [];
25
26 6
    public function __construct(
27
        string $startUrl,
28
        string $ignore,
29
        int $limit,
30
        string $userAgent,
31
        int $cacheMethod = Recorder::CACHE_ID
32
    ) {
33 6
        $this->urls[$startUrl] = null;
34 6
        $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl).'-'.date('ymd-Hi');
35 6
        $this->ignore = new RobotsTxt($ignore);
36 6
        $this->userAgent = $userAgent;
37 6
        $this->limit = $limit;
38
39 6
        $this->recorder = new Recorder($this->getDataFolder(), $cacheMethod);
40 6
    }
41
42 6
    public function getDataFolder()
43
    {
44 6
        return __DIR__.'/../data/'.$this->project;
45
    }
46
47
    public function setWaitBetweenRequest(int $microSeconds = 100000)
48
    {
49
        $this->wait = $microSeconds;
50
    }
51
52 6
    public function crawl(bool $debug = false)
53
    {
54 6
        $nothingUpdated = true;
55
56 6
        if ($debug) {
57 3
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
58 3
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
59
        }
60
61 6
        foreach ($this->urls as $urlToParse => $url) {
62 6
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
63
                continue;
64
            }
65
66 6
            if ($debug) {
67 3
                echo '    '.$urlToParse.PHP_EOL;
68
            }
69
70 6
            $nothingUpdated = false;
71 6
            ++$this->counter;
72
73 6
            $harvest = $this->harvest($urlToParse);
74
75 6
            $this->cacheRobotsTxt($harvest);
76
77 6
            $this->cacheRequest($harvest);
78
79 6
            usleep($this->wait);
80
        }
81
82 6
        ++$this->currentClick;
83
84 6
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
85
86 6
        return $record ? $this->recorder->record($this->urls) : $this->crawl($debug);
87
    }
88
89 6
    protected function cacheRobotsTxt($harvest)
90
    {
91 6
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
92 4
            $this->robotsTxt = $harvest->getRobotsTxt();
93
        }
94
95 6
        return $this;
96
    }
97
98 6
    protected function cacheRequest($harvest)
99
    {
100 6
        if ($harvest instanceof Harvest) {
101 4
            $this->request = $harvest->getResponse()->getRequest();
102
        }
103
104 6
        return $this;
105
    }
106
107 4
    protected function loadRobotsTxt(Harvest $harvest)
108
    {
109 4
        if (null !== $this->robotsTxt) {
110
            $harvest->setRobotsTxt($this->robotsTxt);
111
        }
112
113 4
        return $this;
114
    }
115
116 6
    protected function harvest(string $urlToParse)
117
    {
118 6
        $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick);
119
120 6
        $url->updated_at = date('Ymd');
121 6
        $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent);
122
123 6
        if (false === $url->can_be_crawled) {
124
            return;
125
        }
126
127 6
        $harvest = Harvest::fromUrl(
128 6
            $urlToParse,
129 6
            $this->userAgent,
130 6
            'en,en-US;q=0.5',
131 6
            $this->request
132
        );
133
134 6
        if (!$harvest instanceof Harvest) {
135 2
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
136
137 2
            return;
138
        }
139
140 4
        $this->loadRobotsTxt($harvest);
141
142 4
        $url->indexable = $harvest->isIndexable();
143
144 4
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
145
            $redir = $harvest->getRedirection();
146
            if (false !== $redir) {
147
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];
0 ignored issues
show
introduced by
The condition PiedWeb\UrlHarvester\Har...arvest->getType($redir) is always false.
Loading history...
148
            }
149
        } else {
150 4
            $this->recorder->cache($harvest, $url);
151
152 4
            $this->content_type = $harvest->getResponse()->getContentType();
0 ignored issues
show
Bug Best Practice introduced by
The property content_type does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
153
154 4
            $mimeType = $harvest->getResponse()->getMimeType();
155 4
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
156
157 4
            $this->recorder->recordOutboundLink($url, $harvest->getLinks());
158
159 4
            $url->links = count($harvest->getLinks());
160 4
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
161 4
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
162 4
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
163 4
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
164 4
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
165 4
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
166
167 4
            $url->ratio_text_code = $harvest->getRatioTxtCode();
168 4
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
169 4
            $url->size = $harvest->getResponse()->getInfo('size_download');
170
171 4
            $breadcrumb = $harvest->getBreadCrumb();
172 4
            if (is_array($breadcrumb)) {
173
                $url->breadcrumb_level = count($breadcrumb);
174
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
175
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
176
            }
177
178 4
            $url->title = $harvest->getUniqueTag('head title') ?? '';
179 4
            $url->kws = ','.implode(',', $harvest->getKws()).',';
180 4
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
181
        }
182
183 4
        if (isset($links)) {
184 4
            foreach ($links as $link) {
185 4
                $linkUrl = $link->getPageUrl();
186 4
                $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1));
187 4
                $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]);
188 4
                ++$this->urls[$linkUrl]->inboundlinks;
189
            }
190
        }
191
192 4
        return $harvest;
193
    }
194
}
195