Completed
Push — master ( 1c8751...9578ed )
by Dev
10:51 queued 09:36
created

Crawler   A

Complexity

Total Complexity 41

Size/Duplication

Total Lines 254
Duplicated Lines 0 %

Test Coverage

Coverage 91.74%

Importance

Changes 0
Metric Value
wmc 41
eloc 125
dl 0
loc 254
ccs 111
cts 121
cp 0.9174
rs 9.1199
c 0
b 0
f 0

10 Methods

Rating   Name   Duplication   Size   Complexity  
A getDataFolder() 0 3 1
A getId() 0 3 1
A setBaseAndReturnNormalizedStartUrl() 0 10 4
C crawl() 0 46 12
A cacheRobotsTxt() 0 7 3
A loadRobotsTxt() 0 7 2
A __construct() 0 26 1
C harvest() 0 75 13
A getHarvest() 0 7 1
A cacheRequest() 0 7 3

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use Spatie\Robots\RobotsTxt;
8
9
class Crawler
10
{
11
    /**
12
     * @var string contain the user agent used during the crawl
13
     */
14
    protected $userAgent;
15
16
    /**
17
     * @var string crawl id
18
     */
19
    protected $id;
20
21
    /**
22
     * @var RobotsTxt page to ignore during the crawl
23
     */
24
    protected $ignore;
25
26
    /**
27
     * @var int depth max where to crawl
28
     */
29
    protected $limit;
30
31
    /**
32
     * @var string contain https://domain.tdl from start url
33
     */
34
    protected $base;
35
36
    /**
37
     * @var bool
38
     */
39
    protected $fromCache;
40
41
    protected $recorder;
42
    protected $robotsTxt;
43
    protected $request;
44
    protected $wait = 0;
45
46
    protected $currentClick = 0;
47
48
    protected $counter = 0;
49
50
    protected $urls = [];
51
52 6
    public function __construct(
53
        string $startUrl,
54
        string $ignore,
55
        int $limit,
56
        string $userAgent,
57
        int $cacheMethod = Recorder::CACHE_ID,
58
        int $waitInMicroSeconds = 100000
59
    ) {
60 6
        $startUrl = $this->setBaseAndReturnNormalizedStartUrl($startUrl);
61 6
        $this->urls[$startUrl] = null;
62 6
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
63 6
        $this->ignore = new RobotsTxt($ignore);
64 6
        $this->userAgent = $userAgent;
65 6
        $this->limit = $limit;
66 6
        $this->wait = $waitInMicroSeconds;
67
68 6
        $this->recorder = new Recorder($this->getDataFolder(), $cacheMethod);
69
70 6
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
71 6
            'startUrl' => $startUrl,
72 6
            'base' => $this->base,
73 6
            'ignore' => $ignore,
74 6
            'limit' => $limit,
75 6
            'userAgent' => $userAgent,
76 6
            'cacheMethod' => $cacheMethod,
77 6
            'wait' => $waitInMicroSeconds,
78
        ]));
79 6
    }
80
81 3
    public function getId()
82
    {
83 3
        return $this->id;
84
    }
85
86 6
    protected function setBaseAndReturnNormalizedStartUrl(string $url): string
87
    {
88 6
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
89
            throw new \Exception('start is not a valid URL `'.$url.'`');
90
        }
91
92 6
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
93 6
        $url = substr($url, strlen($this->base));
94
95 6
        return ('/' != $url[0] ? '/' : '').$url;
96
    }
97
98 6
    public function getDataFolder()
99
    {
100 6
        return __DIR__.'/../data/'.$this->id;
101
    }
102
103 6
    public function crawl(bool $debug = false)
104
    {
105 6
        $nothingUpdated = true;
106
107 6
        if ($debug) {
108 6
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
109 6
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
110
        }
111
112 6
        foreach ($this->urls as $urlToParse => $url) {
113 6
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
114 3
                continue;
115 6
            } elseif ($this->currentClick > $this->limit) {
116 2
                continue;
117
            }
118
119 6
            if ($debug) {
120 6
                echo $this->counter.'/'.count($this->urls).'    '.$this->base.$urlToParse.PHP_EOL;
121
            }
122
123 6
            $nothingUpdated = false;
124 6
            ++$this->counter;
125
126 6
            $harvest = $this->harvest($urlToParse);
127 6
            $this->urls[$urlToParse]->setDiscovered(count($this->urls));
128
129 6
            $this->cacheRobotsTxt($harvest);
130
131 6
            $this->cacheRequest($harvest);
132
133 6
            usleep($this->wait);
134
135 6
            if ($this->counter / 500 == round($this->counter / 500)) {
136
                echo $debug ? '    --- auto-save'.PHP_EOL : '';
137 4
                $this->recorder->record($this->urls);
138
            }
139
        }
140
141 6
        ++$this->currentClick;
142
143
        // Record after each Level:
144 6
        $this->recorder->record($this->urls);
145
146 6
        $record = $nothingUpdated || $this->currentClick >= $this->limit;
147
148 6
        return $record ? null : $this->crawl($debug);
149
    }
150
151 6
    protected function cacheRobotsTxt($harvest)
152
    {
153 6
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
154 4
            $this->robotsTxt = $harvest->getRobotsTxt();
155
        }
156
157 6
        return $this;
158
    }
159
160 6
    protected function cacheRequest($harvest)
161
    {
162 6
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
163 4
            $this->request = $harvest->getResponse()->getRequest();
164
        }
165
166 6
        return $this;
167
    }
168
169 4
    protected function loadRobotsTxt(Harvest $harvest)
170
    {
171 4
        if (null !== $this->robotsTxt) {
172
            $harvest->setRobotsTxt($this->robotsTxt);
173
        }
174
175 4
        return $this;
176
    }
177
178 6
    protected function getHarvest(Url $url)
179
    {
180 6
        return Harvest::fromUrl(
181 6
            $this->base.$url->uri,
182 6
            $this->userAgent,
183 6
            'en,en-US;q=0.5',
184 6
            $this->request
185
        );
186
    }
187
188 6
    protected function harvest(string $urlToParse)
189
    {
190 6
        $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($this->base.$urlToParse, $this->currentClick);
191 6
        $url = $this->urls[$urlToParse];
192
193 6
        $url->can_be_crawled = $this->ignore->allows($this->base.$urlToParse, $this->userAgent);
194
195 6
        if (false === $url->can_be_crawled) {
196
            return;
197
        }
198
199 6
        $harvest = $this->getHarvest($url);
200
201 6
        if (!$harvest instanceof Harvest) {
202 2
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
203
204 2
            return;
205
        }
206
207 4
        $this->loadRobotsTxt($harvest);
208
209 4
        $url->indexable = $harvest->isIndexable();
210
211 4
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
212
            $redir = $harvest->getRedirection();
213
            if (false !== $redir) {
214
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];
0 ignored issues
show
introduced by
The condition PiedWeb\UrlHarvester\Har...arvest->getType($redir) is always false.
Loading history...
215
            }
216
        } else {
217 4
            $this->recorder->cache($harvest, $url);
218
219 4
            $mimeType = $harvest->getResponse()->getMimeType();
220 4
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
221
222 4
            $this->recorder->recordOutboundLink($url, $harvest->getLinks());
223
224 4
            $url->links = count($harvest->getLinks());
225 4
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
226 4
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
227 4
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
228 4
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
229 4
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
230 4
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
231
232 4
            $url->ratio_text_code = $harvest->getRatioTxtCode();
233 4
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
234 4
            $url->size = $harvest->getResponse()->getInfo('size_download');
235
236 4
            $breadcrumb = $harvest->getBreadCrumb();
237 4
            if (is_array($breadcrumb)) {
238
                $url->breadcrumb_level = count($breadcrumb);
239
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
240
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
241
            }
242
243 4
            $url->title = $harvest->getUniqueTag('head title') ?? '';
244 4
            $url->kws = ','.implode(',', array_keys($harvest->getKws())).',';
245 4
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
246 4
            $url->h1 = $url->title == $url->h1 ? '=' : $url->h1;
247
        }
248
249 4
        $everAdd = [];
250 4
        if (isset($links)) {
251 4
            foreach ($links as $link) {
252 4
                $newUri = substr($link->getPageUrl(), strlen($this->base));
253 4
                $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
254 4
                if (!isset($everAdd[$newUri])) {
255 4
                    $everAdd[$newUri] = 1;
256 4
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri]);
257 4
                    ++$this->urls[$newUri]->inboundlinks;
258
                }
259
            }
260
        }
261
262 4
        return $harvest;
263
    }
264
}
265