Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

Crawler::harvest()   B

Complexity

Conditions 8
Paths 15

Size

Total Lines 63
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 28
CRAP Score 8.0189

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 8
eloc 32
c 3
b 0
f 0
nc 15
nop 1
dl 0
loc 63
rs 8.1635
ccs 28
cts 30
cp 0.9333
crap 8.0189

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
9
class Crawler
10
{
11
    const FOLLOW = 1;
12
    const NOFOLLOW = 2;
13
14
    protected $recorder;
15
16
    protected $robotsTxt;
17
    protected $request;
18
19
    protected $currentClick = 0;
20
21
    protected $counter = 0;
22
23
    protected $urls = [];
24
25
    /**
26
     * @var CrawlerConfig
27
     */
28
    protected $config;
29
30
    public function __construct(
31
        string $startUrl,
32
        string $ignore,
33
        int $limit,
34
        string $userAgent,
35
        int $cacheMethod = Recorder::CACHE_ID,
36
        int $wait = 100000, // microSeconds !
37
        ?string $dataDirectory = null
38
    ) {
39
        $this->config = new CrawlerConfig($startUrl, $ignore, $limit, $userAgent, $cacheMethod, $wait, $dataDirectory);
40
41
        $this->urls[$this->config->getStartUrl()] = null;
42
43
        $this->recorder = new Recorder($this->config->getDataFolder(), $this->config->getCacheMethod());
44
45
        $this->config->recordConfig();
46
    }
47
48
    public function getConfig(): CrawlerConfig
49
    {
50
        return $this->config;
51
    }
52
53
    public function crawl(bool $debug = false)
54
    {
55
        $nothingUpdated = true;
56
57
        if ($debug) {
58
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
59
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
60
        }
61 9
62
        foreach ($this->urls as $urlToParse => $url) {
63
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
64
                continue;
65
            } elseif ($this->currentClick > $this->config->getLimit()) {
66
                continue;
67
            }
68
69
            if ($debug) {
70 9
                echo $this->counter.'/'.count($this->urls).'    '.$this->config->getBase().$urlToParse.PHP_EOL;
71 9
            }
72 9
73 9
            $nothingUpdated = false;
74 9
            ++$this->counter;
75 9
76 9
            if (null === $this->urls[$urlToParse]) {
77
                $url = $this->urls[$urlToParse] = new Url($this->config->getBase().$urlToParse, $this->currentClick);
78 9
            }
79
80 9
            $harvest = false === $this->canBeCrawled($url) ? null : $this->harvest($url);
81
            $this->urls[$urlToParse]->setDiscovered(count($this->urls));
82 9
83 9
            $this->cacheRobotsTxt($harvest);
84 9
85 9
            $this->cacheRequest($harvest);
86 9
87 9
            usleep($this->config->getWait());
88 9
89 9
            if ($this->counter / 500 == round($this->counter / 500)) {
90
                echo $debug ? '    --- auto-save'.PHP_EOL : '';
91 9
                $this->recorder->record($this->urls);
92
            }
93
        }
94
95
        ++$this->currentClick;
96 9
97
        // Record after each Level:
98 9
        $this->recorder->record($this->urls);
99 9
100
        $record = $nothingUpdated || $this->currentClick >= $this->config->getLimit();
101 6
102
        return $record ? null : $this->crawl($debug);
103 6
    }
104
105
    protected function cacheRobotsTxt($harvest)
106 9
    {
107
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
108 9
            $this->robotsTxt = $harvest->getRobotsTxt();
109
        }
110
111
        return $this;
112 9
    }
113 9
114
    protected function cacheRequest($harvest)
115 9
    {
116
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
117
            $this->request = $harvest->getResponse()->getRequest();
118 9
        }
119
120 9
        return $this;
121
    }
122
123 9
    protected function loadRobotsTxt(Harvest $harvest)
124
    {
125 9
        if (null !== $this->robotsTxt) {
126
            $harvest->setRobotsTxt($this->robotsTxt);
127 9
        }
128 9
129 9
        return $this;
130
    }
131
132 9
    protected function getHarvester(Url $url)
133 9
    {
134 3
        return Harvest::fromUrl(
135 9
            $this->config->getBase().$url->uri,
136 2
            $this->config->getUserAgent(),
137
            'en,en-US;q=0.5',
138
            $this->request
139 9
        );
140 9
    }
141
142
    protected function canBeCrawled(Url $url)
143 9
    {
144 9
        if (null === $url->can_be_crawled) {
145
            $url->can_be_crawled = $this->config->getVirtualRobots()
146 9
            ->allows($this->config->getBase().$url->uri, $this->config->getUserAgent());
147 9
        }
148
149 9
        return $url->can_be_crawled;
150
    }
151 9
152
    protected function harvest(Url $url): ?Harvest
153 9
    {
154
        $harvest = $this->getHarvester($url);
155 9
156
        if (!$harvest instanceof Harvest) {
157 6
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
158
159
            return null;
160
        }
161 9
162
        $this->loadRobotsTxt($harvest);
163
164 9
        $url->indexable = $harvest->isIndexable(); // slow ~30%
165
166 9
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
167
            $redir = $harvest->getRedirection();
168 9
            if (false !== $redir) {
169
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [new Link($redir)] : [];
170
            }
171 9
        } else {
172
            $this->recorder->cache($harvest, $url);
173 9
174 6
            $mimeType = $harvest->getResponse()->getMimeType();
175
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
176
177 9
            $this->recorder->recordOutboundLink($url, $harvest->getLinks()); // ~10%
178
            $url->links = count($harvest->getLinks());
179
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
180 9
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
181
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
182 9
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
183 6
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
184
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
185
186 9
            //$url->ratio_text_code = $harvest->getRatioTxtCode(); // Slow ~30%
187
            $url->words_count = $harvest->getTextAnalysis()->getWordNumber();
188
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
189 6
            $url->size = $harvest->getResponse()->getInfo('size_download');
190
191 6
            /*
192
             * I remove it from the default crawler, you can extend this one and restablish this code
193
             *
194
            $breadcrumb = $harvest->getBreadCrumb();
195 6
            if (is_array($breadcrumb)) {
196
                $url->breadcrumb_level = count($breadcrumb);
197
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
198 9
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
199
            }
200 9
            *
201 9
            $url->kws = ','.implode(',', array_keys($harvest->getKws())).','; // Slow ~20%
202 9
            /**/
203 9
204 9
            $url->title = $harvest->getUniqueTag('head title') ?? '';
205
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
206
            $url->h1 = $url->title == $url->h1 ? '=' : $url->h1;
207
        }
208 9
209
        if (isset($links)) {
210 9
            $this->updateInboundLinksCounter($url, $links, $harvest);
211 9
            $this->recorder->recordLinksIndex($this->config->getBase(), $url, $this->urls, $harvest->getLinks());
212
        }
213 9
214
        return $harvest;
215 9
    }
216
217
    public function updateInboundLinksCounter(Url $url, array $links, Harvest $harvest)
218
    {
219 9
        $everAdd = [];
220
        foreach ($links as $link) {
221 9
            $newUri = substr($link->getPageUrl(), strlen($this->config->getBase()));
222 3
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
223
            if (!isset($everAdd[$newUri])) {
224 3
                $everAdd[$newUri] = 1;
225
                if (!$link->mayFollow() || !$harvest->mayFollow()) {
226
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
227 6
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::NOFOLLOW);
228
                } else {
229 6
                    ++$this->urls[$newUri]->inboundlinks;
230
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::FOLLOW);
231 6
                }
232
            }
233
        }
234
    }
235
}
236