Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

Crawler::getHarvest()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 7
ccs 6
cts 6
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
9
class Crawler
10
{
11
    const FOLLOW = 1;
12
    const NOFOLLOW = 2;
13
14
    protected $recorder;
15
16
    protected $robotsTxt;
17
    protected $request;
18
19
    protected $currentClick = 0;
20
21
    protected $counter = 0;
22
23
    protected $urls = [];
24
25
    /**
26
     * @var CrawlerConfig
27
     */
28
    protected $config;
29
30
    public function __construct(
31
        string $startUrl,
32
        string $ignore,
33
        int $limit,
34
        string $userAgent,
35
        int $cacheMethod = Recorder::CACHE_ID,
36
        int $wait = 100000, // microSeconds !
37
        ?string $dataDirectory = null
38
    ) {
39
        $this->config = new CrawlerConfig($startUrl, $ignore, $limit, $userAgent, $cacheMethod, $wait, $dataDirectory);
40
41
        $this->urls[$this->config->getStartUrl()] = null;
42
43
        $this->recorder = new Recorder($this->config->getDataFolder(), $this->config->getCacheMethod());
44
45
        $this->config->recordConfig();
46
    }
47
48
    public function getConfig(): CrawlerConfig
49
    {
50
        return $this->config;
51
    }
52
53
    public function crawl(bool $debug = false)
54
    {
55
        $nothingUpdated = true;
56
57
        if ($debug) {
58
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
59
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
60
        }
61 9
62
        foreach ($this->urls as $urlToParse => $url) {
63
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
64
                continue;
65
            } elseif ($this->currentClick > $this->config->getLimit()) {
66
                continue;
67
            }
68
69
            if ($debug) {
70 9
                echo $this->counter.'/'.count($this->urls).'    '.$this->config->getBase().$urlToParse.PHP_EOL;
71 9
            }
72 9
73 9
            $nothingUpdated = false;
74 9
            ++$this->counter;
75 9
76 9
            if (null === $this->urls[$urlToParse]) {
77
                $url = $this->urls[$urlToParse] = new Url($this->config->getBase().$urlToParse, $this->currentClick);
78 9
            }
79
80 9
            $harvest = false === $this->canBeCrawled($url) ? null : $this->harvest($url);
81
            $this->urls[$urlToParse]->setDiscovered(count($this->urls));
82 9
83 9
            $this->cacheRobotsTxt($harvest);
84 9
85 9
            $this->cacheRequest($harvest);
86 9
87 9
            usleep($this->config->getWait());
88 9
89 9
            if ($this->counter / 500 == round($this->counter / 500)) {
90
                echo $debug ? '    --- auto-save'.PHP_EOL : '';
91 9
                $this->recorder->record($this->urls);
92
            }
93
        }
94
95
        ++$this->currentClick;
96 9
97
        // Record after each Level:
98 9
        $this->recorder->record($this->urls);
99 9
100
        $record = $nothingUpdated || $this->currentClick >= $this->config->getLimit();
101 6
102
        return $record ? null : $this->crawl($debug);
103 6
    }
104
105
    protected function cacheRobotsTxt($harvest)
106 9
    {
107
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
108 9
            $this->robotsTxt = $harvest->getRobotsTxt();
109
        }
110
111
        return $this;
112 9
    }
113 9
114
    protected function cacheRequest($harvest)
115 9
    {
116
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
117
            $this->request = $harvest->getResponse()->getRequest();
118 9
        }
119
120 9
        return $this;
121
    }
122
123 9
    protected function loadRobotsTxt(Harvest $harvest)
124
    {
125 9
        if (null !== $this->robotsTxt) {
126
            $harvest->setRobotsTxt($this->robotsTxt);
127 9
        }
128 9
129 9
        return $this;
130
    }
131
132 9
    protected function getHarvester(Url $url)
133 9
    {
134 3
        return Harvest::fromUrl(
135 9
            $this->config->getBase().$url->uri,
136 2
            $this->config->getUserAgent(),
137
            'en,en-US;q=0.5',
138
            $this->request
139 9
        );
140 9
    }
141
142
    protected function canBeCrawled(Url $url)
143 9
    {
144 9
        if (null === $url->can_be_crawled) {
145
            $url->can_be_crawled = $this->config->getVirtualRobots()
146 9
            ->allows($this->config->getBase().$url->uri, $this->config->getUserAgent());
147 9
        }
148
149 9
        return $url->can_be_crawled;
150
    }
151 9
152
    protected function harvest(Url $url): ?Harvest
153 9
    {
154
        $harvest = $this->getHarvester($url);
155 9
156
        if (!$harvest instanceof Harvest) {
157 6
            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;
158
159
            return null;
160
        }
161 9
162
        $this->loadRobotsTxt($harvest);
163
164 9
        $url->indexable = $harvest->isIndexable(); // slow ~30%
165
166 9
        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) {
167
            $redir = $harvest->getRedirection();
168 9
            if (false !== $redir) {
169
                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [new Link($redir)] : [];
170
            }
171 9
        } else {
172
            $this->recorder->cache($harvest, $url);
173 9
174 6
            $mimeType = $harvest->getResponse()->getMimeType();
175
            $url->mime_type = 'text/html' == $mimeType ? 1 : $mimeType;
176
177 9
            $this->recorder->recordOutboundLink($url, $harvest->getLinks()); // ~10%
178
            $url->links = count($harvest->getLinks());
179
            $url->links_duplicate = $harvest->getNbrDuplicateLinks();
180 9
            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));
181
            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));
182 9
            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));
183 6
            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));
184
            $links = $harvest->getLinks(Harvest::LINK_INTERNAL);
185
186 9
            //$url->ratio_text_code = $harvest->getRatioTxtCode(); // Slow ~30%
187
            $url->words_count = $harvest->getTextAnalysis()->getWordNumber();
188
            $url->load_time = $harvest->getResponse()->getInfo('total_time');
189 6
            $url->size = $harvest->getResponse()->getInfo('size_download');
190
191 6
            /*
192
             * I remove it from the default crawler, you can extend this one and restablish this code
193
             *
194
            $breadcrumb = $harvest->getBreadCrumb();
195 6
            if (is_array($breadcrumb)) {
196
                $url->breadcrumb_level = count($breadcrumb);
197
                $url->breadcrumb_first = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';
198 9
                $url->breadcrumb_text = $harvest->getBreadCrumb('//');
199
            }
200 9
            *
201 9
            $url->kws = ','.implode(',', array_keys($harvest->getKws())).','; // Slow ~20%
202 9
            /**/
203 9
204 9
            $url->title = $harvest->getUniqueTag('head title') ?? '';
205
            $url->h1 = $harvest->getUniqueTag('h1') ?? '';
206
            $url->h1 = $url->title == $url->h1 ? '=' : $url->h1;
207
        }
208 9
209
        if (isset($links)) {
210 9
            $this->updateInboundLinksCounter($url, $links, $harvest);
211 9
            $this->recorder->recordLinksIndex($this->config->getBase(), $url, $this->urls, $harvest->getLinks());
212
        }
213 9
214
        return $harvest;
215 9
    }
216
217
    public function updateInboundLinksCounter(Url $url, array $links, Harvest $harvest)
218
    {
219 9
        $everAdd = [];
220
        foreach ($links as $link) {
221 9
            $newUri = substr($link->getPageUrl(), strlen($this->config->getBase()));
222 3
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
223
            if (!isset($everAdd[$newUri])) {
224 3
                $everAdd[$newUri] = 1;
225
                if (!$link->mayFollow() || !$harvest->mayFollow()) {
226
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
227 6
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::NOFOLLOW);
228
                } else {
229 6
                    ++$this->urls[$newUri]->inboundlinks;
230
                    $this->recorder->recordInboundLink($url, $this->urls[$newUri], self::FOLLOW);
231 6
                }
232
            }
233
        }
234
    }
235
}
236