Crawler   A
last analyzed

Complexity

Total Complexity 27

Size/Duplication

Total Lines 160
Duplicated Lines 0 %

Test Coverage

Coverage 96.49%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
eloc 65
dl 0
loc 160
ccs 55
cts 57
cp 0.9649
rs 10
c 3
b 0
f 0
wmc 27

9 Methods

Rating   Name   Duplication   Size   Complexity  
A getConfig() 0 3 1
A printDebugCrawlUrl() 0 4 2
A autosave() 0 5 4
B crawl() 0 56 10
A __construct() 0 17 1
A canBeCrawled() 0 8 2
A printDebugInitCrawlLoop() 0 5 2
A getRecorder() 0 3 1
A updateInboundLinksAndUrlsToParse() 0 14 4
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
7
class Crawler
8
{
9
    /**
10
     * @var int number of Urls we can crawled before saving (0 = autosaving disabled)
11
     */
12
    const AUTOSAVE = 500;
13
14
    /** @var string */
15
    protected $harvester = '\PiedWeb\SeoPocketCrawler\CrawlerUrl';
16
17
    protected $currentClick = 0;
18
19
    protected $counter = 0;
20
21
    protected $urls = [];
22
23
    /**
24
     * @var CrawlerConfig
25
     */
26
    protected $config;
27
28
    /** @var bool */
29
    protected $debug = true;
30
31
    public function __construct(
32
        string $startUrl,
33
        string $ignore,
34
        int $limit,
35
        string $userAgent,
36
        int $cacheMethod = Recorder::CACHE_ID,
37
        int $wait = 100000, // microSeconds !
38
        bool $debug = true,
39
        ?string $dataDirectory = null
40
    ) {
41
        $this->config = new CrawlerConfig($startUrl, $ignore, $limit, $userAgent, $cacheMethod, $wait, $dataDirectory);
42
43
        $this->urls[$this->config->getStartUrl()] = null;
44
45
        $this->config->recordConfig();
46
47
        $this->debug = $debug;
48
    }
49
50
    public function getConfig(): CrawlerConfig
51
    {
52
        return $this->config;
53
    }
54
55
    public function crawl()
56
    {
57
        $nothingUpdated = true;
58
59
        $this->printDebugInitCrawlLoop();
60
61 9
        foreach ($this->urls as $urlToParse => $url) {
62
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
63
                continue;
64
            }
65
            // additionnal check not required ?!
66
            //elseif ($this->currentClick > $this->config->getLimit()) {
67
            //    break;
68
            //}
69
70 9
            $this->printDebugCrawlUrl($urlToParse);
71 9
72 9
            $nothingUpdated = false;
73 9
            ++$this->counter;
74 9
75 9
            if (null === $this->urls[$urlToParse]) {
76 9
                $url = $this->urls[$urlToParse] = new Url($this->config->getBase().$urlToParse, $this->currentClick);
77
            }
78 9
79
            if (false !== $this->canBeCrawled($url)) {
80 9
                $crawlerUrl = new $this->harvester($url, $this->config);
81
82 9
                if ($crawlerUrl->getHarvester() instanceof Harvest) {
83 9
                    $this->updateInboundLinksAndUrlsToParse($url, $crawlerUrl->getLinks());
84 9
                    $this->config->getRecorder()->recordLinksIndex(
85 9
                        $this->config->getBase(),
86 9
                        $url,
87 9
                        $this->urls,
88 9
                        $crawlerUrl->getHarvester()->getLinks()
89 9
                    );
90
                }
91 9
92
                $this->urls[$urlToParse]->setDiscovered(count($this->urls));
93
94
                $this->config->cacheRobotsTxt($crawlerUrl->getHarvester());
95
                $this->config->cacheRequest($crawlerUrl->getHarvester());
96 9
97
                usleep($this->config->getWait());
98 9
            }
99 9
100
            $this->autosave();
101 6
        }
102
103 6
        ++$this->currentClick;
104
105
        // Record after each Level:
106 9
        $this->config->getRecorder()->record($this->urls);
107
108 9
        $record = $nothingUpdated || $this->currentClick >= $this->config->getLimit();
109
110
        return $record ? null : $this->crawl();
111
    }
112 9
113 9
    protected function autosave()
114
    {
115 9
        if (0 !== $this->counter && $this->counter / self::AUTOSAVE == round($this->counter / self::AUTOSAVE)) {
116
            echo $this->debug ? '    --- auto-save'.PHP_EOL : '';
117
            $this->getRecorder()->record($this->urls);
118 9
        }
119
    }
120 9
121
    protected function canBeCrawled(Url $url)
122
    {
123 9
        if (null === $url->can_be_crawled) {
124
            $url->can_be_crawled = $this->config->getVirtualRobots()
125 9
            ->allows($this->config->getBase().$url->uri, $this->config->getUserAgent());
126
        }
127 9
128 9
        return $url->can_be_crawled;
129 9
    }
130
131
    public function updateInboundLinksAndUrlsToParse(Url $url, array $links)
132 9
    {
133 9
        $everAdd = [];
134 3
        foreach ($links as $link) {
135 9
            $newUri = $link->getUrl()->getRelativizedDocumentUrl();
136 2
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
137
            if (! isset($everAdd[$newUri])) {
138
                $everAdd[$newUri] = 1;
139 9
                if (! $link->mayFollow()) {
140 9
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
141
                } else {
142
                    ++$this->urls[$newUri]->inboundlinks;
143 9
                }
144 9
                $this->getRecorder()->recordInboundLink($link, $url, $this->urls[$newUri]);
145
            }
146 9
        }
147 9
    }
148
149 9
    protected function printDebugCrawlUrl(string $urlToParse)
150
    {
151 9
        if ($this->debug) {
152
            echo $this->counter.'/'.count($this->urls).'    '.$this->config->getBase().$urlToParse.PHP_EOL;
153 9
        }
154
    }
155 9
156
    protected function printDebugInitCrawlLoop()
157 6
    {
158
        if ($this->debug) {
159
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
160
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
161 9
        }
162
    }
163
164 9
    protected function getRecorder()
165
    {
166 9
        return $this->config->getRecorder();
167
    }
168
}
169