Completed
Push — master ( 62aac0...15756a )
by Dev
02:10
created

CrawlerContinue   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 50
Duplicated Lines 0 %

Test Coverage

Coverage 93.75%

Importance

Changes 0
Metric Value
wmc 9
eloc 30
dl 0
loc 50
ccs 30
cts 32
cp 0.9375
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 19 2
B loadFromPreviousCrawl() 0 27 7
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use Spatie\Robots\RobotsTxt;
6
use League\Csv\Reader;
7
8
class CrawlerContinue extends Crawler
9
{
10 3
    public function __construct(string $id, string $dataDirectoryBasePath = null)
11
    {
12 3
        $this->id = trim($id);
13 3
        $this->initDataDirectory($dataDirectoryBasePath);
14
15 3
        $configFilePath = $this->getDataFolder().'/config.json';
16 3
        if (!file_exists($configFilePath)) {
17
            throw new \Exception('previous crawl not found (config.json)');
18
        }
19 3
        $config = json_decode(file_get_contents($configFilePath), true);
20
21 3
        $this->ignore = new RobotsTxt($config['ignore']);
22 3
        $this->limit = $config['limit'];
23 3
        $this->userAgent = $config['userAgent'];
24 3
        $this->wait = $config['wait'];
25 3
        $this->base = $config['base'];
26
27 3
        $this->recorder = new Recorder($this->getDataFolder(), (int) $config['cacheMethod']);
28 3
        $this->loadFromPreviousCrawl($config['startUrl']);
29 3
    }
30
31 3
    protected function loadFromPreviousCrawl(string $startUrl)
32
    {
33 3
        $resultFilePath = $this->getDataFolder().'/index.csv';
34 3
        if (!file_exists($resultFilePath)) {
35
            throw new \Exception('previous crawl not found (index.csv)');
36
        }
37
38 3
        $csv = Reader::createFromPath($resultFilePath, 'r');
39 3
        $csv->setHeaderOffset(0);
40
41 3
        $records = $csv->getRecords();
42 3
        foreach ($records as $r) {
43 3
            $this->urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
44 3
            foreach ($r as $k => $v) {
45 3
                if ('can_be_crawled' == $k && !empty($v)) {
46 3
                    $v = (bool) $v;
47
                }
48 3
                $this->urls[$r['uri']]->$k = $v;
49
            }
50 3
            if (!empty($r['can_be_crawled'])) {
51 3
                ++$this->counter;
52
            }
53
        }
54
55 3
        $this->currentClick = $r['click'] ?? 0;
56
57 3
        return $startUrl;
58
    }
59
}
60