Crawler::__construct()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 17
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 4
nc 1
nop 8
dl 0
loc 17
ccs 0
cts 0
cp 0
crap 2
rs 10
c 1
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
7
class Crawler
8
{
9
    /**
10
     * @var int number of Urls we can crawled before saving (0 = autosaving disabled)
11
     */
12
    const AUTOSAVE = 500;
13
14
    /** @var string */
15
    protected $harvester = '\PiedWeb\SeoPocketCrawler\CrawlerUrl';
16
17
    protected $currentClick = 0;
18
19
    protected $counter = 0;
20
21
    protected $urls = [];
22
23
    /**
24
     * @var CrawlerConfig
25
     */
26
    protected $config;
27
28
    /** @var bool */
29
    protected $debug = true;
30
31
    public function __construct(
32
        string $startUrl,
33
        string $ignore,
34
        int $limit,
35
        string $userAgent,
36
        int $cacheMethod = Recorder::CACHE_ID,
37
        int $wait = 100000, // microSeconds !
38
        bool $debug = true,
39
        ?string $dataDirectory = null
40
    ) {
41
        $this->config = new CrawlerConfig($startUrl, $ignore, $limit, $userAgent, $cacheMethod, $wait, $dataDirectory);
42
43
        $this->urls[$this->config->getStartUrl()] = null;
44
45
        $this->config->recordConfig();
46
47
        $this->debug = $debug;
48
    }
49
50
    public function getConfig(): CrawlerConfig
51
    {
52
        return $this->config;
53
    }
54
55
    public function crawl()
56
    {
57
        $nothingUpdated = true;
58
59
        $this->printDebugInitCrawlLoop();
60
61 9
        foreach ($this->urls as $urlToParse => $url) {
62
            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé
63
                continue;
64
            }
65
            // additionnal check not required ?!
66
            //elseif ($this->currentClick > $this->config->getLimit()) {
67
            //    break;
68
            //}
69
70 9
            $this->printDebugCrawlUrl($urlToParse);
71 9
72 9
            $nothingUpdated = false;
73 9
            ++$this->counter;
74 9
75 9
            if (null === $this->urls[$urlToParse]) {
76 9
                $url = $this->urls[$urlToParse] = new Url($this->config->getBase().$urlToParse, $this->currentClick);
77
            }
78 9
79
            if (false !== $this->canBeCrawled($url)) {
80 9
                $crawlerUrl = new $this->harvester($url, $this->config);
81
82 9
                if ($crawlerUrl->getHarvester() instanceof Harvest) {
83 9
                    $this->updateInboundLinksAndUrlsToParse($url, $crawlerUrl->getLinks());
84 9
                    $this->config->getRecorder()->recordLinksIndex(
85 9
                        $this->config->getBase(),
86 9
                        $url,
87 9
                        $this->urls,
88 9
                        $crawlerUrl->getHarvester()->getLinks()
89 9
                    );
90
                }
91 9
92
                $this->urls[$urlToParse]->setDiscovered(count($this->urls));
93
94
                $this->config->cacheRobotsTxt($crawlerUrl->getHarvester());
95
                $this->config->cacheRequest($crawlerUrl->getHarvester());
96 9
97
                usleep($this->config->getWait());
98 9
            }
99 9
100
            $this->autosave();
101 6
        }
102
103 6
        ++$this->currentClick;
104
105
        // Record after each Level:
106 9
        $this->config->getRecorder()->record($this->urls);
107
108 9
        $record = $nothingUpdated || $this->currentClick >= $this->config->getLimit();
109
110
        return $record ? null : $this->crawl();
111
    }
112 9
113 9
    protected function autosave()
114
    {
115 9
        if (0 !== $this->counter && $this->counter / self::AUTOSAVE == round($this->counter / self::AUTOSAVE)) {
116
            echo $this->debug ? '    --- auto-save'.PHP_EOL : '';
117
            $this->getRecorder()->record($this->urls);
118 9
        }
119
    }
120 9
121
    protected function canBeCrawled(Url $url)
122
    {
123 9
        if (null === $url->can_be_crawled) {
124
            $url->can_be_crawled = $this->config->getVirtualRobots()
125 9
            ->allows($this->config->getBase().$url->uri, $this->config->getUserAgent());
126
        }
127 9
128 9
        return $url->can_be_crawled;
129 9
    }
130
131
    public function updateInboundLinksAndUrlsToParse(Url $url, array $links)
132 9
    {
133 9
        $everAdd = [];
134 3
        foreach ($links as $link) {
135 9
            $newUri = $link->getUrl()->getRelativizedDocumentUrl();
136 2
            $this->urls[$newUri] = $this->urls[$newUri] ?? new Url($link->getPageUrl(), ($this->currentClick + 1));
137
            if (! isset($everAdd[$newUri])) {
138
                $everAdd[$newUri] = 1;
139 9
                if (! $link->mayFollow()) {
140 9
                    ++$this->urls[$newUri]->inboundlinks_nofollow;
141
                } else {
142
                    ++$this->urls[$newUri]->inboundlinks;
143 9
                }
144 9
                $this->getRecorder()->recordInboundLink($link, $url, $this->urls[$newUri]);
145
            }
146 9
        }
147 9
    }
148
149 9
    protected function printDebugCrawlUrl(string $urlToParse)
150
    {
151 9
        if ($this->debug) {
152
            echo $this->counter.'/'.count($this->urls).'    '.$this->config->getBase().$urlToParse.PHP_EOL;
153 9
        }
154
    }
155 9
156
    protected function printDebugInitCrawlLoop()
157 6
    {
158
        if ($this->debug) {
159
            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '
160
                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;
161 9
        }
162
    }
163
164 9
    protected function getRecorder()
165
    {
166 9
        return $this->config->getRecorder();
167
    }
168
}
169