CrawlerConfig - Code Metrics - Inspection of "add `last` shorctut to avoid boring copy and paste..." - PiedWeb/SeoPocketCrawler - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( dc854e...b085d5 )

by Dev

created 2020-04-17 07:12 UTC

CrawlerConfig B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	279
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
eloc	111
c	1
b	0
f	1
dl	0
loc	279
rs	8.8798
wmc	44

20 Methods

Rating	Name	Size	Complexity
A	__construct()	18	1
A	getStartUrl()	3	1
A	getVirtualRobots()	7	2
A	getDataFolderFrom()	3	1
A	getUrlFromId()	5	3
A	getLimit()	3	1
A	getUserAgent()	3	1
A	getDataFolder()	3	1
A	setId()	5	1
A	setBaseAndStartUrl()	11	5
A	getDataDirectory()	3	1
B	getDataFromPreviousCrawl()	33	7
A	getCacheMethod()	3	1
A	recordConfig()	10	1
A	getWait()	3	1
A	getIndexFromPreviousCrawl()	23	4
A	getId()	3	1
A	loadFrom()	21	3
A	getBase()	3	1
B	getLastCrawl()	20	7

How to fix Complexity

<?php

namespace PiedWeb\SeoPocketCrawler;

use League\Csv\Reader;
use Spatie\Robots\RobotsTxt;

class CrawlerConfig
{
    /**
     * @var string contain the user agent used during the crawl
     */
    protected $userAgent;

    /**
     * @var string crawl id
     */
    protected $id;

    /**
     * @var string page to ignore during the crawl
     */
    protected $ignore;

    /**
     * @var int depth max where to crawl
     */
    protected $limit;

    /**
     * @var string contain https://domain.tdl from start url
     */
    protected $base;

    /**
     * @var int
     */
    protected $wait;

    /**
     * @var int
     */
    protected $cacheMethod;

    /**
     * @var string
     */
    protected $dataDirectory;

    /**
     * @var string
     */
    protected $startUrl;

    public function __construct(
        string $startUrl,
        string $ignore,
        int $limit,
        string $userAgent,
        int $cacheMethod = Recorder::CACHE_ID,
        int $waitInMicroSeconds = 100000,
        ?string $dataDirectory = null
    ) {
        $this->setBaseAndStartUrl($startUrl);
        //$this->urls[$this->startUrl] = null;
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
        $this->ignore = $ignore;
        $this->userAgent = $userAgent;
        $this->limit = $limit;
        $this->cacheMethod = $cacheMethod;
        $this->wait = $waitInMicroSeconds;
        $this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
    }

    /**
     * @return string id
     */
    public static function getLastCrawl(string $dataDirectory): string
    {
        $dir = scandir($dataDirectory);
        $lastCrawl = null;
        $lastRunAt = null;

        foreach ($dir as $file) {
            if ('.' != $file && '..' != $file
                && is_dir($dataDirectory.'/'.$file)
                && filemtime($dataDirectory.'/'.$file) > $lastRunAt) {
                $lastCrawl = $file;
                $lastRunAt = filemtime($dataDirectory.'/'.$file);
            }
        }

        if (null === $lastCrawl) {
            throw new \Exception('No crawl previously runned');
        }

        return $lastCrawl;
    }

    public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
    {
        if ('last' === $crawlId) {
            $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/'));
        }

        $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
        if (!file_exists($configFilePath)) {
            throw new \Exception('Crawl `'.$crawlId.'` not found.');
        }
        $config = json_decode(file_get_contents($configFilePath), true);

        return (new self(
            $config['base'].$config['startUrl'],
            $config['ignore'],
            intval($config['limit']),
            (string) $config['userAgent'],
            intval($config['cacheMethod']),
            intval($config['wait']),
            $dataDirectory
        ))->setId($crawlId);
    }

    public function recordConfig()
    {
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
            'startUrl' => $this->startUrl,
            'base' => $this->base,
            'ignore' => $this->ignore,
            'limit' => $this->limit,
            'userAgent' => $this->userAgent,
            'cacheMethod' => $this->cacheMethod,
            'wait' => $this->wait,
        ]));
    }

    protected function setBaseAndStartUrl(string $url)
    {
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            throw new \Exception('start is not a valid URL `'.$url.'`');
        }

        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;

        $url = substr($url, strlen($this->base));

        $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url;
    }

    public static function getDataFolderFrom(string $id, ?string $path)
    {
        return ($path ?? __DIR__.'/../data').'/'.$id;
    }

    public function getDataFolder()
    {
        return $this->dataDirectory.'/'.$this->id;
    }

    public function getId()
    {
        return $this->id;
    }

    public function getBase()
    {
        return $this->base;
    }

    public function getStartUrl()
    {
        return $this->startUrl;
    }

    public function getWait()
    {
        return $this->wait;
    }

    public function getUserAgent()
    {
        return $this->userAgent;
    }

    public function getLimit()
    {
        return $this->limit;
    }

    public function getCacheMethod()
    {
        return $this->cacheMethod;
    }

    public function getDataDirectory()
    {
        $this->dataDirectory;
    }

    /** @var RobotsTxt */
    protected $virtualRobots;

    public function getVirtualRobots()
    {
        if (null === $this->virtualRobots) {
            $this->virtualRobots = new RobotsTxt($this->ignore);
        }

        return $this->virtualRobots;
    }

    public function setId(string $id): self
    {
        $this->id = $id;

        return $this;
    }

    public function getDataFromPreviousCrawl()
    {
        $dataFilePath = $this->getDataFolder().'/data.csv';
        if (!file_exists($dataFilePath)) {
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
        }

        $urls = [];
        $counter = 0;

        $csv = Reader::createFromPath($dataFilePath, 'r');
        $csv->setHeaderOffset(0);

        $records = $csv->getRecords();
        foreach ($records as $r) {
            $urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
            foreach ($r as $k => $v) {
                if ('can_be_crawled' == $k && !empty($v)) {
                    $v = (bool) $v;
                }
                $urls[$r['uri']]->$k = $v;
            }
            if (!empty($r['can_be_crawled'])) {
                ++$counter;
            }
        }

        $currentClick = $r['click'] ?? 0;

        return [
            'urls' => $urls,
            'counter' => $counter,
            'currentClick' => $currentClick,
        ];
    }

    // could be add in an other class..
    protected $index;

    protected function getIndexFromPreviousCrawl()
    {
        if (null !== $this->index) {
            return $this->index;
        }

        $this->index = [];

        $indexFilePath = $this->getDataFolder().'/index.csv';
        if (!file_exists($indexFilePath)) {
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
        }

        $csv = Reader::createFromPath($indexFilePath, 'r');
        $csv->setHeaderOffset(0);

        $records = $csv->getRecords();
        foreach ($records as $r) {
            $this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
            $this->index[$r['id']]->id = $r['id'];
        }

        return $this->index;
    }

    public function getUrlFromId($id, $base = true)
    {
        $index = $this->getIndexFromPreviousCrawl();

        return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
    }
}


1			<?php
2
3			namespace PiedWeb\SeoPocketCrawler;
4
5			use League\Csv\Reader;
6			use Spatie\Robots\RobotsTxt;
7
8			class CrawlerConfig
9			{
10			/**
11			* @var string contain the user agent used during the crawl
12			*/
13			protected $userAgent;
14
15			/**
16			* @var string crawl id
17			*/
18			protected $id;
19
20			/**
21			* @var string page to ignore during the crawl
22			*/
23			protected $ignore;
24
25			/**
26			* @var int depth max where to crawl
27			*/
28			protected $limit;
29
30			/**
31			* @var string contain https://domain.tdl from start url
32			*/
33			protected $base;
34
35			/**
36			* @var int
37			*/
38			protected $wait;
39
40			/**
41			* @var int
42			*/
43			protected $cacheMethod;
44
45			/**
46			* @var string
47			*/
48			protected $dataDirectory;
49
50			/**
51			* @var string
52			*/
53			protected $startUrl;
54
55			public function __construct(
56			string $startUrl,
57			string $ignore,
58			int $limit,
59			string $userAgent,
60			int $cacheMethod = Recorder::CACHE_ID,
61			int $waitInMicroSeconds = 100000,
62			?string $dataDirectory = null
63			) {
64			$this->setBaseAndStartUrl($startUrl);
65			//$this->urls[$this->startUrl] = null;
66			$this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
67			$this->ignore = $ignore;
68			$this->userAgent = $userAgent;
69			$this->limit = $limit;
70			$this->cacheMethod = $cacheMethod;
71			$this->wait = $waitInMicroSeconds;
72			$this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
73			}
74
75			/**
76			* @return string id
77			*/
78			public static function getLastCrawl(string $dataDirectory): string
79			{
80			$dir = scandir($dataDirectory);
81			$lastCrawl = null;
82			$lastRunAt = null;
83
84			foreach ($dir as $file) {
85			if ('.' != $file && '..' != $file
86			&& is_dir($dataDirectory.'/'.$file)
87			&& filemtime($dataDirectory.'/'.$file) > $lastRunAt) {
88			$lastCrawl = $file;
89			$lastRunAt = filemtime($dataDirectory.'/'.$file);
90			}
91			}
92
93			if (null === $lastCrawl) {
94			throw new \Exception('No crawl previously runned');
95			}
96
97			return $lastCrawl;
98			}
99
100			public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
101			{
102			if ('last' === $crawlId) {
103			$crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/'));
104			}
105
106			$configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
107			if (!file_exists($configFilePath)) {
108			throw new \Exception('Crawl `'.$crawlId.'` not found.');
109			}
110			$config = json_decode(file_get_contents($configFilePath), true);
111
112			return (new self(
113			$config['base'].$config['startUrl'],
114			$config['ignore'],
115			intval($config['limit']),
116			(string) $config['userAgent'],
117			intval($config['cacheMethod']),
118			intval($config['wait']),
119			$dataDirectory
120			))->setId($crawlId);
121			}
122
123			public function recordConfig()
124			{
125			file_put_contents($this->getDataFolder().'/config.json', json_encode([
126			'startUrl' => $this->startUrl,
127			'base' => $this->base,
128			'ignore' => $this->ignore,
129			'limit' => $this->limit,
130			'userAgent' => $this->userAgent,
131			'cacheMethod' => $this->cacheMethod,
132			'wait' => $this->wait,
133			]));
134			}
135
136			protected function setBaseAndStartUrl(string $url)
137			{
138			if (!filter_var($url, FILTER_VALIDATE_URL)) {
139			throw new \Exception('start is not a valid URL `'.$url.'`');
140			}
141
142			$this->base = preg_match('@^(http://\|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
143
144			$url = substr($url, strlen($this->base));
145
146			$this->startUrl = (!isset($url[0]) \|\| '/' != $url[0] ? '/' : '').$url;
147			}
148
149			public static function getDataFolderFrom(string $id, ?string $path)
150			{
151			return ($path ?? __DIR__.'/../data').'/'.$id;
152			}
153
154			public function getDataFolder()
155			{
156			return $this->dataDirectory.'/'.$this->id;
157			}
158
159			public function getId()
160			{
161			return $this->id;
162			}
163
164			public function getBase()
165			{
166			return $this->base;
167			}
168
169			public function getStartUrl()
170			{
171			return $this->startUrl;
172			}
173
174			public function getWait()
175			{
176			return $this->wait;
177			}
178
179			public function getUserAgent()
180			{
181			return $this->userAgent;
182			}
183
184			public function getLimit()
185			{
186			return $this->limit;
187			}
188
189			public function getCacheMethod()
190			{
191			return $this->cacheMethod;
192			}
193
194			public function getDataDirectory()
195			{
196			$this->dataDirectory;
197			}
198
199			/** @var RobotsTxt */
200			protected $virtualRobots;
201
202			public function getVirtualRobots()
203			{
204			if (null === $this->virtualRobots) {
205			$this->virtualRobots = new RobotsTxt($this->ignore);
206			}
207
208			return $this->virtualRobots;
209			}
210
211			public function setId(string $id): self
212			{
213			$this->id = $id;
214
215			return $this;
216			}
217
218			public function getDataFromPreviousCrawl()
219			{
220			$dataFilePath = $this->getDataFolder().'/data.csv';
221			if (!file_exists($dataFilePath)) {
222			throw new \Exception('Previous crawl\'s data not found (index.csv)');
223			}
224
225			$urls = [];
226			$counter = 0;
227
228			$csv = Reader::createFromPath($dataFilePath, 'r');
229			$csv->setHeaderOffset(0);
230
231			$records = $csv->getRecords();
232			foreach ($records as $r) {
233			$urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
234			foreach ($r as $k => $v) {
235			if ('can_be_crawled' == $k && !empty($v)) {
236			$v = (bool) $v;
237			}
238			$urls[$r['uri']]->$k = $v;
239			}
240			if (!empty($r['can_be_crawled'])) {
241			++$counter;
242			}
243			}
244
245			$currentClick = $r['click'] ?? 0;
246
247			return [
248			'urls' => $urls,
249			'counter' => $counter,
250			'currentClick' => $currentClick,
251			];
252			}
253
254			// could be add in an other class..
255			protected $index;
256
257			protected function getIndexFromPreviousCrawl()
258			{
259			if (null !== $this->index) {
260			return $this->index;
261			}
262
263			$this->index = [];
264
265			$indexFilePath = $this->getDataFolder().'/index.csv';
266			if (!file_exists($indexFilePath)) {
267			throw new \Exception('Previous crawl\'s data not found (index.csv)');
268			}
269
270			$csv = Reader::createFromPath($indexFilePath, 'r');
271			$csv->setHeaderOffset(0);
272
273			$records = $csv->getRecords();
274			foreach ($records as $r) {
275			$this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
276			$this->index[$r['id']]->id = $r['id'];
277			}
278
279			return $this->index;
280			}
281
282			public function getUrlFromId($id, $base = true)
283			{
284			$index = $this->getIndexFromPreviousCrawl();
285
286			return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
287			}
288			}
289

PiedWeb / SeoPocketCrawler

Push — master ( dc854e...b085d5 )

CrawlerConfig B

Complexity

Size/Duplication

Importance

20 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like