Passed
Push — master ( dc854e...b085d5 )
by Dev
10:55
created

CrawlerConfig   B

Complexity

Total Complexity 44

Size/Duplication

Total Lines 279
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
eloc 111
c 1
b 0
f 1
dl 0
loc 279
rs 8.8798
wmc 44

20 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 18 1
A getStartUrl() 0 3 1
A getVirtualRobots() 0 7 2
A getDataFolderFrom() 0 3 1
A getUrlFromId() 0 5 3
A getLimit() 0 3 1
A getUserAgent() 0 3 1
A getDataFolder() 0 3 1
A setId() 0 5 1
A setBaseAndStartUrl() 0 11 5
A getDataDirectory() 0 3 1
B getDataFromPreviousCrawl() 0 33 7
A getCacheMethod() 0 3 1
A recordConfig() 0 10 1
A getWait() 0 3 1
A getIndexFromPreviousCrawl() 0 23 4
A getId() 0 3 1
A loadFrom() 0 21 3
A getBase() 0 3 1
B getLastCrawl() 0 20 7

How to fix   Complexity   

Complex Class

Complex classes like CrawlerConfig often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerConfig, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
use Spatie\Robots\RobotsTxt;
7
8
class CrawlerConfig
9
{
10
    /**
11
     * @var string contain the user agent used during the crawl
12
     */
13
    protected $userAgent;
14
15
    /**
16
     * @var string crawl id
17
     */
18
    protected $id;
19
20
    /**
21
     * @var string page to ignore during the crawl
22
     */
23
    protected $ignore;
24
25
    /**
26
     * @var int depth max where to crawl
27
     */
28
    protected $limit;
29
30
    /**
31
     * @var string contain https://domain.tdl from start url
32
     */
33
    protected $base;
34
35
    /**
36
     * @var int
37
     */
38
    protected $wait;
39
40
    /**
41
     * @var int
42
     */
43
    protected $cacheMethod;
44
45
    /**
46
     * @var string
47
     */
48
    protected $dataDirectory;
49
50
    /**
51
     * @var string
52
     */
53
    protected $startUrl;
54
55
    public function __construct(
56
        string $startUrl,
57
        string $ignore,
58
        int $limit,
59
        string $userAgent,
60
        int $cacheMethod = Recorder::CACHE_ID,
61
        int $waitInMicroSeconds = 100000,
62
        ?string $dataDirectory = null
63
    ) {
64
        $this->setBaseAndStartUrl($startUrl);
65
        //$this->urls[$this->startUrl] = null;
66
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
67
        $this->ignore = $ignore;
68
        $this->userAgent = $userAgent;
69
        $this->limit = $limit;
70
        $this->cacheMethod = $cacheMethod;
71
        $this->wait = $waitInMicroSeconds;
72
        $this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
73
    }
74
75
    /**
76
     * @return string id
77
     */
78
    public static function getLastCrawl(string $dataDirectory): string
79
    {
80
        $dir = scandir($dataDirectory);
81
        $lastCrawl = null;
82
        $lastRunAt = null;
83
84
        foreach ($dir as $file) {
85
            if ('.' != $file && '..' != $file
86
                && is_dir($dataDirectory.'/'.$file)
87
                && filemtime($dataDirectory.'/'.$file) > $lastRunAt) {
88
                $lastCrawl = $file;
89
                $lastRunAt = filemtime($dataDirectory.'/'.$file);
90
            }
91
        }
92
93
        if (null === $lastCrawl) {
94
            throw new \Exception('No crawl previously runned');
95
        }
96
97
        return $lastCrawl;
98
    }
99
100
    public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
101
    {
102
        if ('last' === $crawlId) {
103
            $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/'));
104
        }
105
106
        $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
107
        if (!file_exists($configFilePath)) {
108
            throw new \Exception('Crawl `'.$crawlId.'` not found.');
109
        }
110
        $config = json_decode(file_get_contents($configFilePath), true);
111
112
        return (new self(
113
            $config['base'].$config['startUrl'],
114
            $config['ignore'],
115
            intval($config['limit']),
116
            (string) $config['userAgent'],
117
            intval($config['cacheMethod']),
118
            intval($config['wait']),
119
            $dataDirectory
120
        ))->setId($crawlId);
121
    }
122
123
    public function recordConfig()
124
    {
125
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
126
            'startUrl' => $this->startUrl,
127
            'base' => $this->base,
128
            'ignore' => $this->ignore,
129
            'limit' => $this->limit,
130
            'userAgent' => $this->userAgent,
131
            'cacheMethod' => $this->cacheMethod,
132
            'wait' => $this->wait,
133
        ]));
134
    }
135
136
    protected function setBaseAndStartUrl(string $url)
137
    {
138
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
139
            throw new \Exception('start is not a valid URL `'.$url.'`');
140
        }
141
142
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
143
144
        $url = substr($url, strlen($this->base));
145
146
        $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url;
147
    }
148
149
    public static function getDataFolderFrom(string $id, ?string $path)
150
    {
151
        return ($path ?? __DIR__.'/../data').'/'.$id;
152
    }
153
154
    public function getDataFolder()
155
    {
156
        return $this->dataDirectory.'/'.$this->id;
157
    }
158
159
    public function getId()
160
    {
161
        return $this->id;
162
    }
163
164
    public function getBase()
165
    {
166
        return $this->base;
167
    }
168
169
    public function getStartUrl()
170
    {
171
        return $this->startUrl;
172
    }
173
174
    public function getWait()
175
    {
176
        return $this->wait;
177
    }
178
179
    public function getUserAgent()
180
    {
181
        return $this->userAgent;
182
    }
183
184
    public function getLimit()
185
    {
186
        return $this->limit;
187
    }
188
189
    public function getCacheMethod()
190
    {
191
        return $this->cacheMethod;
192
    }
193
194
    public function getDataDirectory()
195
    {
196
        $this->dataDirectory;
197
    }
198
199
    /** @var RobotsTxt */
200
    protected $virtualRobots;
201
202
    public function getVirtualRobots()
203
    {
204
        if (null === $this->virtualRobots) {
205
            $this->virtualRobots = new RobotsTxt($this->ignore);
206
        }
207
208
        return $this->virtualRobots;
209
    }
210
211
    public function setId(string $id): self
212
    {
213
        $this->id = $id;
214
215
        return $this;
216
    }
217
218
    public function getDataFromPreviousCrawl()
219
    {
220
        $dataFilePath = $this->getDataFolder().'/data.csv';
221
        if (!file_exists($dataFilePath)) {
222
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
223
        }
224
225
        $urls = [];
226
        $counter = 0;
227
228
        $csv = Reader::createFromPath($dataFilePath, 'r');
229
        $csv->setHeaderOffset(0);
230
231
        $records = $csv->getRecords();
232
        foreach ($records as $r) {
233
            $urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
234
            foreach ($r as $k => $v) {
235
                if ('can_be_crawled' == $k && !empty($v)) {
236
                    $v = (bool) $v;
237
                }
238
                $urls[$r['uri']]->$k = $v;
239
            }
240
            if (!empty($r['can_be_crawled'])) {
241
                ++$counter;
242
            }
243
        }
244
245
        $currentClick = $r['click'] ?? 0;
246
247
        return [
248
            'urls' => $urls,
249
            'counter' => $counter,
250
            'currentClick' => $currentClick,
251
        ];
252
    }
253
254
    // could be add in an other class..
255
    protected $index;
256
257
    protected function getIndexFromPreviousCrawl()
258
    {
259
        if (null !== $this->index) {
260
            return $this->index;
261
        }
262
263
        $this->index = [];
264
265
        $indexFilePath = $this->getDataFolder().'/index.csv';
266
        if (!file_exists($indexFilePath)) {
267
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
268
        }
269
270
        $csv = Reader::createFromPath($indexFilePath, 'r');
271
        $csv->setHeaderOffset(0);
272
273
        $records = $csv->getRecords();
274
        foreach ($records as $r) {
275
            $this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
276
            $this->index[$r['id']]->id = $r['id'];
277
        }
278
279
        return $this->index;
280
    }
281
282
    public function getUrlFromId($id, $base = true)
283
    {
284
        $index = $this->getIndexFromPreviousCrawl();
285
286
        return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
287
    }
288
}
289