Passed
Push — master ( 6700a9...62f4d8 )
by Dev
12:41
created

CrawlerConfig::getDataFolderFrom()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 1
eloc 1
c 1
b 0
f 1
nc 1
nop 2
dl 0
loc 3
rs 10
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
use PiedWeb\UrlHarvester\Harvest;
7
use Spatie\Robots\RobotsTxt;
8
9
class CrawlerConfig
10
{
11
    /**
12
     * @var string contain the user agent used during the crawl
13
     */
14
    protected $userAgent;
15
16
    /**
17
     * @var string crawl id
18
     */
19
    protected $id;
20
21
    /**
22
     * @var string page to ignore during the crawl
23
     */
24
    protected $ignore;
25
26
    /**
27
     * @var int depth max where to crawl
28
     */
29
    protected $limit;
30
31
    /**
32
     * @var string contain https://domain.tdl from start url
33
     */
34
    protected $base;
35
36
    /**
37
     * @var int
38
     */
39
    protected $wait;
40
41
    /**
42
     * @var int
43
     */
44
    protected $cacheMethod;
45
46
    /**
47
     * @var string
48
     */
49
    protected $dataDirectory;
50
51
    /**
52
     * @var string
53
     */
54
    protected $startUrl;
55
56
    protected $request;
57
    protected $robotsTxt;
58
59
    /** @var Recorder */
60
    protected $recorder;
61
62
    public function __construct(
63
        string $startUrl,
64
        string $ignore,
65
        int $limit,
66
        string $userAgent,
67
        int $cacheMethod = Recorder::CACHE_ID,
68
        int $waitInMicroSeconds = 100000,
69
        ?string $dataDirectory = null
70
    ) {
71
        $this->setBaseAndStartUrl($startUrl);
72
        //$this->urls[$this->startUrl] = null;
73
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
74
        $this->ignore = $ignore;
75
        $this->userAgent = $userAgent;
76
        $this->limit = $limit;
77
        $this->cacheMethod = $cacheMethod;
78
        $this->wait = $waitInMicroSeconds;
79
        $this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
80
    }
81
82
    /**
83
     * @return string id
84
     */
85
    public static function getLastCrawl(string $dataDirectory): string
86
    {
87
        $dir = scandir($dataDirectory);
88
        $lastCrawl = null;
89
        $lastRunAt = null;
90
91
        foreach ($dir as $file) {
92
            if ('.' != $file && '..' != $file
93
                && is_dir($dataDirectory.'/'.$file)
94
                && filemtime($dataDirectory.'/'.$file) > $lastRunAt) {
95
                $lastCrawl = $file;
96
                $lastRunAt = filemtime($dataDirectory.'/'.$file);
97
            }
98
        }
99
100
        if (null === $lastCrawl) {
101
            throw new \Exception('No crawl previously runned');
102
        }
103
104
        return $lastCrawl;
105
    }
106
107
    public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
108
    {
109
        if ('last' === $crawlId) {
110
            $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/'));
111
        }
112
113
        $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
114
        if (!file_exists($configFilePath)) {
115
            throw new \Exception('Crawl `'.$crawlId.'` not found.');
116
        }
117
        $config = json_decode(file_get_contents($configFilePath), true);
118
119
        return (new self(
120
            $config['base'].$config['startUrl'],
121
            $config['ignore'],
122
            intval($config['limit']),
123
            (string) $config['userAgent'],
124
            intval($config['cacheMethod']),
125
            intval($config['wait']),
126
            $dataDirectory
127
        ))->setId($crawlId);
128
    }
129
130
    public function recordConfig()
131
    {
132
        $this->getRecorder(); // permit to create folder
133
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
134
            'startUrl' => $this->startUrl,
135
            'base' => $this->base,
136
            'ignore' => $this->ignore,
137
            'limit' => $this->limit,
138
            'userAgent' => $this->userAgent,
139
            'cacheMethod' => $this->cacheMethod,
140
            'wait' => $this->wait,
141
        ]));
142
    }
143
144
    protected function setBaseAndStartUrl(string $url)
145
    {
146
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
147
            throw new \Exception('start is not a valid URL `'.$url.'`');
148
        }
149
150
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
151
152
        $url = substr($url, strlen($this->base));
153
154
        $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url;
155
    }
156
157
    public static function getDataFolderFrom(string $id, ?string $path)
158
    {
159
        return ($path ?? __DIR__.'/../data').'/'.$id;
160
    }
161
162
    public function getDataFolder()
163
    {
164
        return $this->dataDirectory.'/'.$this->id;
165
    }
166
167
    public function getId()
168
    {
169
        return $this->id;
170
    }
171
172
    public function getBase()
173
    {
174
        return $this->base;
175
    }
176
177
    public function getStartUrl()
178
    {
179
        return $this->startUrl;
180
    }
181
182
    public function getWait()
183
    {
184
        return $this->wait;
185
    }
186
187
    public function getUserAgent()
188
    {
189
        return $this->userAgent;
190
    }
191
192
    public function getLimit()
193
    {
194
        return $this->limit;
195
    }
196
197
    public function getCacheMethod()
198
    {
199
        return $this->cacheMethod;
200
    }
201
202
    public function getDataDirectory()
203
    {
204
        $this->dataDirectory;
205
    }
206
207
    /** @var RobotsTxt */
208
    protected $virtualRobots;
209
210
    public function getVirtualRobots()
211
    {
212
        if (null === $this->virtualRobots) {
213
            $this->virtualRobots = new RobotsTxt($this->ignore);
214
        }
215
216
        return $this->virtualRobots;
217
    }
218
219
    public function setId(string $id): self
220
    {
221
        $this->id = $id;
222
223
        return $this;
224
    }
225
226
    public function getDataFromPreviousCrawl()
227
    {
228
        $dataFilePath = $this->getDataFolder().'/data.csv';
229
        if (!file_exists($dataFilePath)) {
230
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
231
        }
232
233
        $urls = [];
234
        $counter = 0;
235
236
        $csv = Reader::createFromPath($dataFilePath, 'r');
237
        $csv->setHeaderOffset(0);
238
239
        $records = $csv->getRecords();
240
        foreach ($records as $r) {
241
            $urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
242
            foreach ($r as $k => $v) {
243
                if ('can_be_crawled' == $k && !empty($v)) {
244
                    $v = (bool) $v;
245
                }
246
                $urls[$r['uri']]->$k = $v;
247
            }
248
            if (!empty($r['can_be_crawled'])) {
249
                ++$counter;
250
            }
251
        }
252
253
        $currentClick = $r['click'] ?? 0;
254
255
        return [
256
            'urls' => $urls,
257
            'counter' => $counter,
258
            'currentClick' => $currentClick,
259
        ];
260
    }
261
262
    // could be add in an other class..
263
    protected $index;
264
265
    protected function getIndexFromPreviousCrawl()
266
    {
267
        if (null !== $this->index) {
268
            return $this->index;
269
        }
270
271
        $this->index = [];
272
273
        $indexFilePath = $this->getDataFolder().'/index.csv';
274
        if (!file_exists($indexFilePath)) {
275
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
276
        }
277
278
        $csv = Reader::createFromPath($indexFilePath, 'r');
279
        $csv->setHeaderOffset(0);
280
281
        $records = $csv->getRecords();
282
        foreach ($records as $r) {
283
            $this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
284
            $this->index[$r['id']]->id = $r['id'];
285
        }
286
287
        return $this->index;
288
    }
289
290
    public function getUrlFromId($id, $base = true)
291
    {
292
        $index = $this->getIndexFromPreviousCrawl();
293
294
        return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
295
    }
296
297
    public function cacheRequest($harvest)
298
    {
299
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
300
            $this->request = $harvest->getResponse()->getRequest();
301
        }
302
303
        return $this;
304
    }
305
306
    public function getRequestCached()
307
    {
308
        return $this->request;
309
    }
310
311
    public function cacheRobotsTxt($harvest)
312
    {
313
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
314
            $this->robotsTxt = $harvest->getRobotsTxt();
315
        }
316
317
        return $this;
318
    }
319
320
    public function getRobotsTxtCached()
321
    {
322
        return $this->robotsTxt;
323
    }
324
325
    public function getRecorder()
326
    {
327
        return $this->recorder ?? $this->recorder = new Recorder($this->getDataFolder(), $this->getCacheMethod());
328
    }
329
}
330