CrawlerConfig::cacheRequest()   A
last analyzed

Complexity

Conditions 3
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 3
nc 2
nop 1
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
use PiedWeb\UrlHarvester\Harvest;
7
use PiedWeb\UrlHarvester\Indexable;
8
use Spatie\Robots\RobotsTxt;
9
10
class CrawlerConfig
11
{
12
    /**
13
     * @var string contain the user agent used during the crawl
14
     */
15
    protected $userAgent;
16
17
    /**
18
     * @var string crawl id
19
     */
20
    protected $id;
21
22
    /**
23
     * @var string page to ignore during the crawl
24
     */
25
    protected $ignore;
26
27
    /**
28
     * @var int depth max where to crawl
29
     */
30
    protected $limit;
31
32
    /**
33
     * @var string contain https://domain.tdl from start url
34
     */
35
    protected $base;
36
37
    /**
38
     * @var int
39
     */
40
    protected $wait;
41
42
    /**
43
     * @var int
44
     */
45
    protected $cacheMethod;
46
47
    /**
48
     * @var string
49
     */
50
    protected $dataDirectory;
51
52
    /**
53
     * @var string
54
     */
55
    protected $startUrl;
56
57
    protected $request;
58
    protected $robotsTxt;
59
60
    /** @var Recorder */
61
    protected $recorder;
62
63
    public function __construct(
64
        string $startUrl,
65
        string $ignore,
66
        int $limit,
67
        string $userAgent,
68
        int $cacheMethod = Recorder::CACHE_ID,
69
        int $waitInMicroSeconds = 100000,
70
        ?string $dataDirectory = null
71
    ) {
72
        $this->setBaseAndStartUrl($startUrl);
73
        //$this->urls[$this->startUrl] = null;
74
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
75
        $this->ignore = $ignore;
76
        $this->userAgent = $userAgent;
77
        $this->limit = $limit;
78
        $this->cacheMethod = $cacheMethod;
79
        $this->wait = $waitInMicroSeconds;
80
        $this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
81
    }
82
83
    /**
84
     * @return string id
85
     */
86
    public static function getLastCrawl(string $dataDirectory): string
87
    {
88
        $dir = scandir($dataDirectory);
89
        $lastCrawl = null;
90
        $lastRunAt = null;
91
92
        foreach ($dir as $file) {
93
            if ('.' != $file && '..' != $file
94
                && is_dir($dataDirectory.'/'.$file)
95
                && filemtime($dataDirectory.'/'.$file) > $lastRunAt) {
96
                $lastCrawl = $file;
97
                $lastRunAt = filemtime($dataDirectory.'/'.$file);
98
            }
99
        }
100
101
        if (null === $lastCrawl) {
102
            throw new \Exception('No crawl previously runned');
103
        }
104
105
        return $lastCrawl;
106
    }
107
108
    public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
109
    {
110
        if ('last' === $crawlId) {
111
            $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/'));
112
        }
113
114
        $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
115
        if (! file_exists($configFilePath)) {
116
            throw new \Exception('Crawl `'.$crawlId.'` not found.');
117
        }
118
        $config = json_decode(file_get_contents($configFilePath), true);
119
120
        return (new self(
121
            $config['base'].$config['startUrl'],
122
            $config['ignore'],
123
            intval($config['limit']),
124
            (string) $config['userAgent'],
125
            intval($config['cacheMethod']),
126
            intval($config['wait']),
127
            $dataDirectory
128
        ))->setId($crawlId);
129
    }
130
131
    public function recordConfig()
132
    {
133
        $this->getRecorder(); // permit to create folder
134
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
135
            'startUrl' => $this->startUrl,
136
            'base' => $this->base,
137
            'ignore' => $this->ignore,
138
            'limit' => $this->limit,
139
            'userAgent' => $this->userAgent,
140
            'cacheMethod' => $this->cacheMethod,
141
            'wait' => $this->wait,
142
        ]));
143
    }
144
145
    protected function setBaseAndStartUrl(string $url)
146
    {
147
        if (! filter_var($url, FILTER_VALIDATE_URL)) {
148
            throw new \Exception('start is not a valid URL `'.$url.'`');
149
        }
150
151
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
152
153
        $url = substr($url, strlen($this->base));
154
155
        $this->startUrl = (! isset($url[0]) || '/' != $url[0] ? '/' : '').$url;
156
    }
157
158
    public static function getDataFolderFrom(string $id, ?string $path)
159
    {
160
        return ($path ?? __DIR__.'/../data').'/'.$id;
161
    }
162
163
    public function getDataFolder()
164
    {
165
        return $this->dataDirectory.'/'.$this->id;
166
    }
167
168
    public function getId()
169
    {
170
        return $this->id;
171
    }
172
173
    public function getBase()
174
    {
175
        return $this->base;
176
    }
177
178
    public function getStartUrl()
179
    {
180
        return $this->startUrl;
181
    }
182
183
    public function getWait()
184
    {
185
        return $this->wait;
186
    }
187
188
    public function getUserAgent()
189
    {
190
        return $this->userAgent;
191
    }
192
193
    public function getLimit()
194
    {
195
        return $this->limit;
196
    }
197
198
    public function getCacheMethod()
199
    {
200
        return $this->cacheMethod;
201
    }
202
203
    public function getDataDirectory()
204
    {
205
        $this->dataDirectory;
206
    }
207
208
    /** @var RobotsTxt */
209
    protected $virtualRobots;
210
211
    public function getVirtualRobots()
212
    {
213
        if (null === $this->virtualRobots) {
214
            $this->virtualRobots = new RobotsTxt($this->ignore);
215
        }
216
217
        return $this->virtualRobots;
218
    }
219
220
    public function setId(string $id): self
221
    {
222
        $this->id = $id;
223
224
        return $this;
225
    }
226
227
    public function getDataFromPreviousCrawl()
228
    {
229
        $dataFilePath = $this->getDataFolder().'/data.csv';
230
        if (! file_exists($dataFilePath)) {
231
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
232
        }
233
234
        $urls = [];
235
        $counter = 0;
236
237
        $csv = Reader::createFromPath($dataFilePath, 'r');
238
        $csv->setHeaderOffset(0);
239
240
        $records = $csv->getRecords();
241
        foreach ($records as $r) {
242
            $urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
243
            if (isset($r['can_be_crawled']) && ! empty($r['can_be_crawled'])
244
                && Indexable::NOT_INDEXABLE_NETWORK_ERROR != $r['indexable'] // we will retry network errror
245
            ) {
246
                foreach ($r as $k => $v) {
247
                    $kFunction = 'set'.self::camelize($k);
248
                    $urls[$r['uri']]->$kFunction($v);
249
                }
250
                ++$counter;
251
            }
252
        }
253
254
        $currentClick = $r['click'] ?? 0;
255
256
        return [
257
            'urls' => $urls,
258
            'counter' => $counter,
259
            'currentClick' => $currentClick,
260
        ];
261
    }
262
263
    protected static function camelize($input)
264
    {
265
        return ucfirst(str_replace('_', '', ucwords($input, '_')));
266
    }
267
268
    // could be add in an other class..
269
    protected $index;
270
271
    protected function getIndexFromPreviousCrawl()
272
    {
273
        if (null !== $this->index) {
274
            return $this->index;
275
        }
276
277
        $this->index = [];
278
279
        $indexFilePath = $this->getDataFolder().'/index.csv';
280
        if (! file_exists($indexFilePath)) {
281
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
282
        }
283
284
        $csv = Reader::createFromPath($indexFilePath, 'r');
285
        $csv->setHeaderOffset(0);
286
287
        $records = $csv->getRecords();
288
        foreach ($records as $r) {
289
            $this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
290
            $this->index[$r['id']]->id = $r['id'];
291
        }
292
293
        return $this->index;
294
    }
295
296
    public function getUrlFromId($id, $base = true)
297
    {
298
        $index = $this->getIndexFromPreviousCrawl();
299
300
        return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
301
    }
302
303
    public function cacheRequest($harvest)
304
    {
305
        if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) {
306
            $this->request = $harvest->getResponse()->getRequest();
307
        }
308
309
        return $this;
310
    }
311
312
    public function getRequestCached()
313
    {
314
        return $this->request;
315
    }
316
317
    public function cacheRobotsTxt($harvest)
318
    {
319
        if (null === $this->robotsTxt && $harvest instanceof Harvest) {
320
            $this->robotsTxt = $harvest->getRobotsTxt();
321
        }
322
323
        return $this;
324
    }
325
326
    public function getRobotsTxtCached()
327
    {
328
        return $this->robotsTxt;
329
    }
330
331
    public function getRecorder()
332
    {
333
        if ($this->recorder) {
334
            return $this->recorder;
335
        }
336
337
        return $this->recorder = new Recorder($this->getDataFolder(), $this->getCacheMethod());
338
    }
339
}
340