1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\SeoPocketCrawler; |
4
|
|
|
|
5
|
|
|
use League\Csv\Reader; |
6
|
|
|
use PiedWeb\UrlHarvester\Harvest; |
7
|
|
|
use PiedWeb\UrlHarvester\Indexable; |
8
|
|
|
use Spatie\Robots\RobotsTxt; |
9
|
|
|
|
10
|
|
|
class CrawlerConfig |
11
|
|
|
{ |
12
|
|
|
/** |
13
|
|
|
* @var string contain the user agent used during the crawl |
14
|
|
|
*/ |
15
|
|
|
protected $userAgent; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* @var string crawl id |
19
|
|
|
*/ |
20
|
|
|
protected $id; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var string page to ignore during the crawl |
24
|
|
|
*/ |
25
|
|
|
protected $ignore; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* @var int depth max where to crawl |
29
|
|
|
*/ |
30
|
|
|
protected $limit; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* @var string contain https://domain.tdl from start url |
34
|
|
|
*/ |
35
|
|
|
protected $base; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* @var int |
39
|
|
|
*/ |
40
|
|
|
protected $wait; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* @var int |
44
|
|
|
*/ |
45
|
|
|
protected $cacheMethod; |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* @var string |
49
|
|
|
*/ |
50
|
|
|
protected $dataDirectory; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* @var string |
54
|
|
|
*/ |
55
|
|
|
protected $startUrl; |
56
|
|
|
|
57
|
|
|
protected $request; |
58
|
|
|
protected $robotsTxt; |
59
|
|
|
|
60
|
|
|
/** @var Recorder */ |
61
|
|
|
protected $recorder; |
62
|
|
|
|
63
|
|
|
public function __construct( |
64
|
|
|
string $startUrl, |
65
|
|
|
string $ignore, |
66
|
|
|
int $limit, |
67
|
|
|
string $userAgent, |
68
|
|
|
int $cacheMethod = Recorder::CACHE_ID, |
69
|
|
|
int $waitInMicroSeconds = 100000, |
70
|
|
|
?string $dataDirectory = null |
71
|
|
|
) { |
72
|
|
|
$this->setBaseAndStartUrl($startUrl); |
73
|
|
|
//$this->urls[$this->startUrl] = null; |
74
|
|
|
$this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST); |
75
|
|
|
$this->ignore = $ignore; |
76
|
|
|
$this->userAgent = $userAgent; |
77
|
|
|
$this->limit = $limit; |
78
|
|
|
$this->cacheMethod = $cacheMethod; |
79
|
|
|
$this->wait = $waitInMicroSeconds; |
80
|
|
|
$this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/'); |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* @return string id |
85
|
|
|
*/ |
86
|
|
|
public static function getLastCrawl(string $dataDirectory): string |
87
|
|
|
{ |
88
|
|
|
$dir = scandir($dataDirectory); |
89
|
|
|
$lastCrawl = null; |
90
|
|
|
$lastRunAt = null; |
91
|
|
|
|
92
|
|
|
foreach ($dir as $file) { |
93
|
|
|
if ('.' != $file && '..' != $file |
94
|
|
|
&& is_dir($dataDirectory.'/'.$file) |
95
|
|
|
&& filemtime($dataDirectory.'/'.$file) > $lastRunAt) { |
96
|
|
|
$lastCrawl = $file; |
97
|
|
|
$lastRunAt = filemtime($dataDirectory.'/'.$file); |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
if (null === $lastCrawl) { |
102
|
|
|
throw new \Exception('No crawl previously runned'); |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
return $lastCrawl; |
106
|
|
|
} |
107
|
|
|
|
108
|
|
|
public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self |
109
|
|
|
{ |
110
|
|
|
if ('last' === $crawlId) { |
111
|
|
|
$crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
$configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; |
115
|
|
|
if (! file_exists($configFilePath)) { |
116
|
|
|
throw new \Exception('Crawl `'.$crawlId.'` not found.'); |
117
|
|
|
} |
118
|
|
|
$config = json_decode(file_get_contents($configFilePath), true); |
119
|
|
|
|
120
|
|
|
return (new self( |
121
|
|
|
$config['base'].$config['startUrl'], |
122
|
|
|
$config['ignore'], |
123
|
|
|
intval($config['limit']), |
124
|
|
|
(string) $config['userAgent'], |
125
|
|
|
intval($config['cacheMethod']), |
126
|
|
|
intval($config['wait']), |
127
|
|
|
$dataDirectory |
128
|
|
|
))->setId($crawlId); |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
public function recordConfig() |
132
|
|
|
{ |
133
|
|
|
$this->getRecorder(); // permit to create folder |
134
|
|
|
file_put_contents($this->getDataFolder().'/config.json', json_encode([ |
135
|
|
|
'startUrl' => $this->startUrl, |
136
|
|
|
'base' => $this->base, |
137
|
|
|
'ignore' => $this->ignore, |
138
|
|
|
'limit' => $this->limit, |
139
|
|
|
'userAgent' => $this->userAgent, |
140
|
|
|
'cacheMethod' => $this->cacheMethod, |
141
|
|
|
'wait' => $this->wait, |
142
|
|
|
])); |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
protected function setBaseAndStartUrl(string $url) |
146
|
|
|
{ |
147
|
|
|
if (! filter_var($url, FILTER_VALIDATE_URL)) { |
148
|
|
|
throw new \Exception('start is not a valid URL `'.$url.'`'); |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
$this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url; |
152
|
|
|
|
153
|
|
|
$url = substr($url, strlen($this->base)); |
154
|
|
|
|
155
|
|
|
$this->startUrl = (! isset($url[0]) || '/' != $url[0] ? '/' : '').$url; |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
public static function getDataFolderFrom(string $id, ?string $path) |
159
|
|
|
{ |
160
|
|
|
return ($path ?? __DIR__.'/../data').'/'.$id; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
public function getDataFolder() |
164
|
|
|
{ |
165
|
|
|
return $this->dataDirectory.'/'.$this->id; |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
public function getId() |
169
|
|
|
{ |
170
|
|
|
return $this->id; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
public function getBase() |
174
|
|
|
{ |
175
|
|
|
return $this->base; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
public function getStartUrl() |
179
|
|
|
{ |
180
|
|
|
return $this->startUrl; |
181
|
|
|
} |
182
|
|
|
|
183
|
|
|
public function getWait() |
184
|
|
|
{ |
185
|
|
|
return $this->wait; |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
public function getUserAgent() |
189
|
|
|
{ |
190
|
|
|
return $this->userAgent; |
191
|
|
|
} |
192
|
|
|
|
193
|
|
|
public function getLimit() |
194
|
|
|
{ |
195
|
|
|
return $this->limit; |
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
public function getCacheMethod() |
199
|
|
|
{ |
200
|
|
|
return $this->cacheMethod; |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
public function getDataDirectory() |
204
|
|
|
{ |
205
|
|
|
$this->dataDirectory; |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
/** @var RobotsTxt */ |
209
|
|
|
protected $virtualRobots; |
210
|
|
|
|
211
|
|
|
public function getVirtualRobots() |
212
|
|
|
{ |
213
|
|
|
if (null === $this->virtualRobots) { |
214
|
|
|
$this->virtualRobots = new RobotsTxt($this->ignore); |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
return $this->virtualRobots; |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
public function setId(string $id): self |
221
|
|
|
{ |
222
|
|
|
$this->id = $id; |
223
|
|
|
|
224
|
|
|
return $this; |
225
|
|
|
} |
226
|
|
|
|
227
|
|
|
public function getDataFromPreviousCrawl() |
228
|
|
|
{ |
229
|
|
|
$dataFilePath = $this->getDataFolder().'/data.csv'; |
230
|
|
|
if (! file_exists($dataFilePath)) { |
231
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
$urls = []; |
235
|
|
|
$counter = 0; |
236
|
|
|
|
237
|
|
|
$csv = Reader::createFromPath($dataFilePath, 'r'); |
238
|
|
|
$csv->setHeaderOffset(0); |
239
|
|
|
|
240
|
|
|
$records = $csv->getRecords(); |
241
|
|
|
foreach ($records as $r) { |
242
|
|
|
$urls[$r['uri']] = new Url($this->base.$r['uri'], 0); |
243
|
|
|
if (isset($r['can_be_crawled']) && ! empty($r['can_be_crawled']) |
244
|
|
|
&& Indexable::NOT_INDEXABLE_NETWORK_ERROR != $r['indexable'] // we will retry network errror |
245
|
|
|
) { |
246
|
|
|
foreach ($r as $k => $v) { |
247
|
|
|
$kFunction = 'set'.self::camelize($k); |
248
|
|
|
$urls[$r['uri']]->$kFunction($v); |
249
|
|
|
} |
250
|
|
|
++$counter; |
251
|
|
|
} |
252
|
|
|
} |
253
|
|
|
|
254
|
|
|
$currentClick = $r['click'] ?? 0; |
255
|
|
|
|
256
|
|
|
return [ |
257
|
|
|
'urls' => $urls, |
258
|
|
|
'counter' => $counter, |
259
|
|
|
'currentClick' => $currentClick, |
260
|
|
|
]; |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
protected static function camelize($input) |
264
|
|
|
{ |
265
|
|
|
return ucfirst(str_replace('_', '', ucwords($input, '_'))); |
266
|
|
|
} |
267
|
|
|
|
268
|
|
|
// could be add in an other class.. |
269
|
|
|
protected $index; |
270
|
|
|
|
271
|
|
|
protected function getIndexFromPreviousCrawl() |
272
|
|
|
{ |
273
|
|
|
if (null !== $this->index) { |
274
|
|
|
return $this->index; |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
$this->index = []; |
278
|
|
|
|
279
|
|
|
$indexFilePath = $this->getDataFolder().'/index.csv'; |
280
|
|
|
if (! file_exists($indexFilePath)) { |
281
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
282
|
|
|
} |
283
|
|
|
|
284
|
|
|
$csv = Reader::createFromPath($indexFilePath, 'r'); |
285
|
|
|
$csv->setHeaderOffset(0); |
286
|
|
|
|
287
|
|
|
$records = $csv->getRecords(); |
288
|
|
|
foreach ($records as $r) { |
289
|
|
|
$this->index[$r['id']] = new Url($this->base.$r['uri'], 0); |
290
|
|
|
$this->index[$r['id']]->id = $r['id']; |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
return $this->index; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
public function getUrlFromId($id, $base = true) |
297
|
|
|
{ |
298
|
|
|
$index = $this->getIndexFromPreviousCrawl(); |
299
|
|
|
|
300
|
|
|
return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null; |
301
|
|
|
} |
302
|
|
|
|
303
|
|
|
public function cacheRequest($harvest) |
304
|
|
|
{ |
305
|
|
|
if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) { |
306
|
|
|
$this->request = $harvest->getResponse()->getRequest(); |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
return $this; |
310
|
|
|
} |
311
|
|
|
|
312
|
|
|
public function getRequestCached() |
313
|
|
|
{ |
314
|
|
|
return $this->request; |
315
|
|
|
} |
316
|
|
|
|
317
|
|
|
public function cacheRobotsTxt($harvest) |
318
|
|
|
{ |
319
|
|
|
if (null === $this->robotsTxt && $harvest instanceof Harvest) { |
320
|
|
|
$this->robotsTxt = $harvest->getRobotsTxt(); |
321
|
|
|
} |
322
|
|
|
|
323
|
|
|
return $this; |
324
|
|
|
} |
325
|
|
|
|
326
|
|
|
public function getRobotsTxtCached() |
327
|
|
|
{ |
328
|
|
|
return $this->robotsTxt; |
329
|
|
|
} |
330
|
|
|
|
331
|
|
|
public function getRecorder() |
332
|
|
|
{ |
333
|
|
|
if ($this->recorder) { |
334
|
|
|
return $this->recorder; |
335
|
|
|
} |
336
|
|
|
|
337
|
|
|
return $this->recorder = new Recorder($this->getDataFolder(), $this->getCacheMethod()); |
338
|
|
|
} |
339
|
|
|
} |
340
|
|
|
|