1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\SeoPocketCrawler; |
4
|
|
|
|
5
|
|
|
use League\Csv\Reader; |
6
|
|
|
use PiedWeb\UrlHarvester\Harvest; |
7
|
|
|
use Spatie\Robots\RobotsTxt; |
8
|
|
|
|
9
|
|
|
class CrawlerConfig |
10
|
|
|
{ |
11
|
|
|
/** |
12
|
|
|
* @var string contain the user agent used during the crawl |
13
|
|
|
*/ |
14
|
|
|
protected $userAgent; |
15
|
|
|
|
16
|
|
|
/** |
17
|
|
|
* @var string crawl id |
18
|
|
|
*/ |
19
|
|
|
protected $id; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* @var string page to ignore during the crawl |
23
|
|
|
*/ |
24
|
|
|
protected $ignore; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* @var int depth max where to crawl |
28
|
|
|
*/ |
29
|
|
|
protected $limit; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* @var string contain https://domain.tdl from start url |
33
|
|
|
*/ |
34
|
|
|
protected $base; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* @var int |
38
|
|
|
*/ |
39
|
|
|
protected $wait; |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* @var int |
43
|
|
|
*/ |
44
|
|
|
protected $cacheMethod; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @var string |
48
|
|
|
*/ |
49
|
|
|
protected $dataDirectory; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* @var string |
53
|
|
|
*/ |
54
|
|
|
protected $startUrl; |
55
|
|
|
|
56
|
|
|
protected $request; |
57
|
|
|
protected $robotsTxt; |
58
|
|
|
|
59
|
|
|
/** @var Recorder */ |
60
|
|
|
protected $recorder; |
61
|
|
|
|
62
|
|
|
public function __construct( |
63
|
|
|
string $startUrl, |
64
|
|
|
string $ignore, |
65
|
|
|
int $limit, |
66
|
|
|
string $userAgent, |
67
|
|
|
int $cacheMethod = Recorder::CACHE_ID, |
68
|
|
|
int $waitInMicroSeconds = 100000, |
69
|
|
|
?string $dataDirectory = null |
70
|
|
|
) { |
71
|
|
|
$this->setBaseAndStartUrl($startUrl); |
72
|
|
|
//$this->urls[$this->startUrl] = null; |
73
|
|
|
$this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST); |
74
|
|
|
$this->ignore = $ignore; |
75
|
|
|
$this->userAgent = $userAgent; |
76
|
|
|
$this->limit = $limit; |
77
|
|
|
$this->cacheMethod = $cacheMethod; |
78
|
|
|
$this->wait = $waitInMicroSeconds; |
79
|
|
|
$this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/'); |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* @return string id |
84
|
|
|
*/ |
85
|
|
|
public static function getLastCrawl(string $dataDirectory): string |
86
|
|
|
{ |
87
|
|
|
$dir = scandir($dataDirectory); |
88
|
|
|
$lastCrawl = null; |
89
|
|
|
$lastRunAt = null; |
90
|
|
|
|
91
|
|
|
foreach ($dir as $file) { |
92
|
|
|
if ('.' != $file && '..' != $file |
93
|
|
|
&& is_dir($dataDirectory.'/'.$file) |
94
|
|
|
&& filemtime($dataDirectory.'/'.$file) > $lastRunAt) { |
95
|
|
|
$lastCrawl = $file; |
96
|
|
|
$lastRunAt = filemtime($dataDirectory.'/'.$file); |
97
|
|
|
} |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
if (null === $lastCrawl) { |
101
|
|
|
throw new \Exception('No crawl previously runned'); |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
return $lastCrawl; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self |
108
|
|
|
{ |
109
|
|
|
if ('last' === $crawlId) { |
110
|
|
|
$crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
$configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; |
114
|
|
|
if (!file_exists($configFilePath)) { |
115
|
|
|
throw new \Exception('Crawl `'.$crawlId.'` not found.'); |
116
|
|
|
} |
117
|
|
|
$config = json_decode(file_get_contents($configFilePath), true); |
118
|
|
|
|
119
|
|
|
return (new self( |
120
|
|
|
$config['base'].$config['startUrl'], |
121
|
|
|
$config['ignore'], |
122
|
|
|
intval($config['limit']), |
123
|
|
|
(string) $config['userAgent'], |
124
|
|
|
intval($config['cacheMethod']), |
125
|
|
|
intval($config['wait']), |
126
|
|
|
$dataDirectory |
127
|
|
|
))->setId($crawlId); |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
public function recordConfig() |
131
|
|
|
{ |
132
|
|
|
$this->getRecorder(); // permit to create folder |
133
|
|
|
file_put_contents($this->getDataFolder().'/config.json', json_encode([ |
134
|
|
|
'startUrl' => $this->startUrl, |
135
|
|
|
'base' => $this->base, |
136
|
|
|
'ignore' => $this->ignore, |
137
|
|
|
'limit' => $this->limit, |
138
|
|
|
'userAgent' => $this->userAgent, |
139
|
|
|
'cacheMethod' => $this->cacheMethod, |
140
|
|
|
'wait' => $this->wait, |
141
|
|
|
])); |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
protected function setBaseAndStartUrl(string $url) |
145
|
|
|
{ |
146
|
|
|
if (!filter_var($url, FILTER_VALIDATE_URL)) { |
147
|
|
|
throw new \Exception('start is not a valid URL `'.$url.'`'); |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
$this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url; |
151
|
|
|
|
152
|
|
|
$url = substr($url, strlen($this->base)); |
153
|
|
|
|
154
|
|
|
$this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
public static function getDataFolderFrom(string $id, ?string $path) |
158
|
|
|
{ |
159
|
|
|
return ($path ?? __DIR__.'/../data').'/'.$id; |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
public function getDataFolder() |
163
|
|
|
{ |
164
|
|
|
return $this->dataDirectory.'/'.$this->id; |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
public function getId() |
168
|
|
|
{ |
169
|
|
|
return $this->id; |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
public function getBase() |
173
|
|
|
{ |
174
|
|
|
return $this->base; |
175
|
|
|
} |
176
|
|
|
|
177
|
|
|
public function getStartUrl() |
178
|
|
|
{ |
179
|
|
|
return $this->startUrl; |
180
|
|
|
} |
181
|
|
|
|
182
|
|
|
public function getWait() |
183
|
|
|
{ |
184
|
|
|
return $this->wait; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
public function getUserAgent() |
188
|
|
|
{ |
189
|
|
|
return $this->userAgent; |
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
public function getLimit() |
193
|
|
|
{ |
194
|
|
|
return $this->limit; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
public function getCacheMethod() |
198
|
|
|
{ |
199
|
|
|
return $this->cacheMethod; |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
public function getDataDirectory() |
203
|
|
|
{ |
204
|
|
|
$this->dataDirectory; |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
/** @var RobotsTxt */ |
208
|
|
|
protected $virtualRobots; |
209
|
|
|
|
210
|
|
|
public function getVirtualRobots() |
211
|
|
|
{ |
212
|
|
|
if (null === $this->virtualRobots) { |
213
|
|
|
$this->virtualRobots = new RobotsTxt($this->ignore); |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
return $this->virtualRobots; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
public function setId(string $id): self |
220
|
|
|
{ |
221
|
|
|
$this->id = $id; |
222
|
|
|
|
223
|
|
|
return $this; |
224
|
|
|
} |
225
|
|
|
|
226
|
|
|
public function getDataFromPreviousCrawl() |
227
|
|
|
{ |
228
|
|
|
$dataFilePath = $this->getDataFolder().'/data.csv'; |
229
|
|
|
if (!file_exists($dataFilePath)) { |
230
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
$urls = []; |
234
|
|
|
$counter = 0; |
235
|
|
|
|
236
|
|
|
$csv = Reader::createFromPath($dataFilePath, 'r'); |
237
|
|
|
$csv->setHeaderOffset(0); |
238
|
|
|
|
239
|
|
|
$records = $csv->getRecords(); |
240
|
|
|
foreach ($records as $r) { |
241
|
|
|
$urls[$r['uri']] = new Url($this->base.$r['uri'], 0); |
242
|
|
|
foreach ($r as $k => $v) { |
243
|
|
|
if ('can_be_crawled' == $k && !empty($v)) { |
244
|
|
|
$v = (bool) $v; |
245
|
|
|
} |
246
|
|
|
$urls[$r['uri']]->$k = $v; |
247
|
|
|
} |
248
|
|
|
if (!empty($r['can_be_crawled'])) { |
249
|
|
|
++$counter; |
250
|
|
|
} |
251
|
|
|
} |
252
|
|
|
|
253
|
|
|
$currentClick = $r['click'] ?? 0; |
254
|
|
|
|
255
|
|
|
return [ |
256
|
|
|
'urls' => $urls, |
257
|
|
|
'counter' => $counter, |
258
|
|
|
'currentClick' => $currentClick, |
259
|
|
|
]; |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
// could be add in an other class.. |
263
|
|
|
protected $index; |
264
|
|
|
|
265
|
|
|
protected function getIndexFromPreviousCrawl() |
266
|
|
|
{ |
267
|
|
|
if (null !== $this->index) { |
268
|
|
|
return $this->index; |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
$this->index = []; |
272
|
|
|
|
273
|
|
|
$indexFilePath = $this->getDataFolder().'/index.csv'; |
274
|
|
|
if (!file_exists($indexFilePath)) { |
275
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
276
|
|
|
} |
277
|
|
|
|
278
|
|
|
$csv = Reader::createFromPath($indexFilePath, 'r'); |
279
|
|
|
$csv->setHeaderOffset(0); |
280
|
|
|
|
281
|
|
|
$records = $csv->getRecords(); |
282
|
|
|
foreach ($records as $r) { |
283
|
|
|
$this->index[$r['id']] = new Url($this->base.$r['uri'], 0); |
284
|
|
|
$this->index[$r['id']]->id = $r['id']; |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
return $this->index; |
288
|
|
|
} |
289
|
|
|
|
290
|
|
|
public function getUrlFromId($id, $base = true) |
291
|
|
|
{ |
292
|
|
|
$index = $this->getIndexFromPreviousCrawl(); |
293
|
|
|
|
294
|
|
|
return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null; |
295
|
|
|
} |
296
|
|
|
|
297
|
|
|
public function cacheRequest($harvest) |
298
|
|
|
{ |
299
|
|
|
if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) { |
300
|
|
|
$this->request = $harvest->getResponse()->getRequest(); |
301
|
|
|
} |
302
|
|
|
|
303
|
|
|
return $this; |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
public function getRequestCached() |
307
|
|
|
{ |
308
|
|
|
return $this->request; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
public function cacheRobotsTxt($harvest) |
312
|
|
|
{ |
313
|
|
|
if (null === $this->robotsTxt && $harvest instanceof Harvest) { |
314
|
|
|
$this->robotsTxt = $harvest->getRobotsTxt(); |
315
|
|
|
} |
316
|
|
|
|
317
|
|
|
return $this; |
318
|
|
|
} |
319
|
|
|
|
320
|
|
|
public function getRobotsTxtCached() |
321
|
|
|
{ |
322
|
|
|
return $this->robotsTxt; |
323
|
|
|
} |
324
|
|
|
|
325
|
|
|
public function getRecorder() |
326
|
|
|
{ |
327
|
|
|
return $this->recorder ?? $this->recorder = new Recorder($this->getDataFolder(), $this->getCacheMethod()); |
328
|
|
|
} |
329
|
|
|
} |
330
|
|
|
|