1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\SeoPocketCrawler; |
4
|
|
|
|
5
|
|
|
use League\Csv\Reader; |
6
|
|
|
use Spatie\Robots\RobotsTxt; |
7
|
|
|
|
8
|
|
|
class CrawlerConfig |
9
|
|
|
{ |
10
|
|
|
/** |
11
|
|
|
* @var string contain the user agent used during the crawl |
12
|
|
|
*/ |
13
|
|
|
protected $userAgent; |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* @var string crawl id |
17
|
|
|
*/ |
18
|
|
|
protected $id; |
19
|
|
|
|
20
|
|
|
/** |
21
|
|
|
* @var string page to ignore during the crawl |
22
|
|
|
*/ |
23
|
|
|
protected $ignore; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* @var int depth max where to crawl |
27
|
|
|
*/ |
28
|
|
|
protected $limit; |
29
|
|
|
|
30
|
|
|
/** |
31
|
|
|
* @var string contain https://domain.tdl from start url |
32
|
|
|
*/ |
33
|
|
|
protected $base; |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* @var int |
37
|
|
|
*/ |
38
|
|
|
protected $wait; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* @var int |
42
|
|
|
*/ |
43
|
|
|
protected $cacheMethod; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* @var string |
47
|
|
|
*/ |
48
|
|
|
protected $dataDirectory; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var string |
52
|
|
|
*/ |
53
|
|
|
protected $startUrl; |
54
|
|
|
|
55
|
|
|
public function __construct( |
56
|
|
|
string $startUrl, |
57
|
|
|
string $ignore, |
58
|
|
|
int $limit, |
59
|
|
|
string $userAgent, |
60
|
|
|
int $cacheMethod = Recorder::CACHE_ID, |
61
|
|
|
int $waitInMicroSeconds = 100000, |
62
|
|
|
?string $dataDirectory = null |
63
|
|
|
) { |
64
|
|
|
$this->setBaseAndStartUrl($startUrl); |
65
|
|
|
//$this->urls[$this->startUrl] = null; |
66
|
|
|
$this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST); |
67
|
|
|
$this->ignore = $ignore; |
68
|
|
|
$this->userAgent = $userAgent; |
69
|
|
|
$this->limit = $limit; |
70
|
|
|
$this->cacheMethod = $cacheMethod; |
71
|
|
|
$this->wait = $waitInMicroSeconds; |
72
|
|
|
$this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/'); |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* @return string id |
77
|
|
|
*/ |
78
|
|
|
public static function getLastCrawl(string $dataDirectory): string |
79
|
|
|
{ |
80
|
|
|
$dir = scandir($dataDirectory); |
81
|
|
|
$lastCrawl = null; |
82
|
|
|
$lastRunAt = null; |
83
|
|
|
|
84
|
|
|
foreach ($dir as $file) { |
85
|
|
|
if ('.' != $file && '..' != $file |
86
|
|
|
&& is_dir($dataDirectory.'/'.$file) |
87
|
|
|
&& filemtime($dataDirectory.'/'.$file) > $lastRunAt) { |
88
|
|
|
$lastCrawl = $file; |
89
|
|
|
$lastRunAt = filemtime($dataDirectory.'/'.$file); |
90
|
|
|
} |
91
|
|
|
} |
92
|
|
|
|
93
|
|
|
if (null === $lastCrawl) { |
94
|
|
|
throw new \Exception('No crawl previously runned'); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
return $lastCrawl; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self |
101
|
|
|
{ |
102
|
|
|
if ('last' === $crawlId) { |
103
|
|
|
$crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
$configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; |
107
|
|
|
if (!file_exists($configFilePath)) { |
108
|
|
|
throw new \Exception('Crawl `'.$crawlId.'` not found.'); |
109
|
|
|
} |
110
|
|
|
$config = json_decode(file_get_contents($configFilePath), true); |
111
|
|
|
|
112
|
|
|
return (new self( |
113
|
|
|
$config['base'].$config['startUrl'], |
114
|
|
|
$config['ignore'], |
115
|
|
|
intval($config['limit']), |
116
|
|
|
(string) $config['userAgent'], |
117
|
|
|
intval($config['cacheMethod']), |
118
|
|
|
intval($config['wait']), |
119
|
|
|
$dataDirectory |
120
|
|
|
))->setId($crawlId); |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
public function recordConfig() |
124
|
|
|
{ |
125
|
|
|
file_put_contents($this->getDataFolder().'/config.json', json_encode([ |
126
|
|
|
'startUrl' => $this->startUrl, |
127
|
|
|
'base' => $this->base, |
128
|
|
|
'ignore' => $this->ignore, |
129
|
|
|
'limit' => $this->limit, |
130
|
|
|
'userAgent' => $this->userAgent, |
131
|
|
|
'cacheMethod' => $this->cacheMethod, |
132
|
|
|
'wait' => $this->wait, |
133
|
|
|
])); |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
protected function setBaseAndStartUrl(string $url) |
137
|
|
|
{ |
138
|
|
|
if (!filter_var($url, FILTER_VALIDATE_URL)) { |
139
|
|
|
throw new \Exception('start is not a valid URL `'.$url.'`'); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
$this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url; |
143
|
|
|
|
144
|
|
|
$url = substr($url, strlen($this->base)); |
145
|
|
|
|
146
|
|
|
$this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
public static function getDataFolderFrom(string $id, ?string $path) |
150
|
|
|
{ |
151
|
|
|
return ($path ?? __DIR__.'/../data').'/'.$id; |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
public function getDataFolder() |
155
|
|
|
{ |
156
|
|
|
return $this->dataDirectory.'/'.$this->id; |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
public function getId() |
160
|
|
|
{ |
161
|
|
|
return $this->id; |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
public function getBase() |
165
|
|
|
{ |
166
|
|
|
return $this->base; |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
public function getStartUrl() |
170
|
|
|
{ |
171
|
|
|
return $this->startUrl; |
172
|
|
|
} |
173
|
|
|
|
174
|
|
|
public function getWait() |
175
|
|
|
{ |
176
|
|
|
return $this->wait; |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
public function getUserAgent() |
180
|
|
|
{ |
181
|
|
|
return $this->userAgent; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
public function getLimit() |
185
|
|
|
{ |
186
|
|
|
return $this->limit; |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
public function getCacheMethod() |
190
|
|
|
{ |
191
|
|
|
return $this->cacheMethod; |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
public function getDataDirectory() |
195
|
|
|
{ |
196
|
|
|
$this->dataDirectory; |
197
|
|
|
} |
198
|
|
|
|
199
|
|
|
/** @var RobotsTxt */ |
200
|
|
|
protected $virtualRobots; |
201
|
|
|
|
202
|
|
|
public function getVirtualRobots() |
203
|
|
|
{ |
204
|
|
|
if (null === $this->virtualRobots) { |
205
|
|
|
$this->virtualRobots = new RobotsTxt($this->ignore); |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
return $this->virtualRobots; |
209
|
|
|
} |
210
|
|
|
|
211
|
|
|
public function setId(string $id): self |
212
|
|
|
{ |
213
|
|
|
$this->id = $id; |
214
|
|
|
|
215
|
|
|
return $this; |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
public function getDataFromPreviousCrawl() |
219
|
|
|
{ |
220
|
|
|
$dataFilePath = $this->getDataFolder().'/data.csv'; |
221
|
|
|
if (!file_exists($dataFilePath)) { |
222
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
223
|
|
|
} |
224
|
|
|
|
225
|
|
|
$urls = []; |
226
|
|
|
$counter = 0; |
227
|
|
|
|
228
|
|
|
$csv = Reader::createFromPath($dataFilePath, 'r'); |
229
|
|
|
$csv->setHeaderOffset(0); |
230
|
|
|
|
231
|
|
|
$records = $csv->getRecords(); |
232
|
|
|
foreach ($records as $r) { |
233
|
|
|
$urls[$r['uri']] = new Url($this->base.$r['uri'], 0); |
234
|
|
|
foreach ($r as $k => $v) { |
235
|
|
|
if ('can_be_crawled' == $k && !empty($v)) { |
236
|
|
|
$v = (bool) $v; |
237
|
|
|
} |
238
|
|
|
$urls[$r['uri']]->$k = $v; |
239
|
|
|
} |
240
|
|
|
if (!empty($r['can_be_crawled'])) { |
241
|
|
|
++$counter; |
242
|
|
|
} |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
$currentClick = $r['click'] ?? 0; |
246
|
|
|
|
247
|
|
|
return [ |
248
|
|
|
'urls' => $urls, |
249
|
|
|
'counter' => $counter, |
250
|
|
|
'currentClick' => $currentClick, |
251
|
|
|
]; |
252
|
|
|
} |
253
|
|
|
|
254
|
|
|
// could be add in an other class.. |
255
|
|
|
protected $index; |
256
|
|
|
|
257
|
|
|
protected function getIndexFromPreviousCrawl() |
258
|
|
|
{ |
259
|
|
|
if (null !== $this->index) { |
260
|
|
|
return $this->index; |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
$this->index = []; |
264
|
|
|
|
265
|
|
|
$indexFilePath = $this->getDataFolder().'/index.csv'; |
266
|
|
|
if (!file_exists($indexFilePath)) { |
267
|
|
|
throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
$csv = Reader::createFromPath($indexFilePath, 'r'); |
271
|
|
|
$csv->setHeaderOffset(0); |
272
|
|
|
|
273
|
|
|
$records = $csv->getRecords(); |
274
|
|
|
foreach ($records as $r) { |
275
|
|
|
$this->index[$r['id']] = new Url($this->base.$r['uri'], 0); |
276
|
|
|
$this->index[$r['id']]->id = $r['id']; |
277
|
|
|
} |
278
|
|
|
|
279
|
|
|
return $this->index; |
280
|
|
|
} |
281
|
|
|
|
282
|
|
|
public function getUrlFromId($id, $base = true) |
283
|
|
|
{ |
284
|
|
|
$index = $this->getIndexFromPreviousCrawl(); |
285
|
|
|
|
286
|
|
|
return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null; |
287
|
|
|
} |
288
|
|
|
} |
289
|
|
|
|