|
1
|
|
|
<?php |
|
2
|
|
|
declare(strict_types = 1); |
|
3
|
|
|
namespace hexydec\agentzero; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* @phpstan-import-type props from config |
|
7
|
|
|
*/ |
|
8
|
|
|
class crawlers { |
|
9
|
|
|
|
|
10
|
|
|
/** |
|
11
|
|
|
* Extracts application and version information from a token |
|
12
|
|
|
* |
|
13
|
|
|
* @param string $value The token to be processed |
|
14
|
|
|
* @param array<string|null> $data An array containing existing data to merge |
|
15
|
|
|
* @return array<string|int|float|null> The $data array with the processed application and version added |
|
16
|
|
|
*/ |
|
17
|
11 |
|
public static function getApp(string $value, array $data = []) : array { |
|
18
|
11 |
|
if (!\str_contains($value, '://') && !\str_starts_with($value, 'Chrome/')) { // bot will be in the URL |
|
19
|
11 |
|
$parts = \explode('/', $value, 2); |
|
20
|
|
|
|
|
21
|
|
|
// process version |
|
22
|
11 |
|
if (!empty($parts[1])) { |
|
23
|
9 |
|
$parts[1] = \ltrim($parts[1], 'v'); |
|
24
|
9 |
|
$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
|
25
|
|
|
} |
|
26
|
11 |
|
return \array_merge([ |
|
27
|
11 |
|
'type' => 'robot', |
|
28
|
11 |
|
'app' => $parts[0], |
|
29
|
11 |
|
'appname' => $parts[0], |
|
30
|
11 |
|
'appversion' => empty($parts[1]) ? null : $parts[1] |
|
31
|
11 |
|
], $data); |
|
32
|
|
|
} |
|
33
|
2 |
|
return []; |
|
34
|
|
|
} |
|
35
|
|
|
|
|
36
|
|
|
/** |
|
37
|
|
|
* Generates a configuration array for matching crawlers |
|
38
|
|
|
* |
|
39
|
|
|
* @return props An array with keys representing the string to match, and a value of an array containing parsing and output settings |
|
40
|
|
|
*/ |
|
41
|
11 |
|
public static function get() : array { |
|
42
|
8 |
|
$fn = [ |
|
43
|
8 |
|
'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
|
44
|
8 |
|
'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
|
45
|
8 |
|
'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
|
46
|
8 |
|
'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
|
47
|
2 |
|
\str_contains($value, 'WhatsApp/') ? [ |
|
48
|
2 |
|
'app' => 'WhatsApp' |
|
49
|
2 |
|
] : [], |
|
50
|
8 |
|
[ |
|
51
|
8 |
|
'category' => 'feed' |
|
52
|
8 |
|
] |
|
53
|
8 |
|
)), |
|
54
|
8 |
|
'crawler' => function (string $value) : array { |
|
55
|
2 |
|
$parts = \explode('/', $value, 2); |
|
56
|
2 |
|
$map = [ |
|
57
|
2 |
|
'baiduspider' => 'search', |
|
58
|
2 |
|
'haosouspider' => 'search', |
|
59
|
2 |
|
'yisouspider' => 'search', |
|
60
|
2 |
|
'360spider' => 'search', |
|
61
|
2 |
|
'sogou web spider' => 'search', |
|
62
|
2 |
|
'bytespider' => 'search', |
|
63
|
2 |
|
]; |
|
64
|
2 |
|
return self::getApp($value, [ |
|
65
|
2 |
|
'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler' |
|
66
|
2 |
|
]); |
|
67
|
8 |
|
}, |
|
68
|
8 |
|
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
|
69
|
8 |
|
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
|
70
|
8 |
|
'map' => function (string $value) : ?array { |
|
71
|
11 |
|
if (!\str_contains($value, '://') && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
|
72
|
9 |
|
$parts = \explode('/', $value, 2); |
|
73
|
9 |
|
$category = [ |
|
74
|
9 |
|
'yacybot' => [ |
|
75
|
9 |
|
'category' => 'search', |
|
76
|
9 |
|
'app' => 'YacyBot' |
|
77
|
9 |
|
], |
|
78
|
9 |
|
'Googlebot' => [ |
|
79
|
9 |
|
'category' => 'search', |
|
80
|
9 |
|
'app' => 'GoogleBot' |
|
81
|
9 |
|
], |
|
82
|
9 |
|
'Googlebot-Mobile' => [ |
|
83
|
9 |
|
'category' => 'search', |
|
84
|
9 |
|
'app' => 'GoogleBot' |
|
85
|
9 |
|
], |
|
86
|
9 |
|
'Googlebot-Image' => [ |
|
87
|
9 |
|
'category' => 'search', |
|
88
|
9 |
|
'app' => 'GoogleBot' |
|
89
|
9 |
|
], |
|
90
|
9 |
|
'Googlebot-Video' => [ |
|
91
|
9 |
|
'category' => 'search', |
|
92
|
9 |
|
'app' => 'GoogleBot' |
|
93
|
9 |
|
], |
|
94
|
9 |
|
'Googlebot-News' => [ |
|
95
|
9 |
|
'category' => 'search', |
|
96
|
9 |
|
'app' => 'GoogleBot' |
|
97
|
9 |
|
], |
|
98
|
9 |
|
'Storebot-Google' => [ |
|
99
|
9 |
|
'category' => 'search', |
|
100
|
9 |
|
'app' => 'GoogleBot' |
|
101
|
9 |
|
], |
|
102
|
9 |
|
'AdsBot-Google' => [ |
|
103
|
9 |
|
'category' => 'ads', |
|
104
|
9 |
|
'app' => 'GoogleBot' |
|
105
|
9 |
|
], |
|
106
|
9 |
|
'AdsBot-Google-Mobile' => [ |
|
107
|
9 |
|
'category' => 'ads', |
|
108
|
9 |
|
'app' => 'GoogleBot' |
|
109
|
9 |
|
], |
|
110
|
9 |
|
'Bingbot' => [ |
|
111
|
9 |
|
'category' => 'search', |
|
112
|
9 |
|
'app' => 'BingBot' |
|
113
|
9 |
|
], |
|
114
|
9 |
|
'bingbot' => [ |
|
115
|
9 |
|
'category' => 'search', |
|
116
|
9 |
|
'app' => 'BingBot' |
|
117
|
9 |
|
], |
|
118
|
9 |
|
'adidxbot' => [ |
|
119
|
9 |
|
'category' => 'ads', |
|
120
|
9 |
|
'app' => 'AdidxBot' |
|
121
|
9 |
|
], |
|
122
|
9 |
|
'DuckDuckBot' => [ |
|
123
|
9 |
|
'category' => 'search', |
|
124
|
9 |
|
'app' => 'DuckDuckBot' |
|
125
|
9 |
|
], |
|
126
|
9 |
|
'DuckDuckGo-Favicons-Bot' => [ |
|
127
|
9 |
|
'category' => 'search', |
|
128
|
9 |
|
'app' => 'DuckDuckBot' |
|
129
|
9 |
|
], |
|
130
|
9 |
|
'coccocbot-image' => [ |
|
131
|
9 |
|
'category' => 'search', |
|
132
|
9 |
|
'app' => 'CoccocBot' |
|
133
|
9 |
|
], |
|
134
|
9 |
|
'coccocbot-web' => [ |
|
135
|
9 |
|
'category' => 'search', |
|
136
|
9 |
|
'app' => 'CoccocBot' |
|
137
|
9 |
|
], |
|
138
|
9 |
|
'Applebot' => [ |
|
139
|
9 |
|
'category' => 'search', |
|
140
|
9 |
|
'app' => 'AppleBot' |
|
141
|
9 |
|
], |
|
142
|
9 |
|
'YandexBot' => [ |
|
143
|
9 |
|
'category' => 'search' |
|
144
|
9 |
|
], |
|
145
|
9 |
|
'MJ12bot' => [ |
|
146
|
9 |
|
'category' => 'search', |
|
147
|
9 |
|
'app' => 'MJ12 Bot' |
|
148
|
9 |
|
], |
|
149
|
9 |
|
'Mail.RU_Bot' => [ |
|
150
|
9 |
|
'category' => 'search', |
|
151
|
9 |
|
'app' => 'Mail.ru Bot' |
|
152
|
9 |
|
], |
|
153
|
9 |
|
'Exabot' => [ |
|
154
|
9 |
|
'category' => 'search', |
|
155
|
9 |
|
'app' => 'ExaBot' |
|
156
|
9 |
|
], |
|
157
|
9 |
|
'UptimeRobot' => [ |
|
158
|
9 |
|
'category' => 'monitor' |
|
159
|
9 |
|
], |
|
160
|
9 |
|
'PetalBot' => [ |
|
161
|
9 |
|
'category' => 'search' |
|
162
|
9 |
|
], |
|
163
|
9 |
|
'Twitterbot' => [ |
|
164
|
9 |
|
'category' => 'feed', |
|
165
|
9 |
|
'app' => 'TwitterBot' |
|
166
|
9 |
|
], |
|
167
|
9 |
|
'Xbot' => [ |
|
168
|
9 |
|
'category' => 'feed' |
|
169
|
9 |
|
], |
|
170
|
9 |
|
'Discordbot' => [ |
|
171
|
9 |
|
'category' => 'feed', |
|
172
|
9 |
|
'app' => 'DiscordBot' |
|
173
|
9 |
|
], |
|
174
|
9 |
|
'SematextSyntheticsRobot' => [ |
|
175
|
9 |
|
'category' => 'monitor', |
|
176
|
9 |
|
'app' => 'Sematext Synthetics Robot' |
|
177
|
9 |
|
], |
|
178
|
9 |
|
'LinkedInBot' => [ |
|
179
|
9 |
|
'category' => 'feed' |
|
180
|
9 |
|
], |
|
181
|
9 |
|
'PaperLiBot' => [ |
|
182
|
9 |
|
'category' => 'feed' |
|
183
|
9 |
|
], |
|
184
|
9 |
|
'bitlybot' => [ |
|
185
|
9 |
|
'category' => 'feed', |
|
186
|
9 |
|
'app' => 'Bit.ly Bot' |
|
187
|
9 |
|
], |
|
188
|
9 |
|
'TinEye-bot' => [ |
|
189
|
9 |
|
'category' => 'search', |
|
190
|
9 |
|
'app' => 'TinEye Bot' |
|
191
|
9 |
|
], |
|
192
|
9 |
|
'Pinterestbot' => [ |
|
193
|
9 |
|
'category' => 'feed', |
|
194
|
9 |
|
'app' => 'PinterestBot' |
|
195
|
9 |
|
], |
|
196
|
9 |
|
'WebCrawler' => [ |
|
197
|
9 |
|
'category' => 'crawler' |
|
198
|
9 |
|
], |
|
199
|
9 |
|
'webprosbot' => [ |
|
200
|
9 |
|
'category' => 'crawler', |
|
201
|
9 |
|
'app' => 'WebprosBot' |
|
202
|
9 |
|
], |
|
203
|
9 |
|
'GuzzleHttp' => [ |
|
204
|
9 |
|
'category' => 'scraper' |
|
205
|
9 |
|
], |
|
206
|
9 |
|
'TelegramBot' => [ |
|
207
|
9 |
|
'category' => 'feed' |
|
208
|
9 |
|
], |
|
209
|
9 |
|
'Ruby' => [ |
|
210
|
9 |
|
'category' => 'scraper' |
|
211
|
9 |
|
], |
|
212
|
9 |
|
'SEMrushBot' => [ |
|
213
|
9 |
|
'category' => 'crawler' |
|
214
|
9 |
|
], |
|
215
|
9 |
|
'Mediatoolkitbot' => [ |
|
216
|
9 |
|
'category' => 'crawler', |
|
217
|
9 |
|
'app' => 'MediaToolkitBot' |
|
218
|
9 |
|
], |
|
219
|
9 |
|
'IPLoggerBot' => [ |
|
220
|
9 |
|
'category' => 'monitor' |
|
221
|
9 |
|
], |
|
222
|
9 |
|
'CFNetwork' => [ |
|
223
|
9 |
|
'category' => 'feed', |
|
224
|
9 |
|
'app' => 'Apple Core Foundation Network' |
|
225
|
9 |
|
], |
|
226
|
9 |
|
'NCSC Web Check [email protected]' => [ |
|
227
|
9 |
|
'category' => 'monitor', |
|
228
|
9 |
|
'app' => 'NCSC Web Check' |
|
229
|
9 |
|
], |
|
230
|
9 |
|
'Google-Site-Verification' => [ |
|
231
|
9 |
|
'category' => 'validator', |
|
232
|
9 |
|
'app' => 'Google Site Verification' |
|
233
|
9 |
|
], |
|
234
|
9 |
|
'Google-InspectionTool' => [ |
|
235
|
9 |
|
'category' => 'validator', |
|
236
|
9 |
|
'app' => 'Google Inspection Tool' |
|
237
|
9 |
|
], |
|
238
|
9 |
|
'PingdomTMS' => [ |
|
239
|
9 |
|
'category' => 'monitor', |
|
240
|
9 |
|
'app' => 'Pingdom.com' |
|
241
|
9 |
|
], |
|
242
|
9 |
|
'facebookexternalhit' => [ |
|
243
|
9 |
|
'category' => 'feed', |
|
244
|
9 |
|
'app' => 'Facebook URL Preview' |
|
245
|
9 |
|
] |
|
246
|
9 |
|
]; |
|
247
|
9 |
|
return self::getApp($value, \array_merge([ |
|
248
|
9 |
|
'category' => \mb_stripos($value, 'crawler') !== false ? 'crawler' : null, |
|
249
|
9 |
|
'app' => $parts[0], |
|
250
|
9 |
|
'appname' => $parts[0], |
|
251
|
9 |
|
], $category[$parts[0]] ?? [])); |
|
252
|
|
|
} |
|
253
|
7 |
|
return null; |
|
254
|
8 |
|
} |
|
255
|
8 |
|
]; |
|
256
|
1 |
|
return [ |
|
|
|
|
|
|
257
|
1 |
|
'Yahoo! Slurp' => new props('exact', $fn['search']), |
|
258
|
1 |
|
'facebookexternalhit/' => new props('start', $fn['map']), |
|
259
|
1 |
|
'Google-Site-Verification/' => new props('start', $fn['map']), |
|
260
|
1 |
|
'Google-InspectionTool/' => new props('start', $fn['map']), |
|
261
|
1 |
|
'Mediapartners-Google' => new props('start', $fn['search']), |
|
262
|
1 |
|
'FeedFetcher-Google' => new props('exact', $fn['feed']), |
|
263
|
1 |
|
'GoogleProducer' => new props('exact', $fn['feed']), |
|
264
|
1 |
|
'Google-adstxt' => new props('exact', $fn['ads']), |
|
265
|
1 |
|
'CFNetwork/' => new props('start', $fn['map']), |
|
266
|
1 |
|
'Siteimprove.com' => new props('any', $fn['crawler']), |
|
267
|
1 |
|
'CyotekWebCopy' => new props('start', $fn['scraper']), |
|
268
|
1 |
|
'Google Page Speed Insights' => new props('exact', $fn['validator']), |
|
269
|
1 |
|
'Qwantify' => new props('start', $fn['search']), |
|
270
|
1 |
|
'okhttp' => new props('start', $fn['scraper']), |
|
271
|
1 |
|
'python' => new props('start', $fn['scraper']), |
|
272
|
1 |
|
'jsdom/' => new props('start', $fn['scraper']), |
|
273
|
1 |
|
'Nessus' => new props('start', $fn['monitor']), |
|
274
|
1 |
|
'Chrome-Lighthouse' => new props('start', $fn['validator']), |
|
275
|
1 |
|
'Siege/' => new props('start', $fn['validator']), |
|
276
|
1 |
|
'PingdomTMS/' => new props('start', $fn['map']), |
|
277
|
1 |
|
'DynGate' => new props('exact', $fn['monitor']), |
|
278
|
1 |
|
'Datadog/Synthetics' => new props('exact', [ |
|
279
|
1 |
|
'type' => 'robot', |
|
280
|
1 |
|
'category' => 'monitor', |
|
281
|
1 |
|
'app' => 'Datadog/Synthetics' |
|
282
|
1 |
|
]), |
|
283
|
1 |
|
'RuxitSynthetic/' => new props('start', $fn['monitor']), |
|
284
|
1 |
|
'Checkly/' => new props('start', $fn['monitor']), |
|
285
|
1 |
|
'Uptime/' => new props('start', $fn['monitor']), |
|
286
|
1 |
|
'HostTracker/' => new props('start', $fn['monitor']), |
|
287
|
1 |
|
'NCSC Web Check [email protected]' => new props('exact', $fn['map']), |
|
288
|
1 |
|
'Pingdom.com' => new props('start', function (string $value) : array { |
|
289
|
1 |
|
$version = \explode('_', \trim($value, '_')); |
|
290
|
1 |
|
return [ |
|
291
|
1 |
|
'type' => 'robot', |
|
292
|
1 |
|
'category' => 'monitor', |
|
293
|
1 |
|
'app' => 'Pingdom.com', |
|
294
|
1 |
|
'appname' => \trim($value, '_'), |
|
295
|
1 |
|
'appversion' => \end($version) |
|
296
|
1 |
|
]; |
|
297
|
1 |
|
}), |
|
298
|
1 |
|
'proximic' => new props('exact', $fn['ads']), |
|
299
|
1 |
|
'WordPress' => new props('start', $fn['monitor']), |
|
300
|
1 |
|
'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
|
301
|
1 |
|
'PRTGCloudBot/' => new props('start', $fn['monitor']), |
|
302
|
1 |
|
'Site24x7' => new props('exact', $fn['monitor']), |
|
303
|
1 |
|
'StatusCake' => new props('exact', $fn['monitor']), |
|
304
|
1 |
|
'adbeat.com' => new props('start', fn (string $value) : array => [ |
|
305
|
2 |
|
'type' => 'robot', |
|
306
|
2 |
|
'category' => 'ads', |
|
307
|
2 |
|
'app' => 'Adbeat', |
|
308
|
2 |
|
'appname' => 'Adbeat', |
|
309
|
2 |
|
'url' => 'https://'.$value |
|
310
|
2 |
|
]), |
|
311
|
1 |
|
'MicrosoftPreview/' => new props('start', $fn['feed']), |
|
312
|
1 |
|
'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
|
313
|
1 |
|
'Expanse' => new props('start', $fn['crawler']), |
|
314
|
1 |
|
'Apache-HttpClient/' => new props('start', $fn['scraper']), |
|
315
|
1 |
|
'eCairn-Grabber/' => new props('start', $fn['scraper']), |
|
316
|
1 |
|
'SEOkicks' => new props('exact', $fn['crawler']), |
|
317
|
1 |
|
'PostmanRuntime/' => new props('start', $fn['scraper']), |
|
318
|
1 |
|
'axios/' => new props('start', $fn['scraper']), |
|
319
|
1 |
|
'Rogerbot/' => new props('start', $fn['crawler']), |
|
320
|
1 |
|
'Go-http-client/' => new props('start', $fn['scraper']), |
|
321
|
1 |
|
'DashLinkPreviews/' => new props('start', $fn['feed']), |
|
322
|
1 |
|
'PycURL/' => new props('start', $fn['scraper']), |
|
323
|
1 |
|
'lua-resty-http/' => new props('start', $fn['scraper']), |
|
324
|
1 |
|
'Snapchat/' => new props('start', $fn['feed']), |
|
325
|
1 |
|
'HTTPClient/' => new props('start', $fn['scraper']), |
|
326
|
1 |
|
'WhatsApp/' => new props('any', $fn['feed']), |
|
327
|
1 |
|
'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
|
328
|
1 |
|
'Snap URL Preview Service' => new props('exact', $fn['feed']), |
|
329
|
1 |
|
'ApacheBench/' => new props('start', $fn['validator']), |
|
330
|
1 |
|
'Asana/' => new props('start', $fn['feed']), |
|
331
|
1 |
|
'Java/' => new props('start', $fn['scraper']), |
|
332
|
1 |
|
'curl/' => new props('start', $fn['scraper']), |
|
333
|
1 |
|
'Wget/' => new props('start', $fn['scraper']), |
|
334
|
1 |
|
'rest-client/' => new props('start', $fn['scraper']), |
|
335
|
1 |
|
'ruby/' => new props('start', $fn['scraper']), |
|
336
|
1 |
|
'libcurl/' => new props('start', $fn['scraper']), |
|
337
|
1 |
|
'Bun/' => new props('start', $fn['scraper']), |
|
338
|
1 |
|
'CakePHP' => new props('start', $fn['scraper']), |
|
339
|
1 |
|
'cpp-httplib/' => new props('start', $fn['scraper']), |
|
340
|
1 |
|
'Dart/' => new props('start', $fn['scraper']), |
|
341
|
1 |
|
'Deno/' => new props('start', $fn['scraper']), |
|
342
|
1 |
|
'libwww-perl/' => new props('start', $fn['scraper']), |
|
343
|
1 |
|
'GuzzleHttp/' => new props('start', $fn['scraper']), |
|
344
|
1 |
|
'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
|
345
|
1 |
|
'akka-http/' => new props('start', $fn['scraper']), |
|
346
|
1 |
|
'feed' => new props('any', $fn['feed']), |
|
347
|
1 |
|
'spider' => new props('any', $fn['crawler']), |
|
348
|
1 |
|
'crawler' => new props('any', $fn['map']), |
|
349
|
1 |
|
'bot/' => new props('any', $fn['map']), |
|
350
|
1 |
|
'bot-' => new props('any', $fn['map']), |
|
351
|
1 |
|
' bot ' => new props('any', $fn['map']), |
|
352
|
1 |
|
'bot' => new props('end', $fn['map']), |
|
353
|
1 |
|
]; |
|
354
|
|
|
} |
|
355
|
|
|
} |