1
|
|
|
<?php |
2
|
|
|
declare(strict_types = 1); |
3
|
|
|
namespace hexydec\agentzero; |
4
|
|
|
|
5
|
|
|
class crawlers { |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Extracts application and version information from a token |
9
|
|
|
* |
10
|
|
|
* @param string $value The token to be processed |
11
|
|
|
* @param array<string|null> $data An array containing existing data to merge |
12
|
|
|
* @return array<string|int|float|null> The $data array with the processed application and version added |
13
|
|
|
*/ |
14
|
12 |
|
public static function getApp(string $value, array $data = []) : array { |
15
|
12 |
|
if (!\str_contains($value, '://') && !\str_starts_with($value, 'Chrome/') && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
16
|
12 |
|
$parts = \explode('/', $value, 2); |
17
|
|
|
|
18
|
|
|
// process version |
19
|
12 |
|
if (!empty($parts[1])) { |
20
|
10 |
|
$parts[1] = \ltrim($parts[1], 'v'); |
21
|
10 |
|
$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
22
|
|
|
} |
23
|
12 |
|
$category = [ |
24
|
12 |
|
'yacybot' => [ |
25
|
12 |
|
'category' => 'search', |
26
|
12 |
|
'app' => 'YacyBot' |
27
|
12 |
|
], |
28
|
12 |
|
'googlebot' => [ |
29
|
|
|
'category' => 'search', |
30
|
2 |
|
'app' => 'GoogleBot' |
31
|
|
|
], |
32
|
|
|
'googlebot-mobile' => [ |
33
|
|
|
'category' => 'search', |
34
|
|
|
'app' => 'GoogleBot' |
35
|
|
|
], |
36
|
|
|
'googlebot-image' => [ |
37
|
|
|
'category' => 'search', |
38
|
12 |
|
'app' => 'GoogleBot' |
39
|
8 |
|
], |
40
|
8 |
|
'googlebot-video' => [ |
41
|
8 |
|
'category' => 'search', |
42
|
8 |
|
'app' => 'GoogleBot' |
43
|
8 |
|
], |
44
|
2 |
|
'googlebot-news' => [ |
45
|
2 |
|
'category' => 'search', |
46
|
2 |
|
'app' => 'GoogleBot' |
47
|
8 |
|
], |
48
|
8 |
|
'storebot-google' => [ |
49
|
8 |
|
'category' => 'search', |
50
|
8 |
|
'app' => 'GoogleBot' |
51
|
8 |
|
], |
52
|
2 |
|
'adsbot-google' => [ |
53
|
2 |
|
'category' => 'ads', |
54
|
2 |
|
'app' => 'GoogleBot' |
55
|
2 |
|
], |
56
|
2 |
|
'adsbot-google-mobile' => [ |
57
|
2 |
|
'category' => 'ads', |
58
|
2 |
|
'app' => 'GoogleBot' |
59
|
2 |
|
], |
60
|
2 |
|
'mediapartners-google' => [ |
61
|
2 |
|
'category' => 'ads', |
62
|
2 |
|
'app' => 'GoogleBot' |
63
|
2 |
|
], |
64
|
8 |
|
'bingbot' => [ |
65
|
8 |
|
'category' => 'search', |
66
|
8 |
|
'app' => 'BingBot' |
67
|
8 |
|
], |
68
|
12 |
|
'adidxbot' => [ |
69
|
10 |
|
'category' => 'ads', |
70
|
10 |
|
'app' => 'AdidxBot' |
71
|
10 |
|
], |
72
|
10 |
|
'duckduckbot' => [ |
73
|
10 |
|
'category' => 'search', |
74
|
10 |
|
'app' => 'DuckDuckBot' |
75
|
10 |
|
], |
76
|
10 |
|
'duckduckgo-favicons-bot' => [ |
77
|
10 |
|
'category' => 'search', |
78
|
10 |
|
'app' => 'DuckDuckBot' |
79
|
10 |
|
], |
80
|
10 |
|
'coccocbot-image' => [ |
81
|
10 |
|
'category' => 'search', |
82
|
10 |
|
'app' => 'CoccocBot' |
83
|
10 |
|
], |
84
|
10 |
|
'coccocbot-web' => [ |
85
|
10 |
|
'category' => 'search', |
86
|
10 |
|
'app' => 'CoccocBot' |
87
|
10 |
|
], |
88
|
10 |
|
'applebot' => [ |
89
|
10 |
|
'category' => 'search', |
90
|
10 |
|
'app' => 'AppleBot' |
91
|
10 |
|
], |
92
|
10 |
|
'yandexbot' => [ |
93
|
10 |
|
'category' => 'search' |
94
|
10 |
|
], |
95
|
10 |
|
'mj12bot' => [ |
96
|
10 |
|
'category' => 'search', |
97
|
10 |
|
'app' => 'Majestic 12 Bot' |
98
|
10 |
|
], |
99
|
10 |
|
'mail.ru_bot' => [ |
100
|
10 |
|
'category' => 'search', |
101
|
10 |
|
'app' => 'Mail.ru Bot' |
102
|
10 |
|
], |
103
|
10 |
|
'exabot' => [ |
104
|
10 |
|
'category' => 'search', |
105
|
10 |
|
'app' => 'ExaBot' |
106
|
10 |
|
], |
107
|
10 |
|
'uptimerobot' => [ |
108
|
10 |
|
'category' => 'monitor' |
109
|
10 |
|
], |
110
|
10 |
|
'petalbot' => [ |
111
|
10 |
|
'category' => 'search' |
112
|
10 |
|
], |
113
|
10 |
|
'twitterbot' => [ |
114
|
10 |
|
'category' => 'feed', |
115
|
10 |
|
'app' => 'TwitterBot' |
116
|
10 |
|
], |
117
|
10 |
|
'xbot' => [ |
118
|
10 |
|
'category' => 'feed' |
119
|
10 |
|
], |
120
|
10 |
|
'discordbot' => [ |
121
|
10 |
|
'category' => 'feed', |
122
|
10 |
|
'app' => 'DiscordBot' |
123
|
10 |
|
], |
124
|
10 |
|
'sematextsyntheticsrobot' => [ |
125
|
10 |
|
'category' => 'monitor', |
126
|
10 |
|
'app' => 'Sematext Synthetics Robot' |
127
|
10 |
|
], |
128
|
10 |
|
'linkedinbot' => [ |
129
|
10 |
|
'category' => 'feed' |
130
|
10 |
|
], |
131
|
10 |
|
'paperlibot' => [ |
132
|
10 |
|
'category' => 'feed' |
133
|
10 |
|
], |
134
|
10 |
|
'bitlybot' => [ |
135
|
10 |
|
'category' => 'feed', |
136
|
10 |
|
'app' => 'Bit.ly Bot' |
137
|
10 |
|
], |
138
|
10 |
|
'tineye-bot' => [ |
139
|
10 |
|
'category' => 'search', |
140
|
10 |
|
'app' => 'TinEye Bot' |
141
|
10 |
|
], |
142
|
10 |
|
'pinterestbot' => [ |
143
|
10 |
|
'category' => 'feed', |
144
|
10 |
|
'app' => 'PinterestBot' |
145
|
10 |
|
], |
146
|
10 |
|
'webcrawler' => [ |
147
|
10 |
|
'category' => 'crawler' |
148
|
10 |
|
], |
149
|
10 |
|
'webprosbot' => [ |
150
|
10 |
|
'category' => 'crawler', |
151
|
10 |
|
'app' => 'WebprosBot' |
152
|
10 |
|
], |
153
|
10 |
|
'guzzlehttp' => [ |
154
|
10 |
|
'category' => 'scraper' |
155
|
10 |
|
], |
156
|
10 |
|
'telegrambot' => [ |
157
|
10 |
|
'category' => 'feed' |
158
|
10 |
|
], |
159
|
10 |
|
'semrushbot' => [ |
160
|
10 |
|
'category' => 'crawler' |
161
|
10 |
|
], |
162
|
10 |
|
'mediatoolkitbot' => [ |
163
|
10 |
|
'category' => 'crawler', |
164
|
10 |
|
'app' => 'MediaToolkitBot' |
165
|
10 |
|
], |
166
|
10 |
|
'iploggerbot' => [ |
167
|
10 |
|
'category' => 'monitor' |
168
|
10 |
|
], |
169
|
10 |
|
'cfnetwork' => [ |
170
|
10 |
|
'category' => 'feed', |
171
|
10 |
|
'app' => 'Apple Core Foundation Network' |
172
|
10 |
|
], |
173
|
10 |
|
'ncsc web check [email protected]' => [ |
174
|
10 |
|
'category' => 'monitor', |
175
|
10 |
|
'app' => 'NCSC Web Check' |
176
|
10 |
|
], |
177
|
10 |
|
'google-site-verification' => [ |
178
|
10 |
|
'category' => 'validator', |
179
|
10 |
|
'app' => 'Google Site Verification' |
180
|
10 |
|
], |
181
|
10 |
|
'google-inspectiontool' => [ |
182
|
10 |
|
'category' => 'validator', |
183
|
10 |
|
'app' => 'Google Inspection Tool' |
184
|
10 |
|
], |
185
|
10 |
|
'pingdomtms' => [ |
186
|
10 |
|
'category' => 'monitor', |
187
|
10 |
|
'app' => 'Pingdom.com' |
188
|
10 |
|
], |
189
|
10 |
|
'facebookexternalhit' => [ |
190
|
10 |
|
'category' => 'feed', |
191
|
10 |
|
'app' => 'Facebook URL Preview' |
192
|
10 |
|
], |
193
|
10 |
|
'phxbot' => [ |
194
|
10 |
|
'app' => 'ProtonMail Bot' |
195
|
10 |
|
] |
196
|
10 |
|
]; |
197
|
10 |
|
return \array_merge([ |
198
|
10 |
|
'type' => 'robot', |
199
|
10 |
|
'category' => \mb_stripos($value, 'crawler') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper', |
200
|
10 |
|
'app' => $parts[0], |
201
|
10 |
|
'appname' => $parts[0], |
202
|
10 |
|
'appversion' => empty($parts[1]) ? null : $parts[1] |
203
|
10 |
|
], $data, $category[\mb_strtolower($parts[0])] ?? []); |
204
|
10 |
|
} |
205
|
10 |
|
return []; |
206
|
10 |
|
} |
207
|
10 |
|
|
208
|
10 |
|
/** |
209
|
10 |
|
* Generates a configuration array for matching crawlers |
210
|
10 |
|
* |
211
|
10 |
|
* @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
212
|
10 |
|
*/ |
213
|
10 |
|
public static function get() : array { |
214
|
10 |
|
$fn = [ |
215
|
10 |
|
'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
216
|
10 |
|
'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
217
|
10 |
|
'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
218
|
10 |
|
'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
219
|
10 |
|
\str_contains($value, 'WhatsApp/') ? [ |
220
|
10 |
|
'app' => 'WhatsApp' |
221
|
10 |
|
] : [], |
222
|
10 |
|
[ |
223
|
10 |
|
'category' => 'feed' |
224
|
10 |
|
] |
225
|
10 |
|
)), |
226
|
10 |
|
'crawler' => function (string $value) : array { |
227
|
10 |
|
$parts = \explode('/', $value, 2); |
228
|
10 |
|
$map = [ |
229
|
10 |
|
'baiduspider' => 'search', |
230
|
10 |
|
'haosouspider' => 'search', |
231
|
10 |
|
'yisouspider' => 'search', |
232
|
10 |
|
'360spider' => 'search', |
233
|
10 |
|
'sogou web spider' => 'search', |
234
|
10 |
|
'bytespider' => 'search', |
235
|
10 |
|
]; |
236
|
10 |
|
return self::getApp($value, [ |
237
|
10 |
|
'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler' |
238
|
10 |
|
]); |
239
|
10 |
|
}, |
240
|
10 |
|
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
241
|
10 |
|
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
242
|
10 |
|
'map' => fn (string $value) : ?array => self::getApp($value) |
243
|
10 |
|
]; |
244
|
10 |
|
return [ |
245
|
10 |
|
'Yahoo! Slurp' => new props('exact', $fn['search']), |
246
|
10 |
|
'facebookexternalhit/' => new props('start', $fn['map']), |
247
|
10 |
|
'Google-Site-Verification/' => new props('start', $fn['map']), |
248
|
10 |
|
'Google-InspectionTool/' => new props('start', $fn['map']), |
249
|
|
|
'Mediapartners-Google' => new props('start', $fn['search']), |
250
|
8 |
|
'FeedFetcher-Google' => new props('exact', $fn['feed']), |
251
|
8 |
|
'GoogleProducer' => new props('exact', $fn['feed']), |
252
|
8 |
|
'Google-adstxt' => new props('exact', $fn['ads']), |
253
|
1 |
|
'CFNetwork/' => new props('start', $fn['map']), |
254
|
1 |
|
'Siteimprove.com' => new props('any', $fn['crawler']), |
255
|
1 |
|
'CyotekWebCopy' => new props('start', $fn['scraper']), |
256
|
1 |
|
'Google Page Speed Insights' => new props('exact', $fn['validator']), |
257
|
1 |
|
'Qwantify' => new props('start', function (string $value) : array { |
258
|
1 |
|
$parts = \explode('/', $value, 3); |
259
|
1 |
|
return [ |
260
|
1 |
|
'type' => 'robot', |
261
|
1 |
|
'category' => 'search', |
262
|
1 |
|
'app' => 'Qwant Web Crawler', |
263
|
1 |
|
'appname' => $parts[0], |
264
|
1 |
|
'appversion' => $parts[1] ?? null |
265
|
1 |
|
]; |
266
|
1 |
|
}), |
267
|
1 |
|
'okhttp' => new props('start', $fn['scraper']), |
268
|
1 |
|
'python' => new props('start', $fn['scraper']), |
269
|
1 |
|
'jsdom/' => new props('start', $fn['scraper']), |
270
|
1 |
|
'Nessus' => new props('start', $fn['monitor']), |
271
|
1 |
|
'Chrome-Lighthouse' => new props('start', $fn['validator']), |
272
|
1 |
|
'Siege/' => new props('start', $fn['validator']), |
273
|
1 |
|
'Microsoft Profiling/' => new props('any', $fn['validator']), |
274
|
1 |
|
'Bidtellect' => new props('start', $fn['crawler']), |
275
|
1 |
|
'magpie-crawler/' => new props('start', $fn['crawler']), |
276
|
1 |
|
'PingdomTMS/' => new props('start', $fn['map']), |
277
|
1 |
|
'DynGate' => new props('exact', $fn['monitor']), |
278
|
1 |
|
'Datadog/Synthetics' => new props('exact', [ |
279
|
1 |
|
'type' => 'robot', |
280
|
1 |
|
'category' => 'monitor', |
281
|
1 |
|
'app' => 'Datadog/Synthetics' |
282
|
1 |
|
]), |
283
|
1 |
|
'RuxitSynthetic/' => new props('start', $fn['monitor']), |
284
|
1 |
|
'Checkly/' => new props('start', $fn['monitor']), |
285
|
1 |
|
'Uptime/' => new props('start', $fn['monitor']), |
286
|
1 |
|
'HostTracker/' => new props('start', $fn['monitor']), |
287
|
1 |
|
'NCSC Web Check [email protected]' => new props('exact', $fn['map']), |
288
|
1 |
|
'Pingdom.com' => new props('start', function (string $value) : array { |
289
|
1 |
|
$version = \explode('_', \trim($value, '_')); |
290
|
1 |
|
return [ |
291
|
1 |
|
'type' => 'robot', |
292
|
1 |
|
'category' => 'monitor', |
293
|
1 |
|
'app' => 'Pingdom.com', |
294
|
1 |
|
'appname' => \trim($value, '_'), |
295
|
1 |
|
'appversion' => \end($version) |
296
|
1 |
|
]; |
297
|
1 |
|
}), |
298
|
1 |
|
'proximic' => new props('exact', $fn['ads']), |
299
|
1 |
|
'WordPress' => new props('start', $fn['monitor']), |
300
|
1 |
|
'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
301
|
1 |
|
'PRTGCloudBot/' => new props('start', $fn['monitor']), |
302
|
1 |
|
'Site24x7' => new props('exact', $fn['monitor']), |
303
|
1 |
|
'StatusCake' => new props('exact', $fn['monitor']), |
304
|
1 |
|
'adbeat.com' => new props('start', fn (string $value) : array => [ |
305
|
1 |
|
'type' => 'robot', |
306
|
1 |
|
'category' => 'ads', |
307
|
1 |
|
'app' => 'Adbeat', |
308
|
1 |
|
'appname' => 'Adbeat', |
309
|
1 |
|
'url' => 'https://'.$value |
310
|
1 |
|
]), |
311
|
1 |
|
'MicrosoftPreview/' => new props('start', $fn['feed']), |
312
|
1 |
|
'YahooMailProxy' => new props('exact', $fn['feed']), |
313
|
1 |
|
'PhxBot/' => new props('start', $fn['feed']), // proton mail |
314
|
1 |
|
'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon |
315
|
1 |
|
'type' => 'robot', |
316
|
2 |
|
'category' => 'feed', |
317
|
2 |
|
'app' => 'Mastodon', |
318
|
2 |
|
'appname' => 'Pleroma', |
319
|
2 |
|
'appversion' => \mb_substr($value, 8) |
320
|
2 |
|
]), |
321
|
2 |
|
'Outlook-Android/' => new props('start', fn (string $value) : array => [ // mastodon |
322
|
1 |
|
'type' => 'robot', |
323
|
1 |
|
'category' => 'feed', |
324
|
1 |
|
'app' => 'Outlook', |
325
|
1 |
|
'appname' => 'Outlook-Android', |
326
|
1 |
|
'platform' => 'Android', |
327
|
1 |
|
'appversion' => \mb_substr($value, 16) |
328
|
1 |
|
]), |
329
|
1 |
|
'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ // mastodon |
330
|
1 |
|
'type' => 'robot', |
331
|
1 |
|
'category' => 'feed', |
332
|
1 |
|
'app' => 'Outlook', |
333
|
2 |
|
'appname' => 'Outlook-iOS', |
334
|
2 |
|
'platform' => 'iOS', |
335
|
2 |
|
'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12) |
336
|
2 |
|
]), |
337
|
2 |
|
'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ // mastodon |
338
|
2 |
|
'type' => 'robot', |
339
|
2 |
|
'category' => 'feed', |
340
|
1 |
|
'app' => 'Outlook', |
341
|
2 |
|
'appname' => 'OutlookMobileCloudService-Autodetect', |
342
|
2 |
|
'appversion' => \mb_substr($value, 37) |
343
|
2 |
|
]), |
344
|
2 |
|
'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']), |
345
|
2 |
|
'ViberUrlDownloader' => new props('exact', $fn['feed']), |
346
|
2 |
|
'Google-Lens' => new props('exact', $fn['feed']), |
347
|
2 |
|
'ManicTime/' => new props('start', $fn['feed']), |
348
|
1 |
|
'Yik Yak/' => new props('start', $fn['feed']), |
349
|
2 |
|
'HubSpot-Link-Resolver' => new props('exact', $fn['feed']), |
350
|
2 |
|
'W3C-checklink/' => new props('start', $fn['validator']), |
351
|
2 |
|
'CSSCheck/' => new props('start', $fn['validator']), |
352
|
2 |
|
'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
353
|
2 |
|
'SEO-Macroscope/' => new props('exact', $fn['validator']), |
354
|
2 |
|
'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']), |
355
|
1 |
|
'Expanse' => new props('start', $fn['crawler']), |
356
|
1 |
|
'eCairn-Grabber/' => new props('start', $fn['scraper']), |
357
|
1 |
|
'SEOkicks' => new props('exact', $fn['crawler']), |
358
|
1 |
|
'PostmanRuntime/' => new props('start', $fn['scraper']), |
359
|
1 |
|
'axios/' => new props('start', $fn['scraper']), |
360
|
1 |
|
'Rogerbot/' => new props('start', $fn['crawler']), |
361
|
1 |
|
'DashLinkPreviews/' => new props('start', $fn['feed']), |
362
|
1 |
|
'Snapchat/' => new props('start', $fn['feed']), |
363
|
1 |
|
'HTTPClient/' => new props('start', $fn['scraper']), |
364
|
1 |
|
'WhatsApp/' => new props('any', $fn['feed']), |
365
|
1 |
|
'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
366
|
1 |
|
'Snap URL Preview Service' => new props('exact', $fn['feed']), |
367
|
1 |
|
'ApacheBench/' => new props('start', $fn['validator']), |
368
|
1 |
|
'Asana/' => new props('start', $fn['feed']), |
369
|
1 |
|
'Java/' => new props('start', $fn['scraper']), |
370
|
1 |
|
'curl/' => new props('any', $fn['scraper']), |
371
|
1 |
|
'Wget/' => new props('start', $fn['scraper']), |
372
|
1 |
|
'rest-client/' => new props('start', $fn['scraper']), |
373
|
1 |
|
'ruby/' => new props('start', $fn['scraper']), |
374
|
1 |
|
'Bun/' => new props('start', $fn['scraper']), |
375
|
1 |
|
'CakePHP' => new props('start', $fn['scraper']), |
376
|
1 |
|
'cpp-httplib/' => new props('start', $fn['scraper']), |
377
|
1 |
|
'Dart/' => new props('start', $fn['scraper']), |
378
|
1 |
|
'Deno/' => new props('start', $fn['scraper']), |
379
|
1 |
|
'libwww-perl/' => new props('start', $fn['scraper']), |
380
|
1 |
|
'http/' => new props('start', $fn['scraper']), |
381
|
1 |
|
'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
382
|
1 |
|
'http-client/' => new props('any', $fn['scraper']), |
383
|
1 |
|
'HttpClient/' => new props('any', $fn['scraper']), |
384
|
1 |
|
'Validator' => new props('any', $fn['validator']), |
385
|
1 |
|
'feed' => new props('any', $fn['feed']), |
386
|
1 |
|
'spider' => new props('any', $fn['crawler']), |
387
|
1 |
|
'crawler' => new props('any', $fn['map']), |
388
|
1 |
|
'bot/' => new props('any', $fn['map']), |
389
|
1 |
|
'bot-' => new props('any', $fn['map']), |
390
|
1 |
|
' bot ' => new props('any', $fn['map']), |
391
|
1 |
|
'bot' => new props('end', $fn['map']) |
392
|
1 |
|
]; |
393
|
|
|
} |
394
|
|
|
} |