1
|
|
|
<?php |
2
|
|
|
declare(strict_types = 1); |
3
|
|
|
namespace hexydec\agentzero; |
4
|
|
|
|
5
|
|
|
class crawlers { |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Extracts application and version information from a token |
9
|
|
|
* |
10
|
|
|
* @param string $value The token to be processed |
11
|
|
|
* @param array<string|null> $data An array containing existing data to merge |
12
|
|
|
* @return array<string|int|float|null> The $data array with the processed application and version added |
13
|
|
|
*/ |
14
|
18 |
|
public static function getApp(string $value, array $data = []) : array { |
15
|
18 |
|
if (!\str_contains($value, '://') && \mb_stripos($value, 'Chrome/') !== 0 && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
16
|
16 |
|
$parts = \explode('/', $value, 2); |
17
|
|
|
|
18
|
|
|
// process version |
19
|
16 |
|
if (!empty($parts[1])) { |
20
|
13 |
|
$parts[1] = \ltrim($parts[1], 'v'); |
21
|
13 |
|
$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
22
|
|
|
} |
23
|
16 |
|
$category = [ |
24
|
16 |
|
'yacybot' => 'search', |
25
|
16 |
|
'googlebot' => 'search', |
26
|
16 |
|
'googlebot-mobile' => 'search', |
27
|
16 |
|
'googlebot-image' => 'search', |
28
|
16 |
|
'googlebot-video' => 'search', |
29
|
16 |
|
'googlebot-news' => 'search', |
30
|
16 |
|
'storebot-google' => 'search', |
31
|
16 |
|
'adsbot-google' => 'ads', |
32
|
16 |
|
'adsbot-google-mobile' => 'ads', |
33
|
16 |
|
'mediapartners-google' => 'ads', |
34
|
16 |
|
'bingbot' => 'search', |
35
|
16 |
|
'adidxbot' => 'ads', |
36
|
16 |
|
'duckduckbot' => 'search', |
37
|
16 |
|
'duckduckgo-favicons-bot' => 'search', |
38
|
16 |
|
'coccocbot-image' => 'search', |
39
|
16 |
|
'coccocbot-web' => 'search', |
40
|
16 |
|
'yandexbot' => 'search', |
41
|
16 |
|
'mj12bot' => 'search', |
42
|
16 |
|
'mail.ru_bot' => 'search', |
43
|
16 |
|
'exabot' => 'search', |
44
|
16 |
|
'uptimerobot' => 'monitor', |
45
|
16 |
|
'petalbot' => 'search', |
46
|
16 |
|
'twitterbot' => 'feed', |
47
|
16 |
|
'xbot' => 'feed', |
48
|
16 |
|
'discordbot' => 'feed', |
49
|
16 |
|
'sematextsyntheticsrobot' => 'monitor', |
50
|
16 |
|
'linkedinbot' => 'feed', |
51
|
16 |
|
'paperlibot' => 'feed', |
52
|
16 |
|
'bitlybot' => 'feed', |
53
|
16 |
|
'tineye-bot' => 'search', |
54
|
16 |
|
'pinterestbot' => 'feed', |
55
|
16 |
|
'webcrawler' => 'crawler', |
56
|
16 |
|
'webprosbot' => 'crawler', |
57
|
16 |
|
'guzzlehttp' => 'scraper', |
58
|
16 |
|
'telegrambot' => 'feed', |
59
|
16 |
|
'semrushbot' => 'crawler', |
60
|
16 |
|
'mediatoolkitbot' => 'crawler', |
61
|
16 |
|
'iploggerbot' => 'monitor', |
62
|
16 |
|
'baiduspider' => 'search', |
63
|
16 |
|
'baiduspider+' => 'search', |
64
|
16 |
|
'baiduspider-image+' => 'search', |
65
|
16 |
|
'baiduspider-ads' => 'ads', |
66
|
16 |
|
'haosouspider' => 'search', |
67
|
16 |
|
'yisouspider' => 'search', |
68
|
16 |
|
'360spider' => 'search', |
69
|
16 |
|
'sogou web spider' => 'search', |
70
|
16 |
|
'bytespider' => 'ai', |
71
|
16 |
|
'claudebot' => 'ai', |
72
|
16 |
|
'gptbot' => 'ai', |
73
|
16 |
|
'diffbot' => 'ai', |
74
|
16 |
|
'amazonbot' => 'ai', |
75
|
16 |
|
'applebot' => 'ai', |
76
|
16 |
|
'perplexitybot' => 'ai', |
77
|
16 |
|
'youbot' => 'ai', |
78
|
16 |
|
'iaskbot' => 'ai', |
79
|
16 |
|
'ccbot' => 'crawler', |
80
|
16 |
|
'wpbot' => 'ai', |
81
|
16 |
|
'imagesiftbot' => 'ai' |
82
|
16 |
|
]; |
83
|
16 |
|
$apps = [ |
84
|
16 |
|
'googlebot' => 'Google Bot', |
85
|
16 |
|
'googlebot-mobile' => 'Google Bot', |
86
|
16 |
|
'googlebot-image' => 'Google Bot', |
87
|
16 |
|
'googlebot-video' => 'Google Bot', |
88
|
16 |
|
'googlebot-news' => 'Google Bot', |
89
|
16 |
|
'storebot-google' => 'Google Bot', |
90
|
16 |
|
'adsbot-google' => 'Google Bot', |
91
|
16 |
|
'google-adwords-instant' => 'Google Bot', |
92
|
16 |
|
'adsbot-google-mobile' => 'Google Bot', |
93
|
16 |
|
'mediapartners-google' => 'Google Bot', |
94
|
16 |
|
'google-safety' => 'Google Safety', |
95
|
16 |
|
'duckduckbot' => 'DuckDuck Bot', |
96
|
16 |
|
'duckduckbot-https' => 'DuckDuck Bot', |
97
|
16 |
|
'duckduckgo-favicons-bot' => 'DuckDuck Bot', |
98
|
16 |
|
'coccocbot-image' => 'Coccoc Bot', |
99
|
16 |
|
'coccocbot-web' => 'Coccoc Bot', |
100
|
16 |
|
'mj12bot' => 'Majestic 12 Bot', |
101
|
16 |
|
'exabot' => 'ExaBot', |
102
|
16 |
|
'twitterbot' => 'TwitterBot', |
103
|
16 |
|
'discordbot' => 'DiscordBot', |
104
|
16 |
|
'sematextsyntheticsrobot' => 'Sematext Synthetics Robot', |
105
|
16 |
|
'bitlybot' => 'Bit.ly Bot', |
106
|
16 |
|
'webprosbot' => 'WebprosBot', |
107
|
16 |
|
'mediatoolkitbot' => 'MediaToolkit Bot', |
108
|
16 |
|
'cfnetwork' => 'Apple Core Foundation Network', |
109
|
16 |
|
'ncsc web check [email protected]' => 'NCSC Web Check', |
110
|
16 |
|
'enhanced webcheck [email protected]' => 'NCSC Enhanced Web Check', |
111
|
16 |
|
'the national archives uk government web archive:' => 'UK Government National Archives', |
112
|
16 |
|
'google-inspectiontool' => 'Google Inspection Tool', |
113
|
16 |
|
'google-pagerenderer google' => 'Google Page Renderer', |
114
|
16 |
|
'pingdomtms' => 'Pingdom Bot', |
115
|
16 |
|
'facebookexternalhit' => 'Facebook URL Preview', |
116
|
16 |
|
'facebookcatalog' => 'Facebook', |
117
|
16 |
|
'meta-externalagent' => 'Meta External Agent', |
118
|
16 |
|
'meta-externalfetcher' => 'Meta External Fetcher', |
119
|
16 |
|
'phxbot' => 'ProtonMail Bot', |
120
|
16 |
|
'monitoring360bot' => 'Monitoring360 Bot', |
121
|
16 |
|
'cloudflare-healthchecks' => 'Cloudflare Health Checks', |
122
|
16 |
|
'cloudflare-alwaysonline' => 'Cloudflare Always Online', |
123
|
16 |
|
'cloudflare-traffic-manager' => 'Cloudflare-Traffic-Manager', |
124
|
16 |
|
'cloudflare-prefetch' => 'Cloudflare Prefetch', |
125
|
16 |
|
'cloudflare-ssldetector' => 'Cloudflare SSL Detector', |
126
|
16 |
|
'cloudflare-diagnostics' => 'Cloudflare Diagnostics', |
127
|
16 |
|
'ptst' => 'Cloudflare Speed Test', |
128
|
16 |
|
'citoid' => 'Wikimedia Citoid', |
129
|
16 |
|
'user-agent: seolyt' => 'SEOlyt', |
130
|
16 |
|
'bytespider' => 'ByteDance Spider', |
131
|
16 |
|
'[email protected]' => 'ByteDance Spider', |
132
|
16 |
|
'oai-searchbot' => 'OpenAI SearchBot', |
133
|
16 |
|
'semrushbot' => 'Semrush Bot', |
134
|
16 |
|
'semrushbot-si' => 'Semrush Bot', |
135
|
16 |
|
'semrushbot-ocob' => 'Semrush Bot', |
136
|
16 |
|
'semrushbot-swa' => 'Semrush Bot', |
137
|
16 |
|
'semrushbot-ba' => 'Semrush Bot', |
138
|
16 |
|
'siteauditbot' => 'Semrush Bot', |
139
|
16 |
|
'splitsignalbot' => 'Semrush Bot', |
140
|
16 |
|
'linkcheck by siteimprove.com' => 'SiteImprove Crawler', |
141
|
16 |
|
'sitecheck-sitecrawl by siteimprove.com' => 'SiteImprove Crawler', |
142
|
16 |
|
'image size by siteimprove.com' => 'SiteImprove Crawler', |
143
|
16 |
|
'probe by siteimprove.com' => 'SiteImprove Crawler', |
144
|
16 |
|
'by siteimprove.com' => 'SiteImprove Crawler', |
145
|
16 |
|
'magpie-crawler' => 'Brandwatch Magpie Crawler', |
146
|
16 |
|
'linkedinbot' => 'LinkedIn Bot', |
147
|
16 |
|
'dotbot' => 'Moz DotBot', |
148
|
16 |
|
'dataforseobot' => 'DataForSeo Bot', |
149
|
16 |
|
'wordpress' => 'WordPress', |
150
|
16 |
|
'prtg network monitor' => 'Paessler PRTG Bot', |
151
|
16 |
|
'prtgcloudbot' => 'Paessler PRTG Bot', |
152
|
16 |
|
'powershell' => 'PowerShell', |
153
|
16 |
|
'ccbot' => 'CommonCrawl Bot', |
154
|
16 |
|
'oncrawl' => 'OnCrawl Bot', |
155
|
16 |
|
'pycurl' => 'PycURL', |
156
|
16 |
|
'chatgpt-user' => 'ChatGPT User', |
157
|
16 |
|
'mail.ru_bot' => 'Mail.ru Bot', |
158
|
16 |
|
'wpbot' => 'Wpbot', |
159
|
16 |
|
'dnbcrawler-analytics' => 'DnB Crawler Analytics', |
160
|
16 |
|
'baiduspider-image+' => 'Baidu Spider', |
161
|
16 |
|
'baiduspider-render' => 'Baidu Spider', |
162
|
16 |
|
'baiduspider-ads' => 'Baidu Spider', |
163
|
16 |
|
'amazon-qbusiness' => 'Amazon Bot', |
164
|
16 |
|
'amazon cloudfront' => 'Amazon Bot', |
165
|
16 |
|
'amazonbot-video' => 'Amazon Bot', |
166
|
16 |
|
'hubspot crawler' => 'HubSpot Crawler', |
167
|
16 |
|
'wordpress.com mshots' => 'WordPress.com mShots', |
168
|
16 |
|
'wordpress.com' => 'WordPress', |
169
|
16 |
|
'p3p validator' => 'P3P Validator', |
170
|
16 |
|
'w3c-checklink' => 'W3C Checklink', |
171
|
16 |
|
'w3c_validator' => 'W3C Validator', |
172
|
16 |
|
'omgili' => 'Webz.io', |
173
|
16 |
|
'bluesky cardyb' => 'Bluesky' |
174
|
16 |
|
]; |
175
|
|
|
|
176
|
16 |
|
$lower = \mb_strtolower($parts[0]); |
177
|
16 |
|
return \array_merge([ |
178
|
16 |
|
'type' => 'robot', |
179
|
16 |
|
'app' => $apps[$lower] ?? self::normaliseAppname($parts[0]), |
180
|
16 |
|
'appname' => $parts[0], |
181
|
16 |
|
'appversion' => empty($parts[1]) ? null : $parts[1] |
182
|
16 |
|
], $data, [ |
183
|
16 |
|
'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper') |
184
|
16 |
|
]); |
185
|
|
|
} |
186
|
11 |
|
return []; |
187
|
|
|
} |
188
|
|
|
|
189
|
16 |
|
public static function normaliseAppname(string $name) : string { |
190
|
16 |
|
$find = ['_', '-', '+', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; |
191
|
16 |
|
$replace = [' ', ' ', '', ' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J', ' K', ' L', ' M', ' N', ' O', ' P', ' Q', ' R', ' S', ' T', ' U', ' V', ' W', ' X', ' Y', ' Z']; |
192
|
16 |
|
$name = \trim(\str_replace($find, $replace, $name)); |
193
|
16 |
|
$output = ''; |
194
|
16 |
|
$single = true; |
195
|
16 |
|
foreach (\explode(' ', $name) AS $key => $item) { |
196
|
16 |
|
if ($item !== '') { |
197
|
16 |
|
$currsingle = \mb_strlen($item) === 1; |
198
|
16 |
|
$output .= ($single && ($currsingle || $key === 1) ? '' : ' ').(!$currsingle ? \ucfirst($item) : $item); |
199
|
16 |
|
$single = $currsingle; |
200
|
|
|
} |
201
|
|
|
} |
202
|
16 |
|
return \trim(\str_ireplace(['bot', 'crawler', 'spider', ' ', 'ro bot'], [' Bot', ' Crawler', ' Spider', ' ', 'Robot'], $output)); // replace afterward for where it is preceded by ACROYMN |
|
|
|
|
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
/** |
206
|
|
|
* Generates a configuration array for matching crawlers |
207
|
|
|
* |
208
|
|
|
* @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
209
|
|
|
*/ |
210
|
16 |
|
public static function get() : array { |
211
|
16 |
|
$fn = [ |
212
|
16 |
|
'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
213
|
16 |
|
'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
214
|
16 |
|
'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
215
|
16 |
|
'ai' => fn (string $value) : array => self::getApp($value, ['category' => 'ai']), |
216
|
16 |
|
'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
217
|
4 |
|
\str_contains($value, 'WhatsApp/') ? [ |
218
|
4 |
|
'app' => 'WhatsApp' |
219
|
4 |
|
] : [], |
220
|
16 |
|
[ |
221
|
16 |
|
'category' => 'feed' |
222
|
16 |
|
] |
223
|
16 |
|
)), |
224
|
16 |
|
'crawler' => function (string $value) : array { |
225
|
3 |
|
return self::getApp($value, ['category' => 'crawler']); |
226
|
16 |
|
}, |
227
|
16 |
|
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
228
|
16 |
|
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
229
|
16 |
|
'map' => fn (string $value) : array => self::getApp($value) |
230
|
16 |
|
]; |
231
|
2 |
|
return [ |
232
|
2 |
|
'Mozlila/' => new props('start', [ |
233
|
2 |
|
'type' => 'robot', |
234
|
2 |
|
'categpry' => 'scraper' |
235
|
2 |
|
]), |
236
|
2 |
|
'Moblie' => new props('exact', [ // some samsung devices mispelt it |
237
|
2 |
|
'type' => 'robot', |
238
|
2 |
|
'category' => 'scraper' |
239
|
2 |
|
]), |
240
|
2 |
|
'HeadlessChrome/' => new props('start', fn (string $value) : array => [ |
241
|
1 |
|
'type' => 'robot', |
242
|
1 |
|
'category' => 'crawler', |
243
|
1 |
|
'browser' => 'HeadlessChrome', |
244
|
1 |
|
'browserversion' => \mb_substr($value, 15) |
245
|
1 |
|
]), |
246
|
2 |
|
'Yahoo! Slurp' => new props('start', fn (string $value) : array => [ |
247
|
2 |
|
'type' => 'robot', |
248
|
2 |
|
'category' => 'search', |
249
|
2 |
|
'app' => 'Yahoo! Slurp', |
250
|
2 |
|
'appname' => $value |
251
|
2 |
|
]), |
252
|
2 |
|
'Google-Site-Verification/' => new props('start', $fn['validator']), |
253
|
2 |
|
'Google-InspectionTool/' => new props('start', $fn['validator']), |
254
|
2 |
|
'Google-Safety' => new props('exact', $fn['validator']), |
255
|
2 |
|
'Google-Read-Aloud' => new props('exact', $fn['feed']), |
256
|
2 |
|
'Google AppsViewer' => new props('exact', $fn['feed']), |
257
|
2 |
|
'Mediapartners-Google' => new props('start', $fn['search']), |
258
|
2 |
|
'FeedFetcher-Google' => new props('exact', $fn['feed']), |
259
|
2 |
|
'Google-PageRenderer' => new props('start', $fn['crawler']), |
260
|
2 |
|
'GoogleProducer' => new props('exact', $fn['feed']), |
261
|
2 |
|
'Google-adstxt' => new props('exact', $fn['ads']), |
262
|
2 |
|
'Google-Adwords-Instant' => new props('exact', $fn['ads']), |
263
|
2 |
|
'CFNetwork/' => new props('start', $fn['feed']), |
264
|
2 |
|
'Siteimprove.com' => new props('any', fn (string $value) : array => \array_merge([ |
265
|
2 |
|
'url' => 'https://siteimprove.com' |
266
|
2 |
|
], $fn['crawler']($value))), |
267
|
2 |
|
'SEOlyt/' => new props('any', $fn['crawler']), |
268
|
2 |
|
'CyotekWebCopy' => new props('start', $fn['scraper']), |
269
|
2 |
|
'Yandex' => new props('start', function (string $value) : array { |
270
|
1 |
|
$parts = \explode('/', $value, 3); |
271
|
1 |
|
return [ |
272
|
1 |
|
'type' => 'robot', |
273
|
1 |
|
'category' => 'search', |
274
|
1 |
|
'app' => 'Yandex Bot', |
275
|
1 |
|
'appname' => $parts[0], |
276
|
1 |
|
'appversion' => $parts[1] ?? null |
277
|
1 |
|
]; |
278
|
2 |
|
}), |
279
|
2 |
|
'Google Page Speed Insights' => new props('exact', $fn['validator']), |
280
|
2 |
|
'Qwantify' => new props('start', function (string $value) : array { |
281
|
1 |
|
$parts = \explode('/', $value, 3); |
282
|
1 |
|
return [ |
283
|
1 |
|
'type' => 'robot', |
284
|
1 |
|
'category' => 'search', |
285
|
1 |
|
'app' => 'Qwant Web Crawler', |
286
|
1 |
|
'appname' => $parts[0], |
287
|
1 |
|
'appversion' => $parts[1] ?? null |
288
|
1 |
|
]; |
289
|
2 |
|
}), |
290
|
2 |
|
'amazon-kendra' => new props('start', fn () : array => [ |
291
|
2 |
|
'type' => 'robot', |
292
|
2 |
|
'category' => 'crawler', |
293
|
2 |
|
'app' => 'Amazon Bot', |
294
|
2 |
|
'appname' => 'Amazon Kendra' |
295
|
2 |
|
]), |
296
|
2 |
|
'amazon-QBusiness' => new props('exact', $fn['ai']), |
297
|
2 |
|
'amazon CloudFront' => new props('exact', $fn['validator']), |
298
|
2 |
|
'Amazonbot-Video/' => new props('start', $fn['crawler']), |
299
|
2 |
|
'okhttp' => new props('start', $fn['scraper']), |
300
|
2 |
|
'python' => new props('start', $fn['scraper']), |
301
|
2 |
|
'grpc-python/' => new props('start', $fn['scraper']), |
302
|
2 |
|
'LWP::Simple/' => new props('start', $fn['scraper']), |
303
|
2 |
|
'jsdom/' => new props('start', $fn['scraper']), |
304
|
2 |
|
'Nessus' => new props('start', $fn['monitor']), |
305
|
2 |
|
'monitoring360bot' => new props('start', $fn['monitor']), |
306
|
2 |
|
'Cloudflare' => new props('start', $fn['validator']), |
307
|
2 |
|
'PTST/' => new props('start', $fn['validator']), |
308
|
2 |
|
'+https://developers.cloudflare.com/security-center/' => new props('exact', $fn['monitor']), |
309
|
2 |
|
'AppSignalBot/' => new props('start', $fn['monitor']), |
310
|
2 |
|
'Better Uptime Bot' => new props('start', [ |
311
|
2 |
|
'type' => 'robot', |
312
|
2 |
|
'category' => 'monitor', |
313
|
2 |
|
'app' => 'Better Uptime Bot', |
314
|
2 |
|
'appname' => 'Better Uptime Bot' |
315
|
2 |
|
]), |
316
|
2 |
|
'Chrome-Lighthouse' => new props('start', $fn['validator']), |
317
|
2 |
|
'Siege/' => new props('start', $fn['validator']), |
318
|
2 |
|
'Microsoft Profiling/' => new props('any', $fn['validator']), |
319
|
2 |
|
'Bidtellect' => new props('start', $fn['crawler']), |
320
|
2 |
|
'magpie-crawler/' => new props('start', $fn['crawler']), |
321
|
2 |
|
'Web Measure/' => new props('start', $fn['crawler']), |
322
|
2 |
|
'Bluesky Cardyb/' => new props('start', $fn['crawler']), |
323
|
2 |
|
'PingdomTMS/' => new props('start', $fn['monitor']), |
324
|
2 |
|
'DynGate' => new props('exact', $fn['monitor']), |
325
|
2 |
|
'CensysInspect/' => new props('start', $fn['monitor']), |
326
|
2 |
|
'Datadog/Synthetics' => new props('exact', [ |
327
|
2 |
|
'type' => 'robot', |
328
|
2 |
|
'category' => 'monitor', |
329
|
2 |
|
'app' => 'Datadog/Synthetics' |
330
|
2 |
|
]), |
331
|
2 |
|
'RuxitSynthetic/' => new props('start', $fn['monitor']), |
332
|
2 |
|
'Checkly/' => new props('start', $fn['monitor']), |
333
|
2 |
|
'Uptime/' => new props('start', $fn['monitor']), |
334
|
2 |
|
'HostTracker/' => new props('start', $fn['monitor']), |
335
|
2 |
|
'NCSC Web Check [email protected]' => new props('exact', $fn['monitor']), |
336
|
2 |
|
'Enhanced WebCheck [email protected]' => new props('exact', $fn['monitor']), |
337
|
2 |
|
'Pingdom.com' => new props('start', function (string $value) : array { |
338
|
1 |
|
$version = \explode('_', \trim($value, '_')); |
339
|
1 |
|
return [ |
340
|
1 |
|
'type' => 'robot', |
341
|
1 |
|
'category' => 'monitor', |
342
|
1 |
|
'app' => 'Pingdom Bot', |
343
|
1 |
|
'appname' => \trim($value, '_'), |
344
|
1 |
|
'appversion' => \end($version) |
345
|
1 |
|
]; |
346
|
2 |
|
}), |
347
|
2 |
|
'proximic' => new props('exact', $fn['ads']), |
348
|
2 |
|
'WordPress' => new props('start', $fn['feed']), |
349
|
2 |
|
'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
350
|
2 |
|
'PRTGCloudBot/' => new props('start', $fn['monitor']), |
351
|
2 |
|
'Site24x7' => new props('exact', $fn['monitor']), |
352
|
2 |
|
'StatusCake' => new props('exact', $fn['monitor']), |
353
|
2 |
|
'AWS Network Health' => new props('start', $fn['monitor']), |
354
|
2 |
|
'adbeat.com' => new props('start', fn (string $value) : array => [ |
355
|
2 |
|
'type' => 'robot', |
356
|
2 |
|
'category' => 'ads', |
357
|
2 |
|
'app' => 'Adbeat', |
358
|
2 |
|
'appname' => 'Adbeat', |
359
|
2 |
|
'url' => 'https://'.$value |
360
|
2 |
|
]), |
361
|
2 |
|
'MicrosoftPreview/' => new props('start', $fn['feed']), |
362
|
2 |
|
'YahooMailProxy' => new props('exact', $fn['feed']), |
363
|
2 |
|
'PhxBot/' => new props('start', $fn['feed']), // proton mail |
364
|
2 |
|
'Embedly/' => new props('start', $fn['feed']), |
365
|
2 |
|
'PayPal IPN' => new props('exact', $fn['feed']), |
366
|
2 |
|
'DropboxPreviewBot/' => new props('start', $fn['feed']), |
367
|
2 |
|
'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon |
368
|
1 |
|
'type' => 'robot', |
369
|
1 |
|
'category' => 'feed', |
370
|
1 |
|
'app' => 'Mastodon', |
371
|
1 |
|
'appname' => 'Pleroma', |
372
|
1 |
|
'appversion' => \mb_substr($value, 8) |
373
|
1 |
|
]), |
374
|
2 |
|
'Outlook-Android/' => new props('start', fn (string $value) : array => [ // mastodon |
375
|
2 |
|
'type' => 'robot', |
376
|
2 |
|
'category' => 'feed', |
377
|
2 |
|
'app' => 'Outlook', |
378
|
2 |
|
'appname' => 'Outlook-Android', |
379
|
2 |
|
'platform' => 'Android', |
380
|
2 |
|
'appversion' => \mb_substr($value, 16) |
381
|
2 |
|
]), |
382
|
2 |
|
'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ // mastodon |
383
|
2 |
|
'type' => 'robot', |
384
|
2 |
|
'category' => 'feed', |
385
|
2 |
|
'app' => 'Outlook', |
386
|
2 |
|
'appname' => 'Outlook-iOS', |
387
|
2 |
|
'platform' => 'iOS', |
388
|
2 |
|
'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12) |
389
|
2 |
|
]), |
390
|
2 |
|
'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ |
391
|
2 |
|
'type' => 'robot', |
392
|
2 |
|
'category' => 'feed', |
393
|
2 |
|
'app' => 'Outlook', |
394
|
2 |
|
'appname' => 'OutlookMobileCloudService-Autodetect', |
395
|
2 |
|
'appversion' => \mb_substr($value, 37) |
396
|
2 |
|
]), |
397
|
2 |
|
'HubSpot Connect ' => new props('start', function (string $value, int $i, array $tokens) : array { |
398
|
1 |
|
$app = 'HubSpot Connect'; |
399
|
1 |
|
$count = \count($tokens); |
400
|
1 |
|
for ($n = $i; $n < $count; $n++) { |
401
|
1 |
|
if (\str_starts_with($tokens[$n], 'namespace: ')) { |
402
|
1 |
|
$app = \mb_substr($tokens[$n], 11).' - '.$tokens[$n+1]; |
403
|
1 |
|
break; |
404
|
|
|
} |
405
|
|
|
} |
406
|
1 |
|
return [ |
407
|
1 |
|
'type' => 'robot', |
408
|
1 |
|
'category' => 'feed', |
409
|
1 |
|
'app' => 'HubSpot Connect', |
410
|
1 |
|
'appname' => $app, |
411
|
1 |
|
'appversion' => \mb_substr($value, 16) |
412
|
1 |
|
]; |
413
|
2 |
|
}), |
414
|
2 |
|
'Pro-Sitemaps/' => new props('start', $fn['crawler']), |
415
|
2 |
|
'Pandalytics/' => new props('start', $fn['crawler']), |
416
|
2 |
|
'omgili/' => new props('start', $fn['crawler']), |
417
|
|
|
// 'CCBot/' => new props('start', $fn['crawler']), |
418
|
2 |
|
'The National Archives UK Government Web Archive' => new props('start', $fn['crawler']), |
419
|
2 |
|
'Citoid' => new props('exact', $fn['crawler']), |
420
|
2 |
|
'trendictionbot' => new props('start', fn (string $value) : array => [ |
421
|
1 |
|
'type' => 'robot', |
422
|
1 |
|
'category' => 'crawler', |
423
|
1 |
|
'app' => 'Trendicion Bot', |
424
|
1 |
|
'appname' => 'trendictionbot', |
425
|
1 |
|
'appversion' => \mb_substr($value, 14) |
426
|
1 |
|
]), |
427
|
2 |
|
'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']), |
428
|
2 |
|
'ViberUrlDownloader' => new props('exact', $fn['feed']), |
429
|
2 |
|
'GoogleDocs' => new props('exact', fn (string $value, int $i, array $tokens) : array => [ |
430
|
2 |
|
'type' => 'robot', |
431
|
2 |
|
'category' => 'feed', |
432
|
2 |
|
'app' => 'Google Docs', |
433
|
2 |
|
'appname' => $value.'; '.$tokens[$i+1] |
434
|
2 |
|
]), |
435
|
2 |
|
'Google-Lens' => new props('exact', $fn['feed']), |
436
|
2 |
|
'ManicTime/' => new props('start', $fn['feed']), |
437
|
2 |
|
'Yik Yak/' => new props('start', $fn['feed']), |
438
|
2 |
|
'HubSpot-Link-Resolver' => new props('exact', $fn['feed']), |
439
|
2 |
|
'AppleExchangeWebServices/' => new props('start', $fn['feed']), |
440
|
2 |
|
'The Lounge IRC Client' => new props('exact', $fn['feed']), |
441
|
2 |
|
'W3C-checklink/' => new props('start', $fn['validator']), |
442
|
2 |
|
'CSSCheck/' => new props('start', $fn['validator']), |
443
|
2 |
|
'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
444
|
2 |
|
'SEO-Macroscope/' => new props('start', $fn['validator']), |
445
|
2 |
|
'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']), |
446
|
2 |
|
'Barracuda Sentinel' => new props('start', $fn['validator']), |
447
|
2 |
|
'Expanse' => new props('start', $fn['crawler']), |
448
|
2 |
|
'eCairn-Grabber/' => new props('start', $fn['scraper']), |
449
|
2 |
|
'SEOkicks' => new props('exact', $fn['crawler']), |
450
|
2 |
|
'PostmanRuntime/' => new props('start', $fn['scraper']), |
451
|
2 |
|
'axios/' => new props('start', $fn['scraper']), |
452
|
2 |
|
'Rogerbot/' => new props('start', $fn['crawler']), |
453
|
2 |
|
'DashLinkPreviews/' => new props('start', $fn['feed']), |
454
|
2 |
|
'Snapchat/' => new props('start', $fn['feed']), |
455
|
2 |
|
'HTTPClient/' => new props('start', $fn['scraper']), |
456
|
2 |
|
'WhatsApp/' => new props('any', $fn['feed']), |
457
|
2 |
|
'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
458
|
2 |
|
'URL Preview' => new props('any', $fn['feed']), |
459
|
2 |
|
'Link Preview' => new props('any', $fn['feed']), |
460
|
2 |
|
'ApacheBench/' => new props('start', $fn['validator']), |
461
|
2 |
|
'Wheregoes.com Redirect Checker/' => new props('start', $fn['validator']), |
462
|
2 |
|
'Asana/' => new props('start', $fn['feed']), |
463
|
2 |
|
'Java/' => new props('any', fn (string $value) : array => [ |
464
|
1 |
|
'type' => 'robot', |
465
|
1 |
|
'category' => 'scraper', |
466
|
1 |
|
'app' => 'Java', |
467
|
1 |
|
'appname' => $value, |
468
|
1 |
|
'appversion' => \explode('/', $value, 3)[1] |
469
|
1 |
|
]), |
470
|
2 |
|
'curl/' => new props('any', $fn['scraper']), |
471
|
2 |
|
'Wget/' => new props('start', $fn['scraper']), |
472
|
2 |
|
'rest-client/' => new props('start', $fn['scraper']), |
473
|
2 |
|
'ruby/' => new props('start', $fn['scraper']), |
474
|
2 |
|
'Bun/' => new props('start', $fn['scraper']), |
475
|
2 |
|
'CakePHP' => new props('start', $fn['scraper']), |
476
|
2 |
|
'cpp-httplib/' => new props('start', $fn['scraper']), |
477
|
2 |
|
'Dart/' => new props('start', $fn['scraper']), |
478
|
2 |
|
'Deno/' => new props('start', $fn['scraper']), |
479
|
2 |
|
'Datadog' => new props('start', $fn['scraper']), |
480
|
|
|
// 'libwww-perl/' => new props('start', $fn['scraper']), |
481
|
2 |
|
'http/' => new props('start', $fn['scraper']), |
482
|
2 |
|
'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
483
|
2 |
|
'http-client/' => new props('any', $fn['scraper']), |
484
|
2 |
|
'HttpClient/' => new props('any', $fn['scraper']), |
485
|
2 |
|
'PowerShell/' => new props('start', $fn['scraper']), |
486
|
2 |
|
'OAI-SearchBot/' => new props('start', $fn['search']), |
487
|
2 |
|
'Google-Extended' => new props('start', $fn['ai']), |
488
|
2 |
|
'ChatGPT-User/' => new props('start', $fn['ai']), |
489
|
2 |
|
'Cohere' => new props('start', $fn['ai']), |
490
|
2 |
|
'facebookexternalhit/' => new props('start', $fn['feed']), |
491
|
2 |
|
'facebookcatalog/' => new props('start', $fn['crawler']), |
492
|
2 |
|
'meta-externalagent' => new props('start', $fn['ai']), |
493
|
2 |
|
'meta-externalfetcher' => new props('start', $fn['feed']), |
494
|
2 |
|
'Validator' => new props('any', $fn['validator']), |
495
|
2 |
|
'feed' => new props('any', $fn['feed']), |
496
|
2 |
|
'bot/' => new props('any', $fn['map']), |
497
|
2 |
|
'bot-' => new props('any', $fn['map']), |
498
|
2 |
|
' bot ' => new props('any', $fn['map']), |
499
|
2 |
|
'bot' => new props('end', $fn['map']), |
500
|
2 |
|
'spider' => new props('any', $fn['crawler']), |
501
|
2 |
|
'crawler' => new props('any', $fn['map']), |
502
|
2 |
|
]; |
503
|
|
|
} |
504
|
|
|
} |