|
1
|
|
|
<?php |
|
2
|
|
|
declare(strict_types = 1); |
|
3
|
|
|
namespace hexydec\agentzero; |
|
4
|
|
|
|
|
5
|
|
|
class crawlers { |
|
6
|
|
|
|
|
7
|
|
|
/** |
|
8
|
|
|
* Extracts application and version information from a token |
|
9
|
|
|
* |
|
10
|
|
|
* @param string $value The token to be processed |
|
11
|
|
|
* @param array<string|null> $data An array containing existing data to merge |
|
12
|
|
|
* @return array<string|int|float|null> The $data array with the processed application and version added |
|
13
|
|
|
*/ |
|
14
|
18 |
|
public static function getApp(string $value, array $data = []) : array { |
|
15
|
18 |
|
if (!\str_contains($value, '://') && \mb_stripos($value, 'Chrome/') !== 0 && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
|
16
|
16 |
|
$parts = \explode('/', $value, 2); |
|
17
|
|
|
|
|
18
|
|
|
// process version |
|
19
|
16 |
|
if (!empty($parts[1])) { |
|
20
|
13 |
|
$parts[1] = \ltrim($parts[1], 'v'); |
|
21
|
13 |
|
$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
|
22
|
|
|
} |
|
23
|
16 |
|
$category = [ |
|
24
|
16 |
|
'yacybot' => 'search', |
|
25
|
16 |
|
'googlebot' => 'search', |
|
26
|
16 |
|
'googlebot-mobile' => 'search', |
|
27
|
16 |
|
'googlebot-image' => 'search', |
|
28
|
16 |
|
'googlebot-video' => 'search', |
|
29
|
16 |
|
'googlebot-news' => 'search', |
|
30
|
16 |
|
'storebot-google' => 'search', |
|
31
|
16 |
|
'adsbot-google' => 'ads', |
|
32
|
16 |
|
'adsbot-google-mobile' => 'ads', |
|
33
|
16 |
|
'mediapartners-google' => 'ads', |
|
34
|
16 |
|
'bingbot' => 'search', |
|
35
|
16 |
|
'adidxbot' => 'ads', |
|
36
|
16 |
|
'duckduckbot' => 'search', |
|
37
|
16 |
|
'duckduckgo-favicons-bot' => 'search', |
|
38
|
16 |
|
'coccocbot-image' => 'search', |
|
39
|
16 |
|
'coccocbot-web' => 'search', |
|
40
|
16 |
|
'yandexbot' => 'search', |
|
41
|
16 |
|
'mj12bot' => 'search', |
|
42
|
16 |
|
'mail.ru_bot' => 'search', |
|
43
|
16 |
|
'exabot' => 'search', |
|
44
|
16 |
|
'uptimerobot' => 'monitor', |
|
45
|
16 |
|
'petalbot' => 'search', |
|
46
|
16 |
|
'twitterbot' => 'feed', |
|
47
|
16 |
|
'xbot' => 'feed', |
|
48
|
16 |
|
'discordbot' => 'feed', |
|
49
|
16 |
|
'sematextsyntheticsrobot' => 'monitor', |
|
50
|
16 |
|
'linkedinbot' => 'feed', |
|
51
|
16 |
|
'paperlibot' => 'feed', |
|
52
|
16 |
|
'bitlybot' => 'feed', |
|
53
|
16 |
|
'tineye-bot' => 'search', |
|
54
|
16 |
|
'pinterestbot' => 'feed', |
|
55
|
16 |
|
'webcrawler' => 'crawler', |
|
56
|
16 |
|
'webprosbot' => 'crawler', |
|
57
|
16 |
|
'guzzlehttp' => 'scraper', |
|
58
|
16 |
|
'telegrambot' => 'feed', |
|
59
|
16 |
|
'semrushbot' => 'crawler', |
|
60
|
16 |
|
'mediatoolkitbot' => 'crawler', |
|
61
|
16 |
|
'iploggerbot' => 'monitor', |
|
62
|
16 |
|
'baiduspider' => 'search', |
|
63
|
16 |
|
'baiduspider+' => 'search', |
|
64
|
16 |
|
'baiduspider-image+' => 'search', |
|
65
|
16 |
|
'baiduspider-ads' => 'ads', |
|
66
|
16 |
|
'haosouspider' => 'search', |
|
67
|
16 |
|
'yisouspider' => 'search', |
|
68
|
16 |
|
'360spider' => 'search', |
|
69
|
16 |
|
'sogou web spider' => 'search', |
|
70
|
16 |
|
'bytespider' => 'ai', |
|
71
|
16 |
|
'claudebot' => 'ai', |
|
72
|
16 |
|
'gptbot' => 'ai', |
|
73
|
16 |
|
'diffbot' => 'ai', |
|
74
|
16 |
|
'amazonbot' => 'ai', |
|
75
|
16 |
|
'applebot' => 'ai', |
|
76
|
16 |
|
'perplexitybot' => 'ai', |
|
77
|
16 |
|
'youbot' => 'ai', |
|
78
|
16 |
|
'iaskbot' => 'ai', |
|
79
|
16 |
|
'ccbot' => 'crawler', |
|
80
|
16 |
|
'wpbot' => 'ai', |
|
81
|
16 |
|
'imagesiftbot' => 'ai', |
|
82
|
16 |
|
'aihitbot' => 'ai', |
|
83
|
16 |
|
'andibot' => 'ai', |
|
84
|
16 |
|
'bedrockbot' => 'ai', |
|
85
|
16 |
|
'addsearchbot' => 'ai', |
|
86
|
16 |
|
'ai2bot' => 'ai', |
|
87
|
16 |
|
'google-cloudvertexbot' => 'ai', |
|
88
|
16 |
|
'duckassistbot' => 'ai', |
|
89
|
16 |
|
'echobot bot' => 'ai', |
|
90
|
16 |
|
'EchoboxBot' => 'ai', |
|
91
|
16 |
|
'factset_spyderbot' => 'ai', |
|
92
|
16 |
|
'kangaroo bot' => 'ai', |
|
93
|
16 |
|
'linerbot' => 'ai', |
|
94
|
16 |
|
'mycentralaiscraperbot' => 'ai', |
|
95
|
16 |
|
'omgilibot' => 'crawler', // webz.io |
|
96
|
16 |
|
'Webzio' => 'crawler', |
|
97
|
16 |
|
'pangubot' => 'ai', // huawei |
|
98
|
16 |
|
'phindbot' => 'ai', |
|
99
|
16 |
|
'qualifiedbot' => 'ai', |
|
100
|
16 |
|
'quillbot' => 'ai', |
|
101
|
16 |
|
'sbintuitionsbot' => 'ai', |
|
102
|
16 |
|
'sidetradebot' => 'ai', |
|
103
|
16 |
|
'thinkbot' => 'ai', |
|
104
|
16 |
|
'ai2bot' => 'ai', |
|
105
|
16 |
|
'timpibot' => 'ai', |
|
106
|
16 |
|
'wardbot' => 'monitor' |
|
107
|
16 |
|
]; |
|
108
|
16 |
|
$apps = [ |
|
109
|
16 |
|
'googlebot' => 'Google Bot', |
|
110
|
16 |
|
'googlebot-mobile' => 'Google Bot', |
|
111
|
16 |
|
'googlebot-image' => 'Google Bot', |
|
112
|
16 |
|
'googlebot-video' => 'Google Bot', |
|
113
|
16 |
|
'googlebot-news' => 'Google Bot', |
|
114
|
16 |
|
'storebot-google' => 'Google Bot', |
|
115
|
16 |
|
'adsbot-google' => 'Google Bot', |
|
116
|
16 |
|
'google-adwords-instant' => 'Google Bot', |
|
117
|
16 |
|
'adsbot-google-mobile' => 'Google Bot', |
|
118
|
16 |
|
'mediapartners-google' => 'Google Bot', |
|
119
|
16 |
|
'google-safety' => 'Google Safety', |
|
120
|
16 |
|
'duckduckbot' => 'DuckDuck Bot', |
|
121
|
16 |
|
'duckduckbot-https' => 'DuckDuck Bot', |
|
122
|
16 |
|
'duckduckgo-favicons-bot' => 'DuckDuck Bot', |
|
123
|
16 |
|
'coccocbot-image' => 'Coccoc Bot', |
|
124
|
16 |
|
'coccocbot-web' => 'Coccoc Bot', |
|
125
|
16 |
|
'mj12bot' => 'Majestic 12 Bot', |
|
126
|
16 |
|
'exabot' => 'ExaBot', |
|
127
|
16 |
|
'twitterbot' => 'TwitterBot', |
|
128
|
16 |
|
'discordbot' => 'DiscordBot', |
|
129
|
16 |
|
'sematextsyntheticsrobot' => 'Sematext Synthetics Robot', |
|
130
|
16 |
|
'bitlybot' => 'Bit.ly Bot', |
|
131
|
16 |
|
'webprosbot' => 'WebprosBot', |
|
132
|
16 |
|
'mediatoolkitbot' => 'MediaToolkit Bot', |
|
133
|
16 |
|
'cfnetwork' => 'Apple Core Foundation Network', |
|
134
|
16 |
|
'ncsc web check [email protected]' => 'NCSC Web Check', |
|
135
|
16 |
|
'enhanced webcheck [email protected]' => 'NCSC Enhanced Web Check', |
|
136
|
16 |
|
'the national archives uk government web archive:' => 'UK Government National Archives', |
|
137
|
16 |
|
'google-inspectiontool' => 'Google Inspection Tool', |
|
138
|
16 |
|
'google-pagerenderer google' => 'Google Page Renderer', |
|
139
|
16 |
|
'pingdomtms' => 'Pingdom Bot', |
|
140
|
16 |
|
'facebookexternalhit' => 'Facebook URL Preview', |
|
141
|
16 |
|
'facebookcatalog' => 'Facebook', |
|
142
|
16 |
|
'meta-externalagent' => 'Meta External Agent', |
|
143
|
16 |
|
'meta-externalfetcher' => 'Meta External Fetcher', |
|
144
|
16 |
|
'phxbot' => 'ProtonMail Bot', |
|
145
|
16 |
|
'monitoring360bot' => 'Monitoring360 Bot', |
|
146
|
16 |
|
'cloudflare-healthchecks' => 'Cloudflare Health Checks', |
|
147
|
16 |
|
'cloudflare-alwaysonline' => 'Cloudflare Always Online', |
|
148
|
16 |
|
'cloudflare-traffic-manager' => 'Cloudflare-Traffic-Manager', |
|
149
|
16 |
|
'cloudflare-prefetch' => 'Cloudflare Prefetch', |
|
150
|
16 |
|
'cloudflare-ssldetector' => 'Cloudflare SSL Detector', |
|
151
|
16 |
|
'cloudflare-diagnostics' => 'Cloudflare Diagnostics', |
|
152
|
16 |
|
'ptst' => 'Cloudflare Speed Test', |
|
153
|
16 |
|
'citoid' => 'Wikimedia Citoid', |
|
154
|
16 |
|
'user-agent: seolyt' => 'SEOlyt', |
|
155
|
16 |
|
'bytespider' => 'ByteDance Spider', |
|
156
|
16 |
|
'[email protected]' => 'ByteDance Spider', |
|
157
|
16 |
|
'oai-searchbot' => 'OpenAI SearchBot', |
|
158
|
16 |
|
'semrushbot' => 'Semrush Bot', |
|
159
|
16 |
|
'semrushbot-si' => 'Semrush Bot', |
|
160
|
16 |
|
'semrushbot-ocob' => 'Semrush Bot', |
|
161
|
16 |
|
'semrushbot-swa' => 'Semrush Bot', |
|
162
|
16 |
|
'semrushbot-ba' => 'Semrush Bot', |
|
163
|
16 |
|
'siteauditbot' => 'Semrush Bot', |
|
164
|
16 |
|
'splitsignalbot' => 'Semrush Bot', |
|
165
|
16 |
|
'linkcheck by siteimprove.com' => 'SiteImprove Crawler', |
|
166
|
16 |
|
'sitecheck-sitecrawl by siteimprove.com' => 'SiteImprove Crawler', |
|
167
|
16 |
|
'image size by siteimprove.com' => 'SiteImprove Crawler', |
|
168
|
16 |
|
'probe by siteimprove.com' => 'SiteImprove Crawler', |
|
169
|
16 |
|
'by siteimprove.com' => 'SiteImprove Crawler', |
|
170
|
16 |
|
'magpie-crawler' => 'Brandwatch Magpie Crawler', |
|
171
|
16 |
|
'linkedinbot' => 'LinkedIn Bot', |
|
172
|
16 |
|
'dotbot' => 'Moz DotBot', |
|
173
|
16 |
|
'dataforseobot' => 'DataForSeo Bot', |
|
174
|
16 |
|
'wordpress' => 'WordPress', |
|
175
|
16 |
|
'prtg network monitor' => 'Paessler PRTG Bot', |
|
176
|
16 |
|
'prtgcloudbot' => 'Paessler PRTG Bot', |
|
177
|
16 |
|
'powershell' => 'PowerShell', |
|
178
|
16 |
|
'ccbot' => 'CommonCrawl Bot', |
|
179
|
16 |
|
'oncrawl' => 'OnCrawl Bot', |
|
180
|
16 |
|
'pycurl' => 'PycURL', |
|
181
|
16 |
|
'chatgpt-user' => 'ChatGPT User', |
|
182
|
16 |
|
'mail.ru_bot' => 'Mail.ru Bot', |
|
183
|
16 |
|
'wpbot' => 'Wpbot', |
|
184
|
16 |
|
'dnbcrawler-analytics' => 'DnB Crawler Analytics', |
|
185
|
16 |
|
'baiduspider-image+' => 'Baidu Spider', |
|
186
|
16 |
|
'baiduspider-render' => 'Baidu Spider', |
|
187
|
16 |
|
'baiduspider-ads' => 'Baidu Spider', |
|
188
|
16 |
|
'amazon-qbusiness' => 'Amazon Bot', |
|
189
|
16 |
|
'amazon cloudfront' => 'Amazon Bot', |
|
190
|
16 |
|
'amazonbot-video' => 'Amazon Bot', |
|
191
|
16 |
|
'hubspot crawler' => 'HubSpot Crawler', |
|
192
|
16 |
|
'wordpress.com mshots' => 'WordPress.com mShots', |
|
193
|
16 |
|
'wordpress.com' => 'WordPress', |
|
194
|
16 |
|
'p3p validator' => 'P3P Validator', |
|
195
|
16 |
|
'w3c-checklink' => 'W3C Checklink', |
|
196
|
16 |
|
'w3c_validator' => 'W3C Validator', |
|
197
|
16 |
|
'omgili' => 'Webz.io', |
|
198
|
16 |
|
'bluesky cardyb' => 'Bluesky' |
|
199
|
16 |
|
]; |
|
200
|
|
|
|
|
201
|
16 |
|
$lower = \mb_strtolower($parts[0]); |
|
202
|
16 |
|
return \array_merge([ |
|
203
|
16 |
|
'type' => 'robot', |
|
204
|
16 |
|
'app' => $apps[$lower] ?? self::normaliseAppname($parts[0]), |
|
205
|
16 |
|
'appname' => $parts[0], |
|
206
|
16 |
|
'appversion' => empty($parts[1]) ? null : $parts[1] |
|
207
|
16 |
|
], $data, [ |
|
208
|
16 |
|
'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper') |
|
209
|
16 |
|
]); |
|
210
|
|
|
} |
|
211
|
11 |
|
return []; |
|
212
|
|
|
} |
|
213
|
|
|
|
|
214
|
16 |
|
public static function normaliseAppname(string $name) : string { |
|
215
|
16 |
|
$find = ['_', '-', '+', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; |
|
216
|
16 |
|
$replace = [' ', ' ', '', ' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J', ' K', ' L', ' M', ' N', ' O', ' P', ' Q', ' R', ' S', ' T', ' U', ' V', ' W', ' X', ' Y', ' Z']; |
|
217
|
16 |
|
$name = \trim(\str_replace($find, $replace, $name)); |
|
218
|
16 |
|
$output = ''; |
|
219
|
16 |
|
$single = true; |
|
220
|
16 |
|
foreach (\explode(' ', $name) AS $key => $item) { |
|
221
|
16 |
|
if ($item !== '') { |
|
222
|
16 |
|
$currsingle = \mb_strlen($item) === 1; |
|
223
|
16 |
|
$output .= ($single && ($currsingle || $key === 1) ? '' : ' ').(!$currsingle ? \ucfirst($item) : $item); |
|
224
|
16 |
|
$single = $currsingle; |
|
225
|
|
|
} |
|
226
|
|
|
} |
|
227
|
16 |
|
return \trim(\str_ireplace(['bot', 'crawler', 'spider', ' ', 'ro bot'], [' Bot', ' Crawler', ' Spider', ' ', 'Robot'], $output)); // replace afterward for where it is preceded by ACROYMN |
|
|
|
|
|
|
228
|
|
|
} |
|
229
|
|
|
|
|
230
|
|
|
/** |
|
231
|
|
|
* Generates a configuration array for matching crawlers |
|
232
|
|
|
* |
|
233
|
|
|
* @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
|
234
|
|
|
*/ |
|
235
|
16 |
|
public static function get() : array { |
|
236
|
16 |
|
$fn = [ |
|
237
|
16 |
|
'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
|
238
|
16 |
|
'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
|
239
|
16 |
|
'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
|
240
|
16 |
|
'ai' => fn (string $value) : array => self::getApp($value, ['category' => 'ai']), |
|
241
|
16 |
|
'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
|
242
|
5 |
|
\str_contains($value, 'WhatsApp/') ? [ |
|
243
|
5 |
|
'app' => 'WhatsApp' |
|
244
|
5 |
|
] : [], |
|
245
|
16 |
|
[ |
|
246
|
16 |
|
'category' => 'feed' |
|
247
|
16 |
|
] |
|
248
|
16 |
|
)), |
|
249
|
16 |
|
'crawler' => fn (string $value) : array => self::getApp($value, ['category' => 'crawler']), |
|
250
|
16 |
|
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
|
251
|
16 |
|
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
|
252
|
16 |
|
'map' => fn (string $value) : array => self::getApp($value) |
|
253
|
16 |
|
]; |
|
254
|
2 |
|
return [ |
|
255
|
2 |
|
'Mozlila/' => new props('start', [ |
|
256
|
2 |
|
'type' => 'robot', |
|
257
|
2 |
|
'categpry' => 'scraper' |
|
258
|
2 |
|
]), |
|
259
|
2 |
|
'Moblie' => new props('exact', [ // some samsung devices mispelt it |
|
260
|
2 |
|
'type' => 'robot', |
|
261
|
2 |
|
'category' => 'scraper' |
|
262
|
2 |
|
]), |
|
263
|
2 |
|
'HeadlessChrome/' => new props('start', fn (string $value) : array => [ |
|
264
|
1 |
|
'type' => 'robot', |
|
265
|
1 |
|
'category' => 'crawler', |
|
266
|
1 |
|
'browser' => 'HeadlessChrome', |
|
267
|
1 |
|
'browserversion' => \mb_substr($value, 15) |
|
268
|
1 |
|
]), |
|
269
|
2 |
|
'Yahoo! Slurp' => new props('start', fn (string $value) : array => [ |
|
270
|
2 |
|
'type' => 'robot', |
|
271
|
2 |
|
'category' => 'search', |
|
272
|
2 |
|
'app' => 'Yahoo! Slurp', |
|
273
|
2 |
|
'appname' => $value |
|
274
|
2 |
|
]), |
|
275
|
2 |
|
'Google-Site-Verification/' => new props('start', $fn['validator']), |
|
276
|
2 |
|
'Google-InspectionTool/' => new props('start', $fn['validator']), |
|
277
|
2 |
|
'Google-Safety' => new props('exact', $fn['validator']), |
|
278
|
2 |
|
'Google-Read-Aloud' => new props('exact', $fn['feed']), |
|
279
|
2 |
|
'Google AppsViewer' => new props('exact', $fn['feed']), |
|
280
|
2 |
|
'Mediapartners-Google' => new props('start', $fn['search']), |
|
281
|
2 |
|
'FeedFetcher-Google' => new props('exact', $fn['feed']), |
|
282
|
2 |
|
'Google-PageRenderer' => new props('start', $fn['crawler']), |
|
283
|
2 |
|
'GoogleProducer' => new props('exact', $fn['feed']), |
|
284
|
2 |
|
'Google-adstxt' => new props('exact', $fn['ads']), |
|
285
|
2 |
|
'Google-Adwords-Instant' => new props('exact', $fn['ads']), |
|
286
|
2 |
|
'Gemini-Deep-Research' => new props('exact', $fn['ai']), |
|
287
|
2 |
|
'GoogleAgent-Mariner' => new props('exact', $fn['ai']), |
|
288
|
2 |
|
'CFNetwork/' => new props('start', $fn['feed']), |
|
289
|
2 |
|
'Siteimprove.com' => new props('any', fn (string $value) : array => \array_merge([ |
|
290
|
2 |
|
'url' => 'https://siteimprove.com' |
|
291
|
2 |
|
], $fn['crawler']($value))), |
|
292
|
2 |
|
'SEOlyt/' => new props('any', $fn['crawler']), |
|
293
|
2 |
|
'CyotekWebCopy' => new props('start', $fn['scraper']), |
|
294
|
2 |
|
'scrapy' => new props('start', $fn['scraper']), |
|
295
|
2 |
|
'Yandex' => new props('start', function (string $value) : array { |
|
296
|
1 |
|
$parts = \explode('/', $value, 3); |
|
297
|
1 |
|
return [ |
|
298
|
1 |
|
'type' => 'robot', |
|
299
|
1 |
|
'category' => 'search', |
|
300
|
1 |
|
'app' => 'Yandex Bot', |
|
301
|
1 |
|
'appname' => $parts[0], |
|
302
|
1 |
|
'appversion' => $parts[1] ?? null |
|
303
|
1 |
|
]; |
|
304
|
2 |
|
}), |
|
305
|
2 |
|
'Google Page Speed Insights' => new props('exact', $fn['validator']), |
|
306
|
2 |
|
'Qwantify' => new props('start', function (string $value) : array { |
|
307
|
1 |
|
$parts = \explode('/', $value, 3); |
|
308
|
1 |
|
return [ |
|
309
|
1 |
|
'type' => 'robot', |
|
310
|
1 |
|
'category' => 'search', |
|
311
|
1 |
|
'app' => 'Qwant Web Crawler', |
|
312
|
1 |
|
'appname' => $parts[0], |
|
313
|
1 |
|
'appversion' => $parts[1] ?? null |
|
314
|
1 |
|
]; |
|
315
|
2 |
|
}), |
|
316
|
2 |
|
'amazon-kendra' => new props('start', fn () : array => [ |
|
317
|
2 |
|
'type' => 'robot', |
|
318
|
2 |
|
'category' => 'crawler', |
|
319
|
2 |
|
'app' => 'Amazon Bot', |
|
320
|
2 |
|
'appname' => 'Amazon Kendra' |
|
321
|
2 |
|
]), |
|
322
|
2 |
|
'amazon-QBusiness' => new props('exact', $fn['ai']), |
|
323
|
2 |
|
'amazon CloudFront' => new props('exact', $fn['validator']), |
|
324
|
2 |
|
'Amazonbot-Video/' => new props('start', $fn['crawler']), |
|
325
|
2 |
|
'okhttp' => new props('start', $fn['scraper']), |
|
326
|
2 |
|
'python' => new props('start', $fn['scraper']), |
|
327
|
2 |
|
'grpc-python/' => new props('start', $fn['scraper']), |
|
328
|
2 |
|
'LWP::Simple/' => new props('start', $fn['scraper']), |
|
329
|
2 |
|
'jsdom/' => new props('start', $fn['scraper']), |
|
330
|
2 |
|
'Nessus' => new props('start', $fn['monitor']), |
|
331
|
2 |
|
'monitoring360bot' => new props('start', $fn['monitor']), |
|
332
|
2 |
|
'Cloudflare' => new props('start', $fn['validator']), |
|
333
|
2 |
|
'PTST/' => new props('start', $fn['validator']), |
|
334
|
2 |
|
'+https://developers.cloudflare.com/security-center/' => new props('exact', $fn['monitor']), |
|
335
|
2 |
|
'AppSignalBot/' => new props('start', $fn['monitor']), |
|
336
|
2 |
|
'Better Uptime Bot' => new props('start', [ |
|
337
|
2 |
|
'type' => 'robot', |
|
338
|
2 |
|
'category' => 'monitor', |
|
339
|
2 |
|
'app' => 'Better Uptime Bot', |
|
340
|
2 |
|
'appname' => 'Better Uptime Bot' |
|
341
|
2 |
|
]), |
|
342
|
2 |
|
'Chrome-Lighthouse' => new props('start', $fn['validator']), |
|
343
|
2 |
|
'Siege/' => new props('start', $fn['validator']), |
|
344
|
2 |
|
'Microsoft Profiling/' => new props('any', $fn['validator']), |
|
345
|
2 |
|
'Bidtellect' => new props('start', $fn['crawler']), |
|
346
|
2 |
|
'magpie-crawler/' => new props('start', $fn['crawler']), |
|
347
|
2 |
|
'Web Measure/' => new props('start', $fn['crawler']), |
|
348
|
2 |
|
'Bluesky Cardyb/' => new props('start', $fn['crawler']), |
|
349
|
2 |
|
'PingdomTMS/' => new props('start', $fn['monitor']), |
|
350
|
2 |
|
'DynGate' => new props('exact', $fn['monitor']), |
|
351
|
2 |
|
'CensysInspect/' => new props('start', $fn['monitor']), |
|
352
|
2 |
|
'Datadog/Synthetics' => new props('exact', [ |
|
353
|
2 |
|
'type' => 'robot', |
|
354
|
2 |
|
'category' => 'monitor', |
|
355
|
2 |
|
'app' => 'Datadog/Synthetics' |
|
356
|
2 |
|
]), |
|
357
|
2 |
|
'RuxitSynthetic/' => new props('start', $fn['monitor']), |
|
358
|
2 |
|
'Checkly/' => new props('start', $fn['monitor']), |
|
359
|
2 |
|
'Uptime/' => new props('start', $fn['monitor']), |
|
360
|
2 |
|
'HostTracker/' => new props('start', $fn['monitor']), |
|
361
|
2 |
|
'NCSC Web Check [email protected]' => new props('exact', $fn['monitor']), |
|
362
|
2 |
|
'Enhanced WebCheck [email protected]' => new props('exact', $fn['monitor']), |
|
363
|
2 |
|
'Pingdom.com' => new props('start', function (string $value) : array { |
|
364
|
1 |
|
$version = \explode('_', \trim($value, '_')); |
|
365
|
1 |
|
return [ |
|
366
|
1 |
|
'type' => 'robot', |
|
367
|
1 |
|
'category' => 'monitor', |
|
368
|
1 |
|
'app' => 'Pingdom Bot', |
|
369
|
1 |
|
'appname' => \trim($value, '_'), |
|
370
|
1 |
|
'appversion' => \end($version) |
|
371
|
1 |
|
]; |
|
372
|
2 |
|
}), |
|
373
|
2 |
|
'proximic' => new props('exact', $fn['ads']), |
|
374
|
2 |
|
'WordPress' => new props('start', $fn['feed']), |
|
375
|
2 |
|
'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
|
376
|
2 |
|
'PRTGCloudBot/' => new props('start', $fn['monitor']), |
|
377
|
2 |
|
'Site24x7' => new props('exact', $fn['monitor']), |
|
378
|
2 |
|
'StatusCake' => new props('exact', $fn['monitor']), |
|
379
|
2 |
|
'AWS Network Health' => new props('start', $fn['monitor']), |
|
380
|
2 |
|
'StatusCake' => new props('exact', $fn['monitor']), |
|
381
|
2 |
|
'adbeat.com' => new props('start', fn (string $value) : array => [ |
|
382
|
2 |
|
'type' => 'robot', |
|
383
|
2 |
|
'category' => 'ads', |
|
384
|
2 |
|
'app' => 'Adbeat', |
|
385
|
2 |
|
'appname' => 'Adbeat', |
|
386
|
2 |
|
'url' => 'https://'.$value |
|
387
|
2 |
|
]), |
|
388
|
2 |
|
'MicrosoftPreview/' => new props('start', $fn['feed']), |
|
389
|
2 |
|
'YahooMailProxy' => new props('exact', $fn['feed']), |
|
390
|
2 |
|
'PhxBot/' => new props('start', $fn['feed']), // proton mail |
|
391
|
2 |
|
'Embedly/' => new props('start', $fn['feed']), |
|
392
|
2 |
|
'PayPal IPN' => new props('exact', $fn['feed']), |
|
393
|
2 |
|
'DropboxPreviewBot/' => new props('start', $fn['feed']), |
|
394
|
2 |
|
'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon |
|
395
|
1 |
|
'type' => 'robot', |
|
396
|
1 |
|
'category' => 'feed', |
|
397
|
1 |
|
'app' => 'Mastodon', |
|
398
|
1 |
|
'appname' => 'Pleroma', |
|
399
|
1 |
|
'appversion' => \mb_substr($value, 8) |
|
400
|
1 |
|
]), |
|
401
|
2 |
|
'Outlook-Android/' => new props('start', fn (string $value) : array => [ |
|
402
|
2 |
|
'type' => 'robot', |
|
403
|
2 |
|
'category' => 'feed', |
|
404
|
2 |
|
'app' => 'Outlook', |
|
405
|
2 |
|
'appname' => 'Outlook-Android', |
|
406
|
2 |
|
'platform' => 'Android', |
|
407
|
2 |
|
'appversion' => \mb_substr($value, 16) |
|
408
|
2 |
|
]), |
|
409
|
2 |
|
'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ |
|
410
|
2 |
|
'type' => 'robot', |
|
411
|
2 |
|
'category' => 'feed', |
|
412
|
2 |
|
'app' => 'Outlook', |
|
413
|
2 |
|
'appname' => 'Outlook-iOS', |
|
414
|
2 |
|
'platform' => 'iOS', |
|
415
|
2 |
|
'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12) |
|
416
|
2 |
|
]), |
|
417
|
2 |
|
'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ |
|
418
|
2 |
|
'type' => 'robot', |
|
419
|
2 |
|
'category' => 'feed', |
|
420
|
2 |
|
'app' => 'Outlook', |
|
421
|
2 |
|
'appname' => 'OutlookMobileCloudService-Autodetect', |
|
422
|
2 |
|
'appversion' => \mb_substr($value, 37) |
|
423
|
2 |
|
]), |
|
424
|
2 |
|
'HubSpot Connect ' => new props('start', function (string $value, int $i, array $tokens) : array { |
|
425
|
1 |
|
$app = 'HubSpot Connect'; |
|
426
|
1 |
|
$count = \count($tokens); |
|
427
|
1 |
|
for ($n = $i; $n < $count; $n++) { |
|
428
|
1 |
|
if (\str_starts_with($tokens[$n], 'namespace: ')) { |
|
429
|
1 |
|
$app = \mb_substr($tokens[$n], 11).' - '.$tokens[$n+1]; |
|
430
|
1 |
|
break; |
|
431
|
|
|
} |
|
432
|
|
|
} |
|
433
|
1 |
|
return [ |
|
434
|
1 |
|
'type' => 'robot', |
|
435
|
1 |
|
'category' => 'feed', |
|
436
|
1 |
|
'app' => 'HubSpot Connect', |
|
437
|
1 |
|
'appname' => $app, |
|
438
|
1 |
|
'appversion' => \mb_substr($value, 16) ?: null |
|
439
|
1 |
|
]; |
|
440
|
2 |
|
}), |
|
441
|
2 |
|
'TikTokSpider' => new props('start', $fn['feed']), |
|
442
|
2 |
|
'Pro-Sitemaps/' => new props('start', $fn['crawler']), |
|
443
|
2 |
|
'Pandalytics/' => new props('start', $fn['crawler']), |
|
444
|
2 |
|
'omgili/' => new props('start', $fn['crawler']), |
|
445
|
2 |
|
'AwarioBot/' => new props('start', $fn['crawler']), |
|
446
|
2 |
|
'AwarioSmartBot/' => new props('start', $fn['crawler']), |
|
447
|
2 |
|
'AwarioRssBot/' => new props('start', $fn['crawler']), |
|
448
|
2 |
|
'ICC-Crawler/' => new props('start', $fn['crawler']), |
|
449
|
2 |
|
'The National Archives UK Government Web Archive' => new props('start', $fn['crawler']), |
|
450
|
2 |
|
'Citoid' => new props('exact', $fn['crawler']), |
|
451
|
2 |
|
'trendictionbot' => new props('start', fn (string $value) : array => [ |
|
452
|
1 |
|
'type' => 'robot', |
|
453
|
1 |
|
'category' => 'crawler', |
|
454
|
1 |
|
'app' => 'Trendicion Bot', |
|
455
|
1 |
|
'appname' => 'trendictionbot', |
|
456
|
1 |
|
'appversion' => \mb_substr($value, 14) ?: null |
|
457
|
1 |
|
]), |
|
458
|
2 |
|
'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']), |
|
459
|
2 |
|
'ViberUrlDownloader' => new props('exact', $fn['feed']), |
|
460
|
2 |
|
'GoogleDocs' => new props('exact', fn (string $value, int $i, array $tokens) : array => [ |
|
461
|
2 |
|
'type' => 'robot', |
|
462
|
2 |
|
'category' => 'feed', |
|
463
|
2 |
|
'app' => 'Google Docs', |
|
464
|
2 |
|
'appname' => $value.'; '.$tokens[$i+1] |
|
465
|
2 |
|
]), |
|
466
|
2 |
|
'Google-Lens' => new props('exact', $fn['feed']), |
|
467
|
2 |
|
'ManicTime/' => new props('start', $fn['feed']), |
|
468
|
2 |
|
'Yik Yak/' => new props('start', $fn['feed']), |
|
469
|
2 |
|
'HubSpot-Link-Resolver' => new props('exact', $fn['feed']), |
|
470
|
2 |
|
'AppleExchangeWebServices/' => new props('start', $fn['feed']), |
|
471
|
2 |
|
'The Lounge IRC Client' => new props('exact', $fn['feed']), |
|
472
|
2 |
|
'W3C-checklink/' => new props('start', $fn['validator']), |
|
473
|
2 |
|
'CSSCheck/' => new props('start', $fn['validator']), |
|
474
|
2 |
|
'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
|
475
|
2 |
|
'SEO-Macroscope/' => new props('start', $fn['validator']), |
|
476
|
2 |
|
'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']), |
|
477
|
2 |
|
'Barracuda Sentinel' => new props('start', $fn['validator']), |
|
478
|
2 |
|
'Expanse' => new props('start', $fn['crawler']), |
|
479
|
2 |
|
'eCairn-Grabber/' => new props('start', $fn['scraper']), |
|
480
|
2 |
|
'SEOkicks' => new props('exact', $fn['crawler']), |
|
481
|
2 |
|
'PostmanRuntime/' => new props('start', $fn['scraper']), |
|
482
|
2 |
|
'axios/' => new props('start', $fn['scraper']), |
|
483
|
2 |
|
'Rogerbot/' => new props('start', $fn['crawler']), |
|
484
|
2 |
|
'DashLinkPreviews/' => new props('start', $fn['feed']), |
|
485
|
2 |
|
'Snapchat/' => new props('start', $fn['feed']), |
|
486
|
2 |
|
'WhatsApp/' => new props('any', $fn['feed']), |
|
487
|
2 |
|
'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
|
488
|
2 |
|
'URL Preview' => new props('any', $fn['feed']), |
|
489
|
2 |
|
'Link Preview' => new props('any', $fn['feed']), |
|
490
|
2 |
|
'ApacheBench/' => new props('start', $fn['validator']), |
|
491
|
2 |
|
'Wheregoes.com Redirect Checker/' => new props('start', $fn['validator']), |
|
492
|
2 |
|
'Asana/' => new props('start', $fn['feed']), |
|
493
|
2 |
|
'Java/' => new props('any', fn (string $value) : array => [ |
|
494
|
1 |
|
'type' => 'robot', |
|
495
|
1 |
|
'category' => 'scraper', |
|
496
|
1 |
|
'app' => 'Java', |
|
497
|
1 |
|
'appname' => $value, |
|
498
|
1 |
|
'appversion' => \explode('/', $value, 3)[1] |
|
499
|
1 |
|
]), |
|
500
|
2 |
|
'curl/' => new props('any', $fn['scraper']), |
|
501
|
2 |
|
'Wget/' => new props('start', $fn['scraper']), |
|
502
|
2 |
|
'rest-client/' => new props('start', $fn['scraper']), |
|
503
|
2 |
|
'ruby/' => new props('start', $fn['scraper']), |
|
504
|
2 |
|
'Bun/' => new props('start', $fn['scraper']), |
|
505
|
2 |
|
'CakePHP' => new props('start', $fn['scraper']), |
|
506
|
2 |
|
'cpp-httplib/' => new props('start', $fn['scraper']), |
|
507
|
2 |
|
'Dart/' => new props('start', $fn['scraper']), |
|
508
|
2 |
|
'Deno/' => new props('start', $fn['scraper']), |
|
509
|
2 |
|
'Datadog' => new props('start', $fn['scraper']), |
|
510
|
|
|
// 'libwww-perl/' => new props('start', $fn['scraper']), |
|
511
|
2 |
|
'http/' => new props('start', $fn['scraper']), |
|
512
|
2 |
|
'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
|
513
|
2 |
|
'http-client/' => new props('any', $fn['scraper']), |
|
514
|
2 |
|
'HttpClient/' => new props('any', $fn['scraper']), |
|
515
|
2 |
|
'PowerShell/' => new props('start', $fn['scraper']), |
|
516
|
2 |
|
'node-fetch' => new props('exact', $fn['scraper']), |
|
517
|
2 |
|
'OAI-SearchBot/' => new props('start', $fn['search']), |
|
518
|
2 |
|
'iaskspider/' => new props('start', $fn['search']), |
|
519
|
2 |
|
'MeltwaterNews' => new props('start', fn (string $value) : array => [ |
|
520
|
2 |
|
'type' => 'robot', |
|
521
|
2 |
|
'category' => 'crawler', |
|
522
|
2 |
|
'app' => 'Meltwater News', |
|
523
|
2 |
|
'appname' => 'MeltwaterNews', |
|
524
|
2 |
|
'url' => \mb_substr($value, 14) ?: null |
|
525
|
2 |
|
]), |
|
526
|
2 |
|
'Google-Extended' => new props('start', $fn['ai']), |
|
527
|
2 |
|
'ChatGPT-User/' => new props('start', $fn['feed']), |
|
528
|
2 |
|
'Cohere' => new props('start', $fn['ai']), |
|
529
|
2 |
|
'facebookexternalhit/' => new props('start', $fn['feed']), |
|
530
|
2 |
|
'facebookcatalog/' => new props('start', $fn['crawler']), |
|
531
|
2 |
|
'meta-externalagent' => new props('start', $fn['ai']), |
|
532
|
2 |
|
'meta-externalfetcher' => new props('start', $fn['feed']), |
|
533
|
2 |
|
'BrightBot ' => new props('start', fn (string $value) : array => [ |
|
534
|
2 |
|
'type' => 'robot', |
|
535
|
2 |
|
'category' => 'ai', |
|
536
|
2 |
|
'app' => 'Bright Bot', |
|
537
|
2 |
|
'appname' => 'BrightBot', |
|
538
|
2 |
|
'appversion' => \mb_substr($value, 10) ?: null |
|
539
|
2 |
|
]), |
|
540
|
2 |
|
'anthropic-ai' => new props('start', $fn['ai']), |
|
541
|
2 |
|
'bigsur.ai' => new props('start', $fn['ai']), |
|
542
|
2 |
|
'Claude User' => new props('start', $fn['ai']), |
|
543
|
2 |
|
'Claude Web' => new props('start', $fn['ai']), |
|
544
|
2 |
|
'cohere-ai' => new props('start', $fn['ai']), |
|
545
|
2 |
|
'cohere-training-data-crawler' => new props('start', $fn['ai']), |
|
546
|
2 |
|
'Cotoyogi' => new props('start', $fn['ai']), |
|
547
|
2 |
|
'Crawlspace' => new props('start', $fn['ai']), |
|
548
|
2 |
|
'Datenbank Crawler' => new props('start', $fn['ai']), |
|
549
|
2 |
|
'Devin' => new props('start', $fn['ai']), |
|
550
|
2 |
|
'FirecrawlAgent' => new props('start', $fn['ai']), |
|
551
|
2 |
|
'FriendlyCrawler' => new props('start', $fn['ai']), |
|
552
|
2 |
|
'MistralAI-User' => new props('start', $fn['ai']), |
|
553
|
2 |
|
'NovaAct' => new props('start', $fn['ai']), // amazon |
|
554
|
2 |
|
'Panscient' => new props('start', $fn['ai']), |
|
555
|
2 |
|
'pantest' => new props('start', $fn['ai']), |
|
556
|
2 |
|
'Perplexity' => new props('start', $fn['ai']), |
|
557
|
2 |
|
'VelenPublicWebCrawler' => new props('start', $fn['ai']), |
|
558
|
2 |
|
'Validator' => new props('any', $fn['validator']), |
|
559
|
2 |
|
'feed' => new props('any', $fn['feed']), |
|
560
|
2 |
|
'bot/' => new props('any', $fn['map']), |
|
561
|
2 |
|
'bot-' => new props('any', $fn['map']), |
|
562
|
2 |
|
' bot ' => new props('any', $fn['map']), |
|
563
|
2 |
|
'bot' => new props('end', $fn['map']), |
|
564
|
2 |
|
'spider' => new props('any', $fn['crawler']), |
|
565
|
2 |
|
'crawler' => new props('any', $fn['map']), |
|
566
|
|
|
]; |
|
567
|
|
|
} |
|
568
|
|
|
} |