1 | <?php |
||
2 | declare(strict_types = 1); |
||
3 | namespace hexydec\agentzero; |
||
4 | |||
5 | class crawlers { |
||
6 | |||
7 | /** |
||
8 | * Extracts application and version information from a token |
||
9 | * |
||
10 | * @param string $value The token to be processed |
||
11 | * @param array<string|null> $data An array containing existing data to merge |
||
12 | * @return array<string|int|float|null> The $data array with the processed application and version added |
||
13 | */ |
||
14 | 18 | public static function getApp(string $value, array $data = []) : array { |
|
15 | 18 | if (!\str_contains($value, '://') && \mb_stripos($value, 'Chrome/') !== 0 && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
|
16 | 16 | $parts = \explode('/', $value, 2); |
|
17 | |||
18 | // process version |
||
19 | 16 | if (!empty($parts[1])) { |
|
20 | 13 | $parts[1] = \ltrim($parts[1], 'v'); |
|
21 | 13 | $parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
|
22 | } |
||
23 | 16 | $category = [ |
|
24 | 16 | 'yacybot' => 'search', |
|
25 | 16 | 'googlebot' => 'search', |
|
26 | 16 | 'googlebot-mobile' => 'search', |
|
27 | 16 | 'googlebot-image' => 'search', |
|
28 | 16 | 'googlebot-video' => 'search', |
|
29 | 16 | 'googlebot-news' => 'search', |
|
30 | 16 | 'storebot-google' => 'search', |
|
31 | 16 | 'adsbot-google' => 'ads', |
|
32 | 16 | 'adsbot-google-mobile' => 'ads', |
|
33 | 16 | 'mediapartners-google' => 'ads', |
|
34 | 16 | 'bingbot' => 'search', |
|
35 | 16 | 'adidxbot' => 'ads', |
|
36 | 16 | 'duckduckbot' => 'search', |
|
37 | 16 | 'duckduckgo-favicons-bot' => 'search', |
|
38 | 16 | 'coccocbot-image' => 'search', |
|
39 | 16 | 'coccocbot-web' => 'search', |
|
40 | 16 | 'yandexbot' => 'search', |
|
41 | 16 | 'mj12bot' => 'search', |
|
42 | 16 | 'mail.ru_bot' => 'search', |
|
43 | 16 | 'exabot' => 'search', |
|
44 | 16 | 'uptimerobot' => 'monitor', |
|
45 | 16 | 'petalbot' => 'search', |
|
46 | 16 | 'twitterbot' => 'feed', |
|
47 | 16 | 'xbot' => 'feed', |
|
48 | 16 | 'discordbot' => 'feed', |
|
49 | 16 | 'sematextsyntheticsrobot' => 'monitor', |
|
50 | 16 | 'linkedinbot' => 'feed', |
|
51 | 16 | 'paperlibot' => 'feed', |
|
52 | 16 | 'bitlybot' => 'feed', |
|
53 | 16 | 'tineye-bot' => 'search', |
|
54 | 16 | 'pinterestbot' => 'feed', |
|
55 | 16 | 'webcrawler' => 'crawler', |
|
56 | 16 | 'webprosbot' => 'crawler', |
|
57 | 16 | 'guzzlehttp' => 'scraper', |
|
58 | 16 | 'telegrambot' => 'feed', |
|
59 | 16 | 'semrushbot' => 'crawler', |
|
60 | 16 | 'mediatoolkitbot' => 'crawler', |
|
61 | 16 | 'iploggerbot' => 'monitor', |
|
62 | 16 | 'baiduspider' => 'search', |
|
63 | 16 | 'baiduspider+' => 'search', |
|
64 | 16 | 'baiduspider-image+' => 'search', |
|
65 | 16 | 'baiduspider-ads' => 'ads', |
|
66 | 16 | 'haosouspider' => 'search', |
|
67 | 16 | 'yisouspider' => 'search', |
|
68 | 16 | '360spider' => 'search', |
|
69 | 16 | 'sogou web spider' => 'search', |
|
70 | 16 | 'bytespider' => 'ai', |
|
71 | 16 | 'claudebot' => 'ai', |
|
72 | 16 | 'gptbot' => 'ai', |
|
73 | 16 | 'diffbot' => 'ai', |
|
74 | 16 | 'amazonbot' => 'ai', |
|
75 | 16 | 'applebot' => 'ai', |
|
76 | 16 | 'perplexitybot' => 'ai', |
|
77 | 16 | 'youbot' => 'ai', |
|
78 | 16 | 'iaskbot' => 'ai', |
|
79 | 16 | 'ccbot' => 'crawler', |
|
80 | 16 | 'wpbot' => 'ai', |
|
81 | 16 | 'imagesiftbot' => 'ai', |
|
82 | 16 | 'aihitbot' => 'ai', |
|
83 | 16 | 'andibot' => 'ai', |
|
84 | 16 | 'bedrockbot' => 'ai', |
|
85 | 16 | 'addsearchbot' => 'ai', |
|
86 | 16 | 'ai2bot' => 'ai', |
|
87 | 16 | 'google-cloudvertexbot' => 'ai', |
|
88 | 16 | 'duckassistbot' => 'ai', |
|
89 | 16 | 'echobot bot' => 'ai', |
|
90 | 16 | 'EchoboxBot' => 'ai', |
|
91 | 16 | 'factset_spyderbot' => 'ai', |
|
92 | 16 | 'kangaroo bot' => 'ai', |
|
93 | 16 | 'linerbot' => 'ai', |
|
94 | 16 | 'mycentralaiscraperbot' => 'ai', |
|
95 | 16 | 'omgilibot' => 'crawler', // webz.io |
|
96 | 16 | 'Webzio' => 'crawler', |
|
97 | 16 | 'pangubot' => 'ai', // huawei |
|
98 | 16 | 'phindbot' => 'ai', |
|
99 | 16 | 'qualifiedbot' => 'ai', |
|
100 | 16 | 'quillbot' => 'ai', |
|
101 | 16 | 'sbintuitionsbot' => 'ai', |
|
102 | 16 | 'sidetradebot' => 'ai', |
|
103 | 16 | 'thinkbot' => 'ai', |
|
104 | 16 | 'ai2bot' => 'ai', |
|
105 | 16 | 'timpibot' => 'ai', |
|
106 | 16 | 'wardbot' => 'monitor' |
|
107 | 16 | ]; |
|
108 | 16 | $apps = [ |
|
109 | 16 | 'googlebot' => 'Google Bot', |
|
110 | 16 | 'googlebot-mobile' => 'Google Bot', |
|
111 | 16 | 'googlebot-image' => 'Google Bot', |
|
112 | 16 | 'googlebot-video' => 'Google Bot', |
|
113 | 16 | 'googlebot-news' => 'Google Bot', |
|
114 | 16 | 'storebot-google' => 'Google Bot', |
|
115 | 16 | 'adsbot-google' => 'Google Bot', |
|
116 | 16 | 'google-adwords-instant' => 'Google Bot', |
|
117 | 16 | 'adsbot-google-mobile' => 'Google Bot', |
|
118 | 16 | 'mediapartners-google' => 'Google Bot', |
|
119 | 16 | 'google-safety' => 'Google Safety', |
|
120 | 16 | 'duckduckbot' => 'DuckDuck Bot', |
|
121 | 16 | 'duckduckbot-https' => 'DuckDuck Bot', |
|
122 | 16 | 'duckduckgo-favicons-bot' => 'DuckDuck Bot', |
|
123 | 16 | 'coccocbot-image' => 'Coccoc Bot', |
|
124 | 16 | 'coccocbot-web' => 'Coccoc Bot', |
|
125 | 16 | 'mj12bot' => 'Majestic 12 Bot', |
|
126 | 16 | 'exabot' => 'ExaBot', |
|
127 | 16 | 'twitterbot' => 'TwitterBot', |
|
128 | 16 | 'discordbot' => 'DiscordBot', |
|
129 | 16 | 'sematextsyntheticsrobot' => 'Sematext Synthetics Robot', |
|
130 | 16 | 'bitlybot' => 'Bit.ly Bot', |
|
131 | 16 | 'webprosbot' => 'WebprosBot', |
|
132 | 16 | 'mediatoolkitbot' => 'MediaToolkit Bot', |
|
133 | 16 | 'cfnetwork' => 'Apple Core Foundation Network', |
|
134 | 16 | 'ncsc web check [email protected]' => 'NCSC Web Check', |
|
135 | 16 | 'enhanced webcheck [email protected]' => 'NCSC Enhanced Web Check', |
|
136 | 16 | 'the national archives uk government web archive:' => 'UK Government National Archives', |
|
137 | 16 | 'google-inspectiontool' => 'Google Inspection Tool', |
|
138 | 16 | 'google-pagerenderer google' => 'Google Page Renderer', |
|
139 | 16 | 'pingdomtms' => 'Pingdom Bot', |
|
140 | 16 | 'facebookexternalhit' => 'Facebook URL Preview', |
|
141 | 16 | 'facebookcatalog' => 'Facebook', |
|
142 | 16 | 'meta-externalagent' => 'Meta External Agent', |
|
143 | 16 | 'meta-externalfetcher' => 'Meta External Fetcher', |
|
144 | 16 | 'phxbot' => 'ProtonMail Bot', |
|
145 | 16 | 'monitoring360bot' => 'Monitoring360 Bot', |
|
146 | 16 | 'cloudflare-healthchecks' => 'Cloudflare Health Checks', |
|
147 | 16 | 'cloudflare-alwaysonline' => 'Cloudflare Always Online', |
|
148 | 16 | 'cloudflare-traffic-manager' => 'Cloudflare-Traffic-Manager', |
|
149 | 16 | 'cloudflare-prefetch' => 'Cloudflare Prefetch', |
|
150 | 16 | 'cloudflare-ssldetector' => 'Cloudflare SSL Detector', |
|
151 | 16 | 'cloudflare-diagnostics' => 'Cloudflare Diagnostics', |
|
152 | 16 | 'ptst' => 'Cloudflare Speed Test', |
|
153 | 16 | 'citoid' => 'Wikimedia Citoid', |
|
154 | 16 | 'user-agent: seolyt' => 'SEOlyt', |
|
155 | 16 | 'bytespider' => 'ByteDance Spider', |
|
156 | 16 | '[email protected]' => 'ByteDance Spider', |
|
157 | 16 | 'oai-searchbot' => 'OpenAI SearchBot', |
|
158 | 16 | 'semrushbot' => 'Semrush Bot', |
|
159 | 16 | 'semrushbot-si' => 'Semrush Bot', |
|
160 | 16 | 'semrushbot-ocob' => 'Semrush Bot', |
|
161 | 16 | 'semrushbot-swa' => 'Semrush Bot', |
|
162 | 16 | 'semrushbot-ba' => 'Semrush Bot', |
|
163 | 16 | 'siteauditbot' => 'Semrush Bot', |
|
164 | 16 | 'splitsignalbot' => 'Semrush Bot', |
|
165 | 16 | 'linkcheck by siteimprove.com' => 'SiteImprove Crawler', |
|
166 | 16 | 'sitecheck-sitecrawl by siteimprove.com' => 'SiteImprove Crawler', |
|
167 | 16 | 'image size by siteimprove.com' => 'SiteImprove Crawler', |
|
168 | 16 | 'probe by siteimprove.com' => 'SiteImprove Crawler', |
|
169 | 16 | 'by siteimprove.com' => 'SiteImprove Crawler', |
|
170 | 16 | 'magpie-crawler' => 'Brandwatch Magpie Crawler', |
|
171 | 16 | 'linkedinbot' => 'LinkedIn Bot', |
|
172 | 16 | 'dotbot' => 'Moz DotBot', |
|
173 | 16 | 'dataforseobot' => 'DataForSeo Bot', |
|
174 | 16 | 'wordpress' => 'WordPress', |
|
175 | 16 | 'prtg network monitor' => 'Paessler PRTG Bot', |
|
176 | 16 | 'prtgcloudbot' => 'Paessler PRTG Bot', |
|
177 | 16 | 'powershell' => 'PowerShell', |
|
178 | 16 | 'ccbot' => 'CommonCrawl Bot', |
|
179 | 16 | 'oncrawl' => 'OnCrawl Bot', |
|
180 | 16 | 'pycurl' => 'PycURL', |
|
181 | 16 | 'chatgpt-user' => 'ChatGPT User', |
|
182 | 16 | 'mail.ru_bot' => 'Mail.ru Bot', |
|
183 | 16 | 'wpbot' => 'Wpbot', |
|
184 | 16 | 'dnbcrawler-analytics' => 'DnB Crawler Analytics', |
|
185 | 16 | 'baiduspider-image+' => 'Baidu Spider', |
|
186 | 16 | 'baiduspider-render' => 'Baidu Spider', |
|
187 | 16 | 'baiduspider-ads' => 'Baidu Spider', |
|
188 | 16 | 'amazon-qbusiness' => 'Amazon Bot', |
|
189 | 16 | 'amazon cloudfront' => 'Amazon Bot', |
|
190 | 16 | 'amazonbot-video' => 'Amazon Bot', |
|
191 | 16 | 'hubspot crawler' => 'HubSpot Crawler', |
|
192 | 16 | 'wordpress.com mshots' => 'WordPress.com mShots', |
|
193 | 16 | 'wordpress.com' => 'WordPress', |
|
194 | 16 | 'p3p validator' => 'P3P Validator', |
|
195 | 16 | 'w3c-checklink' => 'W3C Checklink', |
|
196 | 16 | 'w3c_validator' => 'W3C Validator', |
|
197 | 16 | 'omgili' => 'Webz.io', |
|
198 | 16 | 'bluesky cardyb' => 'Bluesky' |
|
199 | 16 | ]; |
|
200 | |||
201 | 16 | $lower = \mb_strtolower($parts[0]); |
|
202 | 16 | return \array_merge([ |
|
203 | 16 | 'type' => 'robot', |
|
204 | 16 | 'app' => $apps[$lower] ?? self::normaliseAppname($parts[0]), |
|
205 | 16 | 'appname' => $parts[0], |
|
206 | 16 | 'appversion' => empty($parts[1]) ? null : $parts[1] |
|
207 | 16 | ], $data, [ |
|
208 | 16 | 'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper') |
|
209 | 16 | ]); |
|
210 | } |
||
211 | 11 | return []; |
|
212 | } |
||
213 | |||
214 | 16 | public static function normaliseAppname(string $name) : string { |
|
215 | 16 | $find = ['_', '-', '+', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; |
|
216 | 16 | $replace = [' ', ' ', '', ' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J', ' K', ' L', ' M', ' N', ' O', ' P', ' Q', ' R', ' S', ' T', ' U', ' V', ' W', ' X', ' Y', ' Z']; |
|
217 | 16 | $name = \trim(\str_replace($find, $replace, $name)); |
|
218 | 16 | $output = ''; |
|
219 | 16 | $single = true; |
|
220 | 16 | foreach (\explode(' ', $name) AS $key => $item) { |
|
221 | 16 | if ($item !== '') { |
|
222 | 16 | $currsingle = \mb_strlen($item) === 1; |
|
223 | 16 | $output .= ($single && ($currsingle || $key === 1) ? '' : ' ').(!$currsingle ? \ucfirst($item) : $item); |
|
224 | 16 | $single = $currsingle; |
|
225 | } |
||
226 | } |
||
227 | 16 | return \trim(\str_ireplace(['bot', 'crawler', 'spider', ' ', 'ro bot'], [' Bot', ' Crawler', ' Spider', ' ', 'Robot'], $output)); // replace afterward for where it is preceded by ACROYMN |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
228 | } |
||
229 | |||
230 | /** |
||
231 | * Generates a configuration array for matching crawlers |
||
232 | * |
||
233 | * @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
||
234 | */ |
||
235 | 16 | public static function get() : array { |
|
236 | 16 | $fn = [ |
|
237 | 16 | 'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
|
238 | 16 | 'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
|
239 | 16 | 'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
|
240 | 16 | 'ai' => fn (string $value) : array => self::getApp($value, ['category' => 'ai']), |
|
241 | 16 | 'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
|
242 | 5 | \str_contains($value, 'WhatsApp/') ? [ |
|
243 | 5 | 'app' => 'WhatsApp' |
|
244 | 5 | ] : [], |
|
245 | 16 | [ |
|
246 | 16 | 'category' => 'feed' |
|
247 | 16 | ] |
|
248 | 16 | )), |
|
249 | 16 | 'crawler' => fn (string $value) : array => self::getApp($value, ['category' => 'crawler']), |
|
250 | 16 | 'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
|
251 | 16 | 'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
|
252 | 16 | 'map' => fn (string $value) : array => self::getApp($value) |
|
253 | 16 | ]; |
|
254 | 2 | return [ |
|
255 | 2 | 'Mozlila/' => new props('start', [ |
|
256 | 2 | 'type' => 'robot', |
|
257 | 2 | 'categpry' => 'scraper' |
|
258 | 2 | ]), |
|
259 | 2 | 'Moblie' => new props('exact', [ // some samsung devices mispelt it |
|
260 | 2 | 'type' => 'robot', |
|
261 | 2 | 'category' => 'scraper' |
|
262 | 2 | ]), |
|
263 | 2 | 'HeadlessChrome/' => new props('start', fn (string $value) : array => [ |
|
264 | 1 | 'type' => 'robot', |
|
265 | 1 | 'category' => 'crawler', |
|
266 | 1 | 'browser' => 'HeadlessChrome', |
|
267 | 1 | 'browserversion' => \mb_substr($value, 15) |
|
268 | 1 | ]), |
|
269 | 2 | 'Yahoo! Slurp' => new props('start', fn (string $value) : array => [ |
|
270 | 2 | 'type' => 'robot', |
|
271 | 2 | 'category' => 'search', |
|
272 | 2 | 'app' => 'Yahoo! Slurp', |
|
273 | 2 | 'appname' => $value |
|
274 | 2 | ]), |
|
275 | 2 | 'Google-Site-Verification/' => new props('start', $fn['validator']), |
|
276 | 2 | 'Google-InspectionTool/' => new props('start', $fn['validator']), |
|
277 | 2 | 'Google-Safety' => new props('exact', $fn['validator']), |
|
278 | 2 | 'Google-Read-Aloud' => new props('exact', $fn['feed']), |
|
279 | 2 | 'Google AppsViewer' => new props('exact', $fn['feed']), |
|
280 | 2 | 'Mediapartners-Google' => new props('start', $fn['search']), |
|
281 | 2 | 'FeedFetcher-Google' => new props('exact', $fn['feed']), |
|
282 | 2 | 'Google-PageRenderer' => new props('start', $fn['crawler']), |
|
283 | 2 | 'GoogleProducer' => new props('exact', $fn['feed']), |
|
284 | 2 | 'Google-adstxt' => new props('exact', $fn['ads']), |
|
285 | 2 | 'Google-Adwords-Instant' => new props('exact', $fn['ads']), |
|
286 | 2 | 'Gemini-Deep-Research' => new props('exact', $fn['ai']), |
|
287 | 2 | 'GoogleAgent-Mariner' => new props('exact', $fn['ai']), |
|
288 | 2 | 'CFNetwork/' => new props('start', $fn['feed']), |
|
289 | 2 | 'Siteimprove.com' => new props('any', fn (string $value) : array => \array_merge([ |
|
290 | 2 | 'url' => 'https://siteimprove.com' |
|
291 | 2 | ], $fn['crawler']($value))), |
|
292 | 2 | 'SEOlyt/' => new props('any', $fn['crawler']), |
|
293 | 2 | 'CyotekWebCopy' => new props('start', $fn['scraper']), |
|
294 | 2 | 'scrapy' => new props('start', $fn['scraper']), |
|
295 | 2 | 'Yandex' => new props('start', function (string $value) : array { |
|
296 | 1 | $parts = \explode('/', $value, 3); |
|
297 | 1 | return [ |
|
298 | 1 | 'type' => 'robot', |
|
299 | 1 | 'category' => 'search', |
|
300 | 1 | 'app' => 'Yandex Bot', |
|
301 | 1 | 'appname' => $parts[0], |
|
302 | 1 | 'appversion' => $parts[1] ?? null |
|
303 | 1 | ]; |
|
304 | 2 | }), |
|
305 | 2 | 'Google Page Speed Insights' => new props('exact', $fn['validator']), |
|
306 | 2 | 'Qwantify' => new props('start', function (string $value) : array { |
|
307 | 1 | $parts = \explode('/', $value, 3); |
|
308 | 1 | return [ |
|
309 | 1 | 'type' => 'robot', |
|
310 | 1 | 'category' => 'search', |
|
311 | 1 | 'app' => 'Qwant Web Crawler', |
|
312 | 1 | 'appname' => $parts[0], |
|
313 | 1 | 'appversion' => $parts[1] ?? null |
|
314 | 1 | ]; |
|
315 | 2 | }), |
|
316 | 2 | 'amazon-kendra' => new props('start', fn () : array => [ |
|
317 | 2 | 'type' => 'robot', |
|
318 | 2 | 'category' => 'crawler', |
|
319 | 2 | 'app' => 'Amazon Bot', |
|
320 | 2 | 'appname' => 'Amazon Kendra' |
|
321 | 2 | ]), |
|
322 | 2 | 'amazon-QBusiness' => new props('exact', $fn['ai']), |
|
323 | 2 | 'amazon CloudFront' => new props('exact', $fn['validator']), |
|
324 | 2 | 'Amazonbot-Video/' => new props('start', $fn['crawler']), |
|
325 | 2 | 'okhttp' => new props('start', $fn['scraper']), |
|
326 | 2 | 'python' => new props('start', $fn['scraper']), |
|
327 | 2 | 'grpc-python/' => new props('start', $fn['scraper']), |
|
328 | 2 | 'LWP::Simple/' => new props('start', $fn['scraper']), |
|
329 | 2 | 'jsdom/' => new props('start', $fn['scraper']), |
|
330 | 2 | 'Nessus' => new props('start', $fn['monitor']), |
|
331 | 2 | 'monitoring360bot' => new props('start', $fn['monitor']), |
|
332 | 2 | 'Cloudflare' => new props('start', $fn['validator']), |
|
333 | 2 | 'PTST/' => new props('start', $fn['validator']), |
|
334 | 2 | '+https://developers.cloudflare.com/security-center/' => new props('exact', $fn['monitor']), |
|
335 | 2 | 'AppSignalBot/' => new props('start', $fn['monitor']), |
|
336 | 2 | 'Better Uptime Bot' => new props('start', [ |
|
337 | 2 | 'type' => 'robot', |
|
338 | 2 | 'category' => 'monitor', |
|
339 | 2 | 'app' => 'Better Uptime Bot', |
|
340 | 2 | 'appname' => 'Better Uptime Bot' |
|
341 | 2 | ]), |
|
342 | 2 | 'Chrome-Lighthouse' => new props('start', $fn['validator']), |
|
343 | 2 | 'Siege/' => new props('start', $fn['validator']), |
|
344 | 2 | 'Microsoft Profiling/' => new props('any', $fn['validator']), |
|
345 | 2 | 'Bidtellect' => new props('start', $fn['crawler']), |
|
346 | 2 | 'magpie-crawler/' => new props('start', $fn['crawler']), |
|
347 | 2 | 'Web Measure/' => new props('start', $fn['crawler']), |
|
348 | 2 | 'Bluesky Cardyb/' => new props('start', $fn['crawler']), |
|
349 | 2 | 'PingdomTMS/' => new props('start', $fn['monitor']), |
|
350 | 2 | 'DynGate' => new props('exact', $fn['monitor']), |
|
351 | 2 | 'CensysInspect/' => new props('start', $fn['monitor']), |
|
352 | 2 | 'Datadog/Synthetics' => new props('exact', [ |
|
353 | 2 | 'type' => 'robot', |
|
354 | 2 | 'category' => 'monitor', |
|
355 | 2 | 'app' => 'Datadog/Synthetics' |
|
356 | 2 | ]), |
|
357 | 2 | 'RuxitSynthetic/' => new props('start', $fn['monitor']), |
|
358 | 2 | 'Checkly/' => new props('start', $fn['monitor']), |
|
359 | 2 | 'Uptime/' => new props('start', $fn['monitor']), |
|
360 | 2 | 'HostTracker/' => new props('start', $fn['monitor']), |
|
361 | 2 | 'NCSC Web Check [email protected]' => new props('exact', $fn['monitor']), |
|
362 | 2 | 'Enhanced WebCheck [email protected]' => new props('exact', $fn['monitor']), |
|
363 | 2 | 'Pingdom.com' => new props('start', function (string $value) : array { |
|
364 | 1 | $version = \explode('_', \trim($value, '_')); |
|
365 | 1 | return [ |
|
366 | 1 | 'type' => 'robot', |
|
367 | 1 | 'category' => 'monitor', |
|
368 | 1 | 'app' => 'Pingdom Bot', |
|
369 | 1 | 'appname' => \trim($value, '_'), |
|
370 | 1 | 'appversion' => \end($version) |
|
371 | 1 | ]; |
|
372 | 2 | }), |
|
373 | 2 | 'proximic' => new props('exact', $fn['ads']), |
|
374 | 2 | 'WordPress' => new props('start', $fn['feed']), |
|
375 | 2 | 'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
|
376 | 2 | 'PRTGCloudBot/' => new props('start', $fn['monitor']), |
|
377 | 2 | 'Site24x7' => new props('exact', $fn['monitor']), |
|
378 | 2 | 'StatusCake' => new props('exact', $fn['monitor']), |
|
379 | 2 | 'AWS Network Health' => new props('start', $fn['monitor']), |
|
380 | 2 | 'StatusCake' => new props('exact', $fn['monitor']), |
|
381 | 2 | 'adbeat.com' => new props('start', fn (string $value) : array => [ |
|
382 | 2 | 'type' => 'robot', |
|
383 | 2 | 'category' => 'ads', |
|
384 | 2 | 'app' => 'Adbeat', |
|
385 | 2 | 'appname' => 'Adbeat', |
|
386 | 2 | 'url' => 'https://'.$value |
|
387 | 2 | ]), |
|
388 | 2 | 'MicrosoftPreview/' => new props('start', $fn['feed']), |
|
389 | 2 | 'YahooMailProxy' => new props('exact', $fn['feed']), |
|
390 | 2 | 'PhxBot/' => new props('start', $fn['feed']), // proton mail |
|
391 | 2 | 'Embedly/' => new props('start', $fn['feed']), |
|
392 | 2 | 'PayPal IPN' => new props('exact', $fn['feed']), |
|
393 | 2 | 'DropboxPreviewBot/' => new props('start', $fn['feed']), |
|
394 | 2 | 'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon |
|
395 | 1 | 'type' => 'robot', |
|
396 | 1 | 'category' => 'feed', |
|
397 | 1 | 'app' => 'Mastodon', |
|
398 | 1 | 'appname' => 'Pleroma', |
|
399 | 1 | 'appversion' => \mb_substr($value, 8) |
|
400 | 1 | ]), |
|
401 | 2 | 'Outlook-Android/' => new props('start', fn (string $value) : array => [ |
|
402 | 2 | 'type' => 'robot', |
|
403 | 2 | 'category' => 'feed', |
|
404 | 2 | 'app' => 'Outlook', |
|
405 | 2 | 'appname' => 'Outlook-Android', |
|
406 | 2 | 'platform' => 'Android', |
|
407 | 2 | 'appversion' => \mb_substr($value, 16) |
|
408 | 2 | ]), |
|
409 | 2 | 'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ |
|
410 | 2 | 'type' => 'robot', |
|
411 | 2 | 'category' => 'feed', |
|
412 | 2 | 'app' => 'Outlook', |
|
413 | 2 | 'appname' => 'Outlook-iOS', |
|
414 | 2 | 'platform' => 'iOS', |
|
415 | 2 | 'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12) |
|
416 | 2 | ]), |
|
417 | 2 | 'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ |
|
418 | 2 | 'type' => 'robot', |
|
419 | 2 | 'category' => 'feed', |
|
420 | 2 | 'app' => 'Outlook', |
|
421 | 2 | 'appname' => 'OutlookMobileCloudService-Autodetect', |
|
422 | 2 | 'appversion' => \mb_substr($value, 37) |
|
423 | 2 | ]), |
|
424 | 2 | 'HubSpot Connect ' => new props('start', function (string $value, int $i, array $tokens) : array { |
|
425 | 1 | $app = 'HubSpot Connect'; |
|
426 | 1 | $count = \count($tokens); |
|
427 | 1 | for ($n = $i; $n < $count; $n++) { |
|
428 | 1 | if (\str_starts_with($tokens[$n], 'namespace: ')) { |
|
429 | 1 | $app = \mb_substr($tokens[$n], 11).' - '.$tokens[$n+1]; |
|
430 | 1 | break; |
|
431 | } |
||
432 | } |
||
433 | 1 | return [ |
|
434 | 1 | 'type' => 'robot', |
|
435 | 1 | 'category' => 'feed', |
|
436 | 1 | 'app' => 'HubSpot Connect', |
|
437 | 1 | 'appname' => $app, |
|
438 | 1 | 'appversion' => \mb_substr($value, 16) ?: null |
|
439 | 1 | ]; |
|
440 | 2 | }), |
|
441 | 2 | 'TikTokSpider' => new props('start', $fn['feed']), |
|
442 | 2 | 'Pro-Sitemaps/' => new props('start', $fn['crawler']), |
|
443 | 2 | 'Pandalytics/' => new props('start', $fn['crawler']), |
|
444 | 2 | 'omgili/' => new props('start', $fn['crawler']), |
|
445 | 2 | 'AwarioBot/' => new props('start', $fn['crawler']), |
|
446 | 2 | 'AwarioSmartBot/' => new props('start', $fn['crawler']), |
|
447 | 2 | 'AwarioRssBot/' => new props('start', $fn['crawler']), |
|
448 | 2 | 'ICC-Crawler/' => new props('start', $fn['crawler']), |
|
449 | 2 | 'The National Archives UK Government Web Archive' => new props('start', $fn['crawler']), |
|
450 | 2 | 'Citoid' => new props('exact', $fn['crawler']), |
|
451 | 2 | 'trendictionbot' => new props('start', fn (string $value) : array => [ |
|
452 | 1 | 'type' => 'robot', |
|
453 | 1 | 'category' => 'crawler', |
|
454 | 1 | 'app' => 'Trendicion Bot', |
|
455 | 1 | 'appname' => 'trendictionbot', |
|
456 | 1 | 'appversion' => \mb_substr($value, 14) ?: null |
|
457 | 1 | ]), |
|
458 | 2 | 'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']), |
|
459 | 2 | 'ViberUrlDownloader' => new props('exact', $fn['feed']), |
|
460 | 2 | 'GoogleDocs' => new props('exact', fn (string $value, int $i, array $tokens) : array => [ |
|
461 | 2 | 'type' => 'robot', |
|
462 | 2 | 'category' => 'feed', |
|
463 | 2 | 'app' => 'Google Docs', |
|
464 | 2 | 'appname' => $value.'; '.$tokens[$i+1] |
|
465 | 2 | ]), |
|
466 | 2 | 'Google-Lens' => new props('exact', $fn['feed']), |
|
467 | 2 | 'ManicTime/' => new props('start', $fn['feed']), |
|
468 | 2 | 'Yik Yak/' => new props('start', $fn['feed']), |
|
469 | 2 | 'HubSpot-Link-Resolver' => new props('exact', $fn['feed']), |
|
470 | 2 | 'AppleExchangeWebServices/' => new props('start', $fn['feed']), |
|
471 | 2 | 'The Lounge IRC Client' => new props('exact', $fn['feed']), |
|
472 | 2 | 'W3C-checklink/' => new props('start', $fn['validator']), |
|
473 | 2 | 'CSSCheck/' => new props('start', $fn['validator']), |
|
474 | 2 | 'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
|
475 | 2 | 'SEO-Macroscope/' => new props('start', $fn['validator']), |
|
476 | 2 | 'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']), |
|
477 | 2 | 'Barracuda Sentinel' => new props('start', $fn['validator']), |
|
478 | 2 | 'Expanse' => new props('start', $fn['crawler']), |
|
479 | 2 | 'eCairn-Grabber/' => new props('start', $fn['scraper']), |
|
480 | 2 | 'SEOkicks' => new props('exact', $fn['crawler']), |
|
481 | 2 | 'PostmanRuntime/' => new props('start', $fn['scraper']), |
|
482 | 2 | 'axios/' => new props('start', $fn['scraper']), |
|
483 | 2 | 'Rogerbot/' => new props('start', $fn['crawler']), |
|
484 | 2 | 'DashLinkPreviews/' => new props('start', $fn['feed']), |
|
485 | 2 | 'Snapchat/' => new props('start', $fn['feed']), |
|
486 | 2 | 'WhatsApp/' => new props('any', $fn['feed']), |
|
487 | 2 | 'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
|
488 | 2 | 'URL Preview' => new props('any', $fn['feed']), |
|
489 | 2 | 'Link Preview' => new props('any', $fn['feed']), |
|
490 | 2 | 'ApacheBench/' => new props('start', $fn['validator']), |
|
491 | 2 | 'Wheregoes.com Redirect Checker/' => new props('start', $fn['validator']), |
|
492 | 2 | 'Asana/' => new props('start', $fn['feed']), |
|
493 | 2 | 'Java/' => new props('any', fn (string $value) : array => [ |
|
494 | 1 | 'type' => 'robot', |
|
495 | 1 | 'category' => 'scraper', |
|
496 | 1 | 'app' => 'Java', |
|
497 | 1 | 'appname' => $value, |
|
498 | 1 | 'appversion' => \explode('/', $value, 3)[1] |
|
499 | 1 | ]), |
|
500 | 2 | 'curl/' => new props('any', $fn['scraper']), |
|
501 | 2 | 'Wget/' => new props('start', $fn['scraper']), |
|
502 | 2 | 'rest-client/' => new props('start', $fn['scraper']), |
|
503 | 2 | 'ruby/' => new props('start', $fn['scraper']), |
|
504 | 2 | 'Bun/' => new props('start', $fn['scraper']), |
|
505 | 2 | 'CakePHP' => new props('start', $fn['scraper']), |
|
506 | 2 | 'cpp-httplib/' => new props('start', $fn['scraper']), |
|
507 | 2 | 'Dart/' => new props('start', $fn['scraper']), |
|
508 | 2 | 'Deno/' => new props('start', $fn['scraper']), |
|
509 | 2 | 'Datadog' => new props('start', $fn['scraper']), |
|
510 | // 'libwww-perl/' => new props('start', $fn['scraper']), |
||
511 | 2 | 'http/' => new props('start', $fn['scraper']), |
|
512 | 2 | 'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
|
513 | 2 | 'http-client/' => new props('any', $fn['scraper']), |
|
514 | 2 | 'HttpClient/' => new props('any', $fn['scraper']), |
|
515 | 2 | 'PowerShell/' => new props('start', $fn['scraper']), |
|
516 | 2 | 'node-fetch' => new props('exact', $fn['scraper']), |
|
517 | 2 | 'OAI-SearchBot/' => new props('start', $fn['search']), |
|
518 | 2 | 'iaskspider/' => new props('start', $fn['search']), |
|
519 | 2 | 'MeltwaterNews' => new props('start', fn (string $value) : array => [ |
|
520 | 2 | 'type' => 'robot', |
|
521 | 2 | 'category' => 'crawler', |
|
522 | 2 | 'app' => 'Meltwater News', |
|
523 | 2 | 'appname' => 'MeltwaterNews', |
|
524 | 2 | 'url' => \mb_substr($value, 14) ?: null |
|
525 | 2 | ]), |
|
526 | 2 | 'Google-Extended' => new props('start', $fn['ai']), |
|
527 | 2 | 'ChatGPT-User/' => new props('start', $fn['feed']), |
|
528 | 2 | 'Cohere' => new props('start', $fn['ai']), |
|
529 | 2 | 'facebookexternalhit/' => new props('start', $fn['feed']), |
|
530 | 2 | 'facebookcatalog/' => new props('start', $fn['crawler']), |
|
531 | 2 | 'meta-externalagent' => new props('start', $fn['ai']), |
|
532 | 2 | 'meta-externalfetcher' => new props('start', $fn['feed']), |
|
533 | 2 | 'BrightBot ' => new props('start', fn (string $value) : array => [ |
|
534 | 2 | 'type' => 'robot', |
|
535 | 2 | 'category' => 'ai', |
|
536 | 2 | 'app' => 'Bright Bot', |
|
537 | 2 | 'appname' => 'BrightBot', |
|
538 | 2 | 'appversion' => \mb_substr($value, 10) ?: null |
|
539 | 2 | ]), |
|
540 | 2 | 'anthropic-ai' => new props('start', $fn['ai']), |
|
541 | 2 | 'bigsur.ai' => new props('start', $fn['ai']), |
|
542 | 2 | 'Claude User' => new props('start', $fn['ai']), |
|
543 | 2 | 'Claude Web' => new props('start', $fn['ai']), |
|
544 | 2 | 'cohere-ai' => new props('start', $fn['ai']), |
|
545 | 2 | 'cohere-training-data-crawler' => new props('start', $fn['ai']), |
|
546 | 2 | 'Cotoyogi' => new props('start', $fn['ai']), |
|
547 | 2 | 'Crawlspace' => new props('start', $fn['ai']), |
|
548 | 2 | 'Datenbank Crawler' => new props('start', $fn['ai']), |
|
549 | 2 | 'Devin' => new props('start', $fn['ai']), |
|
550 | 2 | 'FirecrawlAgent' => new props('start', $fn['ai']), |
|
551 | 2 | 'FriendlyCrawler' => new props('start', $fn['ai']), |
|
552 | 2 | 'MistralAI-User' => new props('start', $fn['ai']), |
|
553 | 2 | 'NovaAct' => new props('start', $fn['ai']), // amazon |
|
554 | 2 | 'Panscient' => new props('start', $fn['ai']), |
|
555 | 2 | 'pantest' => new props('start', $fn['ai']), |
|
556 | 2 | 'Perplexity' => new props('start', $fn['ai']), |
|
557 | 2 | 'VelenPublicWebCrawler' => new props('start', $fn['ai']), |
|
558 | 2 | 'Validator' => new props('any', $fn['validator']), |
|
559 | 2 | 'feed' => new props('any', $fn['feed']), |
|
560 | 2 | 'bot/' => new props('any', $fn['map']), |
|
561 | 2 | 'bot-' => new props('any', $fn['map']), |
|
562 | 2 | ' bot ' => new props('any', $fn['map']), |
|
563 | 2 | 'bot' => new props('end', $fn['map']), |
|
564 | 2 | 'spider' => new props('any', $fn['crawler']), |
|
565 | 2 | 'crawler' => new props('any', $fn['map']), |
|
566 | ]; |
||
567 | } |
||
568 | } |