hexydec /
agentzero
| 1 | <?php |
||
| 2 | declare(strict_types = 1); |
||
| 3 | namespace hexydec\agentzero; |
||
| 4 | |||
| 5 | class crawlers { |
||
| 6 | |||
| 7 | /** |
||
| 8 | * Extracts application and version information from a token |
||
| 9 | * |
||
| 10 | * @param string $value The token to be processed |
||
| 11 | * @param array<string|null> $data An array containing existing data to merge |
||
| 12 | * @return array<string|int|float|null> The $data array with the processed application and version added |
||
| 13 | */ |
||
| 14 | 18 | public static function getApp(string $value, array $data = []) : array { |
|
| 15 | 18 | if (!\str_contains($value, '://') && \mb_stripos($value, 'Chrome/') !== 0 && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL |
|
| 16 | 16 | $parts = \explode('/', $value, 2); |
|
| 17 | |||
| 18 | // process version |
||
| 19 | 16 | if (!empty($parts[1])) { |
|
| 20 | 13 | $parts[1] = \ltrim($parts[1], 'v'); |
|
| 21 | 13 | $parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.')); |
|
| 22 | } |
||
| 23 | 16 | $category = [ |
|
| 24 | 16 | 'yacybot' => 'search', |
|
| 25 | 16 | 'googlebot' => 'search', |
|
| 26 | 16 | 'googlebot-mobile' => 'search', |
|
| 27 | 16 | 'googlebot-image' => 'search', |
|
| 28 | 16 | 'googlebot-video' => 'search', |
|
| 29 | 16 | 'googlebot-news' => 'search', |
|
| 30 | 16 | 'storebot-google' => 'search', |
|
| 31 | 16 | 'adsbot-google' => 'ads', |
|
| 32 | 16 | 'adsbot-google-mobile' => 'ads', |
|
| 33 | 16 | 'mediapartners-google' => 'ads', |
|
| 34 | 16 | 'bingbot' => 'search', |
|
| 35 | 16 | 'adidxbot' => 'ads', |
|
| 36 | 16 | 'duckduckbot' => 'search', |
|
| 37 | 16 | 'duckduckgo-favicons-bot' => 'search', |
|
| 38 | 16 | 'coccocbot-image' => 'search', |
|
| 39 | 16 | 'coccocbot-web' => 'search', |
|
| 40 | 16 | 'yandexbot' => 'search', |
|
| 41 | 16 | 'mj12bot' => 'search', |
|
| 42 | 16 | 'mail.ru_bot' => 'search', |
|
| 43 | 16 | 'exabot' => 'search', |
|
| 44 | 16 | 'uptimerobot' => 'monitor', |
|
| 45 | 16 | 'petalbot' => 'search', |
|
| 46 | 16 | 'twitterbot' => 'feed', |
|
| 47 | 16 | 'xbot' => 'feed', |
|
| 48 | 16 | 'discordbot' => 'feed', |
|
| 49 | 16 | 'sematextsyntheticsrobot' => 'monitor', |
|
| 50 | 16 | 'linkedinbot' => 'feed', |
|
| 51 | 16 | 'paperlibot' => 'feed', |
|
| 52 | 16 | 'bitlybot' => 'feed', |
|
| 53 | 16 | 'tineye-bot' => 'search', |
|
| 54 | 16 | 'pinterestbot' => 'feed', |
|
| 55 | 16 | 'webcrawler' => 'crawler', |
|
| 56 | 16 | 'webprosbot' => 'crawler', |
|
| 57 | 16 | 'guzzlehttp' => 'scraper', |
|
| 58 | 16 | 'telegrambot' => 'feed', |
|
| 59 | 16 | 'semrushbot' => 'crawler', |
|
| 60 | 16 | 'mediatoolkitbot' => 'crawler', |
|
| 61 | 16 | 'iploggerbot' => 'monitor', |
|
| 62 | 16 | 'baiduspider' => 'search', |
|
| 63 | 16 | 'baiduspider+' => 'search', |
|
| 64 | 16 | 'baiduspider-image+' => 'search', |
|
| 65 | 16 | 'baiduspider-ads' => 'ads', |
|
| 66 | 16 | 'haosouspider' => 'search', |
|
| 67 | 16 | 'yisouspider' => 'search', |
|
| 68 | 16 | '360spider' => 'search', |
|
| 69 | 16 | 'sogou web spider' => 'search', |
|
| 70 | 16 | 'bytespider' => 'ai', |
|
| 71 | 16 | 'claudebot' => 'ai', |
|
| 72 | 16 | 'gptbot' => 'ai', |
|
| 73 | 16 | 'diffbot' => 'ai', |
|
| 74 | 16 | 'amazonbot' => 'ai', |
|
| 75 | 16 | 'applebot' => 'ai', |
|
| 76 | 16 | 'perplexitybot' => 'ai', |
|
| 77 | 16 | 'youbot' => 'ai', |
|
| 78 | 16 | 'iaskbot' => 'ai', |
|
| 79 | 16 | 'ccbot' => 'crawler', |
|
| 80 | 16 | 'wpbot' => 'ai', |
|
| 81 | 16 | 'imagesiftbot' => 'ai', |
|
| 82 | 16 | 'aihitbot' => 'ai', |
|
| 83 | 16 | 'andibot' => 'ai', |
|
| 84 | 16 | 'bedrockbot' => 'ai', |
|
| 85 | 16 | 'addsearchbot' => 'ai', |
|
| 86 | 16 | 'ai2bot' => 'ai', |
|
| 87 | 16 | 'google-cloudvertexbot' => 'ai', |
|
| 88 | 16 | 'duckassistbot' => 'ai', |
|
| 89 | 16 | 'echobot bot' => 'ai', |
|
| 90 | 16 | 'EchoboxBot' => 'ai', |
|
| 91 | 16 | 'factset_spyderbot' => 'ai', |
|
| 92 | 16 | 'kangaroo bot' => 'ai', |
|
| 93 | 16 | 'linerbot' => 'ai', |
|
| 94 | 16 | 'mycentralaiscraperbot' => 'ai', |
|
| 95 | 16 | 'omgilibot' => 'crawler', // webz.io |
|
| 96 | 16 | 'Webzio' => 'crawler', |
|
| 97 | 16 | 'pangubot' => 'ai', // huawei |
|
| 98 | 16 | 'phindbot' => 'ai', |
|
| 99 | 16 | 'qualifiedbot' => 'ai', |
|
| 100 | 16 | 'quillbot' => 'ai', |
|
| 101 | 16 | 'sbintuitionsbot' => 'ai', |
|
| 102 | 16 | 'sidetradebot' => 'ai', |
|
| 103 | 16 | 'thinkbot' => 'ai', |
|
| 104 | 16 | 'ai2bot' => 'ai', |
|
| 105 | 16 | 'timpibot' => 'ai', |
|
| 106 | 16 | 'wardbot' => 'monitor' |
|
| 107 | 16 | ]; |
|
| 108 | 16 | $apps = [ |
|
| 109 | 16 | 'googlebot' => 'Google Bot', |
|
| 110 | 16 | 'googlebot-mobile' => 'Google Bot', |
|
| 111 | 16 | 'googlebot-image' => 'Google Bot', |
|
| 112 | 16 | 'googlebot-video' => 'Google Bot', |
|
| 113 | 16 | 'googlebot-news' => 'Google Bot', |
|
| 114 | 16 | 'storebot-google' => 'Google Bot', |
|
| 115 | 16 | 'adsbot-google' => 'Google Bot', |
|
| 116 | 16 | 'google-adwords-instant' => 'Google Bot', |
|
| 117 | 16 | 'adsbot-google-mobile' => 'Google Bot', |
|
| 118 | 16 | 'mediapartners-google' => 'Google Bot', |
|
| 119 | 16 | 'google-safety' => 'Google Safety', |
|
| 120 | 16 | 'duckduckbot' => 'DuckDuck Bot', |
|
| 121 | 16 | 'duckduckbot-https' => 'DuckDuck Bot', |
|
| 122 | 16 | 'duckduckgo-favicons-bot' => 'DuckDuck Bot', |
|
| 123 | 16 | 'coccocbot-image' => 'Coccoc Bot', |
|
| 124 | 16 | 'coccocbot-web' => 'Coccoc Bot', |
|
| 125 | 16 | 'mj12bot' => 'Majestic 12 Bot', |
|
| 126 | 16 | 'exabot' => 'ExaBot', |
|
| 127 | 16 | 'twitterbot' => 'TwitterBot', |
|
| 128 | 16 | 'discordbot' => 'DiscordBot', |
|
| 129 | 16 | 'sematextsyntheticsrobot' => 'Sematext Synthetics Robot', |
|
| 130 | 16 | 'bitlybot' => 'Bit.ly Bot', |
|
| 131 | 16 | 'webprosbot' => 'WebprosBot', |
|
| 132 | 16 | 'mediatoolkitbot' => 'MediaToolkit Bot', |
|
| 133 | 16 | 'cfnetwork' => 'Apple Core Foundation Network', |
|
| 134 | 16 | 'ncsc web check [email protected]' => 'NCSC Web Check', |
|
| 135 | 16 | 'enhanced webcheck [email protected]' => 'NCSC Enhanced Web Check', |
|
| 136 | 16 | 'the national archives uk government web archive:' => 'UK Government National Archives', |
|
| 137 | 16 | 'google-inspectiontool' => 'Google Inspection Tool', |
|
| 138 | 16 | 'google-pagerenderer google' => 'Google Page Renderer', |
|
| 139 | 16 | 'pingdomtms' => 'Pingdom Bot', |
|
| 140 | 16 | 'facebookexternalhit' => 'Facebook URL Preview', |
|
| 141 | 16 | 'facebookcatalog' => 'Facebook', |
|
| 142 | 16 | 'meta-externalagent' => 'Meta External Agent', |
|
| 143 | 16 | 'meta-externalfetcher' => 'Meta External Fetcher', |
|
| 144 | 16 | 'phxbot' => 'ProtonMail Bot', |
|
| 145 | 16 | 'monitoring360bot' => 'Monitoring360 Bot', |
|
| 146 | 16 | 'cloudflare-healthchecks' => 'Cloudflare Health Checks', |
|
| 147 | 16 | 'cloudflare-alwaysonline' => 'Cloudflare Always Online', |
|
| 148 | 16 | 'cloudflare-traffic-manager' => 'Cloudflare-Traffic-Manager', |
|
| 149 | 16 | 'cloudflare-prefetch' => 'Cloudflare Prefetch', |
|
| 150 | 16 | 'cloudflare-ssldetector' => 'Cloudflare SSL Detector', |
|
| 151 | 16 | 'cloudflare-diagnostics' => 'Cloudflare Diagnostics', |
|
| 152 | 16 | 'ptst' => 'Cloudflare Speed Test', |
|
| 153 | 16 | 'citoid' => 'Wikimedia Citoid', |
|
| 154 | 16 | 'user-agent: seolyt' => 'SEOlyt', |
|
| 155 | 16 | 'bytespider' => 'ByteDance Spider', |
|
| 156 | 16 | '[email protected]' => 'ByteDance Spider', |
|
| 157 | 16 | 'oai-searchbot' => 'OpenAI SearchBot', |
|
| 158 | 16 | 'semrushbot' => 'Semrush Bot', |
|
| 159 | 16 | 'semrushbot-si' => 'Semrush Bot', |
|
| 160 | 16 | 'semrushbot-ocob' => 'Semrush Bot', |
|
| 161 | 16 | 'semrushbot-swa' => 'Semrush Bot', |
|
| 162 | 16 | 'semrushbot-ba' => 'Semrush Bot', |
|
| 163 | 16 | 'siteauditbot' => 'Semrush Bot', |
|
| 164 | 16 | 'splitsignalbot' => 'Semrush Bot', |
|
| 165 | 16 | 'linkcheck by siteimprove.com' => 'SiteImprove Crawler', |
|
| 166 | 16 | 'sitecheck-sitecrawl by siteimprove.com' => 'SiteImprove Crawler', |
|
| 167 | 16 | 'image size by siteimprove.com' => 'SiteImprove Crawler', |
|
| 168 | 16 | 'probe by siteimprove.com' => 'SiteImprove Crawler', |
|
| 169 | 16 | 'by siteimprove.com' => 'SiteImprove Crawler', |
|
| 170 | 16 | 'magpie-crawler' => 'Brandwatch Magpie Crawler', |
|
| 171 | 16 | 'linkedinbot' => 'LinkedIn Bot', |
|
| 172 | 16 | 'dotbot' => 'Moz DotBot', |
|
| 173 | 16 | 'dataforseobot' => 'DataForSeo Bot', |
|
| 174 | 16 | 'wordpress' => 'WordPress', |
|
| 175 | 16 | 'prtg network monitor' => 'Paessler PRTG Bot', |
|
| 176 | 16 | 'prtgcloudbot' => 'Paessler PRTG Bot', |
|
| 177 | 16 | 'powershell' => 'PowerShell', |
|
| 178 | 16 | 'ccbot' => 'CommonCrawl Bot', |
|
| 179 | 16 | 'oncrawl' => 'OnCrawl Bot', |
|
| 180 | 16 | 'pycurl' => 'PycURL', |
|
| 181 | 16 | 'chatgpt-user' => 'ChatGPT User', |
|
| 182 | 16 | 'mail.ru_bot' => 'Mail.ru Bot', |
|
| 183 | 16 | 'wpbot' => 'Wpbot', |
|
| 184 | 16 | 'dnbcrawler-analytics' => 'DnB Crawler Analytics', |
|
| 185 | 16 | 'baiduspider-image+' => 'Baidu Spider', |
|
| 186 | 16 | 'baiduspider-render' => 'Baidu Spider', |
|
| 187 | 16 | 'baiduspider-ads' => 'Baidu Spider', |
|
| 188 | 16 | 'amazon-qbusiness' => 'Amazon Bot', |
|
| 189 | 16 | 'amazon cloudfront' => 'Amazon Bot', |
|
| 190 | 16 | 'amazonbot-video' => 'Amazon Bot', |
|
| 191 | 16 | 'hubspot crawler' => 'HubSpot Crawler', |
|
| 192 | 16 | 'wordpress.com mshots' => 'WordPress.com mShots', |
|
| 193 | 16 | 'wordpress.com' => 'WordPress', |
|
| 194 | 16 | 'p3p validator' => 'P3P Validator', |
|
| 195 | 16 | 'w3c-checklink' => 'W3C Checklink', |
|
| 196 | 16 | 'w3c_validator' => 'W3C Validator', |
|
| 197 | 16 | 'omgili' => 'Webz.io', |
|
| 198 | 16 | 'bluesky cardyb' => 'Bluesky' |
|
| 199 | 16 | ]; |
|
| 200 | |||
| 201 | 16 | $lower = \mb_strtolower($parts[0]); |
|
| 202 | 16 | return \array_merge([ |
|
| 203 | 16 | 'type' => 'robot', |
|
| 204 | 16 | 'app' => $apps[$lower] ?? self::normaliseAppname($parts[0]), |
|
| 205 | 16 | 'appname' => $parts[0], |
|
| 206 | 16 | 'appversion' => empty($parts[1]) ? null : $parts[1] |
|
| 207 | 16 | ], $data, [ |
|
| 208 | 16 | 'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper') |
|
| 209 | 16 | ]); |
|
| 210 | } |
||
| 211 | 11 | return []; |
|
| 212 | } |
||
| 213 | |||
| 214 | 16 | public static function normaliseAppname(string $name) : string { |
|
| 215 | 16 | $find = ['_', '-', '+', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; |
|
| 216 | 16 | $replace = [' ', ' ', '', ' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J', ' K', ' L', ' M', ' N', ' O', ' P', ' Q', ' R', ' S', ' T', ' U', ' V', ' W', ' X', ' Y', ' Z']; |
|
| 217 | 16 | $name = \trim(\str_replace($find, $replace, $name)); |
|
| 218 | 16 | $output = ''; |
|
| 219 | 16 | $single = true; |
|
| 220 | 16 | foreach (\explode(' ', $name) AS $key => $item) { |
|
| 221 | 16 | if ($item !== '') { |
|
| 222 | 16 | $currsingle = \mb_strlen($item) === 1; |
|
| 223 | 16 | $output .= ($single && ($currsingle || $key === 1) ? '' : ' ').(!$currsingle ? \ucfirst($item) : $item); |
|
| 224 | 16 | $single = $currsingle; |
|
| 225 | } |
||
| 226 | } |
||
| 227 | 16 | return \trim(\str_ireplace(['bot', 'crawler', 'spider', ' ', 'ro bot'], [' Bot', ' Crawler', ' Spider', ' ', 'Robot'], $output)); // replace afterward for where it is preceded by ACROYMN |
|
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 228 | } |
||
| 229 | |||
| 230 | /** |
||
| 231 | * Generates a configuration array for matching crawlers |
||
| 232 | * |
||
| 233 | * @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
||
| 234 | */ |
||
| 235 | 16 | public static function get() : array { |
|
| 236 | 16 | $fn = [ |
|
| 237 | 16 | 'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']), |
|
| 238 | 16 | 'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']), |
|
| 239 | 16 | 'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']), |
|
| 240 | 16 | 'ai' => fn (string $value) : array => self::getApp($value, ['category' => 'ai']), |
|
| 241 | 16 | 'feed' => fn (string $value) : array => self::getApp($value, \array_merge( |
|
| 242 | 5 | \str_contains($value, 'WhatsApp/') ? [ |
|
| 243 | 5 | 'app' => 'WhatsApp' |
|
| 244 | 5 | ] : [], |
|
| 245 | 16 | [ |
|
| 246 | 16 | 'category' => 'feed' |
|
| 247 | 16 | ] |
|
| 248 | 16 | )), |
|
| 249 | 16 | 'crawler' => fn (string $value) : array => self::getApp($value, ['category' => 'crawler']), |
|
| 250 | 16 | 'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']), |
|
| 251 | 16 | 'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']), |
|
| 252 | 16 | 'map' => fn (string $value) : array => self::getApp($value) |
|
| 253 | 16 | ]; |
|
| 254 | 2 | return [ |
|
| 255 | 2 | 'Mozlila/' => new props('start', [ |
|
| 256 | 2 | 'type' => 'robot', |
|
| 257 | 2 | 'categpry' => 'scraper' |
|
| 258 | 2 | ]), |
|
| 259 | 2 | 'Moblie' => new props('exact', [ // some samsung devices mispelt it |
|
| 260 | 2 | 'type' => 'robot', |
|
| 261 | 2 | 'category' => 'scraper' |
|
| 262 | 2 | ]), |
|
| 263 | 2 | 'HeadlessChrome/' => new props('start', fn (string $value) : array => [ |
|
| 264 | 1 | 'type' => 'robot', |
|
| 265 | 1 | 'category' => 'crawler', |
|
| 266 | 1 | 'browser' => 'HeadlessChrome', |
|
| 267 | 1 | 'browserversion' => \mb_substr($value, 15) |
|
| 268 | 1 | ]), |
|
| 269 | 2 | 'Yahoo! Slurp' => new props('start', fn (string $value) : array => [ |
|
| 270 | 2 | 'type' => 'robot', |
|
| 271 | 2 | 'category' => 'search', |
|
| 272 | 2 | 'app' => 'Yahoo! Slurp', |
|
| 273 | 2 | 'appname' => $value |
|
| 274 | 2 | ]), |
|
| 275 | 2 | 'Google-Site-Verification/' => new props('start', $fn['validator']), |
|
| 276 | 2 | 'Google-InspectionTool/' => new props('start', $fn['validator']), |
|
| 277 | 2 | 'Google-Safety' => new props('exact', $fn['validator']), |
|
| 278 | 2 | 'Google-Read-Aloud' => new props('exact', $fn['feed']), |
|
| 279 | 2 | 'Google AppsViewer' => new props('exact', $fn['feed']), |
|
| 280 | 2 | 'Mediapartners-Google' => new props('start', $fn['search']), |
|
| 281 | 2 | 'FeedFetcher-Google' => new props('exact', $fn['feed']), |
|
| 282 | 2 | 'Google-PageRenderer' => new props('start', $fn['crawler']), |
|
| 283 | 2 | 'GoogleProducer' => new props('exact', $fn['feed']), |
|
| 284 | 2 | 'Google-adstxt' => new props('exact', $fn['ads']), |
|
| 285 | 2 | 'Google-Adwords-Instant' => new props('exact', $fn['ads']), |
|
| 286 | 2 | 'Gemini-Deep-Research' => new props('exact', $fn['ai']), |
|
| 287 | 2 | 'GoogleAgent-Mariner' => new props('exact', $fn['ai']), |
|
| 288 | 2 | 'CFNetwork/' => new props('start', $fn['feed']), |
|
| 289 | 2 | 'Siteimprove.com' => new props('any', fn (string $value) : array => \array_merge([ |
|
| 290 | 2 | 'url' => 'https://siteimprove.com' |
|
| 291 | 2 | ], $fn['crawler']($value))), |
|
| 292 | 2 | 'SEOlyt/' => new props('any', $fn['crawler']), |
|
| 293 | 2 | 'CyotekWebCopy' => new props('start', $fn['scraper']), |
|
| 294 | 2 | 'scrapy' => new props('start', $fn['scraper']), |
|
| 295 | 2 | 'Yandex' => new props('start', function (string $value) : array { |
|
| 296 | 1 | $parts = \explode('/', $value, 3); |
|
| 297 | 1 | return [ |
|
| 298 | 1 | 'type' => 'robot', |
|
| 299 | 1 | 'category' => 'search', |
|
| 300 | 1 | 'app' => 'Yandex Bot', |
|
| 301 | 1 | 'appname' => $parts[0], |
|
| 302 | 1 | 'appversion' => $parts[1] ?? null |
|
| 303 | 1 | ]; |
|
| 304 | 2 | }), |
|
| 305 | 2 | 'Google Page Speed Insights' => new props('exact', $fn['validator']), |
|
| 306 | 2 | 'Qwantify' => new props('start', function (string $value) : array { |
|
| 307 | 1 | $parts = \explode('/', $value, 3); |
|
| 308 | 1 | return [ |
|
| 309 | 1 | 'type' => 'robot', |
|
| 310 | 1 | 'category' => 'search', |
|
| 311 | 1 | 'app' => 'Qwant Web Crawler', |
|
| 312 | 1 | 'appname' => $parts[0], |
|
| 313 | 1 | 'appversion' => $parts[1] ?? null |
|
| 314 | 1 | ]; |
|
| 315 | 2 | }), |
|
| 316 | 2 | 'amazon-kendra' => new props('start', fn () : array => [ |
|
| 317 | 2 | 'type' => 'robot', |
|
| 318 | 2 | 'category' => 'crawler', |
|
| 319 | 2 | 'app' => 'Amazon Bot', |
|
| 320 | 2 | 'appname' => 'Amazon Kendra' |
|
| 321 | 2 | ]), |
|
| 322 | 2 | 'amazon-QBusiness' => new props('exact', $fn['ai']), |
|
| 323 | 2 | 'amazon CloudFront' => new props('exact', $fn['validator']), |
|
| 324 | 2 | 'Amazonbot-Video/' => new props('start', $fn['crawler']), |
|
| 325 | 2 | 'okhttp' => new props('start', $fn['scraper']), |
|
| 326 | 2 | 'python' => new props('start', $fn['scraper']), |
|
| 327 | 2 | 'grpc-python/' => new props('start', $fn['scraper']), |
|
| 328 | 2 | 'LWP::Simple/' => new props('start', $fn['scraper']), |
|
| 329 | 2 | 'jsdom/' => new props('start', $fn['scraper']), |
|
| 330 | 2 | 'Nessus' => new props('start', $fn['monitor']), |
|
| 331 | 2 | 'monitoring360bot' => new props('start', $fn['monitor']), |
|
| 332 | 2 | 'Cloudflare' => new props('start', $fn['validator']), |
|
| 333 | 2 | 'PTST/' => new props('start', $fn['validator']), |
|
| 334 | 2 | '+https://developers.cloudflare.com/security-center/' => new props('exact', $fn['monitor']), |
|
| 335 | 2 | 'AppSignalBot/' => new props('start', $fn['monitor']), |
|
| 336 | 2 | 'Better Uptime Bot' => new props('start', [ |
|
| 337 | 2 | 'type' => 'robot', |
|
| 338 | 2 | 'category' => 'monitor', |
|
| 339 | 2 | 'app' => 'Better Uptime Bot', |
|
| 340 | 2 | 'appname' => 'Better Uptime Bot' |
|
| 341 | 2 | ]), |
|
| 342 | 2 | 'Chrome-Lighthouse' => new props('start', $fn['validator']), |
|
| 343 | 2 | 'Siege/' => new props('start', $fn['validator']), |
|
| 344 | 2 | 'Microsoft Profiling/' => new props('any', $fn['validator']), |
|
| 345 | 2 | 'Bidtellect' => new props('start', $fn['crawler']), |
|
| 346 | 2 | 'magpie-crawler/' => new props('start', $fn['crawler']), |
|
| 347 | 2 | 'Web Measure/' => new props('start', $fn['crawler']), |
|
| 348 | 2 | 'Bluesky Cardyb/' => new props('start', $fn['crawler']), |
|
| 349 | 2 | 'PingdomTMS/' => new props('start', $fn['monitor']), |
|
| 350 | 2 | 'DynGate' => new props('exact', $fn['monitor']), |
|
| 351 | 2 | 'CensysInspect/' => new props('start', $fn['monitor']), |
|
| 352 | 2 | 'Datadog/Synthetics' => new props('exact', [ |
|
| 353 | 2 | 'type' => 'robot', |
|
| 354 | 2 | 'category' => 'monitor', |
|
| 355 | 2 | 'app' => 'Datadog/Synthetics' |
|
| 356 | 2 | ]), |
|
| 357 | 2 | 'RuxitSynthetic/' => new props('start', $fn['monitor']), |
|
| 358 | 2 | 'Checkly/' => new props('start', $fn['monitor']), |
|
| 359 | 2 | 'Uptime/' => new props('start', $fn['monitor']), |
|
| 360 | 2 | 'HostTracker/' => new props('start', $fn['monitor']), |
|
| 361 | 2 | 'NCSC Web Check [email protected]' => new props('exact', $fn['monitor']), |
|
| 362 | 2 | 'Enhanced WebCheck [email protected]' => new props('exact', $fn['monitor']), |
|
| 363 | 2 | 'Pingdom.com' => new props('start', function (string $value) : array { |
|
| 364 | 1 | $version = \explode('_', \trim($value, '_')); |
|
| 365 | 1 | return [ |
|
| 366 | 1 | 'type' => 'robot', |
|
| 367 | 1 | 'category' => 'monitor', |
|
| 368 | 1 | 'app' => 'Pingdom Bot', |
|
| 369 | 1 | 'appname' => \trim($value, '_'), |
|
| 370 | 1 | 'appversion' => \end($version) |
|
| 371 | 1 | ]; |
|
| 372 | 2 | }), |
|
| 373 | 2 | 'proximic' => new props('exact', $fn['ads']), |
|
| 374 | 2 | 'WordPress' => new props('start', $fn['feed']), |
|
| 375 | 2 | 'PRTG Network Monitor' => new props('exact', $fn['monitor']), |
|
| 376 | 2 | 'PRTGCloudBot/' => new props('start', $fn['monitor']), |
|
| 377 | 2 | 'Site24x7' => new props('exact', $fn['monitor']), |
|
| 378 | 2 | 'StatusCake' => new props('exact', $fn['monitor']), |
|
| 379 | 2 | 'AWS Network Health' => new props('start', $fn['monitor']), |
|
| 380 | 2 | 'StatusCake' => new props('exact', $fn['monitor']), |
|
| 381 | 2 | 'adbeat.com' => new props('start', fn (string $value) : array => [ |
|
| 382 | 2 | 'type' => 'robot', |
|
| 383 | 2 | 'category' => 'ads', |
|
| 384 | 2 | 'app' => 'Adbeat', |
|
| 385 | 2 | 'appname' => 'Adbeat', |
|
| 386 | 2 | 'url' => 'https://'.$value |
|
| 387 | 2 | ]), |
|
| 388 | 2 | 'MicrosoftPreview/' => new props('start', $fn['feed']), |
|
| 389 | 2 | 'YahooMailProxy' => new props('exact', $fn['feed']), |
|
| 390 | 2 | 'PhxBot/' => new props('start', $fn['feed']), // proton mail |
|
| 391 | 2 | 'Embedly/' => new props('start', $fn['feed']), |
|
| 392 | 2 | 'PayPal IPN' => new props('exact', $fn['feed']), |
|
| 393 | 2 | 'DropboxPreviewBot/' => new props('start', $fn['feed']), |
|
| 394 | 2 | 'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon |
|
| 395 | 1 | 'type' => 'robot', |
|
| 396 | 1 | 'category' => 'feed', |
|
| 397 | 1 | 'app' => 'Mastodon', |
|
| 398 | 1 | 'appname' => 'Pleroma', |
|
| 399 | 1 | 'appversion' => \mb_substr($value, 8) |
|
| 400 | 1 | ]), |
|
| 401 | 2 | 'Outlook-Android/' => new props('start', fn (string $value) : array => [ |
|
| 402 | 2 | 'type' => 'robot', |
|
| 403 | 2 | 'category' => 'feed', |
|
| 404 | 2 | 'app' => 'Outlook', |
|
| 405 | 2 | 'appname' => 'Outlook-Android', |
|
| 406 | 2 | 'platform' => 'Android', |
|
| 407 | 2 | 'appversion' => \mb_substr($value, 16) |
|
| 408 | 2 | ]), |
|
| 409 | 2 | 'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ |
|
| 410 | 2 | 'type' => 'robot', |
|
| 411 | 2 | 'category' => 'feed', |
|
| 412 | 2 | 'app' => 'Outlook', |
|
| 413 | 2 | 'appname' => 'Outlook-iOS', |
|
| 414 | 2 | 'platform' => 'iOS', |
|
| 415 | 2 | 'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12) |
|
| 416 | 2 | ]), |
|
| 417 | 2 | 'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ |
|
| 418 | 2 | 'type' => 'robot', |
|
| 419 | 2 | 'category' => 'feed', |
|
| 420 | 2 | 'app' => 'Outlook', |
|
| 421 | 2 | 'appname' => 'OutlookMobileCloudService-Autodetect', |
|
| 422 | 2 | 'appversion' => \mb_substr($value, 37) |
|
| 423 | 2 | ]), |
|
| 424 | 2 | 'HubSpot Connect ' => new props('start', function (string $value, int $i, array $tokens) : array { |
|
| 425 | 1 | $app = 'HubSpot Connect'; |
|
| 426 | 1 | $count = \count($tokens); |
|
| 427 | 1 | for ($n = $i; $n < $count; $n++) { |
|
| 428 | 1 | if (\str_starts_with($tokens[$n], 'namespace: ')) { |
|
| 429 | 1 | $app = \mb_substr($tokens[$n], 11).' - '.$tokens[$n+1]; |
|
| 430 | 1 | break; |
|
| 431 | } |
||
| 432 | } |
||
| 433 | 1 | return [ |
|
| 434 | 1 | 'type' => 'robot', |
|
| 435 | 1 | 'category' => 'feed', |
|
| 436 | 1 | 'app' => 'HubSpot Connect', |
|
| 437 | 1 | 'appname' => $app, |
|
| 438 | 1 | 'appversion' => \mb_substr($value, 16) ?: null |
|
| 439 | 1 | ]; |
|
| 440 | 2 | }), |
|
| 441 | 2 | 'TikTokSpider' => new props('start', $fn['feed']), |
|
| 442 | 2 | 'Pro-Sitemaps/' => new props('start', $fn['crawler']), |
|
| 443 | 2 | 'Pandalytics/' => new props('start', $fn['crawler']), |
|
| 444 | 2 | 'omgili/' => new props('start', $fn['crawler']), |
|
| 445 | 2 | 'AwarioBot/' => new props('start', $fn['crawler']), |
|
| 446 | 2 | 'AwarioSmartBot/' => new props('start', $fn['crawler']), |
|
| 447 | 2 | 'AwarioRssBot/' => new props('start', $fn['crawler']), |
|
| 448 | 2 | 'ICC-Crawler/' => new props('start', $fn['crawler']), |
|
| 449 | 2 | 'The National Archives UK Government Web Archive' => new props('start', $fn['crawler']), |
|
| 450 | 2 | 'Citoid' => new props('exact', $fn['crawler']), |
|
| 451 | 2 | 'trendictionbot' => new props('start', fn (string $value) : array => [ |
|
| 452 | 1 | 'type' => 'robot', |
|
| 453 | 1 | 'category' => 'crawler', |
|
| 454 | 1 | 'app' => 'Trendicion Bot', |
|
| 455 | 1 | 'appname' => 'trendictionbot', |
|
| 456 | 1 | 'appversion' => \mb_substr($value, 14) ?: null |
|
| 457 | 1 | ]), |
|
| 458 | 2 | 'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']), |
|
| 459 | 2 | 'ViberUrlDownloader' => new props('exact', $fn['feed']), |
|
| 460 | 2 | 'GoogleDocs' => new props('exact', fn (string $value, int $i, array $tokens) : array => [ |
|
| 461 | 2 | 'type' => 'robot', |
|
| 462 | 2 | 'category' => 'feed', |
|
| 463 | 2 | 'app' => 'Google Docs', |
|
| 464 | 2 | 'appname' => $value.'; '.$tokens[$i+1] |
|
| 465 | 2 | ]), |
|
| 466 | 2 | 'Google-Lens' => new props('exact', $fn['feed']), |
|
| 467 | 2 | 'ManicTime/' => new props('start', $fn['feed']), |
|
| 468 | 2 | 'Yik Yak/' => new props('start', $fn['feed']), |
|
| 469 | 2 | 'HubSpot-Link-Resolver' => new props('exact', $fn['feed']), |
|
| 470 | 2 | 'AppleExchangeWebServices/' => new props('start', $fn['feed']), |
|
| 471 | 2 | 'The Lounge IRC Client' => new props('exact', $fn['feed']), |
|
| 472 | 2 | 'W3C-checklink/' => new props('start', $fn['validator']), |
|
| 473 | 2 | 'CSSCheck/' => new props('start', $fn['validator']), |
|
| 474 | 2 | 'Let\'s Encrypt validation server' => new props('exact', $fn['validator']), |
|
| 475 | 2 | 'SEO-Macroscope/' => new props('start', $fn['validator']), |
|
| 476 | 2 | 'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']), |
|
| 477 | 2 | 'Barracuda Sentinel' => new props('start', $fn['validator']), |
|
| 478 | 2 | 'Expanse' => new props('start', $fn['crawler']), |
|
| 479 | 2 | 'eCairn-Grabber/' => new props('start', $fn['scraper']), |
|
| 480 | 2 | 'SEOkicks' => new props('exact', $fn['crawler']), |
|
| 481 | 2 | 'PostmanRuntime/' => new props('start', $fn['scraper']), |
|
| 482 | 2 | 'axios/' => new props('start', $fn['scraper']), |
|
| 483 | 2 | 'Rogerbot/' => new props('start', $fn['crawler']), |
|
| 484 | 2 | 'DashLinkPreviews/' => new props('start', $fn['feed']), |
|
| 485 | 2 | 'Snapchat/' => new props('start', $fn['feed']), |
|
| 486 | 2 | 'WhatsApp/' => new props('any', $fn['feed']), |
|
| 487 | 2 | 'Hootsuite-Authoring/' => new props('start', $fn['feed']), |
|
| 488 | 2 | 'URL Preview' => new props('any', $fn['feed']), |
|
| 489 | 2 | 'Link Preview' => new props('any', $fn['feed']), |
|
| 490 | 2 | 'ApacheBench/' => new props('start', $fn['validator']), |
|
| 491 | 2 | 'Wheregoes.com Redirect Checker/' => new props('start', $fn['validator']), |
|
| 492 | 2 | 'Asana/' => new props('start', $fn['feed']), |
|
| 493 | 2 | 'Java/' => new props('any', fn (string $value) : array => [ |
|
| 494 | 1 | 'type' => 'robot', |
|
| 495 | 1 | 'category' => 'scraper', |
|
| 496 | 1 | 'app' => 'Java', |
|
| 497 | 1 | 'appname' => $value, |
|
| 498 | 1 | 'appversion' => \explode('/', $value, 3)[1] |
|
| 499 | 1 | ]), |
|
| 500 | 2 | 'curl/' => new props('any', $fn['scraper']), |
|
| 501 | 2 | 'Wget/' => new props('start', $fn['scraper']), |
|
| 502 | 2 | 'rest-client/' => new props('start', $fn['scraper']), |
|
| 503 | 2 | 'ruby/' => new props('start', $fn['scraper']), |
|
| 504 | 2 | 'Bun/' => new props('start', $fn['scraper']), |
|
| 505 | 2 | 'CakePHP' => new props('start', $fn['scraper']), |
|
| 506 | 2 | 'cpp-httplib/' => new props('start', $fn['scraper']), |
|
| 507 | 2 | 'Dart/' => new props('start', $fn['scraper']), |
|
| 508 | 2 | 'Deno/' => new props('start', $fn['scraper']), |
|
| 509 | 2 | 'Datadog' => new props('start', $fn['scraper']), |
|
| 510 | // 'libwww-perl/' => new props('start', $fn['scraper']), |
||
| 511 | 2 | 'http/' => new props('start', $fn['scraper']), |
|
| 512 | 2 | 'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']), |
|
| 513 | 2 | 'http-client/' => new props('any', $fn['scraper']), |
|
| 514 | 2 | 'HttpClient/' => new props('any', $fn['scraper']), |
|
| 515 | 2 | 'PowerShell/' => new props('start', $fn['scraper']), |
|
| 516 | 2 | 'node-fetch' => new props('exact', $fn['scraper']), |
|
| 517 | 2 | 'OAI-SearchBot/' => new props('start', $fn['search']), |
|
| 518 | 2 | 'iaskspider/' => new props('start', $fn['search']), |
|
| 519 | 2 | 'MeltwaterNews' => new props('start', fn (string $value) : array => [ |
|
| 520 | 2 | 'type' => 'robot', |
|
| 521 | 2 | 'category' => 'crawler', |
|
| 522 | 2 | 'app' => 'Meltwater News', |
|
| 523 | 2 | 'appname' => 'MeltwaterNews', |
|
| 524 | 2 | 'url' => \mb_substr($value, 14) ?: null |
|
| 525 | 2 | ]), |
|
| 526 | 2 | 'Google-Extended' => new props('start', $fn['ai']), |
|
| 527 | 2 | 'ChatGPT-User/' => new props('start', $fn['feed']), |
|
| 528 | 2 | 'Cohere' => new props('start', $fn['ai']), |
|
| 529 | 2 | 'facebookexternalhit/' => new props('start', $fn['feed']), |
|
| 530 | 2 | 'facebookcatalog/' => new props('start', $fn['crawler']), |
|
| 531 | 2 | 'meta-externalagent' => new props('start', $fn['ai']), |
|
| 532 | 2 | 'meta-externalfetcher' => new props('start', $fn['feed']), |
|
| 533 | 2 | 'BrightBot ' => new props('start', fn (string $value) : array => [ |
|
| 534 | 2 | 'type' => 'robot', |
|
| 535 | 2 | 'category' => 'ai', |
|
| 536 | 2 | 'app' => 'Bright Bot', |
|
| 537 | 2 | 'appname' => 'BrightBot', |
|
| 538 | 2 | 'appversion' => \mb_substr($value, 10) ?: null |
|
| 539 | 2 | ]), |
|
| 540 | 2 | 'anthropic-ai' => new props('start', $fn['ai']), |
|
| 541 | 2 | 'bigsur.ai' => new props('start', $fn['ai']), |
|
| 542 | 2 | 'Claude User' => new props('start', $fn['ai']), |
|
| 543 | 2 | 'Claude Web' => new props('start', $fn['ai']), |
|
| 544 | 2 | 'cohere-ai' => new props('start', $fn['ai']), |
|
| 545 | 2 | 'cohere-training-data-crawler' => new props('start', $fn['ai']), |
|
| 546 | 2 | 'Cotoyogi' => new props('start', $fn['ai']), |
|
| 547 | 2 | 'Crawlspace' => new props('start', $fn['ai']), |
|
| 548 | 2 | 'Datenbank Crawler' => new props('start', $fn['ai']), |
|
| 549 | 2 | 'Devin' => new props('start', $fn['ai']), |
|
| 550 | 2 | 'FirecrawlAgent' => new props('start', $fn['ai']), |
|
| 551 | 2 | 'FriendlyCrawler' => new props('start', $fn['ai']), |
|
| 552 | 2 | 'MistralAI-User' => new props('start', $fn['ai']), |
|
| 553 | 2 | 'NovaAct' => new props('start', $fn['ai']), // amazon |
|
| 554 | 2 | 'Panscient' => new props('start', $fn['ai']), |
|
| 555 | 2 | 'pantest' => new props('start', $fn['ai']), |
|
| 556 | 2 | 'Perplexity' => new props('start', $fn['ai']), |
|
| 557 | 2 | 'VelenPublicWebCrawler' => new props('start', $fn['ai']), |
|
| 558 | 2 | 'Validator' => new props('any', $fn['validator']), |
|
| 559 | 2 | 'feed' => new props('any', $fn['feed']), |
|
| 560 | 2 | 'bot/' => new props('any', $fn['map']), |
|
| 561 | 2 | 'bot-' => new props('any', $fn['map']), |
|
| 562 | 2 | ' bot ' => new props('any', $fn['map']), |
|
| 563 | 2 | 'bot' => new props('end', $fn['map']), |
|
| 564 | 2 | 'spider' => new props('any', $fn['crawler']), |
|
| 565 | 2 | 'crawler' => new props('any', $fn['map']), |
|
| 566 | ]; |
||
| 567 | } |
||
| 568 | } |