fisharebest /
webtrees
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * webtrees: online genealogy |
||
| 5 | * Copyright (C) 2025 webtrees development team |
||
| 6 | * This program is free software: you can redistribute it and/or modify |
||
| 7 | * it under the terms of the GNU General Public License as published by |
||
| 8 | * the Free Software Foundation, either version 3 of the License, or |
||
| 9 | * (at your option) any later version. |
||
| 10 | * This program is distributed in the hope that it will be useful, |
||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 13 | * GNU General Public License for more details. |
||
| 14 | * You should have received a copy of the GNU General Public License |
||
| 15 | * along with this program. If not, see <https://www.gnu.org/licenses/>. |
||
| 16 | */ |
||
| 17 | |||
| 18 | declare(strict_types=1); |
||
| 19 | |||
| 20 | namespace Fisharebest\Webtrees\Http\Middleware; |
||
| 21 | |||
| 22 | use Fig\Http\Message\StatusCodeInterface; |
||
| 23 | use Fisharebest\Webtrees\Registry; |
||
| 24 | use Fisharebest\Webtrees\Services\NetworkService; |
||
| 25 | use Fisharebest\Webtrees\Validator; |
||
| 26 | use IPLib\Address\AddressInterface; |
||
| 27 | use IPLib\Factory; |
||
| 28 | use IPLib\Range\RangeInterface; |
||
| 29 | use Psr\Http\Message\ResponseInterface; |
||
| 30 | use Psr\Http\Message\ServerRequestInterface; |
||
| 31 | use Psr\Http\Server\MiddlewareInterface; |
||
| 32 | use Psr\Http\Server\RequestHandlerInterface; |
||
| 33 | |||
| 34 | use function array_filter; |
||
| 35 | use function array_map; |
||
| 36 | use function assert; |
||
| 37 | use function count; |
||
| 38 | use function gethostbyaddr; |
||
| 39 | use function gethostbyname; |
||
| 40 | use function preg_match_all; |
||
| 41 | use function random_int; |
||
| 42 | use function response; |
||
| 43 | use function str_contains; |
||
| 44 | use function str_ends_with; |
||
| 45 | |||
| 46 | /** |
||
| 47 | * Middleware to block bad robots before they waste our valuable CPU cycles. |
||
| 48 | */ |
||
| 49 | class BadBotBlocker implements MiddlewareInterface |
||
| 50 | { |
||
| 51 | public const string ROBOT_ATTRIBUTE_NAME = 'is-a-robot'; |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 52 | |||
| 53 | // Cache whois requests. Try to avoid all caches expiring at the same time. |
||
| 54 | private const int WHOIS_TTL_MIN = 28 * 86400; |
||
| 55 | private const int WHOIS_TTL_MAX = 35 * 86400; |
||
| 56 | |||
| 57 | // An opinionated list of "bad" robots. Typically, these are AI and SEO crawlers. |
||
| 58 | public const array BAD_ROBOTS = [ |
||
| 59 | 'ADmantX', |
||
| 60 | 'AI2Bot', |
||
| 61 | 'Adsbot', |
||
| 62 | 'AISearchBot', |
||
| 63 | 'AhrefsBot', |
||
| 64 | 'Ai2Bot-Dolma', |
||
| 65 | 'AliyunSecBot', |
||
| 66 | 'Amazonbot', |
||
| 67 | 'Andibot', |
||
| 68 | 'AntBot', |
||
| 69 | 'Applebot', |
||
| 70 | 'AspiegelBot', |
||
| 71 | 'Awario', |
||
| 72 | 'BLEXBot', |
||
| 73 | 'Barkrowler', |
||
| 74 | 'Brightbot', |
||
| 75 | 'Bytespider', |
||
| 76 | 'CCBot', |
||
| 77 | 'CensysInspect', |
||
| 78 | 'ChatGPT-User', |
||
| 79 | 'Claude-SearchBot', |
||
| 80 | 'Claude-User', |
||
| 81 | 'Claude-Web', |
||
| 82 | 'ClaudeBot', |
||
| 83 | 'Cotoyogi', |
||
| 84 | 'Crawlspace', |
||
| 85 | 'DataForSeoBot', |
||
| 86 | 'Datenbank Crawler', |
||
| 87 | 'Devin', |
||
| 88 | 'Diffbot', |
||
| 89 | 'DotBot', |
||
| 90 | 'DuckAssistBot', |
||
| 91 | 'Echobot Bot', |
||
| 92 | 'EchoboxBot', |
||
| 93 | 'Expanse', |
||
| 94 | 'FacebookBot', |
||
| 95 | 'Factset_spyderbot', |
||
| 96 | 'FirecrawlAgent', |
||
| 97 | 'Foregenix', |
||
| 98 | 'FriendlyCrawler', |
||
| 99 | 'GPTBot', |
||
| 100 | 'Gemini-Deep-Research', |
||
| 101 | 'Go-http-client', |
||
| 102 | 'Google-CloudVertexBot', |
||
| 103 | 'Google-Extended', |
||
| 104 | 'GoogleAgent-Mariner', |
||
| 105 | 'GoogleOther', |
||
| 106 | 'Grapeshot', |
||
| 107 | 'Honolulu-bot', |
||
| 108 | 'ICC-Crawler', |
||
| 109 | 'ISSCyberRiskCrawler', |
||
| 110 | 'ImagesiftBot', |
||
| 111 | 'IonCrawl', |
||
| 112 | 'Java', |
||
| 113 | 'Kangaroo Bot', |
||
| 114 | 'Linguee', |
||
| 115 | 'MJ12bot', |
||
| 116 | 'MegaIndex.ru', |
||
| 117 | 'Meta-ExternalAgent', |
||
| 118 | 'Meta-ExternalFetcher', |
||
| 119 | 'MistralAI-User', |
||
| 120 | 'MyCentralAIScraperBot', |
||
| 121 | 'NovaAct', |
||
| 122 | 'OAI-SearchBot', |
||
| 123 | 'Operator', |
||
| 124 | 'PanguBot', |
||
| 125 | 'Panscient', |
||
| 126 | 'Perplexity-User', |
||
| 127 | 'PerplexityBot', |
||
| 128 | 'PetalBot', |
||
| 129 | 'PhindBot', |
||
| 130 | 'Poseidon Research Crawler', |
||
| 131 | 'QualifiedBot', |
||
| 132 | 'QuillBot', |
||
| 133 | 'SBIntuitionsBot', |
||
| 134 | 'SEOkicks', |
||
| 135 | 'Scrapy', |
||
| 136 | 'SeekportBot', |
||
| 137 | 'SemrushBot', |
||
| 138 | 'Sidetrade indexer bot', |
||
| 139 | 'SiteKiosk', |
||
| 140 | 'SummalyBot', |
||
| 141 | 'Thinkbot', |
||
| 142 | 'TikTokSpider', |
||
| 143 | 'Timpibot', |
||
| 144 | 'TinyTestBot', |
||
| 145 | 'Turnitin', |
||
| 146 | 'VelenPublicWebCrawler', |
||
| 147 | 'WARDBot', |
||
| 148 | 'Webzio-Extended', |
||
| 149 | 'XoviBot', |
||
| 150 | 'YandexAdditional', |
||
| 151 | 'YisouSpider', |
||
| 152 | 'YouBot', |
||
| 153 | 'ZoominfoBot', |
||
| 154 | 'aiHitBot', |
||
| 155 | 'aiohttp', |
||
| 156 | 'anthropic-ai', |
||
| 157 | 'bedrockbot', |
||
| 158 | 'cohere-ai', |
||
| 159 | 'cohere-training-data-crawler', |
||
| 160 | 'facebookexternalhit', |
||
| 161 | 'fidget-spinner-bot', |
||
| 162 | 'iaskspider', |
||
| 163 | 'img2dataset', |
||
| 164 | 'internet-measurement', |
||
| 165 | 'linabot', |
||
| 166 | 'meta-externalagent', |
||
| 167 | 'meta-externalfetcher', |
||
| 168 | 'netEstate', |
||
| 169 | 'omgili', |
||
| 170 | 'panscient', |
||
| 171 | 'phxbot', |
||
| 172 | 'proximic', |
||
| 173 | 'python-requests', |
||
| 174 | 'quillbot.com', |
||
| 175 | 'wpbot', |
||
| 176 | 'serpstatbot', |
||
| 177 | 'test-bot', |
||
| 178 | 'wp_is_mobile', |
||
| 179 | ]; |
||
| 180 | |||
| 181 | /** |
||
| 182 | * Some search engines use reverse/forward DNS to verify the IP address. |
||
| 183 | * |
||
| 184 | * @see https://support.google.com/webmasters/answer/80553?hl=en |
||
| 185 | * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 |
||
| 186 | * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 |
||
| 187 | * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html |
||
| 188 | * @see https://www.mojeek.com/bot.html |
||
| 189 | */ |
||
| 190 | private const array ROBOT_REV_FWD_DNS = [ |
||
| 191 | 'BingPreview' => ['.search.msn.com'], |
||
| 192 | 'Google' => ['.google.com', '.googlebot.com'], |
||
| 193 | 'Mail.RU_Bot' => ['.mail.ru'], |
||
| 194 | 'MicrosoftPreview' => ['.search.msn.com'], |
||
| 195 | 'MojeekBot' => ['.mojeek.com'], |
||
| 196 | 'Qwantify' => ['.qwant.com'], |
||
| 197 | 'Sogou' => ['.crawl.sogou.com'], |
||
| 198 | 'Yahoo' => ['.crawl.yahoo.net'], |
||
| 199 | 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], |
||
| 200 | 'bingbot' => ['.search.msn.com'], |
||
| 201 | 'msnbot' => ['.search.msn.com'], |
||
| 202 | ]; |
||
| 203 | |||
| 204 | /** |
||
| 205 | * Some search engines only use reverse DNS to verify the IP address. |
||
| 206 | * |
||
| 207 | * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 |
||
| 208 | * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler |
||
| 209 | * @see https://www.ionos.de/terms-gtc/faq-crawler |
||
| 210 | */ |
||
| 211 | private const array ROBOT_REV_ONLY_DNS = [ |
||
| 212 | 'Baiduspider' => ['.baidu.com', '.baidu.jp'], |
||
| 213 | 'FreshBot' => ['.seznam.cz'], |
||
| 214 | 'Neevabot' => ['.neeva.com'], |
||
| 215 | 'SeznamBot' => ['.seznam.cz'], |
||
| 216 | ]; |
||
| 217 | |||
| 218 | /** |
||
| 219 | * Some search engines operate from designated IP addresses. |
||
| 220 | * TODO: fetch current lists of IPs, rather than use hard-coded values. |
||
| 221 | * See https://merj.com/blog/dont-block-what-you-want-duckduckgo-and-common-crawl-to-provide-ip-address-api-endpoints |
||
| 222 | */ |
||
| 223 | |||
| 224 | /** |
||
| 225 | * Some search engines operate from within a designated autonomous system. |
||
| 226 | * |
||
| 227 | * @see https://developers.facebook.com/docs/sharing/webmasters/crawler |
||
| 228 | * @see https://www.facebook.com/peering/ |
||
| 229 | */ |
||
| 230 | private const array ROBOT_ASNS = [ |
||
| 231 | 'facebook' => ['AS32934'], |
||
| 232 | 'twitter' => ['AS13414'], |
||
| 233 | ]; |
||
| 234 | |||
| 235 | public function __construct(private readonly NetworkService $network_service) |
||
| 236 | { |
||
| 237 | } |
||
| 238 | |||
| 239 | public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface |
||
| 240 | { |
||
| 241 | $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); |
||
| 242 | $ip = Validator::attributes($request)->string('client-ip'); |
||
| 243 | $address = Factory::parseAddressString($ip); |
||
| 244 | assert($address instanceof AddressInterface); |
||
| 245 | |||
| 246 | if ($ua === '') { |
||
| 247 | return $this->response('Not acceptable: no-ua'); |
||
| 248 | } |
||
| 249 | |||
| 250 | foreach (self::BAD_ROBOTS as $robot) { |
||
| 251 | if (str_contains($ua, $robot)) { |
||
| 252 | return $this->response('Not acceptable: bad-ua'); |
||
| 253 | } |
||
| 254 | } |
||
| 255 | |||
| 256 | $validated_bot = false; |
||
| 257 | |||
| 258 | foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { |
||
| 259 | if (str_contains($ua, $robot)) { |
||
| 260 | if ($this->checkRobotDNS($ip, $valid_domains, false)) { |
||
| 261 | $validated_bot = true; |
||
| 262 | } else { |
||
| 263 | return $this->response('Not acceptable: bad-dns'); |
||
| 264 | } |
||
| 265 | } |
||
| 266 | } |
||
| 267 | |||
| 268 | foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { |
||
| 269 | if (str_contains($ua, $robot)) { |
||
| 270 | if ($this->checkRobotDNS($ip, $valid_domains, true)) { |
||
| 271 | $validated_bot = true; |
||
| 272 | } else { |
||
| 273 | return $this->response('Not acceptable: bad-dns'); |
||
| 274 | } |
||
| 275 | } |
||
| 276 | } |
||
| 277 | |||
| 278 | // TODO: fetch current lists of IPs, rather than use hard-coded values. |
||
| 279 | |||
| 280 | foreach (self::ROBOT_ASNS as $robot => $asns) { |
||
| 281 | foreach ($asns as $asn) { |
||
| 282 | if (str_contains($ua, $robot)) { |
||
| 283 | foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
||
| 284 | if ($range->contains($address)) { |
||
| 285 | $validated_bot = true; |
||
| 286 | continue 2; |
||
| 287 | } |
||
| 288 | } |
||
| 289 | |||
| 290 | return $this->response('Not acceptable: bad-dns'); |
||
| 291 | } |
||
| 292 | } |
||
| 293 | } |
||
| 294 | |||
| 295 | // Allow sites to block access from entire networks. |
||
| 296 | $block_asn = Validator::attributes($request)->string('block_asn', ''); |
||
| 297 | preg_match_all('/(AS\d+)/', $block_asn, $matches); |
||
| 298 | |||
| 299 | foreach ($matches[1] as $asn) { |
||
| 300 | foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
||
| 301 | if ($range->contains($address)) { |
||
| 302 | return $this->response('Not acceptable: bad-asn'); |
||
| 303 | } |
||
| 304 | } |
||
| 305 | } |
||
| 306 | |||
| 307 | // No Cookies? Few headers? Probably a robot. |
||
| 308 | $has_cookies = $request->getCookieParams() !== []; |
||
| 309 | $has_few_headers = count($request->getHeaders()) <= 11; |
||
| 310 | $suspected_bot = !$has_cookies && $has_few_headers; |
||
| 311 | |||
| 312 | // Robots often claim to be a browser. |
||
| 313 | $claims_to_be_human = |
||
| 314 | str_contains($ua, 'Chrome/') || |
||
| 315 | str_contains($ua, 'Firefox/') || |
||
| 316 | str_contains($ua, 'Opera/') || |
||
| 317 | str_contains($ua, 'Safari/') |
||
| 318 | ; |
||
| 319 | |||
| 320 | // Validated bots (such as google and bing) use headless browsers. This is OK. |
||
| 321 | // Anyone else claiming to be a browser needs to prove it by setting a cookie. |
||
| 322 | if (!$validated_bot && $claims_to_be_human && !$has_cookies) { |
||
| 323 | $content = |
||
| 324 | '<!DOCTYPE html>' . |
||
| 325 | '<html lang="en">' . |
||
| 326 | '<head>' . |
||
| 327 | '<meta charset="utf-8">' . |
||
| 328 | '<title>Cookie check</title>' . |
||
| 329 | '<meta http-equiv="refresh" content="0">' . |
||
| 330 | '</head>' . |
||
| 331 | '<body>Cookie check</body>' . |
||
| 332 | '</html>'; |
||
| 333 | |||
| 334 | return $this->response($content) |
||
| 335 | ->withHeader('set-cookie', 'x=y; HttpOnly; SameSite=Strict'); |
||
| 336 | } |
||
| 337 | |||
| 338 | // Bots get restricted access |
||
| 339 | if ($validated_bot || $suspected_bot) { |
||
| 340 | $request = $request->withAttribute(self::ROBOT_ATTRIBUTE_NAME, true); |
||
| 341 | } |
||
| 342 | |||
| 343 | // Scans for WordPress vulnerabilities? |
||
| 344 | // Block these before wasting resources on DB connections, sessions, etc. |
||
| 345 | $path = $request->getUri()->getPath(); |
||
| 346 | |||
| 347 | if (str_starts_with($path, '/xmlrpc.php') || str_starts_with($path, '/wp-')) { |
||
| 348 | return $this->response('Not acceptable: not-wp'); |
||
| 349 | } |
||
| 350 | |||
| 351 | return $handler->handle($request); |
||
| 352 | } |
||
| 353 | |||
| 354 | /** |
||
| 355 | * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. |
||
| 356 | * |
||
| 357 | * @param list<string> $valid_domains |
||
| 358 | */ |
||
| 359 | private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool |
||
| 360 | { |
||
| 361 | $host = gethostbyaddr($ip); |
||
| 362 | |||
| 363 | if ($host === false) { |
||
| 364 | return false; |
||
| 365 | } |
||
| 366 | |||
| 367 | foreach ($valid_domains as $domain) { |
||
| 368 | if (str_ends_with($host, $domain)) { |
||
| 369 | return $reverse_only || $ip === gethostbyname($host); |
||
| 370 | } |
||
| 371 | } |
||
| 372 | |||
| 373 | return false; |
||
| 374 | } |
||
| 375 | |||
| 376 | /** |
||
| 377 | * @return array<RangeInterface> |
||
| 378 | */ |
||
| 379 | private function fetchIpRangesForAsn(string $asn): array |
||
| 380 | { |
||
| 381 | return Registry::cache()->file()->remember('whois-asn-' . $asn, function () use ($asn): array { |
||
| 382 | $ranges = $this->network_service->findIpRangesForAsn($asn); |
||
| 383 | $mapper = static fn (string $range): RangeInterface|null => Factory::parseRangeString($range); |
||
| 384 | $ranges = array_map($mapper, $ranges); |
||
| 385 | |||
| 386 | return array_filter($ranges); |
||
| 387 | }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); |
||
| 388 | } |
||
| 389 | |||
| 390 | private function response(string $content): ResponseInterface |
||
| 391 | { |
||
| 392 | return response($content, StatusCodeInterface::STATUS_NOT_ACCEPTABLE); |
||
| 393 | } |
||
| 394 | } |
||
| 395 |