1 | <?php |
||
2 | |||
3 | /** |
||
4 | * webtrees: online genealogy |
||
5 | * Copyright (C) 2025 webtrees development team |
||
6 | * This program is free software: you can redistribute it and/or modify |
||
7 | * it under the terms of the GNU General Public License as published by |
||
8 | * the Free Software Foundation, either version 3 of the License, or |
||
9 | * (at your option) any later version. |
||
10 | * This program is distributed in the hope that it will be useful, |
||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
13 | * GNU General Public License for more details. |
||
14 | * You should have received a copy of the GNU General Public License |
||
15 | * along with this program. If not, see <https://www.gnu.org/licenses/>. |
||
16 | */ |
||
17 | |||
18 | declare(strict_types=1); |
||
19 | |||
20 | namespace Fisharebest\Webtrees\Http\Middleware; |
||
21 | |||
22 | use Fig\Http\Message\StatusCodeInterface; |
||
23 | use Fisharebest\Webtrees\Registry; |
||
24 | use Fisharebest\Webtrees\Services\NetworkService; |
||
25 | use Fisharebest\Webtrees\Validator; |
||
26 | use IPLib\Address\AddressInterface; |
||
27 | use IPLib\Factory; |
||
28 | use IPLib\Range\RangeInterface; |
||
29 | use Psr\Http\Message\ResponseInterface; |
||
30 | use Psr\Http\Message\ServerRequestInterface; |
||
31 | use Psr\Http\Server\MiddlewareInterface; |
||
32 | use Psr\Http\Server\RequestHandlerInterface; |
||
33 | |||
34 | use function array_filter; |
||
35 | use function array_map; |
||
36 | use function assert; |
||
37 | use function count; |
||
38 | use function gethostbyaddr; |
||
39 | use function gethostbyname; |
||
40 | use function preg_match_all; |
||
41 | use function random_int; |
||
42 | use function response; |
||
43 | use function str_contains; |
||
44 | use function str_ends_with; |
||
45 | |||
46 | /** |
||
47 | * Middleware to block bad robots before they waste our valuable CPU cycles. |
||
48 | */ |
||
49 | class BadBotBlocker implements MiddlewareInterface |
||
50 | { |
||
51 | public const string ROBOT_ATTRIBUTE_NAME = 'is-a-robot'; |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
52 | |||
53 | // Cache whois requests. Try to avoid all caches expiring at the same time. |
||
54 | private const int WHOIS_TTL_MIN = 28 * 86400; |
||
55 | private const int WHOIS_TTL_MAX = 35 * 86400; |
||
56 | |||
57 | // An opinionated list of "bad" robots. Typically, these are AI and SEO crawlers. |
||
58 | public const array BAD_ROBOTS = [ |
||
59 | 'ADmantX', |
||
60 | 'AI2Bot', |
||
61 | 'Adsbot', |
||
62 | 'AISearchBot', |
||
63 | 'AhrefsBot', |
||
64 | 'Ai2Bot-Dolma', |
||
65 | 'AliyunSecBot', |
||
66 | 'Amazonbot', |
||
67 | 'Andibot', |
||
68 | 'AntBot', |
||
69 | 'Applebot', |
||
70 | 'AspiegelBot', |
||
71 | 'Awario', |
||
72 | 'BLEXBot', |
||
73 | 'Barkrowler', |
||
74 | 'Brightbot', |
||
75 | 'Bytespider', |
||
76 | 'CCBot', |
||
77 | 'CensysInspect', |
||
78 | 'ChatGPT-User', |
||
79 | 'Claude-SearchBot', |
||
80 | 'Claude-User', |
||
81 | 'Claude-Web', |
||
82 | 'ClaudeBot', |
||
83 | 'Cotoyogi', |
||
84 | 'Crawlspace', |
||
85 | 'DataForSeoBot', |
||
86 | 'Datenbank Crawler', |
||
87 | 'Devin', |
||
88 | 'Diffbot', |
||
89 | 'DotBot', |
||
90 | 'DuckAssistBot', |
||
91 | 'Echobot Bot', |
||
92 | 'EchoboxBot', |
||
93 | 'Expanse', |
||
94 | 'FacebookBot', |
||
95 | 'Factset_spyderbot', |
||
96 | 'FirecrawlAgent', |
||
97 | 'Foregenix', |
||
98 | 'FriendlyCrawler', |
||
99 | 'GPTBot', |
||
100 | 'Gemini-Deep-Research', |
||
101 | 'Go-http-client', |
||
102 | 'Google-CloudVertexBot', |
||
103 | 'Google-Extended', |
||
104 | 'GoogleAgent-Mariner', |
||
105 | 'GoogleOther', |
||
106 | 'Grapeshot', |
||
107 | 'Honolulu-bot', |
||
108 | 'ICC-Crawler', |
||
109 | 'ISSCyberRiskCrawler', |
||
110 | 'ImagesiftBot', |
||
111 | 'IonCrawl', |
||
112 | 'Java', |
||
113 | 'Kangaroo Bot', |
||
114 | 'Linguee', |
||
115 | 'MJ12bot', |
||
116 | 'MegaIndex.ru', |
||
117 | 'Meta-ExternalAgent', |
||
118 | 'Meta-ExternalFetcher', |
||
119 | 'MistralAI-User', |
||
120 | 'MyCentralAIScraperBot', |
||
121 | 'NovaAct', |
||
122 | 'OAI-SearchBot', |
||
123 | 'Operator', |
||
124 | 'PanguBot', |
||
125 | 'Panscient', |
||
126 | 'Perplexity-User', |
||
127 | 'PerplexityBot', |
||
128 | 'PetalBot', |
||
129 | 'PhindBot', |
||
130 | 'Poseidon Research Crawler', |
||
131 | 'QualifiedBot', |
||
132 | 'QuillBot', |
||
133 | 'SBIntuitionsBot', |
||
134 | 'SEOkicks', |
||
135 | 'Scrapy', |
||
136 | 'SeekportBot', |
||
137 | 'SemrushBot', |
||
138 | 'Sidetrade indexer bot', |
||
139 | 'SiteKiosk', |
||
140 | 'SummalyBot', |
||
141 | 'Thinkbot', |
||
142 | 'TikTokSpider', |
||
143 | 'Timpibot', |
||
144 | 'TinyTestBot', |
||
145 | 'Turnitin', |
||
146 | 'VelenPublicWebCrawler', |
||
147 | 'WARDBot', |
||
148 | 'Webzio-Extended', |
||
149 | 'XoviBot', |
||
150 | 'YandexAdditional', |
||
151 | 'YisouSpider', |
||
152 | 'YouBot', |
||
153 | 'ZoominfoBot', |
||
154 | 'aiHitBot', |
||
155 | 'aiohttp', |
||
156 | 'anthropic-ai', |
||
157 | 'bedrockbot', |
||
158 | 'cohere-ai', |
||
159 | 'cohere-training-data-crawler', |
||
160 | 'facebookexternalhit', |
||
161 | 'fidget-spinner-bot', |
||
162 | 'iaskspider', |
||
163 | 'img2dataset', |
||
164 | 'internet-measurement', |
||
165 | 'linabot', |
||
166 | 'meta-externalagent', |
||
167 | 'meta-externalfetcher', |
||
168 | 'netEstate', |
||
169 | 'omgili', |
||
170 | 'panscient', |
||
171 | 'phxbot', |
||
172 | 'proximic', |
||
173 | 'python-requests', |
||
174 | 'quillbot.com', |
||
175 | 'wpbot', |
||
176 | 'serpstatbot', |
||
177 | 'test-bot', |
||
178 | 'wp_is_mobile', |
||
179 | ]; |
||
180 | |||
181 | /** |
||
182 | * Some search engines use reverse/forward DNS to verify the IP address. |
||
183 | * |
||
184 | * @see https://support.google.com/webmasters/answer/80553?hl=en |
||
185 | * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 |
||
186 | * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 |
||
187 | * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html |
||
188 | * @see https://www.mojeek.com/bot.html |
||
189 | */ |
||
190 | private const array ROBOT_REV_FWD_DNS = [ |
||
191 | 'BingPreview' => ['.search.msn.com'], |
||
192 | 'Google' => ['.google.com', '.googlebot.com'], |
||
193 | 'Mail.RU_Bot' => ['.mail.ru'], |
||
194 | 'MicrosoftPreview' => ['.search.msn.com'], |
||
195 | 'MojeekBot' => ['.mojeek.com'], |
||
196 | 'Qwantify' => ['.qwant.com'], |
||
197 | 'Sogou' => ['.crawl.sogou.com'], |
||
198 | 'Yahoo' => ['.crawl.yahoo.net'], |
||
199 | 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], |
||
200 | 'bingbot' => ['.search.msn.com'], |
||
201 | 'msnbot' => ['.search.msn.com'], |
||
202 | ]; |
||
203 | |||
204 | /** |
||
205 | * Some search engines only use reverse DNS to verify the IP address. |
||
206 | * |
||
207 | * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 |
||
208 | * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler |
||
209 | * @see https://www.ionos.de/terms-gtc/faq-crawler |
||
210 | */ |
||
211 | private const array ROBOT_REV_ONLY_DNS = [ |
||
212 | 'Baiduspider' => ['.baidu.com', '.baidu.jp'], |
||
213 | 'FreshBot' => ['.seznam.cz'], |
||
214 | 'Neevabot' => ['.neeva.com'], |
||
215 | 'SeznamBot' => ['.seznam.cz'], |
||
216 | ]; |
||
217 | |||
218 | /** |
||
219 | * Some search engines operate from designated IP addresses. |
||
220 | * TODO: fetch current lists of IPs, rather than use hard-coded values. |
||
221 | * See https://merj.com/blog/dont-block-what-you-want-duckduckgo-and-common-crawl-to-provide-ip-address-api-endpoints |
||
222 | */ |
||
223 | |||
224 | /** |
||
225 | * Some search engines operate from within a designated autonomous system. |
||
226 | * |
||
227 | * @see https://developers.facebook.com/docs/sharing/webmasters/crawler |
||
228 | * @see https://www.facebook.com/peering/ |
||
229 | */ |
||
230 | private const array ROBOT_ASNS = [ |
||
231 | 'facebook' => ['AS32934'], |
||
232 | 'twitter' => ['AS13414'], |
||
233 | ]; |
||
234 | |||
235 | public function __construct(private readonly NetworkService $network_service) |
||
236 | { |
||
237 | } |
||
238 | |||
239 | public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface |
||
240 | { |
||
241 | $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); |
||
242 | $ip = Validator::attributes($request)->string('client-ip'); |
||
243 | $address = Factory::parseAddressString($ip); |
||
244 | assert($address instanceof AddressInterface); |
||
245 | |||
246 | if ($ua === '') { |
||
247 | return $this->response('Not acceptable: no-ua'); |
||
248 | } |
||
249 | |||
250 | foreach (self::BAD_ROBOTS as $robot) { |
||
251 | if (str_contains($ua, $robot)) { |
||
252 | return $this->response('Not acceptable: bad-ua'); |
||
253 | } |
||
254 | } |
||
255 | |||
256 | $validated_bot = false; |
||
257 | |||
258 | foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { |
||
259 | if (str_contains($ua, $robot)) { |
||
260 | if ($this->checkRobotDNS($ip, $valid_domains, false)) { |
||
261 | $validated_bot = true; |
||
262 | } else { |
||
263 | return $this->response('Not acceptable: bad-dns'); |
||
264 | } |
||
265 | } |
||
266 | } |
||
267 | |||
268 | foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { |
||
269 | if (str_contains($ua, $robot)) { |
||
270 | if ($this->checkRobotDNS($ip, $valid_domains, true)) { |
||
271 | $validated_bot = true; |
||
272 | } else { |
||
273 | return $this->response('Not acceptable: bad-dns'); |
||
274 | } |
||
275 | } |
||
276 | } |
||
277 | |||
278 | // TODO: fetch current lists of IPs, rather than use hard-coded values. |
||
279 | |||
280 | foreach (self::ROBOT_ASNS as $robot => $asns) { |
||
281 | foreach ($asns as $asn) { |
||
282 | if (str_contains($ua, $robot)) { |
||
283 | foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
||
284 | if ($range->contains($address)) { |
||
285 | $validated_bot = true; |
||
286 | continue 2; |
||
287 | } |
||
288 | } |
||
289 | |||
290 | return $this->response('Not acceptable: bad-dns'); |
||
291 | } |
||
292 | } |
||
293 | } |
||
294 | |||
295 | // Allow sites to block access from entire networks. |
||
296 | $block_asn = Validator::attributes($request)->string('block_asn', ''); |
||
297 | preg_match_all('/(AS\d+)/', $block_asn, $matches); |
||
298 | |||
299 | foreach ($matches[1] as $asn) { |
||
300 | foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
||
301 | if ($range->contains($address)) { |
||
302 | return $this->response('Not acceptable: bad-asn'); |
||
303 | } |
||
304 | } |
||
305 | } |
||
306 | |||
307 | // No Cookies? Few headers? Probably a robot. |
||
308 | $has_cookies = $request->getCookieParams() !== []; |
||
309 | $has_few_headers = count($request->getHeaders()) <= 11; |
||
310 | $suspected_bot = !$has_cookies && $has_few_headers; |
||
311 | |||
312 | // Robots often claim to be a browser. |
||
313 | $claims_to_be_human = |
||
314 | str_contains($ua, 'Chrome/') || |
||
315 | str_contains($ua, 'Firefox/') || |
||
316 | str_contains($ua, 'Opera/') || |
||
317 | str_contains($ua, 'Safari/') |
||
318 | ; |
||
319 | |||
320 | // Validated bots (such as google and bing) use headless browsers. This is OK. |
||
321 | // Anyone else claiming to be a browser needs to prove it by setting a cookie. |
||
322 | if (!$validated_bot && $claims_to_be_human && !$has_cookies) { |
||
323 | $content = |
||
324 | '<!DOCTYPE html>' . |
||
325 | '<html lang="en">' . |
||
326 | '<head>' . |
||
327 | '<meta charset="utf-8">' . |
||
328 | '<title>Cookie check</title>' . |
||
329 | '<meta http-equiv="refresh" content="0">' . |
||
330 | '</head>' . |
||
331 | '<body>Cookie check</body>' . |
||
332 | '</html>'; |
||
333 | |||
334 | return $this->response($content) |
||
335 | ->withHeader('set-cookie', 'x=y; HttpOnly; SameSite=Strict'); |
||
336 | } |
||
337 | |||
338 | // Bots get restricted access |
||
339 | if ($validated_bot || $suspected_bot) { |
||
340 | $request = $request->withAttribute(self::ROBOT_ATTRIBUTE_NAME, true); |
||
341 | } |
||
342 | |||
343 | // Scans for WordPress vulnerabilities? |
||
344 | // Block these before wasting resources on DB connections, sessions, etc. |
||
345 | $path = $request->getUri()->getPath(); |
||
346 | |||
347 | if (str_starts_with($path, '/xmlrpc.php') || str_starts_with($path, '/wp-')) { |
||
348 | return $this->response('Not acceptable: not-wp'); |
||
349 | } |
||
350 | |||
351 | return $handler->handle($request); |
||
352 | } |
||
353 | |||
354 | /** |
||
355 | * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. |
||
356 | * |
||
357 | * @param list<string> $valid_domains |
||
358 | */ |
||
359 | private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool |
||
360 | { |
||
361 | $host = gethostbyaddr($ip); |
||
362 | |||
363 | if ($host === false) { |
||
364 | return false; |
||
365 | } |
||
366 | |||
367 | foreach ($valid_domains as $domain) { |
||
368 | if (str_ends_with($host, $domain)) { |
||
369 | return $reverse_only || $ip === gethostbyname($host); |
||
370 | } |
||
371 | } |
||
372 | |||
373 | return false; |
||
374 | } |
||
375 | |||
376 | /** |
||
377 | * @return array<RangeInterface> |
||
378 | */ |
||
379 | private function fetchIpRangesForAsn(string $asn): array |
||
380 | { |
||
381 | return Registry::cache()->file()->remember('whois-asn-' . $asn, function () use ($asn): array { |
||
382 | $ranges = $this->network_service->findIpRangesForAsn($asn); |
||
383 | $mapper = static fn (string $range): RangeInterface|null => Factory::parseRangeString($range); |
||
384 | $ranges = array_map($mapper, $ranges); |
||
385 | |||
386 | return array_filter($ranges); |
||
387 | }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); |
||
388 | } |
||
389 | |||
390 | private function response(string $content): ResponseInterface |
||
391 | { |
||
392 | return response($content, StatusCodeInterface::STATUS_NOT_ACCEPTABLE); |
||
393 | } |
||
394 | } |
||
395 |