Issues (2511)

app/Http/Middleware/BadBotBlocker.php (1 issue)

Labels
Severity
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2025 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Registry;
24
use Fisharebest\Webtrees\Services\NetworkService;
25
use Fisharebest\Webtrees\Validator;
26
use IPLib\Address\AddressInterface;
27
use IPLib\Factory;
28
use IPLib\Range\RangeInterface;
29
use Psr\Http\Message\ResponseInterface;
30
use Psr\Http\Message\ServerRequestInterface;
31
use Psr\Http\Server\MiddlewareInterface;
32
use Psr\Http\Server\RequestHandlerInterface;
33
34
use function array_filter;
35
use function array_map;
36
use function assert;
37
use function count;
38
use function gethostbyaddr;
39
use function gethostbyname;
40
use function preg_match_all;
41
use function random_int;
42
use function response;
43
use function str_contains;
44
use function str_ends_with;
45
46
/**
47
 * Middleware to block bad robots before they waste our valuable CPU cycles.
48
 */
49
class BadBotBlocker implements MiddlewareInterface
50
{
51
    public const string ROBOT_ATTRIBUTE_NAME = 'is-a-robot';
0 ignored issues
show
A parse error occurred: Syntax error, unexpected T_STRING, expecting '=' on line 51 at column 24
Loading history...
52
53
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
54
    private const int WHOIS_TTL_MIN = 28 * 86400;
55
    private const int WHOIS_TTL_MAX = 35 * 86400;
56
57
    // An opinionated list of "bad" robots. Typically, these are AI and SEO crawlers.
58
    public const array BAD_ROBOTS = [
59
        'ADmantX',
60
        'AI2Bot',
61
        'Adsbot',
62
        'AISearchBot',
63
        'AhrefsBot',
64
        'Ai2Bot-Dolma',
65
        'AliyunSecBot',
66
        'Amazonbot',
67
        'Andibot',
68
        'AntBot',
69
        'Applebot',
70
        'AspiegelBot',
71
        'Awario',
72
        'BLEXBot',
73
        'Barkrowler',
74
        'Brightbot',
75
        'Bytespider',
76
        'CCBot',
77
        'CensysInspect',
78
        'ChatGPT-User',
79
        'Claude-SearchBot',
80
        'Claude-User',
81
        'Claude-Web',
82
        'ClaudeBot',
83
        'Cotoyogi',
84
        'Crawlspace',
85
        'DataForSeoBot',
86
        'Datenbank Crawler',
87
        'Devin',
88
        'Diffbot',
89
        'DotBot',
90
        'DuckAssistBot',
91
        'Echobot Bot',
92
        'EchoboxBot',
93
        'Expanse',
94
        'FacebookBot',
95
        'Factset_spyderbot',
96
        'FirecrawlAgent',
97
        'Foregenix',
98
        'FriendlyCrawler',
99
        'GPTBot',
100
        'Gemini-Deep-Research',
101
        'Go-http-client',
102
        'Google-CloudVertexBot',
103
        'Google-Extended',
104
        'GoogleAgent-Mariner',
105
        'GoogleOther',
106
        'Grapeshot',
107
        'Honolulu-bot',
108
        'ICC-Crawler',
109
        'ISSCyberRiskCrawler',
110
        'ImagesiftBot',
111
        'IonCrawl',
112
        'Java',
113
        'Kangaroo Bot',
114
        'Linguee',
115
        'MJ12bot',
116
        'MegaIndex.ru',
117
        'Meta-ExternalAgent',
118
        'Meta-ExternalFetcher',
119
        'MistralAI-User',
120
        'MyCentralAIScraperBot',
121
        'NovaAct',
122
        'OAI-SearchBot',
123
        'Operator',
124
        'PanguBot',
125
        'Panscient',
126
        'Perplexity-User',
127
        'PerplexityBot',
128
        'PetalBot',
129
        'PhindBot',
130
        'Poseidon Research Crawler',
131
        'QualifiedBot',
132
        'QuillBot',
133
        'SBIntuitionsBot',
134
        'SEOkicks',
135
        'Scrapy',
136
        'SeekportBot',
137
        'SemrushBot',
138
        'Sidetrade indexer bot',
139
        'SiteKiosk',
140
        'SummalyBot',
141
        'Thinkbot',
142
        'TikTokSpider',
143
        'Timpibot',
144
        'TinyTestBot',
145
        'Turnitin',
146
        'VelenPublicWebCrawler',
147
        'WARDBot',
148
        'Webzio-Extended',
149
        'XoviBot',
150
        'YandexAdditional',
151
        'YisouSpider',
152
        'YouBot',
153
        'ZoominfoBot',
154
        'aiHitBot',
155
        'aiohttp',
156
        'anthropic-ai',
157
        'bedrockbot',
158
        'cohere-ai',
159
        'cohere-training-data-crawler',
160
        'facebookexternalhit',
161
        'fidget-spinner-bot',
162
        'iaskspider',
163
        'img2dataset',
164
        'internet-measurement',
165
        'linabot',
166
        'meta-externalagent',
167
        'meta-externalfetcher',
168
        'netEstate',
169
        'omgili',
170
        'panscient',
171
        'phxbot',
172
        'proximic',
173
        'python-requests',
174
        'quillbot.com',
175
        'wpbot',
176
        'serpstatbot',
177
        'test-bot',
178
        'wp_is_mobile',
179
    ];
180
181
    /**
182
     * Some search engines use reverse/forward DNS to verify the IP address.
183
     *
184
     * @see https://support.google.com/webmasters/answer/80553?hl=en
185
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
186
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
187
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
188
     * @see https://www.mojeek.com/bot.html
189
     */
190
    private const array ROBOT_REV_FWD_DNS = [
191
        'BingPreview'      => ['.search.msn.com'],
192
        'Google'           => ['.google.com', '.googlebot.com'],
193
        'Mail.RU_Bot'      => ['.mail.ru'],
194
        'MicrosoftPreview' => ['.search.msn.com'],
195
        'MojeekBot'        => ['.mojeek.com'],
196
        'Qwantify'         => ['.qwant.com'],
197
        'Sogou'            => ['.crawl.sogou.com'],
198
        'Yahoo'            => ['.crawl.yahoo.net'],
199
        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
200
        'bingbot'          => ['.search.msn.com'],
201
        'msnbot'           => ['.search.msn.com'],
202
    ];
203
204
    /**
205
     * Some search engines only use reverse DNS to verify the IP address.
206
     *
207
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
208
     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
209
     * @see https://www.ionos.de/terms-gtc/faq-crawler
210
     */
211
    private const array ROBOT_REV_ONLY_DNS = [
212
        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
213
        'FreshBot'    => ['.seznam.cz'],
214
        'Neevabot'    => ['.neeva.com'],
215
        'SeznamBot'   => ['.seznam.cz'],
216
    ];
217
218
    /**
219
     * Some search engines operate from designated IP addresses.
220
     * TODO: fetch current lists of IPs, rather than use hard-coded values.
221
     * See https://merj.com/blog/dont-block-what-you-want-duckduckgo-and-common-crawl-to-provide-ip-address-api-endpoints
222
     */
223
224
    /**
225
     * Some search engines operate from within a designated autonomous system.
226
     *
227
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
228
     * @see https://www.facebook.com/peering/
229
     */
230
    private const array ROBOT_ASNS = [
231
        'facebook' => ['AS32934'],
232
        'twitter'  => ['AS13414'],
233
    ];
234
235
    public function __construct(private readonly NetworkService $network_service)
236
    {
237
    }
238
239
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
240
    {
241
        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
242
        $ip      = Validator::attributes($request)->string('client-ip');
243
        $address = Factory::parseAddressString($ip);
244
        assert($address instanceof AddressInterface);
245
246
        if ($ua === '') {
247
            return $this->response('Not acceptable: no-ua');
248
        }
249
250
        foreach (self::BAD_ROBOTS as $robot) {
251
            if (str_contains($ua, $robot)) {
252
                return $this->response('Not acceptable: bad-ua');
253
            }
254
        }
255
256
        $validated_bot =  false;
257
258
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
259
            if (str_contains($ua, $robot)) {
260
                if ($this->checkRobotDNS($ip, $valid_domains, false)) {
261
                    $validated_bot = true;
262
                } else {
263
                    return $this->response('Not acceptable: bad-dns');
264
                }
265
            }
266
        }
267
268
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
269
            if (str_contains($ua, $robot)) {
270
                if ($this->checkRobotDNS($ip, $valid_domains, true)) {
271
                    $validated_bot = true;
272
                } else {
273
                    return $this->response('Not acceptable: bad-dns');
274
                }
275
            }
276
        }
277
278
        // TODO: fetch current lists of IPs, rather than use hard-coded values.
279
280
        foreach (self::ROBOT_ASNS as $robot => $asns) {
281
            foreach ($asns as $asn) {
282
                if (str_contains($ua, $robot)) {
283
                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
284
                        if ($range->contains($address)) {
285
                            $validated_bot = true;
286
                            continue 2;
287
                        }
288
                    }
289
290
                    return $this->response('Not acceptable: bad-dns');
291
                }
292
            }
293
        }
294
295
        // Allow sites to block access from entire networks.
296
        $block_asn = Validator::attributes($request)->string('block_asn', '');
297
        preg_match_all('/(AS\d+)/', $block_asn, $matches);
298
299
        foreach ($matches[1] as $asn) {
300
            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
301
                if ($range->contains($address)) {
302
                    return $this->response('Not acceptable: bad-asn');
303
                }
304
            }
305
        }
306
307
        // No Cookies?  Few headers?  Probably a robot.
308
        $has_cookies     = $request->getCookieParams() !== [];
309
        $has_few_headers = count($request->getHeaders()) <= 11;
310
        $suspected_bot   = !$has_cookies && $has_few_headers;
311
312
        // Robots often claim to be a browser.
313
        $claims_to_be_human =
314
            str_contains($ua, 'Chrome/') ||
315
            str_contains($ua, 'Firefox/') ||
316
            str_contains($ua, 'Opera/') ||
317
            str_contains($ua, 'Safari/')
318
        ;
319
320
        // Validated bots (such as google and bing) use headless browsers.  This is OK.
321
        // Anyone else claiming to be a browser needs to prove it by setting a cookie.
322
        if (!$validated_bot && $claims_to_be_human && !$has_cookies) {
323
            $content =
324
                '<!DOCTYPE html>' .
325
                '<html lang="en">' .
326
                '<head>' .
327
                '<meta charset="utf-8">' .
328
                '<title>Cookie check</title>' .
329
                '<meta http-equiv="refresh" content="0">' .
330
                '</head>' .
331
                '<body>Cookie check</body>' .
332
                '</html>';
333
334
            return $this->response($content)
335
                ->withHeader('set-cookie', 'x=y; HttpOnly; SameSite=Strict');
336
        }
337
338
        // Bots get restricted access
339
        if ($validated_bot || $suspected_bot) {
340
            $request = $request->withAttribute(self::ROBOT_ATTRIBUTE_NAME, true);
341
        }
342
343
        // Scans for WordPress vulnerabilities?
344
        // Block these before wasting resources on DB connections, sessions, etc.
345
        $path = $request->getUri()->getPath();
346
347
        if (str_starts_with($path, '/xmlrpc.php') || str_starts_with($path, '/wp-')) {
348
            return $this->response('Not acceptable: not-wp');
349
        }
350
351
        return $handler->handle($request);
352
    }
353
354
    /**
355
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
356
     *
357
     * @param list<string> $valid_domains
358
     */
359
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
360
    {
361
        $host = gethostbyaddr($ip);
362
363
        if ($host === false) {
364
            return false;
365
        }
366
367
        foreach ($valid_domains as $domain) {
368
            if (str_ends_with($host, $domain)) {
369
                return $reverse_only || $ip === gethostbyname($host);
370
            }
371
        }
372
373
        return false;
374
    }
375
376
    /**
377
     * @return array<RangeInterface>
378
     */
379
    private function fetchIpRangesForAsn(string $asn): array
380
    {
381
        return Registry::cache()->file()->remember('whois-asn-' . $asn, function () use ($asn): array {
382
            $ranges = $this->network_service->findIpRangesForAsn($asn);
383
            $mapper = static fn (string $range): RangeInterface|null => Factory::parseRangeString($range);
384
            $ranges = array_map($mapper, $ranges);
385
386
            return array_filter($ranges);
387
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
388
    }
389
390
    private function response(string $content): ResponseInterface
391
    {
392
        return response($content, StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
393
    }
394
}
395