Passed
Push — 2.1 ( cc4763...230ed5 )
by Greg
07:17
created

BadBotBlocker::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 3
rs 10
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2025 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Registry;
24
use Fisharebest\Webtrees\Services\NetworkService;
25
use Fisharebest\Webtrees\Validator;
26
use IPLib\Address\AddressInterface;
27
use IPLib\Factory;
28
use IPLib\Range\RangeInterface;
29
use Psr\Http\Message\ResponseInterface;
30
use Psr\Http\Message\ServerRequestInterface;
31
use Psr\Http\Server\MiddlewareInterface;
32
use Psr\Http\Server\RequestHandlerInterface;
33
34
use function array_filter;
35
use function array_map;
36
use function assert;
37
use function gethostbyaddr;
38
use function gethostbyname;
39
use function preg_match_all;
40
use function random_int;
41
use function response;
42
use function str_contains;
43
use function str_ends_with;
44
45
/**
46
 * Middleware to block bad robots before they waste our valuable CPU cycles.
47
 */
48
class BadBotBlocker implements MiddlewareInterface
49
{
50
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
51
    private const WHOIS_TTL_MIN = 28 * 86400;
52
    private const WHOIS_TTL_MAX = 35 * 86400;
53
54
    /**
55
     * @see https://github.com/ai-robots-txt/ai.robots.txt for a list of AI crawlers.
56
     * We can't load this repository as a dependency as it's not a package.
57
     * Instead, the list from version 1.26 is copied here.
58
     */
59
    public const AI_ROBOTS = [
60
         'AI2Bot',
61
         'Ai2Bot-Dolma',
62
         'Amazonbot',
63
         'anthropic-ai',
64
         'Applebot',
65
         'Applebot-Extended',
66
         'Brightbot 1.0',
67
         'Bytespider',
68
         'CCBot',
69
         'ChatGPT-User',
70
         'Claude-Web',
71
         'ClaudeBot',
72
         'cohere-ai',
73
         'cohere-training-data-crawler',
74
         'Crawlspace',
75
         'Diffbot',
76
         'DuckAssistBot',
77
         'FacebookBot',
78
         'FriendlyCrawler',
79
         'Google-Extended',
80
         'GoogleOther',
81
         'GoogleOther-Image',
82
         'GoogleOther-Video',
83
         'GPTBot',
84
         'iaskspider/2.0',
85
         'ICC-Crawler',
86
         'ImagesiftBot',
87
         'img2dataset',
88
         'ISSCyberRiskCrawler',
89
         'Kangaroo Bot',
90
         'Meta-ExternalAgent',
91
         'Meta-ExternalFetcher',
92
         'OAI-SearchBot',
93
         'omgili',
94
         'omgilibot',
95
         'PanguBot',
96
         'PerplexityBot',
97
         'PetalBot',
98
         'Scrapy',
99
         'SemrushBot-OCOB',
100
         'SemrushBot-SWA',
101
         'Sidetrade indexer bot',
102
         'Timpibot',
103
         'VelenPublicWebCrawler',
104
         'Webzio-Extended',
105
         'YouBot',
106
    ];
107
108
    // Other bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
109
    public const BAD_ROBOTS = [
110
        'admantx',
111
        'Adsbot',
112
        'AhrefsBot',
113
        'AntBot', // Aggressive crawler
114
        'AspiegelBot',
115
        'Awario', // Brand management
116
        'Barkrowler', // Crawler for babbar.tech
117
        'BLEXBot',
118
        'CensysInspect', // Vulnerability scanner
119
        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
120
        'DotBot',
121
        'Expanse', // Another pointless crawler
122
        'fidget-spinner-bot', // Agressive crawler
123
        'Foregenix', // Vulnerability scanner
124
        'Go-http-client', // Crawler library used by many bots
125
        'Grapeshot',
126
        'Honolulu-bot', // Aggressive crawer, no info available
127
        'ia_archiver',
128
        'internet-measurement', // Driftnet
129
        'IonCrawl',
130
        'Java', // Crawler library used by many bots
131
        'linabot', // Aggressive crawer, no info available
132
        'Linguee',
133
        'MegaIndex.ru',
134
        'MJ12bot',
135
        'netEstate NE',
136
        'panscient',
137
        'phxbot', // Badly written crawler
138
        'proximic',
139
        'python-requests', // Crawler library used by many bots
140
        'SeekportBot', // Pretends to be a search engine - but isn't
141
        'SemrushBot',
142
        'serpstatbot',
143
        'SEOkicks',
144
        'SiteKiosk',
145
        'test-bot', // Agressive crawler
146
        'TinyTestBot',
147
        'Turnitin',
148
        'wp_is_mobile', // Nothing to do with wordpress
149
        'XoviBot',
150
        'YisouSpider',
151
        'ZoominfoBot',
152
    ];
153
154
    /**
155
     * Some search engines use reverse/forward DNS to verify the IP address.
156
     *
157
     * @see https://support.google.com/webmasters/answer/80553?hl=en
158
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
159
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
160
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
161
     * @see https://www.mojeek.com/bot.html
162
     */
163
    private const ROBOT_REV_FWD_DNS = [
164
        'BingPreview'      => ['.search.msn.com'],
165
        'Google'           => ['.google.com', '.googlebot.com'],
166
        'Mail.RU_Bot'      => ['.mail.ru'],
167
        'MicrosoftPreview' => ['.search.msn.com'],
168
        'MojeekBot'        => ['.mojeek.com'],
169
        'Qwantify'         => ['.qwant.com'],
170
        'Sogou'            => ['.crawl.sogou.com'],
171
        'Yahoo'            => ['.crawl.yahoo.net'],
172
        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
173
        'bingbot'          => ['.search.msn.com'],
174
        'msnbot'           => ['.search.msn.com'],
175
    ];
176
177
    /**
178
     * Some search engines only use reverse DNS to verify the IP address.
179
     *
180
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
181
     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
182
     * @see https://www.ionos.de/terms-gtc/faq-crawler
183
     */
184
    private const ROBOT_REV_ONLY_DNS = [
185
        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
186
        'FreshBot'    => ['.seznam.cz'],
187
        'Neevabot'    => ['.neeva.com'],
188
        'SeznamBot'   => ['.seznam.cz'],
189
    ];
190
191
    /**
192
     * Some search engines operate from designated IP addresses.
193
     *
194
     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
195
     */
196
    private const ROBOT_IPS = [
197
        'DuckDuckBot' => [
198
            '23.21.227.69',
199
            '50.16.241.113',
200
            '50.16.241.114',
201
            '50.16.241.117',
202
            '50.16.247.234',
203
            '52.204.97.54',
204
            '52.5.190.19',
205
            '54.197.234.188',
206
            '54.208.100.253',
207
            '54.208.102.37',
208
            '107.21.1.8',
209
        ],
210
    ];
211
212
    /**
213
     * Some search engines operate from within a designated autonomous system.
214
     *
215
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
216
     * @see https://www.facebook.com/peering/
217
     */
218
    private const ROBOT_ASNS = [
219
        'facebook' => ['AS32934'],
220
        'twitter'  => ['AS13414'],
221
    ];
222
223
    private NetworkService $network_service;
224
225
    public function __construct(NetworkService $network_service)
226
    {
227
        $this->network_service = $network_service;
228
    }
229
230
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
231
    {
232
        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
233
        $ip      = Validator::attributes($request)->string('client-ip');
234
        $address = Factory::parseAddressString($ip);
235
        assert($address instanceof AddressInterface);
236
237
        foreach ([self::AI_ROBOTS, self::BAD_ROBOTS] as $robots) {
238
            foreach ($robots as $robot) {
239
                if (str_contains($ua, $robot)) {
240
                    return $this->response();
241
                }
242
            }
243
        }
244
245
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
246
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
247
                return $this->response();
248
            }
249
        }
250
251
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
252
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
253
                return $this->response();
254
            }
255
        }
256
257
        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
258
            if (str_contains($ua, $robot)) {
259
                foreach ($valid_ip_ranges as $ip_range) {
260
                    $range = Factory::parseRangeString($ip_range);
261
262
                    if ($range instanceof RangeInterface && $range->contains($address)) {
263
                        continue 2;
264
                    }
265
                }
266
267
                return $this->response();
268
            }
269
        }
270
271
        foreach (self::ROBOT_ASNS as $robot => $asns) {
272
            foreach ($asns as $asn) {
273
                if (str_contains($ua, $robot)) {
274
                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
275
                        if ($range->contains($address)) {
276
                            continue 2;
277
                        }
278
                    }
279
280
                    return $this->response();
281
                }
282
            }
283
        }
284
285
        // Allow sites to block access from entire networks.
286
        $block_asn = Validator::attributes($request)->string('block_asn', '');
287
        preg_match_all('/(AS\d+)/', $block_asn, $matches);
288
289
        foreach ($matches[1] as $asn) {
290
            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
291
                if ($range->contains($address)) {
292
                    return $this->response();
293
                }
294
            }
295
        }
296
297
        return $handler->handle($request);
298
    }
299
300
    /**
301
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
302
     *
303
     * @param list<string> $valid_domains
0 ignored issues
show
Bug introduced by
The type Fisharebest\Webtrees\Http\Middleware\list was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
304
     */
305
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
306
    {
307
        $host = gethostbyaddr($ip);
308
309
        if ($host === false) {
310
            return false;
311
        }
312
313
        foreach ($valid_domains as $domain) {
314
            if (str_ends_with($host, $domain)) {
315
                return $reverse_only || $ip === gethostbyname($host);
316
            }
317
        }
318
319
        return false;
320
    }
321
322
    /**
323
     * @return array<RangeInterface>
324
     */
325
    private function fetchIpRangesForAsn(string $asn): array
326
    {
327
        return Registry::cache()->file()->remember('whois-asn-' . $asn, function () use ($asn): array {
328
            $ranges = $this->network_service->findIpRangesForAsn($asn);
329
            $mapper = static fn (string $range): ?RangeInterface => Factory::parseRangeString($range);
330
            $ranges = array_map($mapper, $ranges);
331
332
            return array_filter($ranges);
333
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
334
    }
335
336
    private function response(): ResponseInterface
337
    {
338
        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
339
    }
340
}
341