Passed
Push — main ( 95a862...e873f4 )
by Greg
07:13
created

BadBotBlocker::process()   F

Complexity

Conditions 27
Paths 1367

Size

Total Lines 82
Code Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 27
eloc 42
nc 1367
nop 2
dl 0
loc 82
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2023 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Registry;
24
use Fisharebest\Webtrees\Validator;
25
use GuzzleHttp\Client;
26
use GuzzleHttp\Exception\GuzzleException;
27
use Iodev\Whois\Loaders\CurlLoader;
28
use Iodev\Whois\Modules\Asn\AsnRouteInfo;
29
use Iodev\Whois\Whois;
30
use IPLib\Address\AddressInterface;
31
use IPLib\Factory as IPFactory;
32
use IPLib\Range\RangeInterface;
33
use Psr\Http\Message\ResponseInterface;
34
use Psr\Http\Message\ServerRequestInterface;
35
use Psr\Http\Server\MiddlewareInterface;
36
use Psr\Http\Server\RequestHandlerInterface;
37
use Throwable;
38
39
use function array_filter;
40
use function array_map;
41
use function assert;
42
use function gethostbyaddr;
43
use function gethostbyname;
44
use function preg_match_all;
45
use function random_int;
46
use function response;
47
use function str_contains;
48
use function str_ends_with;
49
50
/**
51
 * Middleware to block bad robots before they waste our valuable CPU cycles.
52
 */
53
class BadBotBlocker implements MiddlewareInterface
54
{
55
    private const string REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected T_STRING, expecting '=' on line 55 at column 25
Loading history...
56
    private const string REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57
58
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59
    private const int WHOIS_TTL_MIN = 28 * 86400;
60
    private const int WHOIS_TTL_MAX = 35 * 86400;
61
    private const int WHOIS_TIMEOUT = 5;
62
63
    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64
    public const array BAD_ROBOTS = [
65
        'admantx',
66
        'Adsbot',
67
        'AhrefsBot',
68
        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69
        'AntBot', // Aggressive crawler
70
        'AspiegelBot',
71
        'Awario', // Brand management
72
        'Barkrowler', // Crawler for babbar.tech
73
        'BLEXBot',
74
        'Bytespider', // Aggressive crawler from Bytedance/TikTok
75
        'CCBot', // Used to train a number of LLMs
76
        'CensysInspect', // Vulnerability scanner
77
        'ChatGPT-User', // Used by ChatGPT during operation
78
        'ClaudeBot', // Collects training data for LLMs
79
        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
80
        'DotBot',
81
        'Expanse', // Another pointless crawler
82
        'FacebookBot', // Collects training data for Facebook's LLM translator.
83
        'fidget-spinner-bot', // Agressive crawler
84
        'Foregenix', // Vulnerability scanner
85
        'FriendlyCrawler', // Collects training data for LLMs
86
        'Go-http-client', // Crawler library used by many bots
87
        'Google-Extended', // Collects training data for Google Bard
88
        'GPTBot', // Collects training data for ChatGPT
89
        'Grapeshot',
90
        'Honolulu-bot', // Aggressive crawer, no info available
91
        'ia_archiver',
92
        'internet-measurement', // Driftnet
93
        'IonCrawl',
94
        'Java', // Crawler library used by many bots
95
        'linabot', // Aggressive crawer, no info available
96
        'Linguee',
97
        'MegaIndex.ru',
98
        'MJ12bot',
99
        'netEstate NE',
100
        'Omgilibot', // Collects training data for LLMs
101
        'panscient',
102
        'PetalBot',
103
        'phxbot', // Badly written crawler
104
        'proximic',
105
        'python-requests', // Crawler library used by many bots
106
        'Scrapy', // Scraping tool
107
        'SeekportBot', // Pretends to be a search engine - but isn't
108
        'SemrushBot',
109
        'serpstatbot',
110
        'SEOkicks',
111
        'SiteKiosk',
112
        'test-bot', // Agressive crawler
113
        'TinyTestBot',
114
        'Turnitin',
115
        'wp_is_mobile', // Nothing to do with wordpress
116
        'XoviBot',
117
        'YisouSpider',
118
        'ZoominfoBot',
119
    ];
120
121
    /**
122
     * Some search engines use reverse/forward DNS to verify the IP address.
123
     *
124
     * @see https://developer.amazon.com/support/amazonbot
125
     * @see https://support.google.com/webmasters/answer/80553?hl=en
126
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
127
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
128
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
129
     * @see https://www.mojeek.com/bot.html
130
     * @see https://support.apple.com/en-gb/HT204683
131
     */
132
    private const array ROBOT_REV_FWD_DNS = [
133
        'Amazonbot'        => ['.crawl.amazon.com'],
134
        'Applebot'         => ['.applebot.apple.com'],
135
        'BingPreview'      => ['.search.msn.com'],
136
        'Google'           => ['.google.com', '.googlebot.com'],
137
        'Mail.RU_Bot'      => ['.mail.ru'],
138
        'MicrosoftPreview' => ['.search.msn.com'],
139
        'MojeekBot'        => ['.mojeek.com'],
140
        'Qwantify'         => ['.qwant.com'],
141
        'Sogou'            => ['.crawl.sogou.com'],
142
        'Yahoo'            => ['.crawl.yahoo.net'],
143
        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
144
        'bingbot'          => ['.search.msn.com'],
145
        'msnbot'           => ['.search.msn.com'],
146
    ];
147
148
    /**
149
     * Some search engines only use reverse DNS to verify the IP address.
150
     *
151
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
152
     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
153
     * @see https://www.ionos.de/terms-gtc/faq-crawler
154
     */
155
    private const array ROBOT_REV_ONLY_DNS = [
156
        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
157
        'FreshBot'    => ['.seznam.cz'],
158
        'IonCrawl'    => ['.1und1.org'],
159
        'Neevabot'    => ['.neeva.com'],
160
        'SeznamBot'   => ['.seznam.cz'],
161
    ];
162
163
    /**
164
     * Some search engines operate from designated IP addresses.
165
     *
166
     * @see https://www.apple.com/go/applebot
167
     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
168
     */
169
    private const array ROBOT_IPS = [
170
        'AppleBot'    => [
171
            '17.0.0.0/8',
172
        ],
173
        'Ask Jeeves'  => [
174
            '65.214.45.143',
175
            '65.214.45.148',
176
            '66.235.124.192',
177
            '66.235.124.7',
178
            '66.235.124.101',
179
            '66.235.124.193',
180
            '66.235.124.73',
181
            '66.235.124.196',
182
            '66.235.124.74',
183
            '63.123.238.8',
184
            '202.143.148.61',
185
        ],
186
        'DuckDuckBot' => [
187
            '23.21.227.69',
188
            '50.16.241.113',
189
            '50.16.241.114',
190
            '50.16.241.117',
191
            '50.16.247.234',
192
            '52.204.97.54',
193
            '52.5.190.19',
194
            '54.197.234.188',
195
            '54.208.100.253',
196
            '54.208.102.37',
197
            '107.21.1.8',
198
        ],
199
    ];
200
201
    /**
202
     * Some search engines operate from designated IP addresses.
203
     *
204
     * @see https://bot.seekport.com/
205
     */
206
    private const array ROBOT_IP_FILES = [
207
        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
208
    ];
209
210
    /**
211
     * Some search engines operate from within a designated autonomous system.
212
     *
213
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
214
     * @see https://www.facebook.com/peering/
215
     */
216
    private const array ROBOT_ASNS = [
217
        'facebook' => ['AS32934', 'AS63293'],
218
        'twitter'  => ['AS13414'],
219
    ];
220
221
    /**
222
     * @param ServerRequestInterface  $request
223
     * @param RequestHandlerInterface $handler
224
     *
225
     * @return ResponseInterface
226
     */
227
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
228
    {
229
        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
230
        $ip      = Validator::attributes($request)->string('client-ip');
231
        $address = IPFactory::parseAddressString($ip);
232
        assert($address instanceof AddressInterface);
233
234
        foreach (self::BAD_ROBOTS as $robot) {
235
            if (str_contains($ua, $robot)) {
236
                return $this->response();
237
            }
238
        }
239
240
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
241
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
242
                return $this->response();
243
            }
244
        }
245
246
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
247
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
248
                return $this->response();
249
            }
250
        }
251
252
        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
253
            if (str_contains($ua, $robot)) {
254
                foreach ($valid_ip_ranges as $ip_range) {
255
                    $range = IPFactory::parseRangeString($ip_range);
256
257
                    if ($range instanceof RangeInterface && $range->contains($address)) {
258
                        continue 2;
259
                    }
260
                }
261
262
                return $this->response();
263
            }
264
        }
265
266
        foreach (self::ROBOT_IP_FILES as $robot => $url) {
267
            if (str_contains($ua, $robot)) {
268
                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
269
270
                foreach ($valid_ip_ranges as $ip_range) {
271
                    $range = IPFactory::parseRangeString($ip_range);
272
273
                    if ($range instanceof RangeInterface && $range->contains($address)) {
274
                        continue 2;
275
                    }
276
                }
277
278
                return $this->response();
279
            }
280
        }
281
282
        foreach (self::ROBOT_ASNS as $robot => $asns) {
283
            foreach ($asns as $asn) {
284
                if (str_contains($ua, $robot)) {
285
                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
286
                        if ($range->contains($address)) {
287
                            continue 2;
288
                        }
289
                    }
290
291
                    return $this->response();
292
                }
293
            }
294
        }
295
296
        // Allow sites to block access from entire networks.
297
        $block_asn = Validator::attributes($request)->string('block_asn', '');
298
        preg_match_all('/(AS\d+)/', $block_asn, $matches);
299
300
        foreach ($matches[1] as $asn) {
301
            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
302
                if ($range->contains($address)) {
303
                    return $this->response();
304
                }
305
            }
306
        }
307
308
        return $handler->handle($request);
309
    }
310
311
    /**
312
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
313
     *
314
     * @param string        $ip
315
     * @param array<string> $valid_domains
316
     * @param bool          $reverse_only
317
     *
318
     * @return bool
319
     */
320
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
321
    {
322
        $host = gethostbyaddr($ip);
323
324
        if ($host === false) {
325
            return false;
326
        }
327
328
        foreach ($valid_domains as $domain) {
329
            if (str_ends_with($host, $domain)) {
330
                return $reverse_only || $ip === gethostbyname($host);
331
            }
332
        }
333
334
        return false;
335
    }
336
337
    /**
338
     * Perform a whois search for an ASN.
339
     *
340
     * @param string $asn The autonomous system number to query
341
     *
342
     * @return array<RangeInterface>
343
     */
344
    private function fetchIpRangesForAsn(string $asn): array
345
    {
346
        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
347
            $mapper = static fn (AsnRouteInfo $route_info): RangeInterface|null => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
348
349
            try {
350
                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
351
                $whois  = new Whois($loader);
352
                $info   = $whois->loadAsnInfo($asn);
353
                $routes = $info->routes;
354
                $ranges = array_map($mapper, $routes);
355
356
                return array_filter($ranges);
357
            } catch (Throwable) {
358
                return [];
359
            }
360
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
361
    }
362
363
    /**
364
     * Fetch a list of IP addresses from a remote file.
365
     *
366
     * @param string $ua
367
     * @param string $url
368
     *
369
     * @return array<string>
370
     */
371
    private function fetchIpRangesForUrl(string $ua, string $url): array
372
    {
373
        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
374
            try {
375
                $client   = new Client();
376
                $response = $client->get($url, ['timeout' => 5]);
377
                $contents = $response->getBody()->getContents();
378
379
                preg_match_all(self::REGEX_IPV4, $contents, $matches);
380
381
                return $matches[0];
382
            } catch (GuzzleException) {
383
                return [];
384
            }
385
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
386
    }
387
388
    /**
389
     * @return ResponseInterface
390
     */
391
    private function response(): ResponseInterface
392
    {
393
        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
394
    }
395
}
396