Passed
Push — 2.1 ( c1023f...c6c623 )
by Greg
07:34 queued 53s
created

BadBotBlocker::fetchIpRangesForUrl()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 15
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 10
nc 1
nop 2
dl 0
loc 15
rs 9.9332
c 1
b 0
f 0
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2022 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Registry;
24
use Fisharebest\Webtrees\Validator;
25
use GuzzleHttp\Client;
26
use GuzzleHttp\Exception\GuzzleException;
27
use Iodev\Whois\Loaders\CurlLoader;
28
use Iodev\Whois\Modules\Asn\AsnRouteInfo;
29
use Iodev\Whois\Whois;
30
use IPLib\Address\AddressInterface;
31
use IPLib\Factory as IPFactory;
32
use IPLib\Range\RangeInterface;
33
use Psr\Http\Message\ResponseInterface;
34
use Psr\Http\Message\ServerRequestInterface;
35
use Psr\Http\Server\MiddlewareInterface;
36
use Psr\Http\Server\RequestHandlerInterface;
37
use Throwable;
38
39
use function array_filter;
40
use function array_map;
41
use function assert;
42
use function gethostbyaddr;
43
use function gethostbyname;
44
use function preg_match_all;
45
use function random_int;
46
use function response;
47
use function str_contains;
48
use function str_ends_with;
49
50
/**
51
 * Middleware to block bad robots before they waste our valuable CPU cycles.
52
 */
53
class BadBotBlocker implements MiddlewareInterface
54
{
55
    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56
    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57
58
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59
    private const WHOIS_TTL_MIN = 28 * 86400;
60
    private const WHOIS_TTL_MAX = 35 * 86400;
61
    private const WHOIS_TIMEOUT = 5;
62
63
    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64
    public const BAD_ROBOTS = [
65
        'admantx',
66
        'Adsbot',
67
        'AhrefsBot',
68
        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69
        'AspiegelBot',
70
        'Barkrowler',
71
        'BLEXBot',
72
        'DataForSEO',
73
        'DotBot',
74
        'Grapeshot',
75
        'ia_archiver',
76
        'Linguee',
77
        'MJ12bot',
78
        'netEstate NE',
79
        'panscient',
80
        'PetalBot',
81
        'proximic',
82
        'SemrushBot',
83
        'SEOkicks',
84
        'SiteKiosk',
85
        'Turnitin',
86
        'XoviBot',
87
        'ZoominfoBot',
88
    ];
89
90
    /**
91
     * Some search engines use reverse/forward DNS to verify the IP address.
92
     *
93
     * @see https://developer.amazon.com/support/amazonbot
94
     * @see https://support.google.com/webmasters/answer/80553?hl=en
95
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
96
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
97
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
98
     * @see https://www.mojeek.com/bot.html
99
     * @see https://support.apple.com/en-gb/HT204683
100
     */
101
    private const ROBOT_REV_FWD_DNS = [
102
        'Amazonbot'   => ['.crawl.amazon.com'],
103
        'Applebot'    => ['.applebot.apple.com'],
104
        'bingbot'     => ['.search.msn.com'],
105
        'BingPreview' => ['.search.msn.com'],
106
        'Google'      => ['.google.com', '.googlebot.com'],
107
        'MojeekBot'   => ['.mojeek.com'],
108
        'Mail.RU_Bot' => ['.mail.ru'],
109
        'msnbot'      => ['.search.msn.com'],
110
        'Qwantify'    => ['.search.qwant.com'],
111
        'Sogou'       => ['.crawl.sogou.com'],
112
        'Yahoo'       => ['.crawl.yahoo.net'],
113
        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
114
    ];
115
116
    /**
117
     * Some search engines only use reverse DNS to verify the IP address.
118
     *
119
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
120
     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
121
     * @see https://www.ionos.de/terms-gtc/faq-crawler
122
     */
123
    private const ROBOT_REV_ONLY_DNS = [
124
        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
125
        'FreshBot'    => ['.seznam.cz'],
126
        'IonCrawl'    => ['.1und1.org'],
127
        'Neevabot'    => ['.neeva.com'],
128
    ];
129
130
    /**
131
     * Some search engines operate from designated IP addresses.
132
     *
133
     * @see https://www.apple.com/go/applebot
134
     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
135
     */
136
    private const ROBOT_IPS = [
137
        'AppleBot'    => [
138
            '17.0.0.0/8',
139
        ],
140
        'Ask Jeeves'  => [
141
            '65.214.45.143',
142
            '65.214.45.148',
143
            '66.235.124.192',
144
            '66.235.124.7',
145
            '66.235.124.101',
146
            '66.235.124.193',
147
            '66.235.124.73',
148
            '66.235.124.196',
149
            '66.235.124.74',
150
            '63.123.238.8',
151
            '202.143.148.61',
152
        ],
153
        'DuckDuckBot' => [
154
            '23.21.227.69',
155
            '50.16.241.113',
156
            '50.16.241.114',
157
            '50.16.241.117',
158
            '50.16.247.234',
159
            '52.204.97.54',
160
            '52.5.190.19',
161
            '54.197.234.188',
162
            '54.208.100.253',
163
            '54.208.102.37',
164
            '107.21.1.8',
165
        ],
166
    ];
167
168
    /**
169
     * Some search engines operate from designated IP addresses.
170
     *
171
     * @see https://bot.seekport.com/
172
     */
173
    private const ROBOT_IP_FILES = [
174
        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
175
    ];
176
177
    /**
178
     * Some search engines operate from within a designated autonomous system.
179
     *
180
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
181
     * @see https://www.facebook.com/peering/
182
     */
183
    private const ROBOT_ASNS = [
184
        'facebook' => ['AS32934', 'AS63293'],
185
        'twitter'  => ['AS13414'],
186
    ];
187
188
    /**
189
     * @param ServerRequestInterface  $request
190
     * @param RequestHandlerInterface $handler
191
     *
192
     * @return ResponseInterface
193
     */
194
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
195
    {
196
        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
197
        $ip      = Validator::attributes($request)->string('client-ip');
198
        $address = IPFactory::parseAddressString($ip);
199
        assert($address instanceof AddressInterface);
200
201
        foreach (self::BAD_ROBOTS as $robot) {
202
            if (str_contains($ua, $robot)) {
203
                return $this->response();
204
            }
205
        }
206
207
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
208
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
209
                return $this->response();
210
            }
211
        }
212
213
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
214
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
215
                return $this->response();
216
            }
217
        }
218
219
        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
220
            if (str_contains($ua, $robot)) {
221
                foreach ($valid_ip_ranges as $ip_range) {
222
                    $range = IPFactory::parseRangeString($ip_range);
223
224
                    if ($range instanceof RangeInterface && $range->contains($address)) {
225
                        continue 2;
226
                    }
227
                }
228
229
                return $this->response();
230
            }
231
        }
232
233
        foreach (self::ROBOT_IP_FILES as $robot => $url) {
234
            if (str_contains($ua, $robot)) {
235
                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
236
237
                foreach ($valid_ip_ranges as $ip_range) {
238
                    $range = IPFactory::parseRangeString($ip_range);
239
240
                    if ($range instanceof RangeInterface && $range->contains($address)) {
241
                        continue 2;
242
                    }
243
                }
244
245
                return $this->response();
246
            }
247
        }
248
249
        foreach (self::ROBOT_ASNS as $robot => $asns) {
250
            foreach ($asns as $asn) {
251
                if (str_contains($ua, $robot)) {
252
                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
253
                        if ($range->contains($address)) {
254
                            continue 2;
255
                        }
256
                    }
257
258
                    return $this->response();
259
                }
260
            }
261
        }
262
263
        // Allow sites to block access from entire networks.
264
        $block_asn = Validator::attributes($request)->string('block_asn', '');
265
        preg_match_all('/(AS\d+)/', $block_asn, $matches);
266
267
        foreach ($matches[1] as $asn) {
268
            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
269
                if ($range->contains($address)) {
270
                    return $this->response();
271
                }
272
            }
273
        }
274
275
        return $handler->handle($request);
276
    }
277
278
    /**
279
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
280
     *
281
     * @param string        $ip
282
     * @param array<string> $valid_domains
283
     * @param bool          $reverse_only
284
     *
285
     * @return bool
286
     */
287
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
288
    {
289
        $host = gethostbyaddr($ip);
290
291
        if ($host === false) {
292
            return false;
293
        }
294
295
        foreach ($valid_domains as $domain) {
296
            if (str_ends_with($host, $domain)) {
297
                return $reverse_only || $ip === gethostbyname($host);
298
            }
299
        }
300
301
        return false;
302
    }
303
304
    /**
305
     * Perform a whois search for an ASN.
306
     *
307
     * @param string $asn - The autonomous system number to query
308
     *
309
     * @return array<RangeInterface>
310
     */
311
    private function fetchIpRangesForAsn(string $asn): array
312
    {
313
        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
314
            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
315
316
            try {
317
                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
318
                $whois  = new Whois($loader);
319
                $info   = $whois->loadAsnInfo($asn);
320
                $routes = $info->routes;
321
                $ranges = array_map($mapper, $routes);
322
323
                return array_filter($ranges);
324
            } catch (Throwable $ex) {
325
                return [];
326
            }
327
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
328
    }
329
330
    /**
331
     * Fetch a list of IP addresses from a remote file.
332
     *
333
     * @param string $ua
334
     * @param string $url
335
     *
336
     * @return array<string>
337
     */
338
    private function fetchIpRangesForUrl(string $ua, string $url): array
339
    {
340
        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
341
            try {
342
                $client   = new Client();
343
                $response = $client->get($url, ['timeout' => 5]);
344
                $contents = $response->getBody()->getContents();
345
346
                preg_match_all(self::REGEX_IPV4, $contents, $matches);
347
348
                return $matches[0];
349
            } catch (GuzzleException) {
350
                return [];
351
            }
352
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
353
    }
354
355
    /**
356
     * @return ResponseInterface
357
     */
358
    private function response(): ResponseInterface
359
    {
360
        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
361
    }
362
}
363