Passed
Push — dbal ( 396fa2...d36e46 )
by Greg
11:43
created

BadBotBlocker::fetchIpRangesForUrl()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 15
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 10
nc 1
nop 2
dl 0
loc 15
rs 9.9332
c 1
b 0
f 0
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2022 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Registry;
24
use Fisharebest\Webtrees\Validator;
25
use Iodev\Whois\Loaders\CurlLoader;
26
use Iodev\Whois\Modules\Asn\AsnRouteInfo;
27
use Iodev\Whois\Whois;
28
use IPLib\Address\AddressInterface;
29
use IPLib\Factory as IPFactory;
30
use IPLib\Range\RangeInterface;
31
use Psr\Http\Message\ResponseInterface;
32
use Psr\Http\Message\ServerRequestInterface;
33
use Psr\Http\Server\MiddlewareInterface;
34
use Psr\Http\Server\RequestHandlerInterface;
35
use Throwable;
36
37
use function array_filter;
38
use function array_map;
39
use function assert;
40
use function gethostbyaddr;
41
use function gethostbyname;
42
use function preg_match_all;
43
use function random_int;
44
use function response;
45
use function str_contains;
46
use function str_ends_with;
47
48
/**
49
 * Middleware to block bad robots before they waste our valuable CPU cycles.
50
 */
51
class BadBotBlocker implements MiddlewareInterface
52
{
53
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
54
    private const WHOIS_TTL_MIN = 28 * 86400;
55
    private const WHOIS_TTL_MAX = 35 * 86400;
56
    private const WHOIS_TIMEOUT = 5;
57
58
    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
59
    public const BAD_ROBOTS = [
60
        'admantx',
61
        'Adsbot',
62
        'AhrefsBot',
63
        'AspiegelBot',
64
        'Barkrowler',
65
        'BLEXBot',
66
        'DataForSEO',
67
        'DotBot',
68
        'Grapeshot',
69
        'ia_archiver',
70
        'Linguee',
71
        'MJ12bot',
72
        'panscient',
73
        'PetalBot',
74
        'proximic',
75
        'SemrushBot',
76
        'Turnitin',
77
        'XoviBot',
78
        'ZoominfoBot',
79
    ];
80
81
    /**
82
     * Some search engines use reverse/forward DNS to verify the IP address.
83
     *
84
     * @see https://support.google.com/webmasters/answer/80553?hl=en
85
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
86
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
87
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
88
     */
89
    private const ROBOT_REV_FWD_DNS = [
90
        'bingbot'     => ['.search.msn.com'],
91
        'BingPreview' => ['.search.msn.com'],
92
        'Google'      => ['.google.com', '.googlebot.com'],
93
        'Mail.RU_Bot' => ['mail.ru'],
94
        'msnbot'      => ['.search.msn.com'],
95
        'Qwantify'    => ['.search.qwant.com'],
96
        'Sogou'       => ['.crawl.sogou.com'],
97
        'Yahoo'       => ['.crawl.yahoo.net'],
98
        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
99
    ];
100
101
    /**
102
     * Some search engines only use reverse DNS to verify the IP address.
103
     *
104
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
105
     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
106
     * @see https://www.ionos.de/terms-gtc/faq-crawler
107
     */
108
    private const ROBOT_REV_ONLY_DNS = [
109
        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
110
        'FreshBot'    => ['.seznam.cz'],
111
        'IonCrawl'    => ['.1und1.org'],
112
    ];
113
114
    /**
115
     * Some search engines operate from designated IP addresses.
116
     *
117
     * @see https://www.apple.com/go/applebot
118
     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
119
     */
120
    private const ROBOT_IPS = [
121
        'AppleBot'    => [
122
            '17.0.0.0/8',
123
        ],
124
        'Ask Jeeves'  => [
125
            '65.214.45.143',
126
            '65.214.45.148',
127
            '66.235.124.192',
128
            '66.235.124.7',
129
            '66.235.124.101',
130
            '66.235.124.193',
131
            '66.235.124.73',
132
            '66.235.124.196',
133
            '66.235.124.74',
134
            '63.123.238.8',
135
            '202.143.148.61',
136
        ],
137
        'DuckDuckBot' => [
138
            '23.21.227.69',
139
            '50.16.241.113',
140
            '50.16.241.114',
141
            '50.16.241.117',
142
            '50.16.247.234',
143
            '52.204.97.54',
144
            '52.5.190.19',
145
            '54.197.234.188',
146
            '54.208.100.253',
147
            '54.208.102.37',
148
            '107.21.1.8',
149
        ],
150
    ];
151
152
    /**
153
     * Some search engines operate from within a designated autonomous system.
154
     *
155
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
156
     * @see https://www.facebook.com/peering/
157
     */
158
    private const ROBOT_ASNS = [
159
        'facebook' => ['AS32934', 'AS63293'],
160
        'twitter'  => ['AS13414'],
161
    ];
162
163
    /**
164
     * @param ServerRequestInterface  $request
165
     * @param RequestHandlerInterface $handler
166
     *
167
     * @return ResponseInterface
168
     */
169
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
170
    {
171
        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
172
        $ip      = Validator::attributes($request)->string('client-ip');
173
        $address = IPFactory::parseAddressString($ip);
174
        assert($address instanceof AddressInterface);
175
176
        foreach (self::BAD_ROBOTS as $robot) {
177
            if (str_contains($ua, $robot)) {
178
                return $this->response();
179
            }
180
        }
181
182
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
183
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
184
                return $this->response();
185
            }
186
        }
187
188
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
189
            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
190
                return $this->response();
191
            }
192
        }
193
194
        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
195
            if (str_contains($ua, $robot)) {
196
                foreach ($valid_ips as $ip) {
197
                    $range = IPFactory::parseRangeString($ip);
198
199
                    if ($range instanceof RangeInterface && $range->contains($address)) {
200
                        continue 2;
201
                    }
202
                }
203
204
                return $this->response();
205
            }
206
        }
207
208
        foreach (self::ROBOT_ASNS as $robot => $asns) {
209
            foreach ($asns as $asn) {
210
                if (str_contains($ua, $robot)) {
211
                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
212
                        if ($range->contains($address)) {
213
                            continue 2;
214
                        }
215
                    }
216
217
                    return $this->response();
218
                }
219
            }
220
        }
221
222
        // Allow sites to block access from entire networks.
223
        $block_asn = Validator::attributes($request)->string('block_asn', '');
224
        preg_match_all('/(AS\d+)/', $block_asn, $matches);
225
226
        foreach ($matches[1] as $asn) {
227
            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
228
                if ($range->contains($address)) {
229
                    return $this->response();
230
                }
231
            }
232
        }
233
234
        return $handler->handle($request);
235
    }
236
237
    /**
238
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
239
     *
240
     * @param string        $ip
241
     * @param array<string> $valid_domains
242
     * @param bool          $reverse_only
243
     *
244
     * @return bool
245
     */
246
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
247
    {
248
        $host = gethostbyaddr($ip);
249
250
        if ($host === false) {
251
            return false;
252
        }
253
254
        foreach ($valid_domains as $domain) {
255
            if (str_ends_with($host, $domain)) {
256
                return $reverse_only || $ip === gethostbyname($host);
257
            }
258
        }
259
260
        return false;
261
    }
262
263
    /**
264
     * Perform a whois search for an ASN.
265
     *
266
     * @param string $asn - The autonomous system number to query
267
     *
268
     * @return array<RangeInterface>
269
     */
270
    private function fetchIpRangesForAsn(string $asn): array
271
    {
272
        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
273
            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
274
275
            try {
276
                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
277
                $whois  = new Whois($loader);
278
                $info   = $whois->loadAsnInfo($asn);
279
                $routes = $info->routes;
280
                $ranges = array_map($mapper, $routes);
281
282
                return array_filter($ranges);
283
            } catch (Throwable $ex) {
284
                return [];
285
            }
286
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
287
    }
288
289
    /**
290
     * @return ResponseInterface
291
     */
292
    private function response(): ResponseInterface
293
    {
294
        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
295
    }
296
}
297