Passed
Push — master ( 45fc26...5c20d9 )
by Greg
05:19
created

BadBotBlocker::checkRobotDNS()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 9
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
eloc 4
c 1
b 0
f 0
nc 3
nop 3
dl 0
loc 9
rs 10
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2019 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Http\Middleware;
21
22
use Fig\Http\Message\StatusCodeInterface;
23
use Fisharebest\Webtrees\Cache;
24
use Illuminate\Support\Str;
25
use Iodev\Whois\Loaders\CurlLoader;
26
use Iodev\Whois\Modules\Asn\AsnRouteInfo;
27
use Iodev\Whois\Whois;
28
use IPLib\Address\AddressInterface;
29
use IPLib\Factory;
30
use IPLib\Range\RangeInterface;
31
use Psr\Http\Message\ResponseInterface;
32
use Psr\Http\Message\ServerRequestInterface;
33
use Psr\Http\Server\MiddlewareInterface;
34
use Psr\Http\Server\RequestHandlerInterface;
35
use Throwable;
36
37
use function app;
38
use function array_map;
39
use function assert;
40
use function gethostbyaddr;
41
use function gethostbyname;
42
use function in_array;
43
use function response;
44
45
/**
46
 * Middleware to block bad robots before they waste our valuable CPU cycles.
47
 */
48
class BadBotBlocker implements MiddlewareInterface
49
{
50
    // Cache whois requests.  Try to avoid all caches expiring at the same time.
51
    private const WHOIS_TTL_MIN = 28 * 86400;
52
    private const WHOIS_TTL_MAX = 35 * 86400;
53
    private const WHOIS_TIMEOUT = 5;
54
55
    // Bad robots - SEO optimisers, advertisers, etc
56
    private const BAD_ROBOTS = [
57
        'admantx',
58
        'AhrefsBot',
59
        'AspiegelBot',
60
        'DotBot',
61
        'Grapeshot',
62
        'ia_archiver',
63
        'MJ12bot',
64
        'panscient',
65
        'proximic',
66
        'SemrushBot',
67
        'XoviBot',
68
    ];
69
70
    /**
71
     * Some search engines use reverse/forward DNS to verify the IP address.
72
     *
73
     * @see https://support.google.com/webmasters/answer/80553?hl=en
74
     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
75
     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
76
     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
77
     */
78
    private const ROBOT_REV_FWD_DNS = [
79
        'bingbot'     => ['.search.msn.com'],
80
        'BingPreview' => ['.search.msn.com'],
81
        'Google'      => ['.google.com', '.googlebot.com'],
82
        'msnbot'      => ['.search.msn.com'],
83
        'Qwantify'    => ['.search.qwant.com'],
84
        'Sogou'       => ['.crawl.sogou.com'],
85
        'Yahoo'       => ['.crawl.yahoo.net'],
86
        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
87
    ];
88
89
    /**
90
     * Some search engines only use reverse DNS to verify the IP address.
91
     *
92
     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
93
     */
94
    private const ROBOT_REV_ONLY_DNS = [
95
        'Baidu' => ['.baidu.com', '.baidu.jp'],
96
    ];
97
98
    /**
99
     * Some search engines operate from designated IP addresses.
100
     *
101
     * @see http://www.apple.com/go/applebot
102
     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
103
     */
104
    private const ROBOT_IPS = [
105
        'AppleBot'    => [
106
            '17.0.0.0/8',
107
        ],
108
        'Ask Jeeves'  => [
109
            '65.214.45.143',
110
            '65.214.45.148',
111
            '66.235.124.192',
112
            '66.235.124.7',
113
            '66.235.124.101',
114
            '66.235.124.193',
115
            '66.235.124.73',
116
            '66.235.124.196',
117
            '66.235.124.74',
118
            '63.123.238.8',
119
            '202.143.148.61',
120
        ],
121
        'DuckDuckBot' => [
122
            '23.21.227.69',
123
            '50.16.241.113',
124
            '50.16.241.114',
125
            '50.16.241.117',
126
            '50.16.247.234',
127
            '52.204.97.54',
128
            '52.5.190.19',
129
            '54.197.234.188',
130
            '54.208.100.253',
131
            '54.208.102.37',
132
            '107.21.1.8',
133
        ],
134
    ];
135
136
    /**
137
     * Some search engines operate from within a designated autonomous system.
138
     *
139
     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
140
     */
141
    private const ROBOT_ASN = [
142
        'facebook' => 'AS32934',
143
        'twitter'  => 'AS13414',
144
    ];
145
146
    /**
147
     * These ASNs belong to server farms.
148
     */
149
    private const BLOCK_ASN = [
150
        'hetzner'   => 'AS24920',
151
        'hostdime'  => 'AS33182',
152
        'linode'    => 'AS63949',
153
        'ovh'       => 'AS16276',
154
        'rackspace' => 'AS15395',
155
    ];
156
157
    /**
158
     * @param ServerRequestInterface  $request
159
     * @param RequestHandlerInterface $handler
160
     *
161
     * @return ResponseInterface
162
     */
163
    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
164
    {
165
        $ua      = $request->getServerParams()['HTTP_USER_AGENT'] ?? '';
166
        $ip      = $request->getAttribute('client-ip');
167
        $address = Factory::addressFromString($ip);
168
        assert($address instanceof AddressInterface);
169
170
        if (Str::contains($ua, self::BAD_ROBOTS)) {
171
            return $this->response();
172
        }
173
174
        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
175
            if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
176
                return $this->response();
177
            }
178
        }
179
180
        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
181
            if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
182
                return $this->response();
183
            }
184
        }
185
186
        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
187
            if (Str::contains($ua, $robot)) {
188
                foreach ($valid_ips as $ip) {
189
                    $range = Factory::rangeFromString($ip);
190
191
                    if ($range instanceof RangeInterface && $range->contains($address)) {
192
                        continue 2;
193
                    }
194
                }
195
196
                return $this->response();
197
            }
198
        }
199
200
        foreach (self::ROBOT_ASN as $robot => $asn) {
201
            if (Str::contains($ua, $robot)) {
202
                foreach ($this->fetchIpRangesForAsn($asn) as $range) {
203
                    if ($range->contains($address)) {
204
                        continue 2;
205
                    }
206
                }
207
208
                return $this->response();
209
            }
210
        }
211
212
        // This is potentially controversial, and whois lookups may be slow.
213
        //foreach (self::BLOCK_ASN as $host => $asn) {
214
        //    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
215
        //        if ($range->contains($address)) {
216
        //            return $this->response();
217
        //        }
218
        //    }
219
        //}
220
221
        return $handler->handle($request);
222
    }
223
224
    /**
225
     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
226
     *
227
     * @param string        $ip
228
     * @param array<string> $valid_domains
229
     * @param bool          $reverse_only
230
     *
231
     * @return bool
232
     */
233
    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
234
    {
235
        $host = gethostbyaddr($ip);
236
237
        if ($host === false || !Str::endsWith($host, $valid_domains)) {
238
            return false;
239
        }
240
241
        return $reverse_only || $ip === gethostbyname($host);
242
    }
243
244
    /**
245
     * Perform a whois search for an ASN.
246
     *
247
     * @param string $asn - The autonomous system number to query
248
     *
249
     * @return array<RangeInterface>
250
     */
251
    private function fetchIpRangesForAsn(string $asn): array
252
    {
253
        $cache = app('cache.files');
254
        assert($cache instanceof Cache);
255
256
        return $cache->remember('whois-asn-' . $asn, static function () use ($asn): array {
257
            try {
258
                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
259
                $whois  = new Whois($loader);
260
                $info   = $whois->loadAsnInfo($asn);
261
                $routes = $info->getRoutes();
262
                $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface {
263
                    return Factory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6());
264
                }, $routes);
265
266
                return array_filter($ranges);
267
            } catch (Throwable $ex) {
268
                return [];
269
            }
270
        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
271
    }
272
273
    /**
274
     * @return ResponseInterface
275
     */
276
    private function response(): ResponseInterface
277
    {
278
        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
279
    }
280
}
281