1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* webtrees: online genealogy |
5
|
|
|
* Copyright (C) 2019 webtrees development team |
6
|
|
|
* This program is free software: you can redistribute it and/or modify |
7
|
|
|
* it under the terms of the GNU General Public License as published by |
8
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
* (at your option) any later version. |
10
|
|
|
* This program is distributed in the hope that it will be useful, |
11
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
12
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13
|
|
|
* GNU General Public License for more details. |
14
|
|
|
* You should have received a copy of the GNU General Public License |
15
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
16
|
|
|
*/ |
17
|
|
|
|
18
|
|
|
declare(strict_types=1); |
19
|
|
|
|
20
|
|
|
namespace Fisharebest\Webtrees\Http\Middleware; |
21
|
|
|
|
22
|
|
|
use Fig\Http\Message\StatusCodeInterface; |
23
|
|
|
use Fisharebest\Webtrees\Cache; |
24
|
|
|
use Illuminate\Support\Str; |
25
|
|
|
use Iodev\Whois\Loaders\CurlLoader; |
26
|
|
|
use Iodev\Whois\Modules\Asn\AsnRouteInfo; |
27
|
|
|
use Iodev\Whois\Whois; |
28
|
|
|
use IPLib\Address\AddressInterface; |
29
|
|
|
use IPLib\Factory; |
30
|
|
|
use IPLib\Range\RangeInterface; |
31
|
|
|
use Psr\Http\Message\ResponseInterface; |
32
|
|
|
use Psr\Http\Message\ServerRequestInterface; |
33
|
|
|
use Psr\Http\Server\MiddlewareInterface; |
34
|
|
|
use Psr\Http\Server\RequestHandlerInterface; |
35
|
|
|
use Throwable; |
36
|
|
|
|
37
|
|
|
use function app; |
38
|
|
|
use function array_map; |
39
|
|
|
use function assert; |
40
|
|
|
use function gethostbyaddr; |
41
|
|
|
use function gethostbyname; |
42
|
|
|
use function in_array; |
43
|
|
|
use function response; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Middleware to block bad robots before they waste our valuable CPU cycles. |
47
|
|
|
*/ |
48
|
|
|
class BadBotBlocker implements MiddlewareInterface |
49
|
|
|
{ |
50
|
|
|
// Cache whois requests. Try to avoid all caches expiring at the same time. |
51
|
|
|
private const WHOIS_TTL_MIN = 28 * 86400; |
52
|
|
|
private const WHOIS_TTL_MAX = 35 * 86400; |
53
|
|
|
private const WHOIS_TIMEOUT = 5; |
54
|
|
|
|
55
|
|
|
// Bad robots - SEO optimisers, advertisers, etc |
56
|
|
|
private const BAD_ROBOTS = [ |
57
|
|
|
'admantx', |
58
|
|
|
'AhrefsBot', |
59
|
|
|
'AspiegelBot', |
60
|
|
|
'DotBot', |
61
|
|
|
'Grapeshot', |
62
|
|
|
'ia_archiver', |
63
|
|
|
'MJ12bot', |
64
|
|
|
'panscient', |
65
|
|
|
'proximic', |
66
|
|
|
'SemrushBot', |
67
|
|
|
'XoviBot', |
68
|
|
|
]; |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Some search engines use reverse/forward DNS to verify the IP address. |
72
|
|
|
* |
73
|
|
|
* @see https://support.google.com/webmasters/answer/80553?hl=en |
74
|
|
|
* @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 |
75
|
|
|
* @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 |
76
|
|
|
* @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html |
77
|
|
|
*/ |
78
|
|
|
private const ROBOT_REV_FWD_DNS = [ |
79
|
|
|
'bingbot' => ['.search.msn.com'], |
80
|
|
|
'BingPreview' => ['.search.msn.com'], |
81
|
|
|
'Google' => ['.google.com', '.googlebot.com'], |
82
|
|
|
'msnbot' => ['.search.msn.com'], |
83
|
|
|
'Qwantify' => ['.search.qwant.com'], |
84
|
|
|
'Sogou' => ['.crawl.sogou.com'], |
85
|
|
|
'Yahoo' => ['.crawl.yahoo.net'], |
86
|
|
|
'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], |
87
|
|
|
]; |
88
|
|
|
|
89
|
|
|
/** |
90
|
|
|
* Some search engines only use reverse DNS to verify the IP address. |
91
|
|
|
* |
92
|
|
|
* @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 |
93
|
|
|
*/ |
94
|
|
|
private const ROBOT_REV_ONLY_DNS = [ |
95
|
|
|
'Baidu' => ['.baidu.com', '.baidu.jp'], |
96
|
|
|
]; |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Some search engines operate from designated IP addresses. |
100
|
|
|
* |
101
|
|
|
* @see http://www.apple.com/go/applebot |
102
|
|
|
* @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot |
103
|
|
|
*/ |
104
|
|
|
private const ROBOT_IPS = [ |
105
|
|
|
'AppleBot' => [ |
106
|
|
|
'17.0.0.0/8', |
107
|
|
|
], |
108
|
|
|
'Ask Jeeves' => [ |
109
|
|
|
'65.214.45.143', |
110
|
|
|
'65.214.45.148', |
111
|
|
|
'66.235.124.192', |
112
|
|
|
'66.235.124.7', |
113
|
|
|
'66.235.124.101', |
114
|
|
|
'66.235.124.193', |
115
|
|
|
'66.235.124.73', |
116
|
|
|
'66.235.124.196', |
117
|
|
|
'66.235.124.74', |
118
|
|
|
'63.123.238.8', |
119
|
|
|
'202.143.148.61', |
120
|
|
|
], |
121
|
|
|
'DuckDuckBot' => [ |
122
|
|
|
'23.21.227.69', |
123
|
|
|
'50.16.241.113', |
124
|
|
|
'50.16.241.114', |
125
|
|
|
'50.16.241.117', |
126
|
|
|
'50.16.247.234', |
127
|
|
|
'52.204.97.54', |
128
|
|
|
'52.5.190.19', |
129
|
|
|
'54.197.234.188', |
130
|
|
|
'54.208.100.253', |
131
|
|
|
'54.208.102.37', |
132
|
|
|
'107.21.1.8', |
133
|
|
|
], |
134
|
|
|
]; |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* Some search engines operate from within a designated autonomous system. |
138
|
|
|
* |
139
|
|
|
* @see https://developers.facebook.com/docs/sharing/webmasters/crawler |
140
|
|
|
*/ |
141
|
|
|
private const ROBOT_ASN = [ |
142
|
|
|
'facebook' => 'AS32934', |
143
|
|
|
'twitter' => 'AS13414', |
144
|
|
|
]; |
145
|
|
|
|
146
|
|
|
/** |
147
|
|
|
* These ASNs belong to server farms. |
148
|
|
|
*/ |
149
|
|
|
private const BLOCK_ASN = [ |
150
|
|
|
'hetzner' => 'AS24920', |
151
|
|
|
'hostdime' => 'AS33182', |
152
|
|
|
'linode' => 'AS63949', |
153
|
|
|
'ovh' => 'AS16276', |
154
|
|
|
'rackspace' => 'AS15395', |
155
|
|
|
]; |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* @param ServerRequestInterface $request |
159
|
|
|
* @param RequestHandlerInterface $handler |
160
|
|
|
* |
161
|
|
|
* @return ResponseInterface |
162
|
|
|
*/ |
163
|
|
|
public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface |
164
|
|
|
{ |
165
|
|
|
$ua = $request->getServerParams()['HTTP_USER_AGENT'] ?? ''; |
166
|
|
|
$ip = $request->getAttribute('client-ip'); |
167
|
|
|
$address = Factory::addressFromString($ip); |
168
|
|
|
assert($address instanceof AddressInterface); |
169
|
|
|
|
170
|
|
|
if (Str::contains($ua, self::BAD_ROBOTS)) { |
171
|
|
|
return $this->response(); |
172
|
|
|
} |
173
|
|
|
|
174
|
|
|
foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { |
175
|
|
|
if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { |
176
|
|
|
return $this->response(); |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { |
181
|
|
|
if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { |
182
|
|
|
return $this->response(); |
183
|
|
|
} |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
foreach (self::ROBOT_IPS as $robot => $valid_ips) { |
187
|
|
|
if (Str::contains($ua, $robot)) { |
188
|
|
|
foreach ($valid_ips as $ip) { |
189
|
|
|
$range = Factory::rangeFromString($ip); |
190
|
|
|
|
191
|
|
|
if ($range instanceof RangeInterface && $range->contains($address)) { |
192
|
|
|
continue 2; |
193
|
|
|
} |
194
|
|
|
} |
195
|
|
|
|
196
|
|
|
return $this->response(); |
197
|
|
|
} |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
foreach (self::ROBOT_ASN as $robot => $asn) { |
201
|
|
|
if (Str::contains($ua, $robot)) { |
202
|
|
|
foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
203
|
|
|
if ($range->contains($address)) { |
204
|
|
|
continue 2; |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
return $this->response(); |
209
|
|
|
} |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
// This is potentially controversial, and whois lookups may be slow. |
213
|
|
|
//foreach (self::BLOCK_ASN as $host => $asn) { |
214
|
|
|
// foreach ($this->fetchIpRangesForAsn($asn) as $range) { |
215
|
|
|
// if ($range->contains($address)) { |
216
|
|
|
// return $this->response(); |
217
|
|
|
// } |
218
|
|
|
// } |
219
|
|
|
//} |
220
|
|
|
|
221
|
|
|
return $handler->handle($request); |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
/** |
225
|
|
|
* Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. |
226
|
|
|
* |
227
|
|
|
* @param string $ip |
228
|
|
|
* @param array<string> $valid_domains |
229
|
|
|
* @param bool $reverse_only |
230
|
|
|
* |
231
|
|
|
* @return bool |
232
|
|
|
*/ |
233
|
|
|
private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool |
234
|
|
|
{ |
235
|
|
|
$host = gethostbyaddr($ip); |
236
|
|
|
|
237
|
|
|
if ($host === false || !Str::endsWith($host, $valid_domains)) { |
238
|
|
|
return false; |
239
|
|
|
} |
240
|
|
|
|
241
|
|
|
return $reverse_only || $ip === gethostbyname($host); |
242
|
|
|
} |
243
|
|
|
|
244
|
|
|
/** |
245
|
|
|
* Perform a whois search for an ASN. |
246
|
|
|
* |
247
|
|
|
* @param string $asn - The autonomous system number to query |
248
|
|
|
* |
249
|
|
|
* @return array<RangeInterface> |
250
|
|
|
*/ |
251
|
|
|
private function fetchIpRangesForAsn(string $asn): array |
252
|
|
|
{ |
253
|
|
|
$cache = app('cache.files'); |
254
|
|
|
assert($cache instanceof Cache); |
255
|
|
|
|
256
|
|
|
return $cache->remember('whois-asn-' . $asn, static function () use ($asn): array { |
257
|
|
|
try { |
258
|
|
|
$loader = new CurlLoader(self::WHOIS_TIMEOUT); |
259
|
|
|
$whois = new Whois($loader); |
260
|
|
|
$info = $whois->loadAsnInfo($asn); |
261
|
|
|
$routes = $info->getRoutes(); |
262
|
|
|
$ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface { |
263
|
|
|
return Factory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6()); |
264
|
|
|
}, $routes); |
265
|
|
|
|
266
|
|
|
return array_filter($ranges); |
267
|
|
|
} catch (Throwable $ex) { |
268
|
|
|
return []; |
269
|
|
|
} |
270
|
|
|
}, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
/** |
274
|
|
|
* @return ResponseInterface |
275
|
|
|
*/ |
276
|
|
|
private function response(): ResponseInterface |
277
|
|
|
{ |
278
|
|
|
return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); |
279
|
|
|
} |
280
|
|
|
} |
281
|
|
|
|