Passed
Push — 2.x ( 23da62...4d98bd )
by Terry
02:49
created

TrustedBot::isFakeRobot()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/*
3
 * This file is part of the Shieldon package.
4
 *
5
 * (c) Terry L. <[email protected]>
6
 *
7
 * For the full copyright and license information, please view the LICENSE
8
 * file that was distributed with this source code.
9
 */
10
11
declare(strict_types=1);
12
13
namespace Shieldon\Firewall\Component;
14
15
use Shieldon\Firewall\Component\ComponentProvider;
16
use Shieldon\Firewall\Component\AllowedTrait;
17
use Shieldon\Firewall\IpTrait;
18
19
use function Shieldon\Firewall\get_request;
20
21
use function array_column;
22
use function array_unique;
23
use function gethostbyname;
24
use function implode;
25
use function preg_match;
26
use function strstr;
27
28
/**
29
 * TrustedBot component.
30
 */
31
class TrustedBot extends ComponentProvider
32
{
33
    use IpTrait;
34
    use AllowedTrait;
35
36
    const STATUS_CODE = 85;
37
38
    /**
39
     * Robot's user-agent text.
40
     * 
41
     * @var string
42
     */
43
    private $userAgent = '';
44
45
    /**
46
     * Of course this option is always true. 
47
     * But it can be false to ignore the check when executing the unit tests.
48
     *
49
     * @var bool
50
     */
51
    private $checkFakeRdns = true;
0 ignored issues
show
introduced by
The private property $checkFakeRdns is not used, and could be removed.
Loading history...
52
53
    /**
54
     * Is the current access a fake robot?
55
     *
56
     * @var bool
57
     */
58
    private $isFake = false;
59
60
    /**
61
     * Constructor.
62
     */
63
    public function __construct()
64
    {
65
        $this->userAgent = get_request()->getHeaderLine('user-agent');
66
67
        $this->allowedList = [
68
69
            // Search engline: Google.
70
            'google_1' => [
71
                'userAgent' => 'google',
72
                'rdns'      => '.googlebot.com',
73
            ],
74
    
75
            'google_2' => [
76
                'userAgent' => 'google',
77
                'rdns'      => '.google.com',
78
            ],
79
    
80
            // Search engline: Mircosoft.
81
            'bing_1' => [
82
                'userAgent' => 'live',
83
                'rdns'      => '.live.com',
84
            ],
85
    
86
            'bing_2' => [
87
                'userAgent' => 'msn',
88
                'rdns'      => '.msn.com',
89
            ],
90
    
91
            'bing_3' => [
92
                'userAgent' => 'bing',
93
                'rdns'      => '.bing.com',
94
            ],
95
    
96
            // Search engline: Yahoo.
97
            'yahoo_1' => [
98
                'userAgent' => 'inktomisearch',
99
                'rdns'      => '.inktomisearch.com',
100
            ],
101
    
102
            'yahoo_2' => [
103
                'userAgent' => 'yahoo',
104
                'rdns'      => '.yahoo.com',
105
            ],
106
    
107
            'yahoo_3' => [
108
                'userAgent' => 'yahoo',
109
                'rdns'      => '.yahoo.net',
110
            ],
111
    
112
            // Search engine: Yandex.
113
            'yandex_1' => [
114
                'userAgent' => 'yandex',
115
                'rdns'      => '.yandex.com',
116
            ],
117
    
118
            'yandex_2' => [
119
                'userAgent' => 'yandex',
120
                'rdns'      => '.yandex.net',
121
            ],
122
    
123
            'yandex_3' => [
124
                'userAgent' => 'yandex',
125
                'rdns'      => '.yandex.ru',
126
            ],
127
    
128
            // Facebook crawlers.
129
            'facebook' => [
130
                'userAgent' => 'facebook',
131
                'rdns'      => '.fbsv.net',
132
            ],
133
    
134
            // Twitter crawlers.
135
            'twitter' => [
136
                'userAgent' => 'Twitterbot',
137
                'rdns'      => '.twttr.com', // (not twitter.com)
138
            ],
139
    
140
            // W3C validation services.
141
            'w3' => [
142
                'userAgent' => 'w3.org',
143
                'rdns'      => '.w3.org',
144
            ],
145
    
146
            // Ask.com crawlers.
147
            'ask' => [
148
                'userAgent' => 'ask',
149
                'rdns'      => '.ask.com',
150
            ],
151
        ];
152
153
        $this->deniedList = [];
154
    }
155
156
    /**
157
     * Check the user-agent string and rdns in the trusted list.
158
     */
159
    public function isAllowed(): bool
160
    {
161
        $userAgent = array_unique(
162
            array_column($this->allowedList, 'userAgent')
163
        );
164
165
        if (!preg_match('/(' . implode('|', $userAgent) . ')/i', $this->userAgent)) {
166
            // Okay, current request's user-agent string doesn't contain our truested bots' infroamtion.
167
            // Ignore it.
168
            return false;
169
        }
170
171
        $rdns = array_unique(
172
            array_column($this->allowedList, 'rdns')
173
        );
174
175
        $rdnsCheck = false;
176
177
        // We will check the RDNS record to see if it is in the whitelist.
178
        if (preg_match('/(' . implode('|', $rdns) . ')/i', $this->rdns)) {
179
180
            // To prevent "fake" RDNS such as "abc.google.com.fakedomain.com" pass thorugh our checking process.
181
            // We need to check it one by one.
182
            foreach ($rdns as $r) {
183
184
                // For example:
185
                // $x = strstr('abc.googlebot.com.fake', '.googlebot.com');
186
                // $x will be `.googlebot.com.fake` so that we can identify this is a fake domain.
187
                $x = strstr($this->rdns, $r);
188
189
                // `.googlebot.com` === `.googlebot.com`
190
                if ($x === $r) {
191
                    $rdnsCheck = true;
192
                }
193
            }
194
195
            if ($rdnsCheck) {
196
                $ip = gethostbyname($this->rdns);
197
198
                if ($this->strictMode) {
199
                    if ($ip !== $this->ip) {
200
                        // If the IP is different as hostname's resolved IP. It might be a fake bot.
201
                        $this->isFake = true;
202
                        return false;
203
                    }
204
                }
205
206
            } else {
207
                // We can identify that current access uses a fake RDNS record.
208
                $this->isFake = true;
209
                return false;
210
            }
211
212
            return true;
213
        }
214
215
        // Here, once a request uses a user-agent that contains search engine information, but it does't pass the RDNS check.
216
        // We can identify it is fake.
217
        $this->isFake = true;
218
        return false;
219
    }
220
221
    /**
222
     * {@inheritDoc}
223
     */
224
    public function isGoogle(): bool
225
    {
226
        if (preg_match('/(google.com|googlebot.com)/i', $this->rdns)) {
227
            return true;
228
        }
229
230
        return false;
231
    }
232
233
    /**
234
     * {@inheritDoc}
235
     */
236
    public function isYahoo(): bool
237
    {
238
        if (preg_match('/(yahoo.com|yahoo.net)/i', $this->rdns)) {
239
            return true;
240
        }
241
242
        return false;
243
    }
244
245
    /**
246
     * {@inheritDoc}
247
     */
248
    public function isBing(): bool
249
    {
250
        if (preg_match('/(msn.com|bing.com|live.com)/i', $this->rdns)) {
251
            return true;
252
        }
253
254
        return false;
255
    }
256
257
    /**
258
     * Not used in TrustedBots component.
259
     * 
260
     * @return bool always false.
261
     */
262
    public function isDenied(): bool
263
    {
264
        return false;
265
    }
266
267
    /**
268
     * Check if the current access a fake robot.
269
     * To get real value from this method, execution must be after `isAllowed`.
270
     *
271
     * @return bool
272
     */
273
    public function isFakeRobot(): bool
274
    {
275
        return $this->isFake;
276
    }
277
278
    /**
279
     * Unique deny status code.
280
     *
281
     * @return int
282
     */
283
    public function getDenyStatusCode(): int
284
    {
285
        return self::STATUS_CODE;
286
    }
287
288
    /**
289
     * Add new items to the allowed list.
290
     *
291
     * @param string $name      The key for this inforamtion.
292
     * @param string $useragent A piece of user-agent string that can identify.
293
     * @param string $rdns      The RDNS inforamtion of the bot.
294
     *
295
     * @return void
296
     */
297
    public function addTrustedBot(string $name, string $useragent, string $rdns)
298
    {
299
        $this->setAllowedItem([
300
            'userAgent' => $useragent,
301
            'rdns' => $rdns,
302
        ], $name);
303
    }
304
}