Completed
Pull Request — master (#42)
by Mark
02:08
created

CrawlerDetect::getIgnored()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 0
1
<?php
2
3
namespace Jaybizzle\CrawlerDetect;
4
5
class CrawlerDetect
6
{
7
    /**
8
     * The user agent
9
     * 
10
     * @var null
11
     */
12
    protected $userAgent = null;
13
14
    /**
15
     * Headers that container user agent
16
     * 
17
     * @var array
18
     */
19
    protected $httpHeaders = array();
20
21
    /**
22
     * Store regex matches
23
     * 
24
     * @var array
25
     */
26
    protected $matches = array();
27
28
    /**
29
     * List of strings to remove from the user agent before running the crawler regex
30
     * Over a large list of user agents, this gives us about a 55% speed increase!
31
     * 
32
     * @var array
33
     */
34
    protected static $ignore = array(
35
        'Safari.[\d\.]*',
36
        'Firefox.[\d\.]*',
37
        'Chrome.[\d\.]*',
38
        'Chromium.[\d\.]*',
39
        'MSIE.[\d\.]',
40
        'Opera\/[\d\.]*',
41
        'Mozilla.[\d\.]*',
42
        'AppleWebKit.[\d\.]*',
43
        'Trident.[\d\.]*',
44
        'Windows NT.[\d\.]*',
45
        'Android.[\d\.]*',
46
        'Macintosh.',
47
        'Ubuntu',
48
        'Linux',
49
        'Intel',
50
        'Mac OS X',
51
        'Gecko.[\d\.]*',
52
        'KHTML',
53
        'CriOS.[\d\.]*',
54
        'CPU iPhone OS ([0-9_])* like Mac OS X',
55
        'CPU OS ([0-9_])* like Mac OS X',
56
        'iPod',
57
        'like Gecko',
58
        'compatible',
59
        'x86_..',
60
        'i686',
61
        'x64',
62
        'X11',
63
        'rv:[\d\.]*',
64
        'Version.[\d\.]*',
65
        'WOW64',
66
        'Win64',
67
        'Dalvik.[\d\.]*',
68
        '\.NET CLR [\d\.]*',
69
        'Presto.[\d\.]*',
70
        'Media Center PC',
71
    );
72
73
    /**
74
     * Array of regular expressions to match against the user agent
75
     * 
76
     * @var array
77
     */
78
    protected static $crawlers = array(
79
        '008\\/',
80
        'A6-Indexer',
81
        'Aboundex',
82
        'Accoona-AI-Agent',
83
        'acoon',
84
        'AddThis',
85
        'ADmantX',
86
        'AHC',
87
        'Airmail',
88
        'Anemone',
89
        'Arachmo',
90
        'archive-com',
91
        'B-l-i-t-z-B-O-T',
92
        'bibnum\.bnf',
93
        'biglotron',
94
        'binlar',
95
        'boitho\.com-dc',
96
        'BUbiNG',
97
        'Butterfly\\/',
98
        'BuzzSumo',
99
        'CC Metadata Scaper',
100
        'Cerberian Drtrs',
101
        'changedetection',
102
        'Charlotte',
103
        'clips\.ua\.ac\.be',
104
        'CloudFlare-AlwaysOnline',
105
        'coccoc',
106
        'Commons-HttpClient',
107
        'convera',
108
        'cosmos',
109
        'Covario-IDS',
110
        'curl',
111
        'CyberPatrol',
112
        'DataparkSearch',
113
        'dataprovider',
114
        'Digg',
115
        'DomainAppender',
116
        'drupact',
117
        'EARTHCOM',
118
        'ec2linkfinder',
119
        'ElectricMonk',
120
        'Embedly',
121
        'europarchive\.org',
122
        'EventMachine HttpClient',
123
        'ezooms',
124
        'eZ Publish Link Validator',
125
        'facebookexternalhit',
126
        'Feedfetcher-Google',
127
        'FeedValidator',
128
        'FindLinks',
129
        'findlink',
130
        'findthatfile',
131
        'Flamingo_SearchEngine',
132
        'fluffy',
133
        'getprismatic\.com',
134
        'g00g1e\.net',
135
        'GigablastOpenSource',
136
        'grub-client',
137
        'Genieo',
138
        'Go-http-client',
139
        'Google-HTTP-Java-Client',
140
        'Google favicon',
141
        'heritrix',
142
        'Holmes',
143
        'htdig',
144
        'httpunit',
145
        'httrack',
146
        'ichiro',
147
        'igdeSpyder',
148
        'InAGist',
149
        'InfoWizards Reciprocal Link System PRO',
150
        'integromedb',
151
        'IODC',
152
        'IOI',
153
        'ips-agent',
154
        'iZSearch',
155
        'L\.webis',
156
        'Larbin',
157
        'libwww',
158
        'Link Valet',
159
        'linkdex',
160
        'LinkExaminer',
161
        'LinkWalker',
162
        'Lipperhey Link Explorer',
163
        'Lipperhey SEO Service',
164
        'LongURL API',
165
        'ltx71',
166
        'lwp-trivial',
167
        'MegaIndex\.ru',
168
        'mabontland',
169
        'MagpieRSS',
170
        'Mediapartners-Google',
171
        'MetaURI',
172
        'Mnogosearch',
173
        'mogimogi',
174
        'Morning Paper',
175
        'Mrcgiguy',
176
        'MVAClient',
177
        'netresearchserver',
178
        'NewsGator',
179
        'newsme',
180
        'NG-Search',
181
        '^NING\\/',
182
        'Notifixious',
183
        'nutch',
184
        'NutchCVS',
185
        'Nymesis',
186
        'oegp',
187
        'online link validator',
188
        'Online Website Link Checker',
189
        'Orbiter',
190
        'ow\.ly',
191
        'Ploetz \+ Zeller',
192
        'page2rss',
193
        'panscient',
194
        'Peew',
195
        'phpcrawl',
196
        'Pizilla',
197
        'Plukkie',
198
        'Pompos',
199
        'postano',
200
        'PostPost',
201
        'postrank',
202
        'proximic',
203
        'PycURL',
204
        'Python-httplib2',
205
        'python-requests',
206
        'Python-urllib',
207
        'Qseero',
208
        'Qwantify',
209
        'Radian6',
210
        'RebelMouse',
211
        'REL Link Checker',
212
        'RetrevoPageAnalyzer',
213
        'Riddler',
214
        'Robosourcer',
215
        'Ruby',
216
        'SBIder',
217
        'ScoutJet',
218
        'ScoutURLMonitor',
219
        'Scrapy',
220
        'Scrubby',
221
        'SearchSight',
222
        'semanticdiscovery',
223
        'SEOstats',
224
        'Seznam screenshot-generator',
225
        'ShopWiki',
226
        'SiteBar',
227
        'siteexplorer\.info',
228
        'slider\.com',
229
        'slurp',
230
        'Snappy',
231
        'sogou',
232
        'speedy',
233
        'Sqworm',
234
        'StackRambler',
235
        'Stratagems Kumo',
236
        'summify',
237
        'teoma',
238
        'theoldreader\.com',
239
        'TinEye',
240
        'Traackr.com',
241
        'truwoGPS',
242
        'tweetedtimes\.com',
243
        'Twikle',
244
        'UnwindFetchor',
245
        'updated',
246
        'urlresolver',
247
        'Validator\.nu\\/LV',
248
        'Vagabondo',
249
        'Vivante Link Checker',
250
        'Vortex',
251
        'voyager\\/',
252
        'VYU2',
253
        'W3C-checklink',
254
        'W3C_CSS_Validator_JFouffa',
255
        'W3C_I18n-Checker',
256
        'W3C-mobileOK',
257
        'W3C_Unicorn',
258
        'W3C_Validator',
259
        'WebIndex',
260
        'Websquash\.com',
261
        'webcollage',
262
        'webmon ',
263
        'WeSEE:Search',
264
        'wf84',
265
        'wget',
266
        'WomlpeFactory',
267
        'wotbox',
268
        'Xenu Link Sleuth',
269
        'XML Sitemaps Generator',
270
        'Y!J-ASR',
271
        'yacy',
272
        'Yahoo Link Preview',
273
        'Yahoo! Slurp China',
274
        'Yahoo! Slurp',
275
        'YahooSeeker',
276
        'YahooSeeker-Testing',
277
        'YandexImages',
278
        'YandexMetrika',
279
        'yandex',
280
        'yanga',
281
        'yeti',
282
        'yoogliFetchAgent',
283
        'Zao',
284
        'ZyBorg',
285
        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
286
    );
287
288
    /**
289
     * All possible HTTP headers that represent the
290
     * User-Agent string.
291
     *
292
     * @var array
293
     */
294
    protected static $uaHttpHeaders = array(
295
        // The default User-Agent string.
296
        'HTTP_USER_AGENT',
297
        // Header can occur on devices using Opera Mini.
298
        'HTTP_X_OPERAMINI_PHONE_UA',
299
        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
0 ignored issues
show
Unused Code Comprehensibility introduced by
38% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
300
        'HTTP_X_DEVICE_USER_AGENT',
301
        'HTTP_X_ORIGINAL_USER_AGENT',
302
        'HTTP_X_SKYFIRE_PHONE',
303
        'HTTP_X_BOLT_PHONE_UA',
304
        'HTTP_DEVICE_STOCK_UA',
305
        'HTTP_X_UCBROWSER_DEVICE_UA',
306
    );
307
308
    /**
309
     * Class constructor.
310
     */
311
    public function __construct(array $headers = null, $userAgent = null)
312
    {
313
        $this->setHttpHeaders($headers);
314
        $this->setUserAgent($userAgent);
315
    }
316
317
    /**
318
     * Set HTTP headers
319
     * 
320
     * @param array $httpHeaders
321
     */
322
    public function setHttpHeaders($httpHeaders = null)
0 ignored issues
show
Coding Style introduced by
setHttpHeaders uses the super-global variable $_SERVER which is generally not recommended.

Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable:

// Bad
class Router
{
    public function generate($path)
    {
        return $_SERVER['HOST'].$path;
    }
}

// Better
class Router
{
    private $host;

    public function __construct($host)
    {
        $this->host = $host;
    }

    public function generate($path)
    {
        return $this->host.$path;
    }
}

class Controller
{
    public function myAction(Request $request)
    {
        // Instead of
        $page = isset($_GET['page']) ? intval($_GET['page']) : 1;

        // Better (assuming you use the Symfony2 request)
        $page = $request->query->get('page', 1);
    }
}
Loading history...
323
    {
324
        // use global _SERVER if $httpHeaders aren't defined
325
        if (!is_array($httpHeaders) || !count($httpHeaders)) {
326
            $httpHeaders = $_SERVER;
327
        }
328
        // clear existing headers
329
        $this->httpHeaders = array();
330
        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
331
        // start with HTTP_.
332
        foreach ($httpHeaders as $key => $value) {
333
            if (substr($key, 0, 5) === 'HTTP_') {
334
                $this->httpHeaders[$key] = $value;
335
            }
336
        }
337
    }
338
339
    /**
340
     * Return user agent headers
341
     * 
342
     * @return array
343
     */
344
    public function getUaHttpHeaders()
345
    {
346
        return self::$uaHttpHeaders;
347
    }
348
349
    /**
350
     * Set the user agent
351
     * 
352
     * @param string $userAgent
353
     */
354
    public function setUserAgent($userAgent = null)
355
    {
356
        if (false === empty($userAgent)) {
357
            return $this->userAgent = $userAgent;
0 ignored issues
show
Documentation Bug introduced by
It seems like $userAgent of type string is incompatible with the declared type null of property $userAgent.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
358
        } else {
359
            $this->userAgent = null;
360
            foreach ($this->getUaHttpHeaders() as $altHeader) {
361
                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow.
362
                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
363
                }
364
            }
365
366
            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
0 ignored issues
show
Documentation Bug introduced by
It seems like !empty($this->userAgent)...this->userAgent) : null can also be of type string. However, the property $userAgent is declared as type null. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
367
        }
368
    }
369
370
    /**
371
     * Build the user agent regex
372
     * 
373
     * @return string
374
     */
375
    public function getRegex()
376
    {
377
        return '('.implode('|', self::$crawlers).')';
378
    }
379
380
    /**
381
     * Build the replacement regex
382
     * 
383
     * @return string
384
     */
385
    public function getIgnored()
386
    {
387
        return '('.implode('|', self::$ignore).')';
388
    }
389
390
    /**
391
     * Check user aganet string against the regex
392
     * 
393
     * @param  string  $userAgent
394
     * @return boolean
395
     */
396
    public function isCrawler($userAgent = null)
397
    {
398
        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
399
400
        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
401
402
        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
403
404
        if ($matches) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $matches of type string[] is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
405
            $this->matches = $matches;
406
        }
407
408
        return (bool) $result;
409
    }
410
411
    /**
412
     * Return the matches
413
     * 
414
     * @return array
415
     */
416
    public function getMatches()
417
    {
418
        return $this->matches[0];
419
    }
420
}
421