Completed
Push — master ( d89a54...db6c5d )
by Mark
02:18
created

CrawlerDetect::getCrawlers()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 0
1
<?php
2
3
namespace Jaybizzle\CrawlerDetect;
4
5
class CrawlerDetect
6
{
7
    /**
8
     * The user agent.
9
     *
10
     * @var null
11
     */
12
    protected $userAgent = null;
13
14
    /**
15
     * Headers that contain a user agent.
16
     *
17
     * @var array
18
     */
19
    protected $httpHeaders = array();
20
21
    /**
22
     * Store regex matches.
23
     *
24
     * @var array
25
     */
26
    protected $matches = array();
27
28
    /**
29
     * List of strings to remove from the user agent before running the crawler regex
30
     * Over a large list of user agents, this gives us about a 55% speed increase!
31
     *
32
     * @var array
33
     */
34
    protected static $ignore = array(
35
        'Safari.[\d\.]*',
36
        'Firefox.[\d\.]*',
37
        'Chrome.[\d\.]*',
38
        'Chromium.[\d\.]*',
39
        'MSIE.[\d\.]',
40
        'Opera\/[\d\.]*',
41
        'Mozilla.[\d\.]*',
42
        'AppleWebKit.[\d\.]*',
43
        'Trident.[\d\.]*',
44
        'Windows NT.[\d\.]*',
45
        'Android.[\d\.]*',
46
        'Macintosh.',
47
        'Ubuntu',
48
        'Linux',
49
        '[ ]Intel',
50
        'Mac OS X [\d_]*',
51
        '(like )?Gecko(.[\d\.]*)?',
52
        'KHTML',
53
        'CriOS.[\d\.]*',
54
        'CPU iPhone OS ([0-9_])* like Mac OS X',
55
        'CPU OS ([0-9_])* like Mac OS X',
56
        'iPod',
57
        'compatible',
58
        'x86_..',
59
        'i686',
60
        'x64',
61
        'X11',
62
        'rv:[\d\.]*',
63
        'Version.[\d\.]*',
64
        'WOW64',
65
        'Win64',
66
        'Dalvik.[\d\.]*',
67
        ' \.NET CLR [\d\.]*',
68
        'Presto.[\d\.]*',
69
        'Media Center PC',
70
        'BlackBerry',
71
        'Build',
72
        'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.',
73
        'Opera',
74
        ' \.NET[\d\.]*',
75
        '\(|\)|;|,', // Remove the following characters ( ) : ,
76
    );
77
78
    /**
79
     * Array of regular expressions to match against the user agent.
80
     *
81
     * @var array
82
     */
83
    protected static $crawlers = array(
84
        '.*Java.*outbrain',
85
        '008\\/',
86
        '^NING\\/',
87
        'A6-Indexer',
88
        'Aboundex',
89
        'Accoona-AI-Agent',
90
        'acoon',
91
        'AddThis',
92
        'ADmantX',
93
        'AHC',
94
        'Airmail',
95
        'Anemone',
96
        'Arachmo',
97
        'archive-com',
98
        'B-l-i-t-z-B-O-T',
99
        'Backlink-Ceck\.de',
100
        'BazQux',
101
        'bibnum\.bnf',
102
        'biglotron',
103
        'BingPreview',
104
        'binlar',
105
        'Bloglovin',
106
        'Blogtrottr',
107
        'boitho\.com-dc',
108
        'Browsershots',
109
        'BUbiNG',
110
        'Butterfly\\/',
111
        'BuzzSumo',
112
        'CapsuleChecker',
113
        'CC Metadata Scaper',
114
        'Cerberian Drtrs',
115
        'changedetection',
116
        'Charlotte',
117
        'clips\.ua\.ac\.be',
118
        'CloudFlare-AlwaysOnline',
119
        'coccoc',
120
        'CommaFeed',
121
        'Commons-HttpClient',
122
        'convera',
123
        'cosmos',
124
        'Covario-IDS',
125
        'Curious George',
126
        'curl',
127
        'CyberPatrol',
128
        'DataparkSearch',
129
        'dataprovider',
130
        'Daum(oa)?[ \\/][0-9]',
131
        'Digg',
132
        'DomainAppender',
133
        'Dragonfly File Reader',
134
        'drupact',
135
        'EARTHCOM',
136
        'ec2linkfinder',
137
        'ECCP',
138
        'ElectricMonk',
139
        'EMail Exractor',
140
        'EmailWolf',
141
        'Embed PHP Library',
142
        'Embedly',
143
        'europarchive\.org',
144
        'EventMachine HttpClient',
145
        'ExactSearch',
146
        'ExaleadCloudview',
147
        'eZ Publish Link Validator',
148
        'ezooms',
149
        'facebookexternalhit',
150
        'facebookplatform',
151
        'Feed Wrangler',
152
        'Feedbin',
153
        'FeedBurner',
154
        'Feedfetcher-Google',
155
        'Feedly',
156
        'Feedspot',
157
        'FeedValidator',
158
        'Fever',
159
        'findlink',
160
        'findthatfile',
161
        'Flamingo_SearchEngine',
162
        'fluffy',
163
        'g00g1e\.net',
164
        'Genieo',
165
        'getprismatic\.com',
166
        'GigablastOpenSource',
167
        'Go-http-client',
168
        'Google favicon',
169
        'Google Keyword Suggestion',
170
        'Google Page Speed Insights',
171
        'Google-HTTP-Java-Client',
172
        'google_partner_monitoring',
173
        'GoogleProducer',
174
        'grub-client',
175
        'heritrix',
176
        'Holmes',
177
        'htdig',
178
        'HTTPMon',
179
        'httpunit',
180
        'httrack',
181
        'HubPages.*crawlingpolicy',
182
        'HubSpot Marketing Grader',
183
        'ichiro',
184
        'IDG Twitter Links Resolver',
185
        'igdeSpyder',
186
        'InAGist',
187
        'infegy',
188
        'InfoWizards Reciprocal Link System PRO',
189
        'integromedb',
190
        'IODC',
191
        'IOI',
192
        'ips-agent',
193
        'iZSearch',
194
        'Jigsaw',
195
        'Jobrapido',
196
        'kouio',
197
        'L\.webis',
198
        'Larbin',
199
        'libwww',
200
        'Link Valet',
201
        'linkCheck',
202
        'linkdex',
203
        'LinkExaminer',
204
        'LinkWalker',
205
        'Lipperhey',
206
        'LongURL API',
207
        'ltx71',
208
        'lwp-trivial',
209
        'lycos',
210
        'mabontland',
211
        'MagpieRSS',
212
        'Mediapartners-Google',
213
        'Mediapartners-Google',
214
        'MegaIndex\.ru',
215
        'MetaURI',
216
        'Mnogosearch',
217
        'mogimogi',
218
        'Morning Paper',
219
        'Mrcgiguy',
220
        'MVAClient',
221
        'Netcraft Web Server Survey',
222
        'NetLyzer FastProbe',
223
        'netresearch',
224
        'Netvibes',
225
        'NewsBlur .*(Fetcher|Finder)',
226
        'NewsGator',
227
        'newsme',
228
        'NG-Search',
229
        'nineconnections\.com',
230
        'nominet\.org\.uk',
231
        'Notifixious',
232
        'nuhk',
233
        'nutch',
234
        'Nymesis',
235
        'oegp',
236
        'Omea Reader',
237
        'online link validator',
238
        'Online Website Link Checker',
239
        'Orbiter',
240
        'ow\.ly',
241
        'page2rss',
242
        'PagePeeker',
243
        'panscient',
244
        'Peew',
245
        'phpcrawl',
246
        'phpservermon',
247
        'Pingdom\.com',
248
        'Pinterest',
249
        'Pizilla',
250
        'Ploetz \+ Zeller',
251
        'Plukkie',
252
        'PocketParser',
253
        'Pompos',
254
        'postano',
255
        'PostPost',
256
        'postrank',
257
        'proximic',
258
        'Pulsepoint XT3 web scraper',
259
        'Python-httplib2',
260
        'python-requests',
261
        'Python-urllib',
262
        'Qseero',
263
        'Qwantify',
264
        'Radian6',
265
        'RebelMouse',
266
        'REL Link Checker',
267
        'RetrevoPageAnalyzer',
268
        'Riddler',
269
        'Robosourcer',
270
        'ROI Hunter',
271
        'Ruby',
272
        'SBIder',
273
        'scooter',
274
        'ScoutJet',
275
        'ScoutURLMonitor',
276
        'Scrapy',
277
        'Scrubby',
278
        'SearchSight',
279
        'semanticdiscovery',
280
        'SEOstats',
281
        'Server Density Service Monitoring.*',
282
        'servernfo\.com',
283
        'Seznam screenshot-generator',
284
        'ShopWiki',
285
        'SilverReader',
286
        'SimplePie',
287
        'Site24x7',
288
        'SiteBar',
289
        'siteexplorer\.info',
290
        'Siteimprove\.com',
291
        'SkypeUriPreview',
292
        'slider\.com',
293
        'slurp',
294
        'Snappy',
295
        'sogou',
296
        'SortSite',
297
        'speedy',
298
        'Spinn3r',
299
        'Springshare Link Checker',
300
        'Sqworm',
301
        'StackRambler',
302
        'Stratagems Kumo',
303
        'summify',
304
        'teoma',
305
        'theoldreader\.com',
306
        'TinEye',
307
        'Tiny Tiny RSS',
308
        'Traackr.com',
309
        'truwoGPS',
310
        'tweetedtimes\.com',
311
        'Twikle',
312
        'Typhoeus',
313
        'UdmSearch',
314
        'UnwindFetchor',
315
        'updated',
316
        'URLChecker',
317
        'urlresolver',
318
        'Vagabondo',
319
        'Validator\.nu\\/LV',
320
        'via ggpht\.com GoogleImageProxy',
321
        'Vivante Link Checker',
322
        'Vortex',
323
        'voyager\\/',
324
        'VYU2',
325
        'W3C-checklink',
326
        'W3C-mobileOK',
327
        'W3C_CSS_Validator_JFouffa',
328
        'W3C_I18n-Checker',
329
        'W3C_Unicorn',
330
        'W3C_Validator',
331
        'web-capture\.net',
332
        'webcollage',
333
        'WebIndex',
334
        'webmon ',
335
        'websitepulse[+ ]checker',
336
        'Websquash\.com',
337
        'WebThumbnail',
338
        'WeSEE:Search',
339
        'wf84',
340
        'wget',
341
        'WomlpeFactory',
342
        'wotbox',
343
        'www\.monitor\.us',
344
        'Xenu Link Sleuth',
345
        'XML Sitemaps Generator',
346
        'Y!J-ASR',
347
        'yacy',
348
        'Yahoo Ad monitoring',
349
        'Yahoo Link Preview',
350
        'YahooSeeker',
351
        'yandex',
352
        'yanga',
353
        'yeti',
354
        'yoogliFetchAgent',
355
        'YottaaMonitor',
356
        'Zao',
357
        'ZyBorg',
358
        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
359
    );
360
361
    /**
362
     * All possible HTTP headers that represent the
363
     * User-Agent string.
364
     *
365
     * @var array
366
     */
367
    protected static $uaHttpHeaders = array(
368
        // The default User-Agent string.
369
        'HTTP_USER_AGENT',
370
        // Header can occur on devices using Opera Mini.
371
        'HTTP_X_OPERAMINI_PHONE_UA',
372
        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
373
        'HTTP_X_DEVICE_USER_AGENT',
374
        'HTTP_X_ORIGINAL_USER_AGENT',
375
        'HTTP_X_SKYFIRE_PHONE',
376
        'HTTP_X_BOLT_PHONE_UA',
377
        'HTTP_DEVICE_STOCK_UA',
378
        'HTTP_X_UCBROWSER_DEVICE_UA',
379
    );
380
381
    /**
382
     * Class constructor.
383
     */
384
    public function __construct(array $headers = null, $userAgent = null)
385
    {
386
        $this->setHttpHeaders($headers);
387
        $this->setUserAgent($userAgent);
388
    }
389
390
    /**
391
     * Set HTTP headers.
392
     *
393
     * @param array $httpHeaders
394
     */
395
    public function setHttpHeaders($httpHeaders = null)
0 ignored issues
show
Coding Style introduced by
setHttpHeaders uses the super-global variable $_SERVER which is generally not recommended.

Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable:

// Bad
class Router
{
    public function generate($path)
    {
        return $_SERVER['HOST'].$path;
    }
}

// Better
class Router
{
    private $host;

    public function __construct($host)
    {
        $this->host = $host;
    }

    public function generate($path)
    {
        return $this->host.$path;
    }
}

class Controller
{
    public function myAction(Request $request)
    {
        // Instead of
        $page = isset($_GET['page']) ? intval($_GET['page']) : 1;

        // Better (assuming you use the Symfony2 request)
        $page = $request->query->get('page', 1);
    }
}
Loading history...
396
    {
397
        // use global _SERVER if $httpHeaders aren't defined
398
        if (!is_array($httpHeaders) || !count($httpHeaders)) {
399
            $httpHeaders = $_SERVER;
400
        }
401
        // clear existing headers
402
        $this->httpHeaders = array();
403
        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
404
        // start with HTTP_.
405
        foreach ($httpHeaders as $key => $value) {
406
            if (substr($key, 0, 5) === 'HTTP_') {
407
                $this->httpHeaders[$key] = $value;
408
            }
409
        }
410
    }
411
412
    /**
413
     * Return user agent headers.
414
     *
415
     * @return array
416
     */
417
    public function getUaHttpHeaders()
418
    {
419
        return self::$uaHttpHeaders;
420
    }
421
422
    /**
423
     * Set the user agent.
424
     *
425
     * @param string $userAgent
426
     */
427
    public function setUserAgent($userAgent = null)
428
    {
429
        if (false === empty($userAgent)) {
430
            return $this->userAgent = $userAgent;
0 ignored issues
show
Documentation Bug introduced by
It seems like $userAgent of type string is incompatible with the declared type null of property $userAgent.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
431
        } else {
432
            $this->userAgent = null;
433
            foreach ($this->getUaHttpHeaders() as $altHeader) {
434
                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow.
435
                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
436
                }
437
            }
438
439
            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
0 ignored issues
show
Documentation Bug introduced by
It seems like !empty($this->userAgent)...this->userAgent) : null can also be of type string. However, the property $userAgent is declared as type null. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
440
        }
441
    }
442
443
    /**
444
     * Return the array of crawler regexs.
445
     *
446
     * @return array
447
     */
448
    public function getCrawlers()
449
    {
450
        return self::$crawlers;
451
    }
452
453
    /**
454
     * Build the user agent regex.
455
     *
456
     * @return string
457
     */
458
    public function getRegex()
459
    {
460
        return '('.implode('|', self::$crawlers).')';
461
    }
462
463
    /**
464
     * Build the replacement regex.
465
     *
466
     * @return string
467
     */
468
    public function getIgnored()
469
    {
470
        return '('.implode('|', self::$ignore).')';
471
    }
472
473
    /**
474
     * Check user agent string against the regex.
475
     *
476
     * @param string $userAgent
477
     *
478
     * @return bool
479
     */
480
    public function isCrawler($userAgent = null)
481
    {
482
        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
483
484
        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
485
486
        if (trim($agent) === false) {
487
            return false;
488
        } else {
489
            $result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches);
490
        }
491
492
        if ($matches) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $matches of type string[] is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
493
            $this->matches = $matches;
494
        }
495
496
        return (bool) $result;
497
    }
498
499
    /**
500
     * Return the matches.
501
     *
502
     * @return string
503
     */
504
    public function getMatches()
505
    {
506
        return $this->matches[0];
507
    }
508
}
509