Completed
Pull Request — master (#42)
by Mark
06:10
created

CrawlerDetect::setUserAgent()   B

Complexity

Conditions 5
Paths 7

Size

Total Lines 15
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 1 Features 0
Metric Value
c 3
b 1
f 0
dl 0
loc 15
rs 8.8571
cc 5
eloc 9
nc 7
nop 1
1
<?php
2
3
namespace Jaybizzle\CrawlerDetect;
4
5
class CrawlerDetect
6
{
7
    protected $userAgent = null;
8
9
    protected $httpHeaders = array();
10
11
    protected $matches = array();
12
13
    /**
14
     * List of strings to remove from the user agent before running the crawler regex
15
     * Over a large list of user agents, this gives us about a 55% speed increase!
16
     * 
17
     * @var array
18
     */
19
    protected static $ignore = array(
20
        'Safari.[\d\.]*',
21
        'Firefox.[\d\.]*',
22
        'Chrome.[\d\.]*',
23
        'Chromium.[\d\.]*',
24
        'MSIE.[\d\.]',
25
        'Opera\/[\d\.]*',
26
        'Mozilla.[\d\.]*',
27
        'AppleWebKit.[\d\.]*',
28
        'Trident.[\d\.]*',
29
        'Windows NT.[\d\.]*',
30
        'Android.[\d\.]*',
31
        'Macintosh.',
32
        'Ubuntu',
33
        'Linux',
34
        'Intel',
35
        'Mac OS X',
36
        'Gecko.[\d\.]*',
37
        'KHTML',
38
        'CriOS.[\d\.]*',
39
        'CPU iPhone OS ([0-9_])* like Mac OS X',
40
        'CPU OS ([0-9_])* like Mac OS X',
41
        'iPod',
42
        'like Gecko',
43
        'compatible',
44
        'x86_..',
45
        'i686',
46
        'x64',
47
        'X11',
48
        'rv:[\d\.]*',
49
        'Version.[\d\.]*',
50
        'WOW64',
51
        'Win64',
52
        'Dalvik.[\d\.]*',
53
        '\.NET CLR [\d\.]*',
54
        'Presto.[\d\.]*',
55
        'Media Center PC',
56
    );
57
58
    protected static $crawlers = array(
59
        '007ac9 Crawler',
60
        '008\\/',
61
        'A6-Indexer',
62
        'Aboundex',
63
        'Accoona-AI-Agent',
64
        'acoon',
65
        'AddThis',
66
        'ADmantX',
67
        'AHC',
68
        'Airmail',
69
        'Anemone',
70
        'Arachmo',
71
        'archive-com',
72
        'B-l-i-t-z-B-O-T',
73
        'bibnum\.bnf',
74
        'biglotron',
75
        'binlar',
76
        'boitho\.com-dc',
77
        'BUbiNG',
78
        'Butterfly\\/',
79
        'BuzzSumo',
80
        'CC Metadata Scaper',
81
        'Cerberian Drtrs',
82
        'changedetection',
83
        'Charlotte',
84
        'clips\.ua\.ac\.be',
85
        'CloudFlare-AlwaysOnline',
86
        'coccoc',
87
        'Commons-HttpClient',
88
        'convera',
89
        'cosmos',
90
        'Covario-IDS',
91
        'crawler4j',
92
        'curl',
93
        'CyberPatrol',
94
        'DataparkSearch',
95
        'dataprovider',
96
        'Digg',
97
        'DomainAppender',
98
        'drupact',
99
        'EARTHCOM',
100
        'ec2linkfinder',
101
        'ElectricMonk',
102
        'Embedly',
103
        'europarchive\.org',
104
        'EventMachine HttpClient',
105
        'ezooms',
106
        'eZ Publish Link Validator',
107
        'facebookexternalhit',
108
        'Feedfetcher-Google',
109
        'FeedValidator',
110
        'FindLinks',
111
        'findlink',
112
        'findthatfile',
113
        'Flamingo_SearchEngine',
114
        'fluffy',
115
        'getprismatic\.com',
116
        'g00g1e\.net',
117
        'GigablastOpenSource',
118
        'grub-client',
119
        'Genieo',
120
        'Go-http-client',
121
        'Googlebot-Image',
122
        'Googlebot-Mobile',
123
        'Google-HTTP-Java-Client',
124
        'Google favicon',
125
        'heritrix',
126
        'Holmes',
127
        'htdig',
128
        'httpunit',
129
        'httrack',
130
        'ia_archiver',
131
        'ichiro',
132
        'igdeSpyder',
133
        'InAGist',
134
        'InfoWizards Reciprocal Link System PRO',
135
        'integromedb',
136
        'IODC',
137
        'IOI',
138
        'ips-agent',
139
        'iZSearch',
140
        'L\.webis',
141
        'Larbin',
142
        'libwww',
143
        'Link Valet',
144
        'linkdex',
145
        'LinkExaminer',
146
        'LinkWalker',
147
        'Lipperhey Link Explorer',
148
        'Lipperhey SEO Service',
149
        'LongURL API',
150
        'ltx71',
151
        'lwp-trivial',
152
        'MegaIndex\.ru',
153
        'mabontland',
154
        'MagpieRSS',
155
        'Mediapartners-Google',
156
        'MetaURI',
157
        'Mnogosearch',
158
        'mogimogi',
159
        'Morning Paper',
160
        'Mrcgiguy',
161
        'MVAClient',
162
        'netresearchserver',
163
        'NewsGator',
164
        'newsme',
165
        'NG-Search',
166
        '^NING\\/',
167
        'Notifixious',
168
        'nutch',
169
        'NutchCVS',
170
        'Nymesis',
171
        'oegp',
172
        'online link validator',
173
        'Online Website Link Checker',
174
        'Orbiter',
175
        'ow\.ly',
176
        'Ploetz \+ Zeller',
177
        'page2rss',
178
        'panscient',
179
        'Peew',
180
        'phpcrawl',
181
        'Pizilla',
182
        'Plukkie',
183
        'Pompos',
184
        'postano',
185
        'PostPost',
186
        'postrank',
187
        'proximic',
188
        'PycURL',
189
        'Python-httplib2',
190
        'python-requests',
191
        'Python-urllib',
192
        'Qseero',
193
        'Qwantify',
194
        'Radian6',
195
        'RebelMouse',
196
        'REL Link Checker',
197
        'RetrevoPageAnalyzer',
198
        'Riddler',
199
        'Robosourcer',
200
        'Ruby',
201
        'SandCrawler',
202
        'SBIder',
203
        'ScoutJet',
204
        'ScoutURLMonitor',
205
        'Scrapy',
206
        'Scrubby',
207
        'SearchSight',
208
        'semanticdiscovery',
209
        'SEOstats',
210
        'Seznam screenshot-generator', 
211
        'ShopWiki',
212
        'SiteBar',
213
        'siteexplorer\.info',
214
        'slider\.com',
215
        'slurp',
216
        'Snappy',
217
        'sogou',
218
        'speedy',
219
        'Sqworm',
220
        'StackRambler',
221
        'Stratagems Kumo',
222
        'summify',
223
        'teoma',
224
        'theoldreader\.com',
225
        'TinEye',
226
        'Traackr.com',
227
        'truwoGPS',
228
        'tweetedtimes\.com',
229
        'Twikle',
230
        'UnwindFetchor',
231
        'updated',
232
        'urlresolver',
233
        'Validator\.nu\\/LV',
234
        'Vagabondo',
235
        'Vivante Link Checker',
236
        'Vortex',
237
        'voyager\\/',
238
        'VYU2',
239
        'W3C-checklink',
240
        'W3C_CSS_Validator_JFouffa',
241
        'W3C_I18n-Checker',
242
        'W3C-mobileOK',
243
        'W3C_Unicorn',
244
        'W3C_Validator',
245
        'WebIndex',
246
        'Websquash\.com',
247
        'webcollage',
248
        'webmon ',
249
        'WeSEE:Search',
250
        'wf84',
251
        'wget',
252
        'WomlpeFactory',
253
        'wotbox',
254
        'Xenu Link Sleuth',
255
        'XML Sitemaps Generator',
256
        'Y!J-ASR',
257
        'yacy',
258
        'yacybot',
259
        'Yahoo Link Preview',
260
        'Yahoo! Slurp China',
261
        'Yahoo! Slurp',
262
        'YahooSeeker',
263
        'YahooSeeker-Testing',
264
        'YandexImages',
265
        'YandexMetrika',
266
        'yandex',
267
        'yanga',
268
        'yeti',
269
        'yoogliFetchAgent',
270
        'Zao',
271
        'ZyBorg',
272
        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
273
    );
274
275
    /**
276
     * All possible HTTP headers that represent the
277
     * User-Agent string.
278
     *
279
     * @var array
280
     */
281
    protected static $uaHttpHeaders = array(
282
        // The default User-Agent string.
283
        'HTTP_USER_AGENT',
284
        // Header can occur on devices using Opera Mini.
285
        'HTTP_X_OPERAMINI_PHONE_UA',
286
        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
0 ignored issues
show
Unused Code Comprehensibility introduced by
38% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
287
        'HTTP_X_DEVICE_USER_AGENT',
288
        'HTTP_X_ORIGINAL_USER_AGENT',
289
        'HTTP_X_SKYFIRE_PHONE',
290
        'HTTP_X_BOLT_PHONE_UA',
291
        'HTTP_DEVICE_STOCK_UA',
292
        'HTTP_X_UCBROWSER_DEVICE_UA',
293
    );
294
295
    /**
296
     * Class constructor.
297
     */
298
    public function __construct(array $headers = null, $userAgent = null)
299
    {
300
        $this->setHttpHeaders($headers);
301
        $this->setUserAgent($userAgent);
302
    }
303
304
    public function setHttpHeaders($httpHeaders = null)
0 ignored issues
show
Coding Style introduced by
setHttpHeaders uses the super-global variable $_SERVER which is generally not recommended.

Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable:

// Bad
class Router
{
    public function generate($path)
    {
        return $_SERVER['HOST'].$path;
    }
}

// Better
class Router
{
    private $host;

    public function __construct($host)
    {
        $this->host = $host;
    }

    public function generate($path)
    {
        return $this->host.$path;
    }
}

class Controller
{
    public function myAction(Request $request)
    {
        // Instead of
        $page = isset($_GET['page']) ? intval($_GET['page']) : 1;

        // Better (assuming you use the Symfony2 request)
        $page = $request->query->get('page', 1);
    }
}
Loading history...
305
    {
306
        // use global _SERVER if $httpHeaders aren't defined
307
        if (!is_array($httpHeaders) || !count($httpHeaders)) {
308
            $httpHeaders = $_SERVER;
309
        }
310
        // clear existing headers
311
        $this->httpHeaders = array();
312
        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
313
        // start with HTTP_.
314
        foreach ($httpHeaders as $key => $value) {
315
            if (substr($key, 0, 5) === 'HTTP_') {
316
                $this->httpHeaders[$key] = $value;
317
            }
318
        }
319
    }
320
321
    public function getUaHttpHeaders()
322
    {
323
        return self::$uaHttpHeaders;
324
    }
325
326
    public function setUserAgent($userAgent = null)
327
    {
328
        if (false === empty($userAgent)) {
329
            return $this->userAgent = $userAgent;
330
        } else {
331
            $this->userAgent = null;
332
            foreach ($this->getUaHttpHeaders() as $altHeader) {
333
                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
334
                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
335
                }
336
            }
337
338
            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
339
        }
340
    }
341
342
    public function getRegex()
343
    {
344
        return '('.implode('|', self::$crawlers).')';
345
    }
346
347
    public function getIgnored()
348
    {
349
        return '('.implode('|', self::$ignore).')';
350
    }
351
352
    public function isCrawler($userAgent = null)
353
    {
354
        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
355
356
        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
357
358
        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
359
360
        if ($matches) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $matches of type string[] is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
361
            $this->matches = $matches;
362
        }
363
364
        return (bool) $result;
365
    }
366
367
    public function getMatches()
368
    {
369
        return $this->matches[0];
370
    }
371
}
372