Completed
Pull Request — master (#49)
by
unknown
05:07 queued 13s
created

CrawlerDetect::getUaHttpHeaders()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 0
1
<?php
2
3
namespace Jaybizzle\CrawlerDetect;
4
5
class CrawlerDetect
6
{
7
    /**
8
     * The user agent.
9
     * 
10
     * @var null
11
     */
12
    protected $userAgent = null;
13
14
    /**
15
     * Headers that container user agent.
16
     * 
17
     * @var array
18
     */
19
    protected $httpHeaders = array();
20
21
    /**
22
     * Store regex matches.
23
     * 
24
     * @var array
25
     */
26
    protected $matches = array();
27
28
    /**
29
     * List of strings to remove from the user agent before running the crawler regex
30
     * Over a large list of user agents, this gives us about a 55% speed increase!
31
     * 
32
     * @var array
33
     */
34
    protected static $ignore = array(
35
        'Safari.[\d\.]*',
36
        'Firefox.[\d\.]*',
37
        'Chrome.[\d\.]*',
38
        'Chromium.[\d\.]*',
39
        'MSIE.[\d\.]',
40
        'Opera\/[\d\.]*',
41
        'Mozilla.[\d\.]*',
42
        'AppleWebKit.[\d\.]*',
43
        'Trident.[\d\.]*',
44
        'Windows NT.[\d\.]*',
45
        'Android.[\d\.]*',
46
        'Macintosh.',
47
        'Ubuntu',
48
        'Linux',
49
        'Intel',
50
        'Mac OS X',
51
        'Gecko.[\d\.]*',
52
        'KHTML',
53
        'CriOS.[\d\.]*',
54
        'CPU iPhone OS ([0-9_])* like Mac OS X',
55
        'CPU OS ([0-9_])* like Mac OS X',
56
        'iPod',
57
        'like Gecko',
58
        'compatible',
59
        'x86_..',
60
        'i686',
61
        'x64',
62
        'X11',
63
        'rv:[\d\.]*',
64
        'Version.[\d\.]*',
65
        'WOW64',
66
        'Win64',
67
        'Dalvik.[\d\.]*',
68
        '\.NET CLR [\d\.]*',
69
        'Presto.[\d\.]*',
70
        'Media Center PC',
71
    );
72
73
    /**
74
     * Array of regular expressions to match against the user agent.
75
     * 
76
     * @var array
77
     */
78
    protected static $crawlers = array(
79
        '008\\/',
80
        'A6-Indexer',
81
        'Aboundex',
82
        'Accoona-AI-Agent',
83
        'acoon',
84
        'AddThis',
85
        'ADmantX',
86
        'AHC',
87
        'Airmail',
88
        'Anemone',
89
        'Arachmo',
90
        'archive-com',
91
        'B-l-i-t-z-B-O-T',
92
        'bibnum\.bnf',
93
        'biglotron',
94
        'binlar',
95
        'BingPreview'
96
        'boitho\.com-dc',
0 ignored issues
show
Bug introduced by
This code did not parse for me. Apparently, there is an error somewhere around this line:

Syntax error, unexpected T_CONSTANT_ENCAPSED_STRING, expecting ')'
Loading history...
97
        'BUbiNG',
98
        'Butterfly\\/',
99
        'BuzzSumo',
100
        'CC Metadata Scaper',
101
        'Cerberian Drtrs',
102
        'changedetection',
103
        'Charlotte',
104
        'clips\.ua\.ac\.be',
105
        'CloudFlare-AlwaysOnline',
106
        'coccoc',
107
        'Commons-HttpClient',
108
        'convera',
109
        'cosmos',
110
        'Covario-IDS',
111
        'curl',
112
        'CyberPatrol',
113
        'DataparkSearch',
114
        'dataprovider',
115
        'Digg',
116
        'DomainAppender',
117
        'drupact',
118
        'EARTHCOM',
119
        'ec2linkfinder',
120
        'ElectricMonk',
121
        'Embedly',
122
        'europarchive\.org',
123
        'EventMachine HttpClient',
124
        'ezooms',
125
        'eZ Publish Link Validator',
126
        'facebookexternalhit',
127
        'Feedfetcher-Google',
128
        'FeedValidator',
129
        'FindLinks',
130
        'findlink',
131
        'findthatfile',
132
        'Flamingo_SearchEngine',
133
        'fluffy',
134
        'getprismatic\.com',
135
        'g00g1e\.net',
136
        'GigablastOpenSource',
137
        'grub-client',
138
        'Genieo',
139
        'Go-http-client',
140
        'Google-HTTP-Java-Client',
141
        'Google favicon',
142
        'Google Keyword Suggestion',
143
        'heritrix',
144
        'Holmes',
145
        'htdig',
146
        'httpunit',
147
        'httrack',
148
        'ichiro',
149
        'igdeSpyder',
150
        'InAGist',
151
        'InfoWizards Reciprocal Link System PRO',
152
        'integromedb',
153
        'IODC',
154
        'IOI',
155
        'ips-agent',
156
        'iZSearch',
157
        'L\.webis',
158
        'Larbin',
159
        'libwww',
160
        'Link Valet',
161
        'linkdex',
162
        'LinkExaminer',
163
        'LinkWalker',
164
        'Lipperhey Link Explorer',
165
        'Lipperhey SEO Service',
166
        'LongURL API',
167
        'ltx71',
168
        'lwp-trivial',
169
        'MegaIndex\.ru',
170
        'mabontland',
171
        'MagpieRSS',
172
        'Mediapartners-Google',
173
        'MetaURI',
174
        'Mnogosearch',
175
        'mogimogi',
176
        'Morning Paper',
177
        'Mrcgiguy',
178
        'MVAClient',
179
        'netresearchserver',
180
        'NewsGator',
181
        'newsme',
182
        'NG-Search',
183
        '^NING\\/',
184
        'Notifixious',
185
        'nutch',
186
        'NutchCVS',
187
        'Nymesis',
188
        'oegp',
189
        'online link validator',
190
        'Online Website Link Checker',
191
        'Orbiter',
192
        'ow\.ly',
193
        'Ploetz \+ Zeller',
194
        'page2rss',
195
        'panscient',
196
        'Peew',
197
        'phpcrawl',
198
        'Pizilla',
199
        'Plukkie',
200
        'Pompos',
201
        'postano',
202
        'PostPost',
203
        'postrank',
204
        'proximic',
205
        'PycURL',
206
        'Python-httplib2',
207
        'python-requests',
208
        'Python-urllib',
209
        'Qseero',
210
        'Qwantify',
211
        'Radian6',
212
        'RebelMouse',
213
        'REL Link Checker',
214
        'RetrevoPageAnalyzer',
215
        'Riddler',
216
        'Robosourcer',
217
        'Ruby',
218
        'SBIder',
219
        'ScoutJet',
220
        'ScoutURLMonitor',
221
        'Scrapy',
222
        'Scrubby',
223
        'SearchSight',
224
        'semanticdiscovery',
225
        'SEOstats',
226
        'Seznam screenshot-generator',
227
        'ShopWiki',
228
        'SiteBar',
229
        'siteexplorer\.info',
230
        'slider\.com',
231
        'slurp',
232
        'Snappy',
233
        'sogou',
234
        'speedy',
235
        'Sqworm',
236
        'StackRambler',
237
        'Stratagems Kumo',
238
        'summify',
239
        'teoma',
240
        'theoldreader\.com',
241
        'TinEye',
242
        'Traackr.com',
243
        'truwoGPS',
244
        'tweetedtimes\.com',
245
        'Twikle',
246
        'UnwindFetchor',
247
        'updated',
248
        'urlresolver',
249
        'Validator\.nu\\/LV',
250
        'Vagabondo',
251
        'Vivante Link Checker',
252
        'Vortex',
253
        'voyager\\/',
254
        'VYU2',
255
        'W3C-checklink',
256
        'W3C_CSS_Validator_JFouffa',
257
        'W3C_I18n-Checker',
258
        'W3C-mobileOK',
259
        'W3C_Unicorn',
260
        'W3C_Validator',
261
        'WebIndex',
262
        'Websquash\.com',
263
        'webcollage',
264
        'webmon ',
265
        'WeSEE:Search',
266
        'wf84',
267
        'wget',
268
        'WomlpeFactory',
269
        'wotbox',
270
        'Xenu Link Sleuth',
271
        'XML Sitemaps Generator',
272
        'Y!J-ASR',
273
        'yacy',
274
        'Yahoo Link Preview',
275
        'Yahoo! Slurp China',
276
        'Yahoo! Slurp',
277
        'YahooSeeker',
278
        'YahooSeeker-Testing',
279
        'YandexImages',
280
        'YandexMetrika',
281
        'yandex',
282
        'yanga',
283
        'yeti',
284
        'yoogliFetchAgent',
285
        'Zao',
286
        'ZyBorg',
287
        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
288
    );
289
290
    /**
291
     * All possible HTTP headers that represent the
292
     * User-Agent string.
293
     *
294
     * @var array
295
     */
296
    protected static $uaHttpHeaders = array(
297
        // The default User-Agent string.
298
        'HTTP_USER_AGENT',
299
        // Header can occur on devices using Opera Mini.
300
        'HTTP_X_OPERAMINI_PHONE_UA',
301
        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
0 ignored issues
show
Unused Code Comprehensibility introduced by
38% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
302
        'HTTP_X_DEVICE_USER_AGENT',
303
        'HTTP_X_ORIGINAL_USER_AGENT',
304
        'HTTP_X_SKYFIRE_PHONE',
305
        'HTTP_X_BOLT_PHONE_UA',
306
        'HTTP_DEVICE_STOCK_UA',
307
        'HTTP_X_UCBROWSER_DEVICE_UA',
308
    );
309
310
    /**
311
     * Class constructor.
312
     */
313
    public function __construct(array $headers = null, $userAgent = null)
314
    {
315
        $this->setHttpHeaders($headers);
316
        $this->setUserAgent($userAgent);
317
    }
318
319
    /**
320
     * Set HTTP headers.
321
     * 
322
     * @param array $httpHeaders
323
     */
324
    public function setHttpHeaders($httpHeaders = null)
0 ignored issues
show
Coding Style introduced by
setHttpHeaders uses the super-global variable $_SERVER which is generally not recommended.

Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable:

// Bad
class Router
{
    public function generate($path)
    {
        return $_SERVER['HOST'].$path;
    }
}

// Better
class Router
{
    private $host;

    public function __construct($host)
    {
        $this->host = $host;
    }

    public function generate($path)
    {
        return $this->host.$path;
    }
}

class Controller
{
    public function myAction(Request $request)
    {
        // Instead of
        $page = isset($_GET['page']) ? intval($_GET['page']) : 1;

        // Better (assuming you use the Symfony2 request)
        $page = $request->query->get('page', 1);
    }
}
Loading history...
325
    {
326
        // use global _SERVER if $httpHeaders aren't defined
327
        if (!is_array($httpHeaders) || !count($httpHeaders)) {
328
            $httpHeaders = $_SERVER;
329
        }
330
        // clear existing headers
331
        $this->httpHeaders = array();
332
        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
333
        // start with HTTP_.
334
        foreach ($httpHeaders as $key => $value) {
335
            if (substr($key, 0, 5) === 'HTTP_') {
336
                $this->httpHeaders[$key] = $value;
337
            }
338
        }
339
    }
340
341
    /**
342
     * Return user agent headers.
343
     * 
344
     * @return array
345
     */
346
    public function getUaHttpHeaders()
347
    {
348
        return self::$uaHttpHeaders;
349
    }
350
351
    /**
352
     * Set the user agent.
353
     * 
354
     * @param string $userAgent
355
     */
356
    public function setUserAgent($userAgent = null)
357
    {
358
        if (false === empty($userAgent)) {
359
            return $this->userAgent = $userAgent;
360
        } else {
361
            $this->userAgent = null;
362
            foreach ($this->getUaHttpHeaders() as $altHeader) {
363
                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow.
364
                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
365
                }
366
            }
367
368
            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
369
        }
370
    }
371
372
    /**
373
     * Build the user agent regex.
374
     * 
375
     * @return string
376
     */
377
    public function getRegex()
378
    {
379
        return '('.implode('|', self::$crawlers).')';
380
    }
381
382
    /**
383
     * Build the replacement regex.
384
     * 
385
     * @return string
386
     */
387
    public function getIgnored()
388
    {
389
        return '('.implode('|', self::$ignore).')';
390
    }
391
392
    /**
393
     * Check user aganet string against the regex.
394
     * 
395
     * @param string $userAgent
396
     *
397
     * @return bool
398
     */
399
    public function isCrawler($userAgent = null)
400
    {
401
        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
402
403
        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
404
405
        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
406
407
        if ($matches) {
408
            $this->matches = $matches;
409
        }
410
411
        return (bool) $result;
412
    }
413
414
    /**
415
     * Return the matches.
416
     * 
417
     * @return array
418
     */
419
    public function getMatches()
420
    {
421
        return $this->matches[0];
422
    }
423
}
424