1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Jaybizzle\CrawlerDetect; |
4
|
|
|
|
5
|
|
|
class CrawlerDetect |
6
|
|
|
{ |
7
|
|
|
/** |
8
|
|
|
* The user agent. |
9
|
|
|
* |
10
|
|
|
* @var null |
11
|
|
|
*/ |
12
|
|
|
protected $userAgent = null; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* Headers that contain a user agent. |
16
|
|
|
* |
17
|
|
|
* @var array |
18
|
|
|
*/ |
19
|
|
|
protected $httpHeaders = array(); |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* Store regex matches. |
23
|
|
|
* |
24
|
|
|
* @var array |
25
|
|
|
*/ |
26
|
|
|
protected $matches = array(); |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* List of strings to remove from the user agent before running the crawler regex |
30
|
|
|
* Over a large list of user agents, this gives us about a 55% speed increase! |
31
|
|
|
* |
32
|
|
|
* @var array |
33
|
|
|
*/ |
34
|
|
|
protected static $ignore = array( |
35
|
|
|
'Safari.[\d\.]*', |
36
|
|
|
'Firefox.[\d\.]*', |
37
|
|
|
'Chrome.[\d\.]*', |
38
|
|
|
'Chromium.[\d\.]*', |
39
|
|
|
'MSIE.[\d\.]', |
40
|
|
|
'Opera\/[\d\.]*', |
41
|
|
|
'Mozilla.[\d\.]*', |
42
|
|
|
'AppleWebKit.[\d\.]*', |
43
|
|
|
'Trident.[\d\.]*', |
44
|
|
|
'Windows NT.[\d\.]*', |
45
|
|
|
'Android.[\d\.]*', |
46
|
|
|
'Macintosh.', |
47
|
|
|
'Ubuntu', |
48
|
|
|
'Linux', |
49
|
|
|
'[ ]Intel', |
50
|
|
|
'Mac OS X [\d_]*', |
51
|
|
|
'(like )?Gecko(.[\d\.]*)?', |
52
|
|
|
'KHTML', |
53
|
|
|
'CriOS.[\d\.]*', |
54
|
|
|
'CPU iPhone OS ([0-9_])* like Mac OS X', |
55
|
|
|
'CPU OS ([0-9_])* like Mac OS X', |
56
|
|
|
'iPod', |
57
|
|
|
'compatible', |
58
|
|
|
'x86_..', |
59
|
|
|
'i686', |
60
|
|
|
'x64', |
61
|
|
|
'X11', |
62
|
|
|
'rv:[\d\.]*', |
63
|
|
|
'Version.[\d\.]*', |
64
|
|
|
'WOW64', |
65
|
|
|
'Win64', |
66
|
|
|
'Dalvik.[\d\.]*', |
67
|
|
|
' \.NET CLR [\d\.]*', |
68
|
|
|
'Presto.[\d\.]*', |
69
|
|
|
'Media Center PC', |
70
|
|
|
'BlackBerry', |
71
|
|
|
'Build', |
72
|
|
|
'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.', |
73
|
|
|
'Opera', |
74
|
|
|
' \.NET[\d\.]*', |
75
|
|
|
'\(|\)|;|,', // Remove the following characters ( ) : , |
76
|
|
|
); |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* Array of regular expressions to match against the user agent. |
80
|
|
|
* |
81
|
|
|
* @var array |
82
|
|
|
*/ |
83
|
|
|
protected static $crawlers = array( |
84
|
|
|
'.*Java.*outbrain', |
85
|
|
|
'008\\/', |
86
|
|
|
'^NING\\/', |
87
|
|
|
'A6-Indexer', |
88
|
|
|
'Aboundex', |
89
|
|
|
'Accoona-AI-Agent', |
90
|
|
|
'acoon', |
91
|
|
|
'AddThis', |
92
|
|
|
'ADmantX', |
93
|
|
|
'AHC', |
94
|
|
|
'Airmail', |
95
|
|
|
'Anemone', |
96
|
|
|
'Arachmo', |
97
|
|
|
'archive-com', |
98
|
|
|
'B-l-i-t-z-B-O-T', |
99
|
|
|
'Backlink-Ceck\.de', |
100
|
|
|
'BazQux', |
101
|
|
|
'bibnum\.bnf', |
102
|
|
|
'biglotron', |
103
|
|
|
'BingPreview', |
104
|
|
|
'binlar', |
105
|
|
|
'Bloglovin', |
106
|
|
|
'Blogtrottr', |
107
|
|
|
'boitho\.com-dc', |
108
|
|
|
'Browsershots', |
109
|
|
|
'BUbiNG', |
110
|
|
|
'Butterfly\\/', |
111
|
|
|
'BuzzSumo', |
112
|
|
|
'CapsuleChecker', |
113
|
|
|
'CC Metadata Scaper', |
114
|
|
|
'Cerberian Drtrs', |
115
|
|
|
'changedetection', |
116
|
|
|
'Charlotte', |
117
|
|
|
'clips\.ua\.ac\.be', |
118
|
|
|
'CloudFlare-AlwaysOnline', |
119
|
|
|
'coccoc', |
120
|
|
|
'CommaFeed', |
121
|
|
|
'Commons-HttpClient', |
122
|
|
|
'convera', |
123
|
|
|
'cosmos', |
124
|
|
|
'Covario-IDS', |
125
|
|
|
'Curious George', |
126
|
|
|
'curl', |
127
|
|
|
'CyberPatrol', |
128
|
|
|
'DataparkSearch', |
129
|
|
|
'dataprovider', |
130
|
|
|
'Daum(oa)?[ \\/][0-9]', |
131
|
|
|
'Digg', |
132
|
|
|
'DomainAppender', |
133
|
|
|
'Dragonfly File Reader', |
134
|
|
|
'drupact', |
135
|
|
|
'EARTHCOM', |
136
|
|
|
'ec2linkfinder', |
137
|
|
|
'ECCP', |
138
|
|
|
'ElectricMonk', |
139
|
|
|
'EMail Exractor', |
140
|
|
|
'EmailWolf', |
141
|
|
|
'Embed PHP Library', |
142
|
|
|
'Embedly', |
143
|
|
|
'europarchive\.org', |
144
|
|
|
'EventMachine HttpClient', |
145
|
|
|
'ExactSearch', |
146
|
|
|
'ExaleadCloudview', |
147
|
|
|
'eZ Publish Link Validator', |
148
|
|
|
'ezooms', |
149
|
|
|
'facebookexternalhit', |
150
|
|
|
'facebookplatform', |
151
|
|
|
'Feed Wrangler', |
152
|
|
|
'Feedbin', |
153
|
|
|
'FeedBurner', |
154
|
|
|
'Feedfetcher-Google', |
155
|
|
|
'Feedly', |
156
|
|
|
'Feedspot', |
157
|
|
|
'FeedValidator', |
158
|
|
|
'Fever', |
159
|
|
|
'findlink', |
160
|
|
|
'findthatfile', |
161
|
|
|
'Flamingo_SearchEngine', |
162
|
|
|
'fluffy', |
163
|
|
|
'g00g1e\.net', |
164
|
|
|
'Genieo', |
165
|
|
|
'getprismatic\.com', |
166
|
|
|
'GigablastOpenSource', |
167
|
|
|
'Go-http-client', |
168
|
|
|
'Google favicon', |
169
|
|
|
'Google Keyword Suggestion', |
170
|
|
|
'Google Page Speed Insights', |
171
|
|
|
'Google-HTTP-Java-Client', |
172
|
|
|
'google_partner_monitoring', |
173
|
|
|
'GoogleProducer', |
174
|
|
|
'grub-client', |
175
|
|
|
'heritrix', |
176
|
|
|
'Holmes', |
177
|
|
|
'htdig', |
178
|
|
|
'HTTPMon', |
179
|
|
|
'httpunit', |
180
|
|
|
'httrack', |
181
|
|
|
'HubPages.*crawlingpolicy', |
182
|
|
|
'HubSpot Marketing Grader', |
183
|
|
|
'ichiro', |
184
|
|
|
'IDG Twitter Links Resolver', |
185
|
|
|
'igdeSpyder', |
186
|
|
|
'InAGist', |
187
|
|
|
'infegy', |
188
|
|
|
'InfoWizards Reciprocal Link System PRO', |
189
|
|
|
'integromedb', |
190
|
|
|
'IODC', |
191
|
|
|
'IOI', |
192
|
|
|
'ips-agent', |
193
|
|
|
'iZSearch', |
194
|
|
|
'Jigsaw', |
195
|
|
|
'Jobrapido', |
196
|
|
|
'kouio', |
197
|
|
|
'L\.webis', |
198
|
|
|
'Larbin', |
199
|
|
|
'libwww', |
200
|
|
|
'Link Valet', |
201
|
|
|
'linkCheck', |
202
|
|
|
'linkdex', |
203
|
|
|
'LinkExaminer', |
204
|
|
|
'LinkWalker', |
205
|
|
|
'Lipperhey', |
206
|
|
|
'LongURL API', |
207
|
|
|
'ltx71', |
208
|
|
|
'lwp-trivial', |
209
|
|
|
'lycos', |
210
|
|
|
'mabontland', |
211
|
|
|
'MagpieRSS', |
212
|
|
|
'Mediapartners-Google', |
213
|
|
|
'Mediapartners-Google', |
214
|
|
|
'MegaIndex\.ru', |
215
|
|
|
'MetaURI', |
216
|
|
|
'Mnogosearch', |
217
|
|
|
'mogimogi', |
218
|
|
|
'Morning Paper', |
219
|
|
|
'Mrcgiguy', |
220
|
|
|
'MVAClient', |
221
|
|
|
'Netcraft Web Server Survey', |
222
|
|
|
'NetLyzer FastProbe', |
223
|
|
|
'netresearch', |
224
|
|
|
'Netvibes', |
225
|
|
|
'NewsBlur .*(Fetcher|Finder)', |
226
|
|
|
'NewsGator', |
227
|
|
|
'newsme', |
228
|
|
|
'NG-Search', |
229
|
|
|
'nineconnections\.com', |
230
|
|
|
'nominet\.org\.uk', |
231
|
|
|
'Notifixious', |
232
|
|
|
'nuhk', |
233
|
|
|
'nutch', |
234
|
|
|
'Nymesis', |
235
|
|
|
'oegp', |
236
|
|
|
'Omea Reader', |
237
|
|
|
'online link validator', |
238
|
|
|
'Online Website Link Checker', |
239
|
|
|
'Orbiter', |
240
|
|
|
'ow\.ly', |
241
|
|
|
'page2rss', |
242
|
|
|
'PagePeeker', |
243
|
|
|
'panscient', |
244
|
|
|
'Peew', |
245
|
|
|
'phpcrawl', |
246
|
|
|
'phpservermon', |
247
|
|
|
'Pingdom\.com', |
248
|
|
|
'Pinterest', |
249
|
|
|
'Pizilla', |
250
|
|
|
'Ploetz \+ Zeller', |
251
|
|
|
'Plukkie', |
252
|
|
|
'PocketParser', |
253
|
|
|
'Pompos', |
254
|
|
|
'postano', |
255
|
|
|
'PostPost', |
256
|
|
|
'postrank', |
257
|
|
|
'proximic', |
258
|
|
|
'Pulsepoint XT3 web scraper', |
259
|
|
|
'Python-httplib2', |
260
|
|
|
'python-requests', |
261
|
|
|
'Python-urllib', |
262
|
|
|
'Qseero', |
263
|
|
|
'Qwantify', |
264
|
|
|
'Radian6', |
265
|
|
|
'RebelMouse', |
266
|
|
|
'REL Link Checker', |
267
|
|
|
'RetrevoPageAnalyzer', |
268
|
|
|
'Riddler', |
269
|
|
|
'Robosourcer', |
270
|
|
|
'ROI Hunter', |
271
|
|
|
'Ruby', |
272
|
|
|
'SBIder', |
273
|
|
|
'scooter', |
274
|
|
|
'ScoutJet', |
275
|
|
|
'ScoutURLMonitor', |
276
|
|
|
'Scrapy', |
277
|
|
|
'Scrubby', |
278
|
|
|
'SearchSight', |
279
|
|
|
'semanticdiscovery', |
280
|
|
|
'SEOstats', |
281
|
|
|
'Server Density Service Monitoring.*', |
282
|
|
|
'servernfo\.com', |
283
|
|
|
'Seznam screenshot-generator', |
284
|
|
|
'ShopWiki', |
285
|
|
|
'SilverReader', |
286
|
|
|
'SimplePie', |
287
|
|
|
'Site24x7', |
288
|
|
|
'SiteBar', |
289
|
|
|
'siteexplorer\.info', |
290
|
|
|
'Siteimprove\.com', |
291
|
|
|
'SkypeUriPreview', |
292
|
|
|
'slider\.com', |
293
|
|
|
'slurp', |
294
|
|
|
'Snappy', |
295
|
|
|
'sogou', |
296
|
|
|
'SortSite', |
297
|
|
|
'speedy', |
298
|
|
|
'Spinn3r', |
299
|
|
|
'Springshare Link Checker', |
300
|
|
|
'Sqworm', |
301
|
|
|
'StackRambler', |
302
|
|
|
'Stratagems Kumo', |
303
|
|
|
'summify', |
304
|
|
|
'teoma', |
305
|
|
|
'theoldreader\.com', |
306
|
|
|
'TinEye', |
307
|
|
|
'Tiny Tiny RSS', |
308
|
|
|
'Traackr.com', |
309
|
|
|
'truwoGPS', |
310
|
|
|
'tweetedtimes\.com', |
311
|
|
|
'Twikle', |
312
|
|
|
'Typhoeus', |
313
|
|
|
'UdmSearch', |
314
|
|
|
'UnwindFetchor', |
315
|
|
|
'updated', |
316
|
|
|
'URLChecker', |
317
|
|
|
'urlresolver', |
318
|
|
|
'Vagabondo', |
319
|
|
|
'Validator\.nu\\/LV', |
320
|
|
|
'via ggpht\.com GoogleImageProxy', |
321
|
|
|
'Vivante Link Checker', |
322
|
|
|
'Vortex', |
323
|
|
|
'voyager\\/', |
324
|
|
|
'VYU2', |
325
|
|
|
'W3C-checklink', |
326
|
|
|
'W3C-mobileOK', |
327
|
|
|
'W3C_CSS_Validator_JFouffa', |
328
|
|
|
'W3C_I18n-Checker', |
329
|
|
|
'W3C_Unicorn', |
330
|
|
|
'W3C_Validator', |
331
|
|
|
'web-capture\.net', |
332
|
|
|
'webcollage', |
333
|
|
|
'WebIndex', |
334
|
|
|
'webmon ', |
335
|
|
|
'websitepulse[+ ]checker', |
336
|
|
|
'Websquash\.com', |
337
|
|
|
'WebThumbnail', |
338
|
|
|
'WeSEE:Search', |
339
|
|
|
'wf84', |
340
|
|
|
'wget', |
341
|
|
|
'WomlpeFactory', |
342
|
|
|
'wotbox', |
343
|
|
|
'www\.monitor\.us', |
344
|
|
|
'Xenu Link Sleuth', |
345
|
|
|
'XML Sitemaps Generator', |
346
|
|
|
'Y!J-ASR', |
347
|
|
|
'yacy', |
348
|
|
|
'Yahoo Ad monitoring', |
349
|
|
|
'Yahoo Link Preview', |
350
|
|
|
'YahooSeeker', |
351
|
|
|
'yandex', |
352
|
|
|
'yanga', |
353
|
|
|
'yeti', |
354
|
|
|
'yoogliFetchAgent', |
355
|
|
|
'YottaaMonitor', |
356
|
|
|
'Zao', |
357
|
|
|
'ZyBorg', |
358
|
|
|
'[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
359
|
|
|
); |
360
|
|
|
|
361
|
|
|
/** |
362
|
|
|
* All possible HTTP headers that represent the |
363
|
|
|
* User-Agent string. |
364
|
|
|
* |
365
|
|
|
* @var array |
366
|
|
|
*/ |
367
|
|
|
protected static $uaHttpHeaders = array( |
368
|
|
|
// The default User-Agent string. |
369
|
|
|
'HTTP_USER_AGENT', |
370
|
|
|
// Header can occur on devices using Opera Mini. |
371
|
|
|
'HTTP_X_OPERAMINI_PHONE_UA', |
372
|
|
|
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
373
|
|
|
'HTTP_X_DEVICE_USER_AGENT', |
374
|
|
|
'HTTP_X_ORIGINAL_USER_AGENT', |
375
|
|
|
'HTTP_X_SKYFIRE_PHONE', |
376
|
|
|
'HTTP_X_BOLT_PHONE_UA', |
377
|
|
|
'HTTP_DEVICE_STOCK_UA', |
378
|
|
|
'HTTP_X_UCBROWSER_DEVICE_UA', |
379
|
|
|
); |
380
|
|
|
|
381
|
|
|
/** |
382
|
|
|
* Class constructor. |
383
|
|
|
*/ |
384
|
|
|
public function __construct(array $headers = null, $userAgent = null) |
385
|
|
|
{ |
386
|
|
|
$this->setHttpHeaders($headers); |
387
|
|
|
$this->setUserAgent($userAgent); |
388
|
|
|
} |
389
|
|
|
|
390
|
|
|
/** |
391
|
|
|
* Set HTTP headers. |
392
|
|
|
* |
393
|
|
|
* @param array $httpHeaders |
394
|
|
|
*/ |
395
|
|
|
public function setHttpHeaders($httpHeaders = null) |
|
|
|
|
396
|
|
|
{ |
397
|
|
|
// use global _SERVER if $httpHeaders aren't defined |
398
|
|
|
if (!is_array($httpHeaders) || !count($httpHeaders)) { |
399
|
|
|
$httpHeaders = $_SERVER; |
400
|
|
|
} |
401
|
|
|
// clear existing headers |
402
|
|
|
$this->httpHeaders = array(); |
403
|
|
|
// Only save HTTP headers. In PHP land, that means only _SERVER vars that |
404
|
|
|
// start with HTTP_. |
405
|
|
|
foreach ($httpHeaders as $key => $value) { |
406
|
|
|
if (substr($key, 0, 5) === 'HTTP_') { |
407
|
|
|
$this->httpHeaders[$key] = $value; |
408
|
|
|
} |
409
|
|
|
} |
410
|
|
|
} |
411
|
|
|
|
412
|
|
|
/** |
413
|
|
|
* Return user agent headers. |
414
|
|
|
* |
415
|
|
|
* @return array |
416
|
|
|
*/ |
417
|
|
|
public function getUaHttpHeaders() |
418
|
|
|
{ |
419
|
|
|
return self::$uaHttpHeaders; |
420
|
|
|
} |
421
|
|
|
|
422
|
|
|
/** |
423
|
|
|
* Set the user agent. |
424
|
|
|
* |
425
|
|
|
* @param string $userAgent |
426
|
|
|
*/ |
427
|
|
|
public function setUserAgent($userAgent = null) |
428
|
|
|
{ |
429
|
|
|
if (false === empty($userAgent)) { |
430
|
|
|
return $this->userAgent = $userAgent; |
|
|
|
|
431
|
|
|
} else { |
432
|
|
|
$this->userAgent = null; |
433
|
|
|
foreach ($this->getUaHttpHeaders() as $altHeader) { |
434
|
|
|
if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. |
435
|
|
|
$this->userAgent .= $this->httpHeaders[$altHeader].' '; |
436
|
|
|
} |
437
|
|
|
} |
438
|
|
|
|
439
|
|
|
return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
|
|
|
440
|
|
|
} |
441
|
|
|
} |
442
|
|
|
|
443
|
|
|
/** |
444
|
|
|
* Return the array of crawler regexs. |
445
|
|
|
* |
446
|
|
|
* @return array |
447
|
|
|
*/ |
448
|
|
|
public function getCrawlers() |
449
|
|
|
{ |
450
|
|
|
return self::$crawlers; |
451
|
|
|
} |
452
|
|
|
|
453
|
|
|
/** |
454
|
|
|
* Build the user agent regex. |
455
|
|
|
* |
456
|
|
|
* @return string |
457
|
|
|
*/ |
458
|
|
|
public function getRegex() |
459
|
|
|
{ |
460
|
|
|
return '('.implode('|', self::$crawlers).')'; |
461
|
|
|
} |
462
|
|
|
|
463
|
|
|
/** |
464
|
|
|
* Build the replacement regex. |
465
|
|
|
* |
466
|
|
|
* @return string |
467
|
|
|
*/ |
468
|
|
|
public function getIgnored() |
469
|
|
|
{ |
470
|
|
|
return '('.implode('|', self::$ignore).')'; |
471
|
|
|
} |
472
|
|
|
|
473
|
|
|
/** |
474
|
|
|
* Check user agent string against the regex. |
475
|
|
|
* |
476
|
|
|
* @param string $userAgent |
477
|
|
|
* |
478
|
|
|
* @return bool |
479
|
|
|
*/ |
480
|
|
|
public function isCrawler($userAgent = null) |
481
|
|
|
{ |
482
|
|
|
$agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
483
|
|
|
|
484
|
|
|
$agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent); |
485
|
|
|
|
486
|
|
|
if (trim($agent) === false) { |
487
|
|
|
return false; |
488
|
|
|
} else { |
489
|
|
|
$result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches); |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
if ($matches) { |
|
|
|
|
493
|
|
|
$this->matches = $matches; |
494
|
|
|
} |
495
|
|
|
|
496
|
|
|
return (bool) $result; |
497
|
|
|
} |
498
|
|
|
|
499
|
|
|
/** |
500
|
|
|
* Return the matches. |
501
|
|
|
* |
502
|
|
|
* @return string |
503
|
|
|
*/ |
504
|
|
|
public function getMatches() |
505
|
|
|
{ |
506
|
|
|
return $this->matches[0]; |
507
|
|
|
} |
508
|
|
|
} |
509
|
|
|
|
Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable: