1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Jaybizzle\CrawlerDetect; |
4
|
|
|
|
5
|
|
|
class CrawlerDetect |
6
|
|
|
{ |
7
|
|
|
protected $userAgent = null; |
8
|
|
|
|
9
|
|
|
protected $httpHeaders = array(); |
10
|
|
|
|
11
|
|
|
protected $matches = array(); |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* List of strings to remove from the user agent before running the crawler regex |
15
|
|
|
* Over a large list of user agents, this gives us about a 55% speed increase! |
16
|
|
|
* |
17
|
|
|
* @var array |
18
|
|
|
*/ |
19
|
|
|
protected static $ignore = array( |
20
|
|
|
'Safari.[\d\.]*', |
21
|
|
|
'Firefox.[\d\.]*', |
22
|
|
|
'Chrome.[\d\.]*', |
23
|
|
|
'Chromium.[\d\.]*', |
24
|
|
|
'MSIE.[\d\.]', |
25
|
|
|
'Opera\/[\d\.]*', |
26
|
|
|
'Mozilla.[\d\.]*', |
27
|
|
|
'AppleWebKit.[\d\.]*', |
28
|
|
|
'Trident.[\d\.]*', |
29
|
|
|
'Windows NT.[\d\.]*', |
30
|
|
|
'Macintosh.', |
31
|
|
|
'Ubuntu', |
32
|
|
|
'Linux', |
33
|
|
|
'Intel', |
34
|
|
|
'Mac OS X', |
35
|
|
|
'Gecko.[\d\.]*', |
36
|
|
|
'KHTML', |
37
|
|
|
'iPhone', |
38
|
|
|
'like Gecko', |
39
|
|
|
'compatible', |
40
|
|
|
'x86_..', |
41
|
|
|
'i686', |
42
|
|
|
'x64', |
43
|
|
|
'X11', |
44
|
|
|
'rv:[\d\.]*', |
45
|
|
|
'Version.[\d\.]*', |
46
|
|
|
'WOW64', |
47
|
|
|
'Win64', |
48
|
|
|
'\.NET CLR [\d\.]*', |
49
|
|
|
'Presto.[\d\.]*', |
50
|
|
|
'Media Center PC', |
51
|
|
|
); |
52
|
|
|
|
53
|
|
|
protected static $crawlers = array( |
54
|
|
|
'007ac9 Crawler', |
55
|
|
|
'008\\/', |
56
|
|
|
'360Spider', |
57
|
|
|
'A6-Indexer', |
58
|
|
|
'ABACHOBot', |
59
|
|
|
'AbiLogicBot', |
60
|
|
|
'Aboundex', |
61
|
|
|
'Accoona-AI-Agent', |
62
|
|
|
'acoon', |
63
|
|
|
'AddSugarSpiderBot', |
64
|
|
|
'AddThis', |
65
|
|
|
'Adidxbot', |
66
|
|
|
'ADmantX', |
67
|
|
|
'AdvBot', |
68
|
|
|
'AHC', |
69
|
|
|
'ahrefsbot', |
70
|
|
|
'aihitbot', |
71
|
|
|
'Airmail', |
72
|
|
|
'AISearchBot', |
73
|
|
|
'Anemone', |
74
|
|
|
'antibot', |
75
|
|
|
'AnyApexBot', |
76
|
|
|
'Applebot', |
77
|
|
|
'arabot', |
78
|
|
|
'Arachmo', |
79
|
|
|
'archive-com', |
80
|
|
|
'archive\.org_bot', |
81
|
|
|
'B-l-i-t-z-B-O-T', |
82
|
|
|
'backlinkcrawler', |
83
|
|
|
'baiduspider', |
84
|
|
|
'BecomeBot', |
85
|
|
|
'BeslistBot', |
86
|
|
|
'bibnum\.bnf', |
87
|
|
|
'biglotron', |
88
|
|
|
'BillyBobBot', |
89
|
|
|
'Bimbot', |
90
|
|
|
'bingbot', |
91
|
|
|
'binlar', |
92
|
|
|
'blekkobot', |
93
|
|
|
'blexbot', |
94
|
|
|
'BlitzBOT', |
95
|
|
|
'bl\.uk_lddc_bot', |
96
|
|
|
'bnf\.fr_bot', |
97
|
|
|
'boitho\.com-dc', |
98
|
|
|
'boitho\.com-robot', |
99
|
|
|
'brainobot', |
100
|
|
|
'btbot', |
101
|
|
|
'BUbiNG', |
102
|
|
|
'Butterfly\\/', |
103
|
|
|
'buzzbot', |
104
|
|
|
'BuzzSumo', |
105
|
|
|
'careerbot', |
106
|
|
|
'CatchBot', |
107
|
|
|
'CC Metadata Scaper', |
108
|
|
|
'ccbot', |
109
|
|
|
'Cerberian Drtrs', |
110
|
|
|
'changedetection', |
111
|
|
|
'Charlotte', |
112
|
|
|
'clips\.ua\.ac\.be', |
113
|
|
|
'CloudFlare-AlwaysOnline', |
114
|
|
|
'citeseerxbot', |
115
|
|
|
'coccoc', |
116
|
|
|
'classbot', |
117
|
|
|
'Commons-HttpClient', |
118
|
|
|
'content crawler spider', |
119
|
|
|
'Content Crawler', |
120
|
|
|
'convera', |
121
|
|
|
'ConveraCrawler', |
122
|
|
|
'CoPubbot', |
123
|
|
|
'cosmos', |
124
|
|
|
'Covario-IDS', |
125
|
|
|
'CrawlBot', |
126
|
|
|
'crawler4j', |
127
|
|
|
'CrystalSemanticsBot', |
128
|
|
|
'curl', |
129
|
|
|
'cXensebot', |
130
|
|
|
'CyberPatrol', |
131
|
|
|
'DataparkSearch', |
132
|
|
|
'dataprovider', |
133
|
|
|
'DiamondBot', |
134
|
|
|
'Digg', |
135
|
|
|
'discobot', |
136
|
|
|
'DomainAppender', |
137
|
|
|
'domaincrawler', |
138
|
|
|
'Domain Re-Animator Bot', |
139
|
|
|
'dotbot', |
140
|
|
|
'drupact', |
141
|
|
|
'DuckDuckBot', |
142
|
|
|
'EARTHCOM', |
143
|
|
|
'EasouSpider', |
144
|
|
|
'ec2linkfinder', |
145
|
|
|
'edisterbot', |
146
|
|
|
'ElectricMonk', |
147
|
|
|
'elisabot', |
148
|
|
|
'emailmarketingrobot', |
149
|
|
|
'Embedly', |
150
|
|
|
'EmeraldShield\.com WebBot', |
151
|
|
|
'envolk\[ITS\]spider', |
152
|
|
|
'EsperanzaBot', |
153
|
|
|
'europarchive\.org', |
154
|
|
|
'EventMachine HttpClient', |
155
|
|
|
'exabot', |
156
|
|
|
'ezooms', |
157
|
|
|
'eZ Publish Link Validator', |
158
|
|
|
'facebookexternalhit', |
159
|
|
|
'Facebot', |
160
|
|
|
'FAST Enteprise Crawler', |
161
|
|
|
'FAST Enterprise Crawler', |
162
|
|
|
'FAST-WebCrawler', |
163
|
|
|
'FDSE robot', |
164
|
|
|
'Feedfetcher-Google', |
165
|
|
|
'FeedValidator', |
166
|
|
|
'FindLinks', |
167
|
|
|
'findlink', |
168
|
|
|
'findthatfile', |
169
|
|
|
'findxbot', |
170
|
|
|
'Flamingo_SearchEngine', |
171
|
|
|
'fluffy', |
172
|
|
|
'fr-crawler', |
173
|
|
|
'FRCrawler', |
174
|
|
|
'FurlBot', |
175
|
|
|
'FyberSpider', |
176
|
|
|
'g00g1e\.net', |
177
|
|
|
'GigablastOpenSource', |
178
|
|
|
'grub-client', |
179
|
|
|
'g2crawler', |
180
|
|
|
'Gaisbot', |
181
|
|
|
'GalaxyBot', |
182
|
|
|
'genieBot', |
183
|
|
|
'Genieo', |
184
|
|
|
'GermCrawler', |
185
|
|
|
'getprismatic\.com', |
186
|
|
|
'gigabot', |
187
|
|
|
'GingerCrawler', |
188
|
|
|
'Girafabot', |
189
|
|
|
'Gluten Free Crawler', |
190
|
|
|
'gnam gnam spider', |
191
|
|
|
'Go-http-client', |
192
|
|
|
'Googlebot-Image', |
193
|
|
|
'Googlebot-Mobile', |
194
|
|
|
'Googlebot', |
195
|
|
|
'Google-HTTP-Java-Client', |
196
|
|
|
'Google favicon', |
197
|
|
|
'GrapeshotCrawler', |
198
|
|
|
'gslfbot', |
199
|
|
|
'GurujiBot', |
200
|
|
|
'HappyFunBot', |
201
|
|
|
'Healthbot', |
202
|
|
|
'heritrix', |
203
|
|
|
'hl_ftien_spider', |
204
|
|
|
'Holmes', |
205
|
|
|
'htdig', |
206
|
|
|
'httpunit', |
207
|
|
|
'httrack', |
208
|
|
|
'ia_archiver', |
209
|
|
|
'iaskspider', |
210
|
|
|
'iCCrawler', |
211
|
|
|
'ichiro', |
212
|
|
|
'igdeSpyder', |
213
|
|
|
'iisbot', |
214
|
|
|
'InAGist', |
215
|
|
|
'InfoWizards Reciprocal Link System PRO', |
216
|
|
|
'Insitesbot', |
217
|
|
|
'integromedb', |
218
|
|
|
'intelium_bot', |
219
|
|
|
'InterfaxScanBot', |
220
|
|
|
'IODC', |
221
|
|
|
'IOI', |
222
|
|
|
'ip-web-crawler\.com', |
223
|
|
|
'ips-agent', |
224
|
|
|
'IRLbot', |
225
|
|
|
'IssueCrawler', |
226
|
|
|
'IstellaBot', |
227
|
|
|
'it2media-domain-crawler', |
228
|
|
|
'iZSearch', |
229
|
|
|
'Jaxified Bot', |
230
|
|
|
'JOC Web Spider', |
231
|
|
|
'jyxobot', |
232
|
|
|
'KoepaBot', |
233
|
|
|
'L\.webis', |
234
|
|
|
'LapozzBot', |
235
|
|
|
'Larbin', |
236
|
|
|
'lb-spider', |
237
|
|
|
'LDSpider', |
238
|
|
|
'LexxeBot', |
239
|
|
|
'libwww', |
240
|
|
|
'Linguee Bot', |
241
|
|
|
'Link Valet', |
242
|
|
|
'linkdex', |
243
|
|
|
'LinkExaminer', |
244
|
|
|
'LinksManager\.com_bot', |
245
|
|
|
'LinkpadBot', |
246
|
|
|
'LinksCrawler', |
247
|
|
|
'LinkWalker', |
248
|
|
|
'Lipperhey Link Explorer', |
249
|
|
|
'Lipperhey SEO Service', |
250
|
|
|
'Livelapbot', |
251
|
|
|
'LongURL API', |
252
|
|
|
'lmspider', |
253
|
|
|
'lssbot', |
254
|
|
|
'lssrocketcrawler', |
255
|
|
|
'ltx71', |
256
|
|
|
'lufsbot', |
257
|
|
|
'lwp-trivial', |
258
|
|
|
'Mail\.RU_Bot', |
259
|
|
|
'MegaIndex\.ru', |
260
|
|
|
'mabontland', |
261
|
|
|
'magpie-crawler', |
262
|
|
|
'MagpieRSS', |
263
|
|
|
'Mediapartners-Google', |
264
|
|
|
'memorybot', |
265
|
|
|
'MetaURI', |
266
|
|
|
'MJ12bot', |
267
|
|
|
'mlbot', |
268
|
|
|
'Mnogosearch', |
269
|
|
|
'mogimogi', |
270
|
|
|
'MojeekBot', |
271
|
|
|
'Moreoverbot', |
272
|
|
|
'Morning Paper', |
273
|
|
|
'Mrcgiguy', |
274
|
|
|
'MSIECrawler', |
275
|
|
|
'msnbot', |
276
|
|
|
'msrbot', |
277
|
|
|
'MVAClient', |
278
|
|
|
'mxbot', |
279
|
|
|
'NerdByNature\.Bot', |
280
|
|
|
'NerdyBot', |
281
|
|
|
'netEstate NE Crawler', |
282
|
|
|
'netresearchserver', |
283
|
|
|
'NetSeer Crawler', |
284
|
|
|
'NewsGator', |
285
|
|
|
'newsme', |
286
|
|
|
'NextGenSearchBot', |
287
|
|
|
'NG-Search', |
288
|
|
|
'ngbot', |
289
|
|
|
'nicebot', |
290
|
|
|
'niki-bot', |
291
|
|
|
'^NING\\/', |
292
|
|
|
'Notifixious', |
293
|
|
|
'noxtrumbot', |
294
|
|
|
'Nusearch Spider', |
295
|
|
|
'nutch', |
296
|
|
|
'NutchCVS', |
297
|
|
|
'Nymesis', |
298
|
|
|
'obot', |
299
|
|
|
'oegp', |
300
|
|
|
'ocrawler', |
301
|
|
|
'omgilibot', |
302
|
|
|
'OmniExplorer_Bot', |
303
|
|
|
'online link validator', |
304
|
|
|
'Online Website Link Checker', |
305
|
|
|
'OOZBOT', |
306
|
|
|
'openindexspider', |
307
|
|
|
'OpenWebSpider', |
308
|
|
|
'OrangeBot', |
309
|
|
|
'Orbiter', |
310
|
|
|
'ow\.ly', |
311
|
|
|
'PaperLiBot', |
312
|
|
|
'Pingdom\.com_bot', |
313
|
|
|
'Ploetz \+ Zeller', |
314
|
|
|
'page2rss', |
315
|
|
|
'PageBitesHyperBot', |
316
|
|
|
'panscient', |
317
|
|
|
'Peew', |
318
|
|
|
'PercolateCrawler', |
319
|
|
|
'phpcrawl', |
320
|
|
|
'Pizilla', |
321
|
|
|
'Plukkie', |
322
|
|
|
'polybot', |
323
|
|
|
'Pompos', |
324
|
|
|
'postano', |
325
|
|
|
'PostPost', |
326
|
|
|
'postrank', |
327
|
|
|
'proximic', |
328
|
|
|
'psbot', |
329
|
|
|
'purebot', |
330
|
|
|
'PycURL', |
331
|
|
|
'Python-httplib2', |
332
|
|
|
'python-requests', |
333
|
|
|
'Python-urllib', |
334
|
|
|
'Qseero', |
335
|
|
|
'QuerySeekerSpider', |
336
|
|
|
'Qwantify', |
337
|
|
|
'Radian6', |
338
|
|
|
'RAMPyBot', |
339
|
|
|
'RebelMouse', |
340
|
|
|
'REL Link Checker', |
341
|
|
|
'RetrevoPageAnalyzer', |
342
|
|
|
'Riddler', |
343
|
|
|
'Robosourcer', |
344
|
|
|
'rogerbot', |
345
|
|
|
'Ruby', |
346
|
|
|
'RufusBot', |
347
|
|
|
'SandCrawler', |
348
|
|
|
'SBIder', |
349
|
|
|
'ScoutJet', |
350
|
|
|
'ScoutURLMonitor', |
351
|
|
|
'Scrapy', |
352
|
|
|
'ScreenerBot', |
353
|
|
|
'scribdbot', |
354
|
|
|
'Scrubby', |
355
|
|
|
'SearchmetricsBot', |
356
|
|
|
'SearchSight', |
357
|
|
|
'seekbot', |
358
|
|
|
'semanticdiscovery', |
359
|
|
|
'SemrushBot', |
360
|
|
|
'Sensis Web Crawler', |
361
|
|
|
'SEOChat::Bot', |
362
|
|
|
'seokicks-robot', |
363
|
|
|
'SEOstats', |
364
|
|
|
'Seznam screenshot-generator', |
365
|
|
|
'seznambot', |
366
|
|
|
'Shim-Crawler', |
367
|
|
|
'ShopWiki', |
368
|
|
|
'Shoula robot', |
369
|
|
|
'ShowyouBot', |
370
|
|
|
'SimpleCrawler', |
371
|
|
|
'sistrix crawler', |
372
|
|
|
'SiteBar', |
373
|
|
|
'sitebot', |
374
|
|
|
'siteexplorer\.info', |
375
|
|
|
'SklikBot', |
376
|
|
|
'slider\.com', |
377
|
|
|
'slurp', |
378
|
|
|
'smtbot', |
379
|
|
|
'Snappy', |
380
|
|
|
'sogou spider', |
381
|
|
|
'sogou', |
382
|
|
|
'Sosospider', |
383
|
|
|
'spbot', |
384
|
|
|
'Speedy Spider', |
385
|
|
|
'speedy', |
386
|
|
|
'SpiderMan', |
387
|
|
|
'Sqworm', |
388
|
|
|
'SSL-Crawler', |
389
|
|
|
'StackRambler', |
390
|
|
|
'Stratagems Kumo', |
391
|
|
|
'suggybot', |
392
|
|
|
'summify', |
393
|
|
|
'SurdotlyBot', |
394
|
|
|
'SurveyBot', |
395
|
|
|
'SynooBot', |
396
|
|
|
'tagoobot', |
397
|
|
|
'teoma', |
398
|
|
|
'TerrawizBot', |
399
|
|
|
'theoldreader.com', |
400
|
|
|
'TheSuBot', |
401
|
|
|
'Thumbnail\.CZ robot', |
402
|
|
|
'TinEye', |
403
|
|
|
'toplistbot', |
404
|
|
|
'Traackr.com', |
405
|
|
|
'trendictionbot', |
406
|
|
|
'TrueBot', |
407
|
|
|
'truwoGPS', |
408
|
|
|
'turnitinbot', |
409
|
|
|
'TweetedTimes Bot', |
410
|
|
|
'tweetedtimes\.com', |
411
|
|
|
'TweetmemeBot', |
412
|
|
|
'twengabot', |
413
|
|
|
'Twikle', |
414
|
|
|
'Twitterbot', |
415
|
|
|
'uMBot', |
416
|
|
|
'UnisterBot', |
417
|
|
|
'UnwindFetchor', |
418
|
|
|
'updated', |
419
|
|
|
'urlappendbot', |
420
|
|
|
'Urlfilebot', |
421
|
|
|
'urlresolver', |
422
|
|
|
'UsineNouvelleCrawler', |
423
|
|
|
'Validator\.nu\\/LV', |
424
|
|
|
'Vagabondo', |
425
|
|
|
'Vivante Link Checker', |
426
|
|
|
'voilabot', |
427
|
|
|
'Vortex', |
428
|
|
|
'voyager\\/', |
429
|
|
|
'VYU2', |
430
|
|
|
'W3C-checklink', |
431
|
|
|
'W3C_CSS_Validator_JFouffa', |
432
|
|
|
'W3C_I18n-Checker', |
433
|
|
|
'W3C-mobileOK', |
434
|
|
|
'W3C_Unicorn', |
435
|
|
|
'W3C_Validator', |
436
|
|
|
'WebIndex', |
437
|
|
|
'web-archive-net\.com\.bot', |
438
|
|
|
'Websquash\.com', |
439
|
|
|
'WeSEE:Ads\\/PageBot', |
440
|
|
|
'wbsearchbot', |
441
|
|
|
'webcollage', |
442
|
|
|
'webcompanycrawler', |
443
|
|
|
'webcrawler', |
444
|
|
|
'webmon ', |
445
|
|
|
'WeSEE:Search', |
446
|
|
|
'wf84', |
447
|
|
|
'wget', |
448
|
|
|
'wocbot', |
449
|
|
|
'WoFindeIch Robot', |
450
|
|
|
'WomlpeFactory', |
451
|
|
|
'woriobot', |
452
|
|
|
'wotbox', |
453
|
|
|
'Xaldon_WebSpider', |
454
|
|
|
'Xenu Link Sleuth', |
455
|
|
|
'xintellibot', |
456
|
|
|
'XML Sitemaps Generator', |
457
|
|
|
'XoviBot', |
458
|
|
|
'Y!J-ASR', |
459
|
|
|
'yacy', |
460
|
|
|
'yacybot', |
461
|
|
|
'Yahoo Link Preview', |
462
|
|
|
'Yahoo! Slurp China', |
463
|
|
|
'Yahoo! Slurp', |
464
|
|
|
'YahooSeeker', |
465
|
|
|
'YahooSeeker-Testing', |
466
|
|
|
'YandexBot', |
467
|
|
|
'YandexImages', |
468
|
|
|
'YandexMetrika', |
469
|
|
|
'yandex', |
470
|
|
|
'yanga', |
471
|
|
|
'Yasaklibot', |
472
|
|
|
'yeti', |
473
|
|
|
'YioopBot', |
474
|
|
|
'YisouSpider', |
475
|
|
|
'YodaoBot', |
476
|
|
|
'yoogliFetchAgent', |
477
|
|
|
'yoozBot', |
478
|
|
|
'YoudaoBot', |
479
|
|
|
'Zao', |
480
|
|
|
'Zealbot', |
481
|
|
|
'zspider', |
482
|
|
|
'ZyBorg', |
483
|
|
|
'[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
484
|
|
|
); |
485
|
|
|
|
486
|
|
|
/** |
487
|
|
|
* All possible HTTP headers that represent the |
488
|
|
|
* User-Agent string. |
489
|
|
|
* |
490
|
|
|
* @var array |
491
|
|
|
*/ |
492
|
|
|
protected static $uaHttpHeaders = array( |
493
|
|
|
// The default User-Agent string. |
494
|
|
|
'HTTP_USER_AGENT', |
495
|
|
|
// Header can occur on devices using Opera Mini. |
496
|
|
|
'HTTP_X_OPERAMINI_PHONE_UA', |
497
|
|
|
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
|
|
|
498
|
|
|
'HTTP_X_DEVICE_USER_AGENT', |
499
|
|
|
'HTTP_X_ORIGINAL_USER_AGENT', |
500
|
|
|
'HTTP_X_SKYFIRE_PHONE', |
501
|
|
|
'HTTP_X_BOLT_PHONE_UA', |
502
|
|
|
'HTTP_DEVICE_STOCK_UA', |
503
|
|
|
'HTTP_X_UCBROWSER_DEVICE_UA', |
504
|
|
|
); |
505
|
|
|
|
506
|
|
|
/** |
507
|
|
|
* Class constructor. |
508
|
|
|
*/ |
509
|
|
|
public function __construct(array $headers = null, $userAgent = null) |
510
|
|
|
{ |
511
|
|
|
$this->setHttpHeaders($headers); |
512
|
|
|
$this->setUserAgent($userAgent); |
513
|
|
|
} |
514
|
|
|
|
515
|
|
|
public function setHttpHeaders($httpHeaders = null) |
|
|
|
|
516
|
|
|
{ |
517
|
|
|
// use global _SERVER if $httpHeaders aren't defined |
518
|
|
|
if (!is_array($httpHeaders) || !count($httpHeaders)) { |
519
|
|
|
$httpHeaders = $_SERVER; |
520
|
|
|
} |
521
|
|
|
// clear existing headers |
522
|
|
|
$this->httpHeaders = array(); |
523
|
|
|
// Only save HTTP headers. In PHP land, that means only _SERVER vars that |
524
|
|
|
// start with HTTP_. |
525
|
|
|
foreach ($httpHeaders as $key => $value) { |
526
|
|
|
if (substr($key, 0, 5) === 'HTTP_') { |
527
|
|
|
$this->httpHeaders[$key] = $value; |
528
|
|
|
} |
529
|
|
|
} |
530
|
|
|
} |
531
|
|
|
|
532
|
|
|
public function getUaHttpHeaders() |
533
|
|
|
{ |
534
|
|
|
return self::$uaHttpHeaders; |
535
|
|
|
} |
536
|
|
|
|
537
|
|
|
public function setUserAgent($userAgent = null) |
538
|
|
|
{ |
539
|
|
|
if (false === empty($userAgent)) { |
540
|
|
|
return $this->userAgent = $userAgent; |
541
|
|
|
} else { |
542
|
|
|
$this->userAgent = null; |
543
|
|
|
foreach ($this->getUaHttpHeaders() as $altHeader) { |
544
|
|
|
if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban) |
545
|
|
|
$this->userAgent .= $this->httpHeaders[$altHeader].' '; |
546
|
|
|
} |
547
|
|
|
} |
548
|
|
|
|
549
|
|
|
return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
550
|
|
|
} |
551
|
|
|
} |
552
|
|
|
|
553
|
|
|
public function getRegex() |
554
|
|
|
{ |
555
|
|
|
return '('.implode('|', self::$crawlers).')'; |
556
|
|
|
} |
557
|
|
|
|
558
|
|
|
public function getIgnored() |
559
|
|
|
{ |
560
|
|
|
return '('.implode('|', self::$ignore).')'; |
561
|
|
|
} |
562
|
|
|
|
563
|
|
|
public function isCrawler($userAgent = null) |
564
|
|
|
{ |
565
|
|
|
$agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
566
|
|
|
|
567
|
|
|
$agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent); |
568
|
|
|
|
569
|
|
|
$result = preg_match('/'.$this->getRegex().'/i', $agent, $matches); |
570
|
|
|
|
571
|
|
|
if ($matches) { |
|
|
|
|
572
|
|
|
$this->matches = $matches; |
573
|
|
|
} |
574
|
|
|
|
575
|
|
|
return (bool) $result; |
576
|
|
|
} |
577
|
|
|
|
578
|
|
|
public function getMatches() |
579
|
|
|
{ |
580
|
|
|
return $this->matches[0]; |
581
|
|
|
} |
582
|
|
|
} |
583
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.