@@ -4,528 +4,528 @@ |
||
4 | 4 | |
5 | 5 | class CrawlerDetect |
6 | 6 | { |
7 | - protected $userAgent = null; |
|
7 | + protected $userAgent = null; |
|
8 | 8 | |
9 | - protected $httpHeaders = array(); |
|
9 | + protected $httpHeaders = array(); |
|
10 | 10 | |
11 | - protected $matches = array(); |
|
11 | + protected $matches = array(); |
|
12 | 12 | |
13 | - protected static $crawlers = array( |
|
14 | - '007ac9 Crawler', |
|
15 | - '008\\/', |
|
16 | - '360Spider', |
|
17 | - 'A6-Indexer', |
|
18 | - 'ABACHOBot', |
|
19 | - 'AbiLogicBot', |
|
20 | - 'Aboundex', |
|
21 | - 'Accoona-AI-Agent', |
|
22 | - 'acoon', |
|
23 | - 'AddSugarSpiderBot', |
|
24 | - 'AddThis', |
|
25 | - 'Adidxbot', |
|
26 | - 'ADmantX', |
|
27 | - 'AdvBot', |
|
28 | - 'AHC', |
|
29 | - 'ahrefsbot', |
|
30 | - 'aihitbot', |
|
31 | - 'Airmail', |
|
32 | - 'AISearchBot', |
|
33 | - 'Anemone', |
|
34 | - 'antibot', |
|
35 | - 'AnyApexBot', |
|
36 | - 'Applebot', |
|
37 | - 'arabot', |
|
38 | - 'Arachmo', |
|
39 | - 'archive-com', |
|
40 | - 'archive\.org_bot', |
|
41 | - 'B-l-i-t-z-B-O-T', |
|
42 | - 'backlinkcrawler', |
|
43 | - 'baiduspider', |
|
44 | - 'BecomeBot', |
|
45 | - 'BeslistBot', |
|
46 | - 'bibnum\.bnf', |
|
47 | - 'biglotron', |
|
48 | - 'BillyBobBot', |
|
49 | - 'Bimbot', |
|
50 | - 'bingbot', |
|
51 | - 'binlar', |
|
52 | - 'blekkobot', |
|
53 | - 'blexbot', |
|
54 | - 'BlitzBOT', |
|
55 | - 'bl\.uk_lddc_bot', |
|
56 | - 'bnf\.fr_bot', |
|
57 | - 'boitho\.com-dc', |
|
58 | - 'boitho\.com-robot', |
|
59 | - 'brainobot', |
|
60 | - 'btbot', |
|
61 | - 'BUbiNG', |
|
62 | - 'Butterfly\\/', |
|
63 | - 'buzzbot', |
|
64 | - 'BuzzSumo', |
|
65 | - 'careerbot', |
|
66 | - 'CatchBot', |
|
67 | - 'CC Metadata Scaper', |
|
68 | - 'ccbot', |
|
69 | - 'Cerberian Drtrs', |
|
70 | - 'changedetection', |
|
71 | - 'Charlotte', |
|
72 | - 'clips\.ua\.ac\.be', |
|
73 | - 'CloudFlare-AlwaysOnline', |
|
74 | - 'citeseerxbot', |
|
75 | - 'coccoc', |
|
76 | - 'classbot', |
|
77 | - 'Commons-HttpClient', |
|
78 | - 'content crawler spider', |
|
79 | - 'Content Crawler', |
|
80 | - 'convera', |
|
81 | - 'ConveraCrawler', |
|
82 | - 'CoPubbot', |
|
83 | - 'cosmos', |
|
84 | - 'Covario-IDS', |
|
85 | - 'CrawlBot', |
|
86 | - 'crawler4j', |
|
87 | - 'CrystalSemanticsBot', |
|
88 | - 'curl', |
|
89 | - 'cXensebot', |
|
90 | - 'CyberPatrol', |
|
91 | - 'DataparkSearch', |
|
92 | - 'dataprovider', |
|
93 | - 'DiamondBot', |
|
94 | - 'Digg', |
|
95 | - 'discobot', |
|
96 | - 'DomainAppender', |
|
97 | - 'domaincrawler', |
|
98 | - 'Domain Re-Animator Bot', |
|
99 | - 'dotbot', |
|
100 | - 'drupact', |
|
101 | - 'DuckDuckBot', |
|
102 | - 'EARTHCOM', |
|
103 | - 'EasouSpider', |
|
104 | - 'ec2linkfinder', |
|
105 | - 'edisterbot', |
|
106 | - 'ElectricMonk', |
|
107 | - 'elisabot', |
|
108 | - 'emailmarketingrobot', |
|
109 | - 'Embedly', |
|
110 | - 'EmeraldShield\.com WebBot', |
|
111 | - 'envolk\[ITS\]spider', |
|
112 | - 'EsperanzaBot', |
|
113 | - 'europarchive\.org', |
|
114 | - 'EventMachine HttpClient', |
|
115 | - 'exabot', |
|
116 | - 'ezooms', |
|
117 | - 'facebookexternalhit', |
|
118 | - 'Facebot', |
|
119 | - 'FAST Enteprise Crawler', |
|
120 | - 'FAST Enterprise Crawler', |
|
121 | - 'FAST-WebCrawler', |
|
122 | - 'FDSE robot', |
|
123 | - 'Feedfetcher-Google', |
|
124 | - 'FeedValidator', |
|
125 | - 'FindLinks', |
|
126 | - 'findlink', |
|
127 | - 'findthatfile', |
|
128 | - 'findxbot', |
|
129 | - 'Flamingo_SearchEngine', |
|
130 | - 'fluffy', |
|
131 | - 'fr-crawler', |
|
132 | - 'FRCrawler', |
|
133 | - 'FurlBot', |
|
134 | - 'FyberSpider', |
|
135 | - 'g00g1e\.net', |
|
136 | - 'GigablastOpenSource', |
|
137 | - 'grub-client', |
|
138 | - 'g2crawler', |
|
139 | - 'Gaisbot', |
|
140 | - 'GalaxyBot', |
|
141 | - 'genieBot', |
|
142 | - 'Genieo', |
|
143 | - 'GermCrawler', |
|
144 | - 'getprismatic\.com', |
|
145 | - 'gigabot', |
|
146 | - 'GingerCrawler', |
|
147 | - 'Girafabot', |
|
148 | - 'Gluten Free Crawler', |
|
149 | - 'gnam gnam spider', |
|
150 | - 'Go-http-client', |
|
151 | - 'Googlebot-Image', |
|
152 | - 'Googlebot-Mobile', |
|
153 | - 'Googlebot', |
|
154 | - 'Google-HTTP-Java-Client', |
|
155 | - 'Google favicon', |
|
156 | - 'GrapeshotCrawler', |
|
157 | - 'gslfbot', |
|
158 | - 'GurujiBot', |
|
159 | - 'HappyFunBot', |
|
160 | - 'Healthbot', |
|
161 | - 'heritrix', |
|
162 | - 'hl_ftien_spider', |
|
163 | - 'Holmes', |
|
164 | - 'htdig', |
|
165 | - 'httpunit', |
|
166 | - 'httrack', |
|
167 | - 'ia_archiver', |
|
168 | - 'iaskspider', |
|
169 | - 'iCCrawler', |
|
170 | - 'ichiro', |
|
171 | - 'igdeSpyder', |
|
172 | - 'iisbot', |
|
173 | - 'InAGist', |
|
174 | - 'InfoWizards Reciprocal Link System PRO', |
|
175 | - 'Insitesbot', |
|
176 | - 'integromedb', |
|
177 | - 'intelium_bot', |
|
178 | - 'InterfaxScanBot', |
|
179 | - 'IODC', |
|
180 | - 'IOI', |
|
181 | - 'ip-web-crawler\.com', |
|
182 | - 'ips-agent', |
|
183 | - 'IRLbot', |
|
184 | - 'IssueCrawler', |
|
185 | - 'IstellaBot', |
|
186 | - 'it2media-domain-crawler', |
|
187 | - 'iZSearch', |
|
188 | - 'Jaxified Bot', |
|
189 | - 'JOC Web Spider', |
|
190 | - 'jyxobot', |
|
191 | - 'KoepaBot', |
|
192 | - 'L\.webis', |
|
193 | - 'LapozzBot', |
|
194 | - 'Larbin', |
|
195 | - 'lb-spider', |
|
196 | - 'LDSpider', |
|
197 | - 'LexxeBot', |
|
198 | - 'libwww', |
|
199 | - 'Linguee Bot', |
|
200 | - 'Link Valet', |
|
201 | - 'linkdex', |
|
202 | - 'LinkExaminer', |
|
203 | - 'LinksManager\.com_bot', |
|
204 | - 'LinkpadBot', |
|
205 | - 'LinksCrawler', |
|
206 | - 'LinkWalker', |
|
207 | - 'Lipperhey Link Explorer', |
|
208 | - 'Lipperhey SEO Service', |
|
209 | - 'Livelapbot', |
|
210 | - 'LongURL API', |
|
211 | - 'lmspider', |
|
212 | - 'lssbot', |
|
213 | - 'lssrocketcrawler', |
|
214 | - 'ltx71', |
|
215 | - 'lufsbot', |
|
216 | - 'lwp-trivial', |
|
217 | - 'Mail\.RU_Bot', |
|
218 | - 'MegaIndex\.ru', |
|
219 | - 'mabontland', |
|
220 | - 'magpie-crawler', |
|
221 | - 'MagpieRSS', |
|
222 | - 'Mediapartners-Google', |
|
223 | - 'memorybot', |
|
224 | - 'MetaURI', |
|
225 | - 'MJ12bot', |
|
226 | - 'mlbot', |
|
227 | - 'Mnogosearch', |
|
228 | - 'mogimogi', |
|
229 | - 'MojeekBot', |
|
230 | - 'Moreoverbot', |
|
231 | - 'Morning Paper', |
|
232 | - 'Mrcgiguy', |
|
233 | - 'MSIECrawler', |
|
234 | - 'msnbot', |
|
235 | - 'msrbot', |
|
236 | - 'MVAClient', |
|
237 | - 'mxbot', |
|
238 | - 'NerdByNature\.Bot', |
|
239 | - 'NerdyBot', |
|
240 | - 'netEstate NE Crawler', |
|
241 | - 'netresearchserver', |
|
242 | - 'NetSeer Crawler', |
|
243 | - 'NewsGator', |
|
244 | - 'newsme', |
|
245 | - 'NextGenSearchBot', |
|
246 | - 'NG-Search', |
|
247 | - 'ngbot', |
|
248 | - 'nicebot', |
|
249 | - 'niki-bot', |
|
250 | - '^NING\\/', |
|
251 | - 'Notifixious', |
|
252 | - 'noxtrumbot', |
|
253 | - 'Nusearch Spider', |
|
254 | - 'nutch', |
|
255 | - 'NutchCVS', |
|
256 | - 'Nymesis', |
|
257 | - 'obot', |
|
258 | - 'oegp', |
|
259 | - 'ocrawler', |
|
260 | - 'omgilibot', |
|
261 | - 'OmniExplorer_Bot', |
|
262 | - 'online link validator', |
|
263 | - 'Online Website Link Checker', |
|
264 | - 'OOZBOT', |
|
265 | - 'openindexspider', |
|
266 | - 'OpenWebSpider', |
|
267 | - 'OrangeBot', |
|
268 | - 'Orbiter', |
|
269 | - 'ow\.ly', |
|
270 | - 'PaperLiBot', |
|
271 | - 'Pingdom\.com_bot', |
|
272 | - 'Ploetz \+ Zeller', |
|
273 | - 'page2rss', |
|
274 | - 'PageBitesHyperBot', |
|
275 | - 'panscient', |
|
276 | - 'Peew', |
|
277 | - 'PercolateCrawler', |
|
278 | - 'phpcrawl', |
|
279 | - 'Pizilla', |
|
280 | - 'Plukkie', |
|
281 | - 'polybot', |
|
282 | - 'Pompos', |
|
283 | - 'postano', |
|
284 | - 'PostPost', |
|
285 | - 'postrank', |
|
286 | - 'proximic', |
|
287 | - 'psbot', |
|
288 | - 'purebot', |
|
289 | - 'PycURL', |
|
290 | - 'Python-httplib2', |
|
291 | - 'python-requests', |
|
292 | - 'Python-urllib', |
|
293 | - 'Qseero', |
|
294 | - 'QuerySeekerSpider', |
|
295 | - 'Qwantify', |
|
296 | - 'Radian6', |
|
297 | - 'RAMPyBot', |
|
298 | - 'RebelMouse', |
|
299 | - 'REL Link Checker', |
|
300 | - 'RetrevoPageAnalyzer', |
|
301 | - 'Riddler', |
|
302 | - 'Robosourcer', |
|
303 | - 'rogerbot', |
|
304 | - 'Ruby', |
|
305 | - 'RufusBot', |
|
306 | - 'SandCrawler', |
|
307 | - 'SBIder', |
|
308 | - 'ScoutJet', |
|
309 | - 'ScoutURLMonitor', |
|
310 | - 'Scrapy', |
|
311 | - 'ScreenerBot', |
|
312 | - 'scribdbot', |
|
313 | - 'Scrubby', |
|
314 | - 'SearchmetricsBot', |
|
315 | - 'SearchSight', |
|
316 | - 'seekbot', |
|
317 | - 'semanticdiscovery', |
|
318 | - 'SemrushBot', |
|
319 | - 'Sensis Web Crawler', |
|
320 | - 'SEOChat::Bot', |
|
321 | - 'seokicks-robot', |
|
322 | - 'SEOstats', |
|
323 | - 'Seznam screenshot-generator', |
|
324 | - 'seznambot', |
|
325 | - 'Shim-Crawler', |
|
326 | - 'ShopWiki', |
|
327 | - 'Shoula robot', |
|
328 | - 'ShowyouBot', |
|
329 | - 'SimpleCrawler', |
|
330 | - 'sistrix crawler', |
|
331 | - 'SiteBar', |
|
332 | - 'sitebot', |
|
333 | - 'siteexplorer\.info', |
|
334 | - 'SklikBot', |
|
335 | - 'slider\.com', |
|
336 | - 'slurp', |
|
337 | - 'smtbot', |
|
338 | - 'Snappy', |
|
339 | - 'sogou spider', |
|
340 | - 'sogou', |
|
341 | - 'Sosospider', |
|
342 | - 'spbot', |
|
343 | - 'Speedy Spider', |
|
344 | - 'speedy', |
|
345 | - 'SpiderMan', |
|
346 | - 'Sqworm', |
|
347 | - 'SSL-Crawler', |
|
348 | - 'StackRambler', |
|
349 | - 'suggybot', |
|
350 | - 'summify', |
|
351 | - 'SurdotlyBot', |
|
352 | - 'SurveyBot', |
|
353 | - 'SynooBot', |
|
354 | - 'tagoobot', |
|
355 | - 'teoma', |
|
356 | - 'TerrawizBot', |
|
357 | - 'TheSuBot', |
|
358 | - 'Thumbnail\.CZ robot', |
|
359 | - 'TinEye', |
|
360 | - 'toplistbot', |
|
361 | - 'Traackr.com', |
|
362 | - 'trendictionbot', |
|
363 | - 'TrueBot', |
|
364 | - 'truwoGPS', |
|
365 | - 'turnitinbot', |
|
366 | - 'TweetedTimes Bot', |
|
367 | - 'tweetedtimes\.com', |
|
368 | - 'TweetmemeBot', |
|
369 | - 'twengabot', |
|
370 | - 'Twikle', |
|
371 | - 'Twitterbot', |
|
372 | - 'uMBot', |
|
373 | - 'UnisterBot', |
|
374 | - 'UnwindFetchor', |
|
375 | - 'updated', |
|
376 | - 'urlappendbot', |
|
377 | - 'Urlfilebot', |
|
378 | - 'urlresolver', |
|
379 | - 'UsineNouvelleCrawler', |
|
380 | - 'Validator\.nu\\/LV', |
|
381 | - 'Vagabondo', |
|
382 | - 'Vivante Link Checker', |
|
383 | - 'voilabot', |
|
384 | - 'Vortex', |
|
385 | - 'voyager\\/', |
|
386 | - 'VYU2', |
|
387 | - 'W3C-checklink', |
|
388 | - 'W3C_CSS_Validator_JFouffa', |
|
389 | - 'W3C_I18n-Checker', |
|
390 | - 'W3C-mobileOK', |
|
391 | - 'W3C_Unicorn', |
|
392 | - 'W3C_Validator', |
|
393 | - 'web-archive-net\.com\.bot', |
|
394 | - 'Websquash\.com', |
|
395 | - 'WeSEE:Ads\\/PageBot', |
|
396 | - 'wbsearchbot', |
|
397 | - 'webcollage', |
|
398 | - 'webcompanycrawler', |
|
399 | - 'webcrawler', |
|
400 | - 'webmon ', |
|
401 | - 'WeSEE:Search', |
|
402 | - 'wf84', |
|
403 | - 'wget', |
|
404 | - 'wocbot', |
|
405 | - 'WoFindeIch Robot', |
|
406 | - 'WomlpeFactory', |
|
407 | - 'woriobot', |
|
408 | - 'wotbox', |
|
409 | - 'Xaldon_WebSpider', |
|
410 | - 'Xenu Link Sleuth', |
|
411 | - 'xintellibot', |
|
412 | - 'XML Sitemaps Generator', |
|
413 | - 'XoviBot', |
|
414 | - 'Y!J-ASR', |
|
415 | - 'yacy', |
|
416 | - 'yacybot', |
|
417 | - 'Yahoo Link Preview', |
|
418 | - 'Yahoo! Slurp China', |
|
419 | - 'Yahoo! Slurp', |
|
420 | - 'YahooSeeker', |
|
421 | - 'YahooSeeker-Testing', |
|
422 | - 'YandexBot', |
|
423 | - 'YandexImages', |
|
424 | - 'YandexMetrika', |
|
425 | - 'yandex', |
|
426 | - 'yanga', |
|
427 | - 'Yasaklibot', |
|
428 | - 'yeti', |
|
429 | - 'YioopBot', |
|
430 | - 'YisouSpider', |
|
431 | - 'YodaoBot', |
|
432 | - 'yoogliFetchAgent', |
|
433 | - 'yoozBot', |
|
434 | - 'YoudaoBot', |
|
435 | - 'Zao', |
|
436 | - 'Zealbot', |
|
437 | - 'zspider', |
|
438 | - 'ZyBorg', |
|
439 | - '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
440 | - ); |
|
13 | + protected static $crawlers = array( |
|
14 | + '007ac9 Crawler', |
|
15 | + '008\\/', |
|
16 | + '360Spider', |
|
17 | + 'A6-Indexer', |
|
18 | + 'ABACHOBot', |
|
19 | + 'AbiLogicBot', |
|
20 | + 'Aboundex', |
|
21 | + 'Accoona-AI-Agent', |
|
22 | + 'acoon', |
|
23 | + 'AddSugarSpiderBot', |
|
24 | + 'AddThis', |
|
25 | + 'Adidxbot', |
|
26 | + 'ADmantX', |
|
27 | + 'AdvBot', |
|
28 | + 'AHC', |
|
29 | + 'ahrefsbot', |
|
30 | + 'aihitbot', |
|
31 | + 'Airmail', |
|
32 | + 'AISearchBot', |
|
33 | + 'Anemone', |
|
34 | + 'antibot', |
|
35 | + 'AnyApexBot', |
|
36 | + 'Applebot', |
|
37 | + 'arabot', |
|
38 | + 'Arachmo', |
|
39 | + 'archive-com', |
|
40 | + 'archive\.org_bot', |
|
41 | + 'B-l-i-t-z-B-O-T', |
|
42 | + 'backlinkcrawler', |
|
43 | + 'baiduspider', |
|
44 | + 'BecomeBot', |
|
45 | + 'BeslistBot', |
|
46 | + 'bibnum\.bnf', |
|
47 | + 'biglotron', |
|
48 | + 'BillyBobBot', |
|
49 | + 'Bimbot', |
|
50 | + 'bingbot', |
|
51 | + 'binlar', |
|
52 | + 'blekkobot', |
|
53 | + 'blexbot', |
|
54 | + 'BlitzBOT', |
|
55 | + 'bl\.uk_lddc_bot', |
|
56 | + 'bnf\.fr_bot', |
|
57 | + 'boitho\.com-dc', |
|
58 | + 'boitho\.com-robot', |
|
59 | + 'brainobot', |
|
60 | + 'btbot', |
|
61 | + 'BUbiNG', |
|
62 | + 'Butterfly\\/', |
|
63 | + 'buzzbot', |
|
64 | + 'BuzzSumo', |
|
65 | + 'careerbot', |
|
66 | + 'CatchBot', |
|
67 | + 'CC Metadata Scaper', |
|
68 | + 'ccbot', |
|
69 | + 'Cerberian Drtrs', |
|
70 | + 'changedetection', |
|
71 | + 'Charlotte', |
|
72 | + 'clips\.ua\.ac\.be', |
|
73 | + 'CloudFlare-AlwaysOnline', |
|
74 | + 'citeseerxbot', |
|
75 | + 'coccoc', |
|
76 | + 'classbot', |
|
77 | + 'Commons-HttpClient', |
|
78 | + 'content crawler spider', |
|
79 | + 'Content Crawler', |
|
80 | + 'convera', |
|
81 | + 'ConveraCrawler', |
|
82 | + 'CoPubbot', |
|
83 | + 'cosmos', |
|
84 | + 'Covario-IDS', |
|
85 | + 'CrawlBot', |
|
86 | + 'crawler4j', |
|
87 | + 'CrystalSemanticsBot', |
|
88 | + 'curl', |
|
89 | + 'cXensebot', |
|
90 | + 'CyberPatrol', |
|
91 | + 'DataparkSearch', |
|
92 | + 'dataprovider', |
|
93 | + 'DiamondBot', |
|
94 | + 'Digg', |
|
95 | + 'discobot', |
|
96 | + 'DomainAppender', |
|
97 | + 'domaincrawler', |
|
98 | + 'Domain Re-Animator Bot', |
|
99 | + 'dotbot', |
|
100 | + 'drupact', |
|
101 | + 'DuckDuckBot', |
|
102 | + 'EARTHCOM', |
|
103 | + 'EasouSpider', |
|
104 | + 'ec2linkfinder', |
|
105 | + 'edisterbot', |
|
106 | + 'ElectricMonk', |
|
107 | + 'elisabot', |
|
108 | + 'emailmarketingrobot', |
|
109 | + 'Embedly', |
|
110 | + 'EmeraldShield\.com WebBot', |
|
111 | + 'envolk\[ITS\]spider', |
|
112 | + 'EsperanzaBot', |
|
113 | + 'europarchive\.org', |
|
114 | + 'EventMachine HttpClient', |
|
115 | + 'exabot', |
|
116 | + 'ezooms', |
|
117 | + 'facebookexternalhit', |
|
118 | + 'Facebot', |
|
119 | + 'FAST Enteprise Crawler', |
|
120 | + 'FAST Enterprise Crawler', |
|
121 | + 'FAST-WebCrawler', |
|
122 | + 'FDSE robot', |
|
123 | + 'Feedfetcher-Google', |
|
124 | + 'FeedValidator', |
|
125 | + 'FindLinks', |
|
126 | + 'findlink', |
|
127 | + 'findthatfile', |
|
128 | + 'findxbot', |
|
129 | + 'Flamingo_SearchEngine', |
|
130 | + 'fluffy', |
|
131 | + 'fr-crawler', |
|
132 | + 'FRCrawler', |
|
133 | + 'FurlBot', |
|
134 | + 'FyberSpider', |
|
135 | + 'g00g1e\.net', |
|
136 | + 'GigablastOpenSource', |
|
137 | + 'grub-client', |
|
138 | + 'g2crawler', |
|
139 | + 'Gaisbot', |
|
140 | + 'GalaxyBot', |
|
141 | + 'genieBot', |
|
142 | + 'Genieo', |
|
143 | + 'GermCrawler', |
|
144 | + 'getprismatic\.com', |
|
145 | + 'gigabot', |
|
146 | + 'GingerCrawler', |
|
147 | + 'Girafabot', |
|
148 | + 'Gluten Free Crawler', |
|
149 | + 'gnam gnam spider', |
|
150 | + 'Go-http-client', |
|
151 | + 'Googlebot-Image', |
|
152 | + 'Googlebot-Mobile', |
|
153 | + 'Googlebot', |
|
154 | + 'Google-HTTP-Java-Client', |
|
155 | + 'Google favicon', |
|
156 | + 'GrapeshotCrawler', |
|
157 | + 'gslfbot', |
|
158 | + 'GurujiBot', |
|
159 | + 'HappyFunBot', |
|
160 | + 'Healthbot', |
|
161 | + 'heritrix', |
|
162 | + 'hl_ftien_spider', |
|
163 | + 'Holmes', |
|
164 | + 'htdig', |
|
165 | + 'httpunit', |
|
166 | + 'httrack', |
|
167 | + 'ia_archiver', |
|
168 | + 'iaskspider', |
|
169 | + 'iCCrawler', |
|
170 | + 'ichiro', |
|
171 | + 'igdeSpyder', |
|
172 | + 'iisbot', |
|
173 | + 'InAGist', |
|
174 | + 'InfoWizards Reciprocal Link System PRO', |
|
175 | + 'Insitesbot', |
|
176 | + 'integromedb', |
|
177 | + 'intelium_bot', |
|
178 | + 'InterfaxScanBot', |
|
179 | + 'IODC', |
|
180 | + 'IOI', |
|
181 | + 'ip-web-crawler\.com', |
|
182 | + 'ips-agent', |
|
183 | + 'IRLbot', |
|
184 | + 'IssueCrawler', |
|
185 | + 'IstellaBot', |
|
186 | + 'it2media-domain-crawler', |
|
187 | + 'iZSearch', |
|
188 | + 'Jaxified Bot', |
|
189 | + 'JOC Web Spider', |
|
190 | + 'jyxobot', |
|
191 | + 'KoepaBot', |
|
192 | + 'L\.webis', |
|
193 | + 'LapozzBot', |
|
194 | + 'Larbin', |
|
195 | + 'lb-spider', |
|
196 | + 'LDSpider', |
|
197 | + 'LexxeBot', |
|
198 | + 'libwww', |
|
199 | + 'Linguee Bot', |
|
200 | + 'Link Valet', |
|
201 | + 'linkdex', |
|
202 | + 'LinkExaminer', |
|
203 | + 'LinksManager\.com_bot', |
|
204 | + 'LinkpadBot', |
|
205 | + 'LinksCrawler', |
|
206 | + 'LinkWalker', |
|
207 | + 'Lipperhey Link Explorer', |
|
208 | + 'Lipperhey SEO Service', |
|
209 | + 'Livelapbot', |
|
210 | + 'LongURL API', |
|
211 | + 'lmspider', |
|
212 | + 'lssbot', |
|
213 | + 'lssrocketcrawler', |
|
214 | + 'ltx71', |
|
215 | + 'lufsbot', |
|
216 | + 'lwp-trivial', |
|
217 | + 'Mail\.RU_Bot', |
|
218 | + 'MegaIndex\.ru', |
|
219 | + 'mabontland', |
|
220 | + 'magpie-crawler', |
|
221 | + 'MagpieRSS', |
|
222 | + 'Mediapartners-Google', |
|
223 | + 'memorybot', |
|
224 | + 'MetaURI', |
|
225 | + 'MJ12bot', |
|
226 | + 'mlbot', |
|
227 | + 'Mnogosearch', |
|
228 | + 'mogimogi', |
|
229 | + 'MojeekBot', |
|
230 | + 'Moreoverbot', |
|
231 | + 'Morning Paper', |
|
232 | + 'Mrcgiguy', |
|
233 | + 'MSIECrawler', |
|
234 | + 'msnbot', |
|
235 | + 'msrbot', |
|
236 | + 'MVAClient', |
|
237 | + 'mxbot', |
|
238 | + 'NerdByNature\.Bot', |
|
239 | + 'NerdyBot', |
|
240 | + 'netEstate NE Crawler', |
|
241 | + 'netresearchserver', |
|
242 | + 'NetSeer Crawler', |
|
243 | + 'NewsGator', |
|
244 | + 'newsme', |
|
245 | + 'NextGenSearchBot', |
|
246 | + 'NG-Search', |
|
247 | + 'ngbot', |
|
248 | + 'nicebot', |
|
249 | + 'niki-bot', |
|
250 | + '^NING\\/', |
|
251 | + 'Notifixious', |
|
252 | + 'noxtrumbot', |
|
253 | + 'Nusearch Spider', |
|
254 | + 'nutch', |
|
255 | + 'NutchCVS', |
|
256 | + 'Nymesis', |
|
257 | + 'obot', |
|
258 | + 'oegp', |
|
259 | + 'ocrawler', |
|
260 | + 'omgilibot', |
|
261 | + 'OmniExplorer_Bot', |
|
262 | + 'online link validator', |
|
263 | + 'Online Website Link Checker', |
|
264 | + 'OOZBOT', |
|
265 | + 'openindexspider', |
|
266 | + 'OpenWebSpider', |
|
267 | + 'OrangeBot', |
|
268 | + 'Orbiter', |
|
269 | + 'ow\.ly', |
|
270 | + 'PaperLiBot', |
|
271 | + 'Pingdom\.com_bot', |
|
272 | + 'Ploetz \+ Zeller', |
|
273 | + 'page2rss', |
|
274 | + 'PageBitesHyperBot', |
|
275 | + 'panscient', |
|
276 | + 'Peew', |
|
277 | + 'PercolateCrawler', |
|
278 | + 'phpcrawl', |
|
279 | + 'Pizilla', |
|
280 | + 'Plukkie', |
|
281 | + 'polybot', |
|
282 | + 'Pompos', |
|
283 | + 'postano', |
|
284 | + 'PostPost', |
|
285 | + 'postrank', |
|
286 | + 'proximic', |
|
287 | + 'psbot', |
|
288 | + 'purebot', |
|
289 | + 'PycURL', |
|
290 | + 'Python-httplib2', |
|
291 | + 'python-requests', |
|
292 | + 'Python-urllib', |
|
293 | + 'Qseero', |
|
294 | + 'QuerySeekerSpider', |
|
295 | + 'Qwantify', |
|
296 | + 'Radian6', |
|
297 | + 'RAMPyBot', |
|
298 | + 'RebelMouse', |
|
299 | + 'REL Link Checker', |
|
300 | + 'RetrevoPageAnalyzer', |
|
301 | + 'Riddler', |
|
302 | + 'Robosourcer', |
|
303 | + 'rogerbot', |
|
304 | + 'Ruby', |
|
305 | + 'RufusBot', |
|
306 | + 'SandCrawler', |
|
307 | + 'SBIder', |
|
308 | + 'ScoutJet', |
|
309 | + 'ScoutURLMonitor', |
|
310 | + 'Scrapy', |
|
311 | + 'ScreenerBot', |
|
312 | + 'scribdbot', |
|
313 | + 'Scrubby', |
|
314 | + 'SearchmetricsBot', |
|
315 | + 'SearchSight', |
|
316 | + 'seekbot', |
|
317 | + 'semanticdiscovery', |
|
318 | + 'SemrushBot', |
|
319 | + 'Sensis Web Crawler', |
|
320 | + 'SEOChat::Bot', |
|
321 | + 'seokicks-robot', |
|
322 | + 'SEOstats', |
|
323 | + 'Seznam screenshot-generator', |
|
324 | + 'seznambot', |
|
325 | + 'Shim-Crawler', |
|
326 | + 'ShopWiki', |
|
327 | + 'Shoula robot', |
|
328 | + 'ShowyouBot', |
|
329 | + 'SimpleCrawler', |
|
330 | + 'sistrix crawler', |
|
331 | + 'SiteBar', |
|
332 | + 'sitebot', |
|
333 | + 'siteexplorer\.info', |
|
334 | + 'SklikBot', |
|
335 | + 'slider\.com', |
|
336 | + 'slurp', |
|
337 | + 'smtbot', |
|
338 | + 'Snappy', |
|
339 | + 'sogou spider', |
|
340 | + 'sogou', |
|
341 | + 'Sosospider', |
|
342 | + 'spbot', |
|
343 | + 'Speedy Spider', |
|
344 | + 'speedy', |
|
345 | + 'SpiderMan', |
|
346 | + 'Sqworm', |
|
347 | + 'SSL-Crawler', |
|
348 | + 'StackRambler', |
|
349 | + 'suggybot', |
|
350 | + 'summify', |
|
351 | + 'SurdotlyBot', |
|
352 | + 'SurveyBot', |
|
353 | + 'SynooBot', |
|
354 | + 'tagoobot', |
|
355 | + 'teoma', |
|
356 | + 'TerrawizBot', |
|
357 | + 'TheSuBot', |
|
358 | + 'Thumbnail\.CZ robot', |
|
359 | + 'TinEye', |
|
360 | + 'toplistbot', |
|
361 | + 'Traackr.com', |
|
362 | + 'trendictionbot', |
|
363 | + 'TrueBot', |
|
364 | + 'truwoGPS', |
|
365 | + 'turnitinbot', |
|
366 | + 'TweetedTimes Bot', |
|
367 | + 'tweetedtimes\.com', |
|
368 | + 'TweetmemeBot', |
|
369 | + 'twengabot', |
|
370 | + 'Twikle', |
|
371 | + 'Twitterbot', |
|
372 | + 'uMBot', |
|
373 | + 'UnisterBot', |
|
374 | + 'UnwindFetchor', |
|
375 | + 'updated', |
|
376 | + 'urlappendbot', |
|
377 | + 'Urlfilebot', |
|
378 | + 'urlresolver', |
|
379 | + 'UsineNouvelleCrawler', |
|
380 | + 'Validator\.nu\\/LV', |
|
381 | + 'Vagabondo', |
|
382 | + 'Vivante Link Checker', |
|
383 | + 'voilabot', |
|
384 | + 'Vortex', |
|
385 | + 'voyager\\/', |
|
386 | + 'VYU2', |
|
387 | + 'W3C-checklink', |
|
388 | + 'W3C_CSS_Validator_JFouffa', |
|
389 | + 'W3C_I18n-Checker', |
|
390 | + 'W3C-mobileOK', |
|
391 | + 'W3C_Unicorn', |
|
392 | + 'W3C_Validator', |
|
393 | + 'web-archive-net\.com\.bot', |
|
394 | + 'Websquash\.com', |
|
395 | + 'WeSEE:Ads\\/PageBot', |
|
396 | + 'wbsearchbot', |
|
397 | + 'webcollage', |
|
398 | + 'webcompanycrawler', |
|
399 | + 'webcrawler', |
|
400 | + 'webmon ', |
|
401 | + 'WeSEE:Search', |
|
402 | + 'wf84', |
|
403 | + 'wget', |
|
404 | + 'wocbot', |
|
405 | + 'WoFindeIch Robot', |
|
406 | + 'WomlpeFactory', |
|
407 | + 'woriobot', |
|
408 | + 'wotbox', |
|
409 | + 'Xaldon_WebSpider', |
|
410 | + 'Xenu Link Sleuth', |
|
411 | + 'xintellibot', |
|
412 | + 'XML Sitemaps Generator', |
|
413 | + 'XoviBot', |
|
414 | + 'Y!J-ASR', |
|
415 | + 'yacy', |
|
416 | + 'yacybot', |
|
417 | + 'Yahoo Link Preview', |
|
418 | + 'Yahoo! Slurp China', |
|
419 | + 'Yahoo! Slurp', |
|
420 | + 'YahooSeeker', |
|
421 | + 'YahooSeeker-Testing', |
|
422 | + 'YandexBot', |
|
423 | + 'YandexImages', |
|
424 | + 'YandexMetrika', |
|
425 | + 'yandex', |
|
426 | + 'yanga', |
|
427 | + 'Yasaklibot', |
|
428 | + 'yeti', |
|
429 | + 'YioopBot', |
|
430 | + 'YisouSpider', |
|
431 | + 'YodaoBot', |
|
432 | + 'yoogliFetchAgent', |
|
433 | + 'yoozBot', |
|
434 | + 'YoudaoBot', |
|
435 | + 'Zao', |
|
436 | + 'Zealbot', |
|
437 | + 'zspider', |
|
438 | + 'ZyBorg', |
|
439 | + '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
440 | + ); |
|
441 | 441 | |
442 | - /** |
|
443 | - * All possible HTTP headers that represent the |
|
444 | - * User-Agent string. |
|
445 | - * |
|
446 | - * @var array |
|
447 | - */ |
|
448 | - protected static $uaHttpHeaders = array( |
|
449 | - // The default User-Agent string. |
|
450 | - 'HTTP_USER_AGENT', |
|
451 | - // Header can occur on devices using Opera Mini. |
|
452 | - 'HTTP_X_OPERAMINI_PHONE_UA', |
|
453 | - // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
454 | - 'HTTP_X_DEVICE_USER_AGENT', |
|
455 | - 'HTTP_X_ORIGINAL_USER_AGENT', |
|
456 | - 'HTTP_X_SKYFIRE_PHONE', |
|
457 | - 'HTTP_X_BOLT_PHONE_UA', |
|
458 | - 'HTTP_DEVICE_STOCK_UA', |
|
459 | - 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
460 | - ); |
|
442 | + /** |
|
443 | + * All possible HTTP headers that represent the |
|
444 | + * User-Agent string. |
|
445 | + * |
|
446 | + * @var array |
|
447 | + */ |
|
448 | + protected static $uaHttpHeaders = array( |
|
449 | + // The default User-Agent string. |
|
450 | + 'HTTP_USER_AGENT', |
|
451 | + // Header can occur on devices using Opera Mini. |
|
452 | + 'HTTP_X_OPERAMINI_PHONE_UA', |
|
453 | + // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
454 | + 'HTTP_X_DEVICE_USER_AGENT', |
|
455 | + 'HTTP_X_ORIGINAL_USER_AGENT', |
|
456 | + 'HTTP_X_SKYFIRE_PHONE', |
|
457 | + 'HTTP_X_BOLT_PHONE_UA', |
|
458 | + 'HTTP_DEVICE_STOCK_UA', |
|
459 | + 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
460 | + ); |
|
461 | 461 | |
462 | - /** |
|
463 | - * Class constructor. |
|
464 | - */ |
|
465 | - public function __construct(array $headers = null, $userAgent = null) |
|
466 | - { |
|
467 | - $this->setHttpHeaders($headers); |
|
468 | - $this->setUserAgent($userAgent); |
|
469 | - } |
|
462 | + /** |
|
463 | + * Class constructor. |
|
464 | + */ |
|
465 | + public function __construct(array $headers = null, $userAgent = null) |
|
466 | + { |
|
467 | + $this->setHttpHeaders($headers); |
|
468 | + $this->setUserAgent($userAgent); |
|
469 | + } |
|
470 | 470 | |
471 | - public function setHttpHeaders($httpHeaders = null) |
|
472 | - { |
|
473 | - // use global _SERVER if $httpHeaders aren't defined |
|
474 | - if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
475 | - $httpHeaders = $_SERVER; |
|
476 | - } |
|
477 | - // clear existing headers |
|
478 | - $this->httpHeaders = array(); |
|
479 | - // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
480 | - // start with HTTP_. |
|
481 | - foreach ($httpHeaders as $key => $value) { |
|
482 | - if (substr($key, 0, 5) === 'HTTP_') { |
|
483 | - $this->httpHeaders[$key] = $value; |
|
484 | - } |
|
485 | - } |
|
486 | - } |
|
471 | + public function setHttpHeaders($httpHeaders = null) |
|
472 | + { |
|
473 | + // use global _SERVER if $httpHeaders aren't defined |
|
474 | + if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
475 | + $httpHeaders = $_SERVER; |
|
476 | + } |
|
477 | + // clear existing headers |
|
478 | + $this->httpHeaders = array(); |
|
479 | + // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
480 | + // start with HTTP_. |
|
481 | + foreach ($httpHeaders as $key => $value) { |
|
482 | + if (substr($key, 0, 5) === 'HTTP_') { |
|
483 | + $this->httpHeaders[$key] = $value; |
|
484 | + } |
|
485 | + } |
|
486 | + } |
|
487 | 487 | |
488 | - public function getUaHttpHeaders() |
|
489 | - { |
|
490 | - return self::$uaHttpHeaders; |
|
491 | - } |
|
488 | + public function getUaHttpHeaders() |
|
489 | + { |
|
490 | + return self::$uaHttpHeaders; |
|
491 | + } |
|
492 | 492 | |
493 | - public function setUserAgent($userAgent = null) |
|
494 | - { |
|
495 | - if (false === empty($userAgent)) { |
|
496 | - return $this->userAgent = $userAgent; |
|
497 | - } else { |
|
498 | - $this->userAgent = null; |
|
499 | - foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
500 | - if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban) |
|
501 | - $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
502 | - } |
|
503 | - } |
|
493 | + public function setUserAgent($userAgent = null) |
|
494 | + { |
|
495 | + if (false === empty($userAgent)) { |
|
496 | + return $this->userAgent = $userAgent; |
|
497 | + } else { |
|
498 | + $this->userAgent = null; |
|
499 | + foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
500 | + if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban) |
|
501 | + $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
502 | + } |
|
503 | + } |
|
504 | 504 | |
505 | - return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
506 | - } |
|
507 | - } |
|
505 | + return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
506 | + } |
|
507 | + } |
|
508 | 508 | |
509 | - public function getRegex() |
|
510 | - { |
|
511 | - return '('.implode('|', self::$crawlers).')'; |
|
512 | - } |
|
509 | + public function getRegex() |
|
510 | + { |
|
511 | + return '('.implode('|', self::$crawlers).')'; |
|
512 | + } |
|
513 | 513 | |
514 | - public function isCrawler($userAgent = null) |
|
515 | - { |
|
516 | - $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
514 | + public function isCrawler($userAgent = null) |
|
515 | + { |
|
516 | + $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
517 | 517 | |
518 | - $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches); |
|
518 | + $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches); |
|
519 | 519 | |
520 | - if ($matches) { |
|
521 | - $this->matches = $matches; |
|
522 | - } |
|
520 | + if ($matches) { |
|
521 | + $this->matches = $matches; |
|
522 | + } |
|
523 | 523 | |
524 | - return (bool) $result; |
|
525 | - } |
|
524 | + return (bool) $result; |
|
525 | + } |
|
526 | 526 | |
527 | - public function getMatches() |
|
528 | - { |
|
529 | - return $this->matches[0]; |
|
530 | - } |
|
527 | + public function getMatches() |
|
528 | + { |
|
529 | + return $this->matches[0]; |
|
530 | + } |
|
531 | 531 | } |