@@ -13,538 +13,538 @@ |
||
13 | 13 | |
14 | 14 | class CrawlerDetect |
15 | 15 | { |
16 | - /** |
|
17 | - * The user agent. |
|
18 | - * |
|
19 | - * @var null |
|
20 | - */ |
|
21 | - protected $userAgent = null; |
|
16 | + /** |
|
17 | + * The user agent. |
|
18 | + * |
|
19 | + * @var null |
|
20 | + */ |
|
21 | + protected $userAgent = null; |
|
22 | 22 | |
23 | - /** |
|
24 | - * Headers that contain a user agent. |
|
25 | - * |
|
26 | - * @var array |
|
27 | - */ |
|
28 | - protected $httpHeaders = array(); |
|
23 | + /** |
|
24 | + * Headers that contain a user agent. |
|
25 | + * |
|
26 | + * @var array |
|
27 | + */ |
|
28 | + protected $httpHeaders = array(); |
|
29 | 29 | |
30 | - /** |
|
31 | - * Store regex matches. |
|
32 | - * |
|
33 | - * @var array |
|
34 | - */ |
|
35 | - protected $matches = array(); |
|
30 | + /** |
|
31 | + * Store regex matches. |
|
32 | + * |
|
33 | + * @var array |
|
34 | + */ |
|
35 | + protected $matches = array(); |
|
36 | 36 | |
37 | - /** |
|
38 | - * List of strings to remove from the user agent before running the crawler regex |
|
39 | - * Over a large list of user agents, this gives us about a 55% speed increase! |
|
40 | - * |
|
41 | - * @var array |
|
42 | - */ |
|
43 | - protected static $ignore = array( |
|
44 | - 'Safari.[\d\.]*', |
|
45 | - 'Firefox.[\d\.]*', |
|
46 | - 'Chrome.[\d\.]*', |
|
47 | - 'Chromium.[\d\.]*', |
|
48 | - 'MSIE.[\d\.]', |
|
49 | - 'Opera\/[\d\.]*', |
|
50 | - 'Mozilla.[\d\.]*', |
|
51 | - 'AppleWebKit.[\d\.]*', |
|
52 | - 'Trident.[\d\.]*', |
|
53 | - 'Windows NT.[\d\.]*', |
|
54 | - 'Android.[\d\.]*', |
|
55 | - 'Macintosh.', |
|
56 | - 'Ubuntu', |
|
57 | - 'Linux', |
|
58 | - '[ ]Intel', |
|
59 | - 'Mac OS X [\d_]*', |
|
60 | - '(like )?Gecko(.[\d\.]*)?', |
|
61 | - 'KHTML', |
|
62 | - 'CriOS.[\d\.]*', |
|
63 | - 'CPU iPhone OS ([0-9_])* like Mac OS X', |
|
64 | - 'CPU OS ([0-9_])* like Mac OS X', |
|
65 | - 'iPod', |
|
66 | - 'compatible', |
|
67 | - 'x86_..', |
|
68 | - 'i686', |
|
69 | - 'x64', |
|
70 | - 'X11', |
|
71 | - 'rv:[\d\.]*', |
|
72 | - 'Version.[\d\.]*', |
|
73 | - 'WOW64', |
|
74 | - 'Win64', |
|
75 | - 'Dalvik.[\d\.]*', |
|
76 | - ' \.NET CLR [\d\.]*', |
|
77 | - 'Presto.[\d\.]*', |
|
78 | - 'Media Center PC', |
|
79 | - 'BlackBerry', |
|
80 | - 'Build', |
|
81 | - 'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.', |
|
82 | - 'Opera', |
|
83 | - ' \.NET[\d\.]*', |
|
84 | - '\(|\)|;|,', // Remove the following characters ( ) : , |
|
85 | - ); |
|
37 | + /** |
|
38 | + * List of strings to remove from the user agent before running the crawler regex |
|
39 | + * Over a large list of user agents, this gives us about a 55% speed increase! |
|
40 | + * |
|
41 | + * @var array |
|
42 | + */ |
|
43 | + protected static $ignore = array( |
|
44 | + 'Safari.[\d\.]*', |
|
45 | + 'Firefox.[\d\.]*', |
|
46 | + 'Chrome.[\d\.]*', |
|
47 | + 'Chromium.[\d\.]*', |
|
48 | + 'MSIE.[\d\.]', |
|
49 | + 'Opera\/[\d\.]*', |
|
50 | + 'Mozilla.[\d\.]*', |
|
51 | + 'AppleWebKit.[\d\.]*', |
|
52 | + 'Trident.[\d\.]*', |
|
53 | + 'Windows NT.[\d\.]*', |
|
54 | + 'Android.[\d\.]*', |
|
55 | + 'Macintosh.', |
|
56 | + 'Ubuntu', |
|
57 | + 'Linux', |
|
58 | + '[ ]Intel', |
|
59 | + 'Mac OS X [\d_]*', |
|
60 | + '(like )?Gecko(.[\d\.]*)?', |
|
61 | + 'KHTML', |
|
62 | + 'CriOS.[\d\.]*', |
|
63 | + 'CPU iPhone OS ([0-9_])* like Mac OS X', |
|
64 | + 'CPU OS ([0-9_])* like Mac OS X', |
|
65 | + 'iPod', |
|
66 | + 'compatible', |
|
67 | + 'x86_..', |
|
68 | + 'i686', |
|
69 | + 'x64', |
|
70 | + 'X11', |
|
71 | + 'rv:[\d\.]*', |
|
72 | + 'Version.[\d\.]*', |
|
73 | + 'WOW64', |
|
74 | + 'Win64', |
|
75 | + 'Dalvik.[\d\.]*', |
|
76 | + ' \.NET CLR [\d\.]*', |
|
77 | + 'Presto.[\d\.]*', |
|
78 | + 'Media Center PC', |
|
79 | + 'BlackBerry', |
|
80 | + 'Build', |
|
81 | + 'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.', |
|
82 | + 'Opera', |
|
83 | + ' \.NET[\d\.]*', |
|
84 | + '\(|\)|;|,', // Remove the following characters ( ) : , |
|
85 | + ); |
|
86 | 86 | |
87 | - /** |
|
88 | - * Array of regular expressions to match against the user agent. |
|
89 | - * |
|
90 | - * @var array |
|
91 | - */ |
|
92 | - protected static $crawlers = array( |
|
93 | - '.*Java.*outbrain', |
|
94 | - '008\/', |
|
95 | - '^NING\/', |
|
96 | - 'A6-Indexer', |
|
97 | - 'Aboundex', |
|
98 | - 'Accoona-AI-Agent', |
|
99 | - 'acoon', |
|
100 | - 'AddThis', |
|
101 | - 'ADmantX', |
|
102 | - 'AHC', |
|
103 | - 'Airmail', |
|
104 | - 'Anemone', |
|
105 | - 'Apache-HttpClient\/', |
|
106 | - 'Arachmo', |
|
107 | - 'archive-com', |
|
108 | - 'B-l-i-t-z-B-O-T', |
|
109 | - 'Backlink-Ceck\.de', |
|
110 | - 'baidu\.com', |
|
111 | - 'BazQux', |
|
112 | - 'bibnum\.bnf', |
|
113 | - 'biglotron', |
|
114 | - 'BingLocalSearch', |
|
115 | - 'BingPreview', |
|
116 | - 'binlar', |
|
117 | - 'Bloglovin', |
|
118 | - 'Blogtrottr', |
|
119 | - 'boitho\.com-dc', |
|
120 | - 'Browsershots', |
|
121 | - 'BUbiNG', |
|
122 | - 'Butterfly\/', |
|
123 | - 'BuzzSumo', |
|
124 | - 'CapsuleChecker', |
|
125 | - 'CC Metadata Scaper', |
|
126 | - 'Cerberian Drtrs', |
|
127 | - 'changedetection', |
|
128 | - 'Charlotte', |
|
129 | - 'clips\.ua\.ac\.be', |
|
130 | - 'CloudFlare-AlwaysOnline', |
|
131 | - 'coccoc', |
|
132 | - 'CommaFeed', |
|
133 | - 'Commons-HttpClient', |
|
134 | - 'convera', |
|
135 | - 'cosmos', |
|
136 | - 'corporatetwitnews', |
|
137 | - 'Covario-IDS', |
|
138 | - 'cron-job\.org', |
|
139 | - 'Curious George', |
|
140 | - 'curl', |
|
141 | - 'CyberPatrol', |
|
142 | - 'DataparkSearch', |
|
143 | - 'dataprovider', |
|
144 | - 'Daum(oa)?[ \/][0-9]', |
|
145 | - 'Digg', |
|
146 | - 'DomainAppender', |
|
147 | - 'Dragonfly File Reader', |
|
148 | - 'drupact', |
|
149 | - 'EARTHCOM', |
|
150 | - 'ec2linkfinder', |
|
151 | - 'ECCP', |
|
152 | - 'ElectricMonk', |
|
153 | - 'EMail Exractor', |
|
154 | - 'EmailWolf', |
|
155 | - 'Embed PHP Library', |
|
156 | - 'Embedly', |
|
157 | - 'europarchive\.org', |
|
158 | - 'EventMachine HttpClient', |
|
159 | - 'ExactSearch', |
|
160 | - 'ExaleadCloudview', |
|
161 | - 'eZ Publish Link Validator', |
|
162 | - 'ezooms', |
|
163 | - 'facebookexternalhit', |
|
164 | - 'facebookplatform', |
|
165 | - 'Feed Wrangler', |
|
166 | - 'Feedbin', |
|
167 | - 'FeedBurner', |
|
168 | - 'Feedfetcher-Google', |
|
169 | - 'Feedly', |
|
170 | - 'Feedspot', |
|
171 | - 'FeedValidator', |
|
172 | - 'Fever', |
|
173 | - 'findlink', |
|
174 | - 'findthatfile', |
|
175 | - 'Flamingo_SearchEngine', |
|
176 | - 'FlipboardProxy', |
|
177 | - 'fluffy', |
|
178 | - 'Funnelback', |
|
179 | - 'g00g1e\.net', |
|
180 | - 'Genieo', |
|
181 | - 'getprismatic\.com', |
|
182 | - 'GigablastOpenSource', |
|
183 | - 'Go-http-client', |
|
184 | - 'Google favicon', |
|
185 | - 'Google Keyword Suggestion', |
|
186 | - 'Google Page Speed Insights', |
|
187 | - 'Google Web Preview', |
|
188 | - 'Google-HTTP-Java-Client', |
|
189 | - 'Google-Site-Verification', |
|
190 | - 'google_partner_monitoring', |
|
191 | - 'GoogleProducer', |
|
192 | - 'Grammarly', |
|
193 | - 'grub-client', |
|
194 | - 'heritrix', |
|
195 | - 'Holmes', |
|
196 | - 'htdig', |
|
197 | - 'HTTPMon', |
|
198 | - 'http-kit', |
|
199 | - 'http_requester', |
|
200 | - 'httpunit', |
|
201 | - 'http_request2', |
|
202 | - 'httrack', |
|
203 | - 'HubPages.*crawlingpolicy', |
|
204 | - 'HubSpot Marketing Grader', |
|
205 | - 'ichiro', |
|
206 | - 'IDG Twitter Links Resolver', |
|
207 | - 'igdeSpyder', |
|
208 | - 'InAGist', |
|
209 | - 'infegy', |
|
210 | - 'InfoWizards Reciprocal Link System PRO', |
|
211 | - 'inpwrd\.com', |
|
212 | - 'integromedb', |
|
213 | - 'IODC', |
|
214 | - 'IOI', |
|
215 | - 'ips-agent', |
|
216 | - 'iZSearch', |
|
217 | - '^Java\/', |
|
218 | - 'Jigsaw', |
|
219 | - 'Jobrapido', |
|
220 | - 'kouio', |
|
221 | - 'L\.webis', |
|
222 | - 'Larbin', |
|
223 | - 'libwww', |
|
224 | - 'Link Valet', |
|
225 | - 'linkCheck', |
|
226 | - 'linkdex', |
|
227 | - 'LinkExaminer', |
|
228 | - 'LinkWalker', |
|
229 | - 'Lipperhey', |
|
230 | - 'LongURL API', |
|
231 | - 'ltx71', |
|
232 | - 'lwp-trivial', |
|
233 | - 'lycos', |
|
234 | - 'mabontland', |
|
235 | - 'MagpieRSS', |
|
236 | - 'Mediapartners-Google', |
|
237 | - 'Mediapartners-Google', |
|
238 | - 'MegaIndex\.ru', |
|
239 | - 'MetaURI', |
|
240 | - 'Mnogosearch', |
|
241 | - 'mogimogi', |
|
242 | - 'Morning Paper', |
|
243 | - 'Mrcgiguy', |
|
244 | - 'MVAClient', |
|
245 | - 'Netcraft Web Server Survey', |
|
246 | - 'NetcraftSurveyAgent', |
|
247 | - 'NetLyzer FastProbe', |
|
248 | - 'netresearch', |
|
249 | - 'Netvibes', |
|
250 | - 'NewsBlur .*(Fetcher|Finder)', |
|
251 | - 'NewsGator', |
|
252 | - 'newsme', |
|
253 | - 'NG-Search', |
|
254 | - 'nineconnections\.com', |
|
255 | - 'nominet\.org\.uk', |
|
256 | - 'Notifixious', |
|
257 | - 'nuhk', |
|
258 | - 'nutch', |
|
259 | - 'Nuzzel', |
|
260 | - 'Nymesis', |
|
261 | - 'oegp', |
|
262 | - 'Omea Reader', |
|
263 | - 'omgili', |
|
264 | - 'online link validator', |
|
265 | - 'Online Website Link Checker', |
|
266 | - 'Orbiter', |
|
267 | - 'ow\.ly', |
|
268 | - 'Go [\d\.]* package http', |
|
269 | - 'page2rss', |
|
270 | - 'PagePeeker', |
|
271 | - 'panscient', |
|
272 | - 'Peew', |
|
273 | - 'PhantomJS\/', |
|
274 | - 'phpcrawl', |
|
275 | - 'phpservermon', |
|
276 | - 'Pingdom\.com', |
|
277 | - 'Pinterest', |
|
278 | - 'Pizilla', |
|
279 | - 'Ploetz \+ Zeller', |
|
280 | - 'Plukkie', |
|
281 | - 'PocketParser', |
|
282 | - 'Pompos', |
|
283 | - 'postano', |
|
284 | - 'PostPost', |
|
285 | - 'postrank', |
|
286 | - 'proximic', |
|
287 | - 'Pulsepoint XT3 web scraper', |
|
288 | - 'Python-httplib2', |
|
289 | - 'python-requests', |
|
290 | - 'Python-urllib', |
|
291 | - 'Qseero', |
|
292 | - 'Qwantify', |
|
293 | - 'Radian6', |
|
294 | - 'RebelMouse', |
|
295 | - 'REL Link Checker', |
|
296 | - 'RetrevoPageAnalyzer', |
|
297 | - 'Riddler', |
|
298 | - 'Robosourcer', |
|
299 | - 'ROI Hunter', |
|
300 | - 'Ruby', |
|
301 | - 'SalesIntelligent', |
|
302 | - 'SBIder', |
|
303 | - 'scooter', |
|
304 | - 'ScoutJet', |
|
305 | - 'ScoutURLMonitor', |
|
306 | - 'Scrapy', |
|
307 | - 'Scrubby', |
|
308 | - 'SearchSight', |
|
309 | - 'semanticdiscovery', |
|
310 | - 'SEOstats', |
|
311 | - 'Server Density Service Monitoring.*', |
|
312 | - 'servernfo\.com', |
|
313 | - 'Seznam screenshot-generator', |
|
314 | - 'ShopWiki', |
|
315 | - 'SilverReader', |
|
316 | - 'SimplePie', |
|
317 | - 'Site24x7', |
|
318 | - 'SiteBar', |
|
319 | - 'siteexplorer\.info', |
|
320 | - 'Siteimprove\.com', |
|
321 | - 'SkypeUriPreview', |
|
322 | - 'slider\.com', |
|
323 | - 'slurp', |
|
324 | - 'SMRF URL Expander', |
|
325 | - 'Snappy', |
|
326 | - 'SNK Siteshooter B0t', |
|
327 | - 'sogou', |
|
328 | - 'SortSite', |
|
329 | - 'speedy', |
|
330 | - 'Spinn3r', |
|
331 | - 'Springshare Link Checker', |
|
332 | - 'Sqworm', |
|
333 | - 'StackRambler', |
|
334 | - 'Stratagems Kumo', |
|
335 | - 'summify', |
|
336 | - 'teoma', |
|
337 | - 'theoldreader\.com', |
|
338 | - 'TinEye', |
|
339 | - 'Tiny Tiny RSS', |
|
340 | - 'Traackr.com', |
|
341 | - 'truwoGPS', |
|
342 | - 'tweetedtimes\.com', |
|
343 | - 'Twikle', |
|
344 | - 'Typhoeus', |
|
345 | - 'ubermetrics-technologies', |
|
346 | - 'UdmSearch', |
|
347 | - 'UnwindFetchor', |
|
348 | - 'updated', |
|
349 | - 'URLChecker', |
|
350 | - 'urlresolver', |
|
351 | - 'Vagabondo', |
|
352 | - 'Validator\.nu\/LV', |
|
353 | - 'via ggpht\.com GoogleImageProxy', |
|
354 | - 'Vivante Link Checker', |
|
355 | - 'vkShare', |
|
356 | - 'Vortex', |
|
357 | - 'voyager\/', |
|
358 | - 'VYU2', |
|
359 | - 'W3C-checklink', |
|
360 | - 'W3C-mobileOK', |
|
361 | - 'W3C_CSS_Validator_JFouffa', |
|
362 | - 'W3C_I18n-Checker', |
|
363 | - 'W3C_Unicorn', |
|
364 | - 'W3C_Validator', |
|
365 | - 'web-capture\.net', |
|
366 | - 'WebCapture', |
|
367 | - 'WebCorp', |
|
368 | - 'webcollage', |
|
369 | - 'WebIndex', |
|
370 | - 'WebFetch', |
|
371 | - 'webmon ', |
|
372 | - 'websitepulse[+ ]checker', |
|
373 | - 'Websquash\.com', |
|
374 | - 'WebThumbnail', |
|
375 | - 'Web Link Validator', |
|
376 | - 'WeSEE:Search', |
|
377 | - 'wf84', |
|
378 | - 'wget', |
|
379 | - 'WhatsApp', |
|
380 | - 'WomlpeFactory', |
|
381 | - 'WordPress\/', |
|
382 | - 'wotbox', |
|
383 | - 'www\.monitor\.us', |
|
384 | - 'XaxisSemanticsClassifier', |
|
385 | - 'Xenu Link Sleuth', |
|
386 | - 'XML Sitemaps Generator', |
|
387 | - 'Y!J-ASR', |
|
388 | - 'yacy', |
|
389 | - 'Yahoo Ad monitoring', |
|
390 | - 'Yahoo Link Preview', |
|
391 | - 'YahooSeeker', |
|
392 | - 'yandex', |
|
393 | - 'yanga', |
|
394 | - 'yeti', |
|
395 | - 'yoogliFetchAgent', |
|
396 | - 'YottaaMonitor', |
|
397 | - 'Zao', |
|
398 | - 'zgrab', |
|
399 | - 'ZyBorg', |
|
400 | - '[a-z0-9\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
401 | - ); |
|
87 | + /** |
|
88 | + * Array of regular expressions to match against the user agent. |
|
89 | + * |
|
90 | + * @var array |
|
91 | + */ |
|
92 | + protected static $crawlers = array( |
|
93 | + '.*Java.*outbrain', |
|
94 | + '008\/', |
|
95 | + '^NING\/', |
|
96 | + 'A6-Indexer', |
|
97 | + 'Aboundex', |
|
98 | + 'Accoona-AI-Agent', |
|
99 | + 'acoon', |
|
100 | + 'AddThis', |
|
101 | + 'ADmantX', |
|
102 | + 'AHC', |
|
103 | + 'Airmail', |
|
104 | + 'Anemone', |
|
105 | + 'Apache-HttpClient\/', |
|
106 | + 'Arachmo', |
|
107 | + 'archive-com', |
|
108 | + 'B-l-i-t-z-B-O-T', |
|
109 | + 'Backlink-Ceck\.de', |
|
110 | + 'baidu\.com', |
|
111 | + 'BazQux', |
|
112 | + 'bibnum\.bnf', |
|
113 | + 'biglotron', |
|
114 | + 'BingLocalSearch', |
|
115 | + 'BingPreview', |
|
116 | + 'binlar', |
|
117 | + 'Bloglovin', |
|
118 | + 'Blogtrottr', |
|
119 | + 'boitho\.com-dc', |
|
120 | + 'Browsershots', |
|
121 | + 'BUbiNG', |
|
122 | + 'Butterfly\/', |
|
123 | + 'BuzzSumo', |
|
124 | + 'CapsuleChecker', |
|
125 | + 'CC Metadata Scaper', |
|
126 | + 'Cerberian Drtrs', |
|
127 | + 'changedetection', |
|
128 | + 'Charlotte', |
|
129 | + 'clips\.ua\.ac\.be', |
|
130 | + 'CloudFlare-AlwaysOnline', |
|
131 | + 'coccoc', |
|
132 | + 'CommaFeed', |
|
133 | + 'Commons-HttpClient', |
|
134 | + 'convera', |
|
135 | + 'cosmos', |
|
136 | + 'corporatetwitnews', |
|
137 | + 'Covario-IDS', |
|
138 | + 'cron-job\.org', |
|
139 | + 'Curious George', |
|
140 | + 'curl', |
|
141 | + 'CyberPatrol', |
|
142 | + 'DataparkSearch', |
|
143 | + 'dataprovider', |
|
144 | + 'Daum(oa)?[ \/][0-9]', |
|
145 | + 'Digg', |
|
146 | + 'DomainAppender', |
|
147 | + 'Dragonfly File Reader', |
|
148 | + 'drupact', |
|
149 | + 'EARTHCOM', |
|
150 | + 'ec2linkfinder', |
|
151 | + 'ECCP', |
|
152 | + 'ElectricMonk', |
|
153 | + 'EMail Exractor', |
|
154 | + 'EmailWolf', |
|
155 | + 'Embed PHP Library', |
|
156 | + 'Embedly', |
|
157 | + 'europarchive\.org', |
|
158 | + 'EventMachine HttpClient', |
|
159 | + 'ExactSearch', |
|
160 | + 'ExaleadCloudview', |
|
161 | + 'eZ Publish Link Validator', |
|
162 | + 'ezooms', |
|
163 | + 'facebookexternalhit', |
|
164 | + 'facebookplatform', |
|
165 | + 'Feed Wrangler', |
|
166 | + 'Feedbin', |
|
167 | + 'FeedBurner', |
|
168 | + 'Feedfetcher-Google', |
|
169 | + 'Feedly', |
|
170 | + 'Feedspot', |
|
171 | + 'FeedValidator', |
|
172 | + 'Fever', |
|
173 | + 'findlink', |
|
174 | + 'findthatfile', |
|
175 | + 'Flamingo_SearchEngine', |
|
176 | + 'FlipboardProxy', |
|
177 | + 'fluffy', |
|
178 | + 'Funnelback', |
|
179 | + 'g00g1e\.net', |
|
180 | + 'Genieo', |
|
181 | + 'getprismatic\.com', |
|
182 | + 'GigablastOpenSource', |
|
183 | + 'Go-http-client', |
|
184 | + 'Google favicon', |
|
185 | + 'Google Keyword Suggestion', |
|
186 | + 'Google Page Speed Insights', |
|
187 | + 'Google Web Preview', |
|
188 | + 'Google-HTTP-Java-Client', |
|
189 | + 'Google-Site-Verification', |
|
190 | + 'google_partner_monitoring', |
|
191 | + 'GoogleProducer', |
|
192 | + 'Grammarly', |
|
193 | + 'grub-client', |
|
194 | + 'heritrix', |
|
195 | + 'Holmes', |
|
196 | + 'htdig', |
|
197 | + 'HTTPMon', |
|
198 | + 'http-kit', |
|
199 | + 'http_requester', |
|
200 | + 'httpunit', |
|
201 | + 'http_request2', |
|
202 | + 'httrack', |
|
203 | + 'HubPages.*crawlingpolicy', |
|
204 | + 'HubSpot Marketing Grader', |
|
205 | + 'ichiro', |
|
206 | + 'IDG Twitter Links Resolver', |
|
207 | + 'igdeSpyder', |
|
208 | + 'InAGist', |
|
209 | + 'infegy', |
|
210 | + 'InfoWizards Reciprocal Link System PRO', |
|
211 | + 'inpwrd\.com', |
|
212 | + 'integromedb', |
|
213 | + 'IODC', |
|
214 | + 'IOI', |
|
215 | + 'ips-agent', |
|
216 | + 'iZSearch', |
|
217 | + '^Java\/', |
|
218 | + 'Jigsaw', |
|
219 | + 'Jobrapido', |
|
220 | + 'kouio', |
|
221 | + 'L\.webis', |
|
222 | + 'Larbin', |
|
223 | + 'libwww', |
|
224 | + 'Link Valet', |
|
225 | + 'linkCheck', |
|
226 | + 'linkdex', |
|
227 | + 'LinkExaminer', |
|
228 | + 'LinkWalker', |
|
229 | + 'Lipperhey', |
|
230 | + 'LongURL API', |
|
231 | + 'ltx71', |
|
232 | + 'lwp-trivial', |
|
233 | + 'lycos', |
|
234 | + 'mabontland', |
|
235 | + 'MagpieRSS', |
|
236 | + 'Mediapartners-Google', |
|
237 | + 'Mediapartners-Google', |
|
238 | + 'MegaIndex\.ru', |
|
239 | + 'MetaURI', |
|
240 | + 'Mnogosearch', |
|
241 | + 'mogimogi', |
|
242 | + 'Morning Paper', |
|
243 | + 'Mrcgiguy', |
|
244 | + 'MVAClient', |
|
245 | + 'Netcraft Web Server Survey', |
|
246 | + 'NetcraftSurveyAgent', |
|
247 | + 'NetLyzer FastProbe', |
|
248 | + 'netresearch', |
|
249 | + 'Netvibes', |
|
250 | + 'NewsBlur .*(Fetcher|Finder)', |
|
251 | + 'NewsGator', |
|
252 | + 'newsme', |
|
253 | + 'NG-Search', |
|
254 | + 'nineconnections\.com', |
|
255 | + 'nominet\.org\.uk', |
|
256 | + 'Notifixious', |
|
257 | + 'nuhk', |
|
258 | + 'nutch', |
|
259 | + 'Nuzzel', |
|
260 | + 'Nymesis', |
|
261 | + 'oegp', |
|
262 | + 'Omea Reader', |
|
263 | + 'omgili', |
|
264 | + 'online link validator', |
|
265 | + 'Online Website Link Checker', |
|
266 | + 'Orbiter', |
|
267 | + 'ow\.ly', |
|
268 | + 'Go [\d\.]* package http', |
|
269 | + 'page2rss', |
|
270 | + 'PagePeeker', |
|
271 | + 'panscient', |
|
272 | + 'Peew', |
|
273 | + 'PhantomJS\/', |
|
274 | + 'phpcrawl', |
|
275 | + 'phpservermon', |
|
276 | + 'Pingdom\.com', |
|
277 | + 'Pinterest', |
|
278 | + 'Pizilla', |
|
279 | + 'Ploetz \+ Zeller', |
|
280 | + 'Plukkie', |
|
281 | + 'PocketParser', |
|
282 | + 'Pompos', |
|
283 | + 'postano', |
|
284 | + 'PostPost', |
|
285 | + 'postrank', |
|
286 | + 'proximic', |
|
287 | + 'Pulsepoint XT3 web scraper', |
|
288 | + 'Python-httplib2', |
|
289 | + 'python-requests', |
|
290 | + 'Python-urllib', |
|
291 | + 'Qseero', |
|
292 | + 'Qwantify', |
|
293 | + 'Radian6', |
|
294 | + 'RebelMouse', |
|
295 | + 'REL Link Checker', |
|
296 | + 'RetrevoPageAnalyzer', |
|
297 | + 'Riddler', |
|
298 | + 'Robosourcer', |
|
299 | + 'ROI Hunter', |
|
300 | + 'Ruby', |
|
301 | + 'SalesIntelligent', |
|
302 | + 'SBIder', |
|
303 | + 'scooter', |
|
304 | + 'ScoutJet', |
|
305 | + 'ScoutURLMonitor', |
|
306 | + 'Scrapy', |
|
307 | + 'Scrubby', |
|
308 | + 'SearchSight', |
|
309 | + 'semanticdiscovery', |
|
310 | + 'SEOstats', |
|
311 | + 'Server Density Service Monitoring.*', |
|
312 | + 'servernfo\.com', |
|
313 | + 'Seznam screenshot-generator', |
|
314 | + 'ShopWiki', |
|
315 | + 'SilverReader', |
|
316 | + 'SimplePie', |
|
317 | + 'Site24x7', |
|
318 | + 'SiteBar', |
|
319 | + 'siteexplorer\.info', |
|
320 | + 'Siteimprove\.com', |
|
321 | + 'SkypeUriPreview', |
|
322 | + 'slider\.com', |
|
323 | + 'slurp', |
|
324 | + 'SMRF URL Expander', |
|
325 | + 'Snappy', |
|
326 | + 'SNK Siteshooter B0t', |
|
327 | + 'sogou', |
|
328 | + 'SortSite', |
|
329 | + 'speedy', |
|
330 | + 'Spinn3r', |
|
331 | + 'Springshare Link Checker', |
|
332 | + 'Sqworm', |
|
333 | + 'StackRambler', |
|
334 | + 'Stratagems Kumo', |
|
335 | + 'summify', |
|
336 | + 'teoma', |
|
337 | + 'theoldreader\.com', |
|
338 | + 'TinEye', |
|
339 | + 'Tiny Tiny RSS', |
|
340 | + 'Traackr.com', |
|
341 | + 'truwoGPS', |
|
342 | + 'tweetedtimes\.com', |
|
343 | + 'Twikle', |
|
344 | + 'Typhoeus', |
|
345 | + 'ubermetrics-technologies', |
|
346 | + 'UdmSearch', |
|
347 | + 'UnwindFetchor', |
|
348 | + 'updated', |
|
349 | + 'URLChecker', |
|
350 | + 'urlresolver', |
|
351 | + 'Vagabondo', |
|
352 | + 'Validator\.nu\/LV', |
|
353 | + 'via ggpht\.com GoogleImageProxy', |
|
354 | + 'Vivante Link Checker', |
|
355 | + 'vkShare', |
|
356 | + 'Vortex', |
|
357 | + 'voyager\/', |
|
358 | + 'VYU2', |
|
359 | + 'W3C-checklink', |
|
360 | + 'W3C-mobileOK', |
|
361 | + 'W3C_CSS_Validator_JFouffa', |
|
362 | + 'W3C_I18n-Checker', |
|
363 | + 'W3C_Unicorn', |
|
364 | + 'W3C_Validator', |
|
365 | + 'web-capture\.net', |
|
366 | + 'WebCapture', |
|
367 | + 'WebCorp', |
|
368 | + 'webcollage', |
|
369 | + 'WebIndex', |
|
370 | + 'WebFetch', |
|
371 | + 'webmon ', |
|
372 | + 'websitepulse[+ ]checker', |
|
373 | + 'Websquash\.com', |
|
374 | + 'WebThumbnail', |
|
375 | + 'Web Link Validator', |
|
376 | + 'WeSEE:Search', |
|
377 | + 'wf84', |
|
378 | + 'wget', |
|
379 | + 'WhatsApp', |
|
380 | + 'WomlpeFactory', |
|
381 | + 'WordPress\/', |
|
382 | + 'wotbox', |
|
383 | + 'www\.monitor\.us', |
|
384 | + 'XaxisSemanticsClassifier', |
|
385 | + 'Xenu Link Sleuth', |
|
386 | + 'XML Sitemaps Generator', |
|
387 | + 'Y!J-ASR', |
|
388 | + 'yacy', |
|
389 | + 'Yahoo Ad monitoring', |
|
390 | + 'Yahoo Link Preview', |
|
391 | + 'YahooSeeker', |
|
392 | + 'yandex', |
|
393 | + 'yanga', |
|
394 | + 'yeti', |
|
395 | + 'yoogliFetchAgent', |
|
396 | + 'YottaaMonitor', |
|
397 | + 'Zao', |
|
398 | + 'zgrab', |
|
399 | + 'ZyBorg', |
|
400 | + '[a-z0-9\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
401 | + ); |
|
402 | 402 | |
403 | - /** |
|
404 | - * All possible HTTP headers that represent the |
|
405 | - * User-Agent string. |
|
406 | - * |
|
407 | - * @var array |
|
408 | - */ |
|
409 | - protected static $uaHttpHeaders = array( |
|
410 | - // The default User-Agent string. |
|
411 | - 'HTTP_USER_AGENT', |
|
412 | - // Header can occur on devices using Opera Mini. |
|
413 | - 'HTTP_X_OPERAMINI_PHONE_UA', |
|
414 | - // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
415 | - 'HTTP_X_DEVICE_USER_AGENT', |
|
416 | - 'HTTP_X_ORIGINAL_USER_AGENT', |
|
417 | - 'HTTP_X_SKYFIRE_PHONE', |
|
418 | - 'HTTP_X_BOLT_PHONE_UA', |
|
419 | - 'HTTP_DEVICE_STOCK_UA', |
|
420 | - 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
421 | - ); |
|
403 | + /** |
|
404 | + * All possible HTTP headers that represent the |
|
405 | + * User-Agent string. |
|
406 | + * |
|
407 | + * @var array |
|
408 | + */ |
|
409 | + protected static $uaHttpHeaders = array( |
|
410 | + // The default User-Agent string. |
|
411 | + 'HTTP_USER_AGENT', |
|
412 | + // Header can occur on devices using Opera Mini. |
|
413 | + 'HTTP_X_OPERAMINI_PHONE_UA', |
|
414 | + // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
415 | + 'HTTP_X_DEVICE_USER_AGENT', |
|
416 | + 'HTTP_X_ORIGINAL_USER_AGENT', |
|
417 | + 'HTTP_X_SKYFIRE_PHONE', |
|
418 | + 'HTTP_X_BOLT_PHONE_UA', |
|
419 | + 'HTTP_DEVICE_STOCK_UA', |
|
420 | + 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
421 | + ); |
|
422 | 422 | |
423 | - /** |
|
424 | - * Class constructor. |
|
425 | - */ |
|
426 | - public function __construct(array $headers = null, $userAgent = null) |
|
427 | - { |
|
428 | - $this->setHttpHeaders($headers); |
|
429 | - $this->setUserAgent($userAgent); |
|
430 | - } |
|
423 | + /** |
|
424 | + * Class constructor. |
|
425 | + */ |
|
426 | + public function __construct(array $headers = null, $userAgent = null) |
|
427 | + { |
|
428 | + $this->setHttpHeaders($headers); |
|
429 | + $this->setUserAgent($userAgent); |
|
430 | + } |
|
431 | 431 | |
432 | - /** |
|
433 | - * Set HTTP headers. |
|
434 | - * |
|
435 | - * @param array $httpHeaders |
|
436 | - */ |
|
437 | - public function setHttpHeaders($httpHeaders = null) |
|
438 | - { |
|
439 | - // use global _SERVER if $httpHeaders aren't defined |
|
440 | - if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
441 | - $httpHeaders = $_SERVER; |
|
442 | - } |
|
443 | - // clear existing headers |
|
444 | - $this->httpHeaders = array(); |
|
445 | - // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
446 | - // start with HTTP_. |
|
447 | - foreach ($httpHeaders as $key => $value) { |
|
448 | - if (substr($key, 0, 5) === 'HTTP_') { |
|
449 | - $this->httpHeaders[$key] = $value; |
|
450 | - } |
|
451 | - } |
|
452 | - } |
|
432 | + /** |
|
433 | + * Set HTTP headers. |
|
434 | + * |
|
435 | + * @param array $httpHeaders |
|
436 | + */ |
|
437 | + public function setHttpHeaders($httpHeaders = null) |
|
438 | + { |
|
439 | + // use global _SERVER if $httpHeaders aren't defined |
|
440 | + if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
441 | + $httpHeaders = $_SERVER; |
|
442 | + } |
|
443 | + // clear existing headers |
|
444 | + $this->httpHeaders = array(); |
|
445 | + // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
446 | + // start with HTTP_. |
|
447 | + foreach ($httpHeaders as $key => $value) { |
|
448 | + if (substr($key, 0, 5) === 'HTTP_') { |
|
449 | + $this->httpHeaders[$key] = $value; |
|
450 | + } |
|
451 | + } |
|
452 | + } |
|
453 | 453 | |
454 | - /** |
|
455 | - * Return user agent headers. |
|
456 | - * |
|
457 | - * @return array |
|
458 | - */ |
|
459 | - public function getUaHttpHeaders() |
|
460 | - { |
|
461 | - return self::$uaHttpHeaders; |
|
462 | - } |
|
454 | + /** |
|
455 | + * Return user agent headers. |
|
456 | + * |
|
457 | + * @return array |
|
458 | + */ |
|
459 | + public function getUaHttpHeaders() |
|
460 | + { |
|
461 | + return self::$uaHttpHeaders; |
|
462 | + } |
|
463 | 463 | |
464 | - /** |
|
465 | - * Set the user agent. |
|
466 | - * |
|
467 | - * @param string $userAgent |
|
468 | - */ |
|
469 | - public function setUserAgent($userAgent = null) |
|
470 | - { |
|
471 | - if (false === empty($userAgent)) { |
|
472 | - return $this->userAgent = $userAgent; |
|
473 | - } else { |
|
474 | - $this->userAgent = null; |
|
475 | - foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
476 | - if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. |
|
477 | - $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
478 | - } |
|
479 | - } |
|
464 | + /** |
|
465 | + * Set the user agent. |
|
466 | + * |
|
467 | + * @param string $userAgent |
|
468 | + */ |
|
469 | + public function setUserAgent($userAgent = null) |
|
470 | + { |
|
471 | + if (false === empty($userAgent)) { |
|
472 | + return $this->userAgent = $userAgent; |
|
473 | + } else { |
|
474 | + $this->userAgent = null; |
|
475 | + foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
476 | + if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. |
|
477 | + $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
478 | + } |
|
479 | + } |
|
480 | 480 | |
481 | - return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
482 | - } |
|
483 | - } |
|
481 | + return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
482 | + } |
|
483 | + } |
|
484 | 484 | |
485 | - /** |
|
486 | - * Return the array of crawler regexs. |
|
487 | - * |
|
488 | - * @return array |
|
489 | - */ |
|
490 | - public function getCrawlers() |
|
491 | - { |
|
492 | - return self::$crawlers; |
|
493 | - } |
|
485 | + /** |
|
486 | + * Return the array of crawler regexs. |
|
487 | + * |
|
488 | + * @return array |
|
489 | + */ |
|
490 | + public function getCrawlers() |
|
491 | + { |
|
492 | + return self::$crawlers; |
|
493 | + } |
|
494 | 494 | |
495 | - /** |
|
496 | - * Build the user agent regex. |
|
497 | - * |
|
498 | - * @return string |
|
499 | - */ |
|
500 | - public function getRegex() |
|
501 | - { |
|
502 | - return '('.implode('|', self::$crawlers).')'; |
|
503 | - } |
|
495 | + /** |
|
496 | + * Build the user agent regex. |
|
497 | + * |
|
498 | + * @return string |
|
499 | + */ |
|
500 | + public function getRegex() |
|
501 | + { |
|
502 | + return '('.implode('|', self::$crawlers).')'; |
|
503 | + } |
|
504 | 504 | |
505 | - /** |
|
506 | - * Build the replacement regex. |
|
507 | - * |
|
508 | - * @return string |
|
509 | - */ |
|
510 | - public function getIgnored() |
|
511 | - { |
|
512 | - return '('.implode('|', self::$ignore).')'; |
|
513 | - } |
|
505 | + /** |
|
506 | + * Build the replacement regex. |
|
507 | + * |
|
508 | + * @return string |
|
509 | + */ |
|
510 | + public function getIgnored() |
|
511 | + { |
|
512 | + return '('.implode('|', self::$ignore).')'; |
|
513 | + } |
|
514 | 514 | |
515 | - /** |
|
516 | - * Check user agent string against the regex. |
|
517 | - * |
|
518 | - * @param string $userAgent |
|
519 | - * |
|
520 | - * @return bool |
|
521 | - */ |
|
522 | - public function isCrawler($userAgent = null) |
|
523 | - { |
|
524 | - $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
515 | + /** |
|
516 | + * Check user agent string against the regex. |
|
517 | + * |
|
518 | + * @param string $userAgent |
|
519 | + * |
|
520 | + * @return bool |
|
521 | + */ |
|
522 | + public function isCrawler($userAgent = null) |
|
523 | + { |
|
524 | + $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
525 | 525 | |
526 | - $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent); |
|
526 | + $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent); |
|
527 | 527 | |
528 | - if (trim($agent) === false) { |
|
529 | - return false; |
|
530 | - } else { |
|
531 | - $result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches); |
|
532 | - } |
|
528 | + if (trim($agent) === false) { |
|
529 | + return false; |
|
530 | + } else { |
|
531 | + $result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches); |
|
532 | + } |
|
533 | 533 | |
534 | - if ($matches) { |
|
535 | - $this->matches = $matches; |
|
536 | - } |
|
534 | + if ($matches) { |
|
535 | + $this->matches = $matches; |
|
536 | + } |
|
537 | 537 | |
538 | - return (bool) $result; |
|
539 | - } |
|
538 | + return (bool) $result; |
|
539 | + } |
|
540 | 540 | |
541 | - /** |
|
542 | - * Return the matches. |
|
543 | - * |
|
544 | - * @return string |
|
545 | - */ |
|
546 | - public function getMatches() |
|
547 | - { |
|
548 | - return $this->matches[0]; |
|
549 | - } |
|
541 | + /** |
|
542 | + * Return the matches. |
|
543 | + * |
|
544 | + * @return string |
|
545 | + */ |
|
546 | + public function getMatches() |
|
547 | + { |
|
548 | + return $this->matches[0]; |
|
549 | + } |
|
550 | 550 | } |