@@ -11,7 +11,7 @@ |
||
11 | 11 | $dot = dirname(__FILE__); |
12 | 12 | |
13 | 13 | if (!file_exists($composer = dirname($dot).'/vendor/autoload.php')) { |
14 | - throw new RuntimeException("Please run 'composer install' first to set up autoloading. $composer"); |
|
14 | + throw new RuntimeException("Please run 'composer install' first to set up autoloading. $composer"); |
|
15 | 15 | } |
16 | 16 | /** @var \Composer\Autoload\ClassLoader $autoloader */ |
17 | 17 | $autoloader = include $composer; |
@@ -10,7 +10,7 @@ |
||
10 | 10 | */ |
11 | 11 | $dot = dirname(__FILE__); |
12 | 12 | |
13 | -if (!file_exists($composer = dirname($dot).'/vendor/autoload.php')) { |
|
13 | +if ( ! file_exists($composer = dirname($dot).'/vendor/autoload.php')) { |
|
14 | 14 | throw new RuntimeException("Please run 'composer install' first to set up autoloading. $composer"); |
15 | 15 | } |
16 | 16 | /** @var \Composer\Autoload\ClassLoader $autoloader */ |
@@ -13,53 +13,53 @@ |
||
13 | 13 | |
14 | 14 | class Exclusions extends AbstractProvider |
15 | 15 | { |
16 | - /** |
|
17 | - * List of strings to remove from the user agent before running the crawler regex |
|
18 | - * Over a large list of user agents, this gives us about a 55% speed increase! |
|
19 | - * |
|
20 | - * @var array |
|
21 | - */ |
|
22 | - protected $data = array( |
|
23 | - 'Safari.[\d\.]*', |
|
24 | - 'Firefox.[\d\.]*', |
|
25 | - 'Chrome.[\d\.]*', |
|
26 | - 'Chromium.[\d\.]*', |
|
27 | - 'MSIE.[\d\.]', |
|
28 | - 'Opera\/[\d\.]*', |
|
29 | - 'Mozilla.[\d\.]*', |
|
30 | - 'AppleWebKit.[\d\.]*', |
|
31 | - 'Trident.[\d\.]*', |
|
32 | - 'Windows NT.[\d\.]*', |
|
33 | - 'Android.[\d\.]*', |
|
34 | - 'Macintosh.', |
|
35 | - 'Ubuntu', |
|
36 | - 'Linux', |
|
37 | - '[ ]Intel', |
|
38 | - 'Mac OS X [\d_]*', |
|
39 | - '(like )?Gecko(.[\d\.]*)?', |
|
40 | - 'KHTML', |
|
41 | - 'CriOS.[\d\.]*', |
|
42 | - 'CPU iPhone OS ([0-9_])* like Mac OS X', |
|
43 | - 'CPU OS ([0-9_])* like Mac OS X', |
|
44 | - 'iPod', |
|
45 | - 'compatible', |
|
46 | - 'x86_..', |
|
47 | - 'i686', |
|
48 | - 'x64', |
|
49 | - 'X11', |
|
50 | - 'rv:[\d\.]*', |
|
51 | - 'Version.[\d\.]*', |
|
52 | - 'WOW64', |
|
53 | - 'Win64', |
|
54 | - 'Dalvik.[\d\.]*', |
|
55 | - ' \.NET CLR [\d\.]*', |
|
56 | - 'Presto.[\d\.]*', |
|
57 | - 'Media Center PC', |
|
58 | - 'BlackBerry', |
|
59 | - 'Build', |
|
60 | - 'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.', |
|
61 | - 'Opera', |
|
62 | - ' \.NET[\d\.]*', |
|
63 | - '\(|\)|;|,', // Remove the following characters ( ) : , |
|
64 | - ); |
|
16 | + /** |
|
17 | + * List of strings to remove from the user agent before running the crawler regex |
|
18 | + * Over a large list of user agents, this gives us about a 55% speed increase! |
|
19 | + * |
|
20 | + * @var array |
|
21 | + */ |
|
22 | + protected $data = array( |
|
23 | + 'Safari.[\d\.]*', |
|
24 | + 'Firefox.[\d\.]*', |
|
25 | + 'Chrome.[\d\.]*', |
|
26 | + 'Chromium.[\d\.]*', |
|
27 | + 'MSIE.[\d\.]', |
|
28 | + 'Opera\/[\d\.]*', |
|
29 | + 'Mozilla.[\d\.]*', |
|
30 | + 'AppleWebKit.[\d\.]*', |
|
31 | + 'Trident.[\d\.]*', |
|
32 | + 'Windows NT.[\d\.]*', |
|
33 | + 'Android.[\d\.]*', |
|
34 | + 'Macintosh.', |
|
35 | + 'Ubuntu', |
|
36 | + 'Linux', |
|
37 | + '[ ]Intel', |
|
38 | + 'Mac OS X [\d_]*', |
|
39 | + '(like )?Gecko(.[\d\.]*)?', |
|
40 | + 'KHTML', |
|
41 | + 'CriOS.[\d\.]*', |
|
42 | + 'CPU iPhone OS ([0-9_])* like Mac OS X', |
|
43 | + 'CPU OS ([0-9_])* like Mac OS X', |
|
44 | + 'iPod', |
|
45 | + 'compatible', |
|
46 | + 'x86_..', |
|
47 | + 'i686', |
|
48 | + 'x64', |
|
49 | + 'X11', |
|
50 | + 'rv:[\d\.]*', |
|
51 | + 'Version.[\d\.]*', |
|
52 | + 'WOW64', |
|
53 | + 'Win64', |
|
54 | + 'Dalvik.[\d\.]*', |
|
55 | + ' \.NET CLR [\d\.]*', |
|
56 | + 'Presto.[\d\.]*', |
|
57 | + 'Media Center PC', |
|
58 | + 'BlackBerry', |
|
59 | + 'Build', |
|
60 | + 'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.', |
|
61 | + 'Opera', |
|
62 | + ' \.NET[\d\.]*', |
|
63 | + '\(|\)|;|,', // Remove the following characters ( ) : , |
|
64 | + ); |
|
65 | 65 | } |
@@ -13,8 +13,8 @@ |
||
13 | 13 | |
14 | 14 | abstract class AbstractProvider |
15 | 15 | { |
16 | - public function getAll() |
|
17 | - { |
|
18 | - return $this->data; |
|
19 | - } |
|
16 | + public function getAll() |
|
17 | + { |
|
18 | + return $this->data; |
|
19 | + } |
|
20 | 20 | } |
@@ -13,324 +13,324 @@ |
||
13 | 13 | |
14 | 14 | class Crawlers extends AbstractProvider |
15 | 15 | { |
16 | - /** |
|
17 | - * Array of regular expressions to match against the user agent. |
|
18 | - * |
|
19 | - * @var array |
|
20 | - */ |
|
21 | - protected $data = array( |
|
22 | - '.*Java.*outbrain', |
|
23 | - '008\/', |
|
24 | - '^NING\/', |
|
25 | - 'A6-Indexer', |
|
26 | - 'Aboundex', |
|
27 | - 'Accoona-AI-Agent', |
|
28 | - 'acoon', |
|
29 | - 'AddThis', |
|
30 | - 'ADmantX', |
|
31 | - 'AHC', |
|
32 | - 'Airmail', |
|
33 | - 'alexa site audit', |
|
34 | - 'Anemone', |
|
35 | - 'Apache-HttpClient\/', |
|
36 | - 'Arachmo', |
|
37 | - 'archive-com', |
|
38 | - 'B-l-i-t-z-B-O-T', |
|
39 | - 'Backlink-Ceck\.de', |
|
40 | - 'baidu\.com', |
|
41 | - 'BazQux', |
|
42 | - 'bibnum\.bnf', |
|
43 | - 'biglotron', |
|
44 | - 'BingLocalSearch', |
|
45 | - 'BingPreview', |
|
46 | - 'binlar', |
|
47 | - 'Bloglovin', |
|
48 | - 'Blogtrottr', |
|
49 | - 'boitho\.com-dc', |
|
50 | - 'Browsershots', |
|
51 | - 'BUbiNG', |
|
52 | - 'Butterfly\/', |
|
53 | - 'BuzzSumo', |
|
54 | - 'CapsuleChecker', |
|
55 | - 'CC Metadata Scaper', |
|
56 | - 'Cerberian Drtrs', |
|
57 | - 'changedetection', |
|
58 | - 'Charlotte', |
|
59 | - 'clips\.ua\.ac\.be', |
|
60 | - 'CloudFlare-AlwaysOnline', |
|
61 | - 'coccoc', |
|
62 | - 'CommaFeed', |
|
63 | - 'Commons-HttpClient', |
|
64 | - 'convera', |
|
65 | - 'cosmos', |
|
66 | - 'corporatetwitnews', |
|
67 | - 'Covario-IDS', |
|
68 | - 'cron-job\.org', |
|
69 | - 'Curious George', |
|
70 | - 'curl', |
|
71 | - 'CyberPatrol', |
|
72 | - 'DataparkSearch', |
|
73 | - 'dataprovider', |
|
74 | - 'Daum(oa)?[ \/][0-9]', |
|
75 | - 'developers\.google\.com\/\+\/web\/snippet\/', |
|
76 | - 'Digg', |
|
77 | - 'DomainAppender', |
|
78 | - 'Dragonfly File Reader', |
|
79 | - 'drupact', |
|
80 | - 'EARTHCOM', |
|
81 | - 'ec2linkfinder', |
|
82 | - 'ECCP', |
|
83 | - 'ElectricMonk', |
|
84 | - 'EMail Exractor', |
|
85 | - 'EmailWolf', |
|
86 | - 'Embed PHP Library', |
|
87 | - 'Embedly', |
|
88 | - 'europarchive\.org', |
|
89 | - 'EventMachine HttpClient', |
|
90 | - 'ExactSearch', |
|
91 | - 'ExaleadCloudview', |
|
92 | - 'ezooms', |
|
93 | - 'facebookexternalhit', |
|
94 | - 'facebookplatform', |
|
95 | - 'Feed Wrangler', |
|
96 | - 'Feedbin', |
|
97 | - 'FeedBurner', |
|
98 | - 'Feedfetcher-Google', |
|
99 | - 'Feedly', |
|
100 | - 'Feedspot', |
|
101 | - 'FeedValidator', |
|
102 | - 'Fever', |
|
103 | - 'findlink', |
|
104 | - 'findthatfile', |
|
105 | - 'Flamingo_SearchEngine', |
|
106 | - 'FlipboardProxy', |
|
107 | - 'fluffy', |
|
108 | - 'Funnelback', |
|
109 | - 'g00g1e\.net', |
|
110 | - 'Genieo', |
|
111 | - 'getprismatic\.com', |
|
112 | - 'GigablastOpenSource', |
|
113 | - 'Go-http-client', |
|
114 | - 'Google favicon', |
|
115 | - 'Google Keyword Suggestion', |
|
116 | - 'Google Page Speed Insights', |
|
117 | - 'Google Web Preview', |
|
118 | - 'Google-HTTP-Java-Client', |
|
119 | - 'Google-Site-Verification', |
|
120 | - 'google_partner_monitoring', |
|
121 | - 'GoogleProducer', |
|
122 | - 'Grammarly', |
|
123 | - 'grub-client', |
|
124 | - 'heritrix', |
|
125 | - 'Holmes', |
|
126 | - 'htdig', |
|
127 | - 'HTTPMon', |
|
128 | - 'http-kit', |
|
129 | - 'http_requester', |
|
130 | - 'httpunit', |
|
131 | - 'http_request2', |
|
132 | - 'httrack', |
|
133 | - 'HubPages.*crawlingpolicy', |
|
134 | - 'HubSpot Marketing Grader', |
|
135 | - 'ichiro', |
|
136 | - 'IDG Twitter Links Resolver', |
|
137 | - 'igdeSpyder', |
|
138 | - 'InAGist', |
|
139 | - 'infegy', |
|
140 | - 'InfoWizards Reciprocal Link System PRO', |
|
141 | - 'inpwrd\.com', |
|
142 | - 'integromedb', |
|
143 | - 'IODC', |
|
144 | - 'IOI', |
|
145 | - 'ips-agent', |
|
146 | - 'iZSearch', |
|
147 | - '^Java\/', |
|
148 | - 'Jigsaw', |
|
149 | - 'Jobrapido', |
|
150 | - 'kouio', |
|
151 | - 'L\.webis', |
|
152 | - 'Larbin', |
|
153 | - 'libwww', |
|
154 | - 'Link Valet', |
|
155 | - 'linkCheck', |
|
156 | - 'linkdex', |
|
157 | - 'LinkExaminer', |
|
158 | - 'LinkWalker', |
|
159 | - 'Lipperhey', |
|
160 | - 'link checker', |
|
161 | - 'link validator', |
|
162 | - 'LongURL API', |
|
163 | - 'ltx71', |
|
164 | - 'lwp-trivial', |
|
165 | - 'lycos', |
|
166 | - 'mabontland', |
|
167 | - 'MagpieRSS', |
|
168 | - 'Mediapartners-Google', |
|
169 | - 'MegaIndex\.ru', |
|
170 | - 'MetaURI', |
|
171 | - 'MergeFlow-PageReader', |
|
172 | - 'Mnogosearch', |
|
173 | - 'mogimogi', |
|
174 | - 'Mojolicious (Perl)', |
|
175 | - 'Morning Paper', |
|
176 | - 'Mrcgiguy', |
|
177 | - 'MVAClient', |
|
178 | - 'Netcraft Web Server Survey', |
|
179 | - 'NetcraftSurveyAgent', |
|
180 | - 'NetLyzer FastProbe', |
|
181 | - 'netresearch', |
|
182 | - 'Netvibes', |
|
183 | - 'NewsBlur .*(Fetcher|Finder)', |
|
184 | - 'NewsGator', |
|
185 | - 'newsme', |
|
186 | - 'newspaper\/', |
|
187 | - 'NG-Search', |
|
188 | - 'nineconnections\.com', |
|
189 | - 'nominet\.org\.uk', |
|
190 | - 'Notifixious', |
|
191 | - 'nuhk', |
|
192 | - 'nutch', |
|
193 | - 'Nuzzel', |
|
194 | - 'Nymesis', |
|
195 | - 'oegp', |
|
196 | - 'Omea Reader', |
|
197 | - 'omgili', |
|
198 | - 'Orbiter', |
|
199 | - 'ow\.ly', |
|
200 | - 'Go [\d\.]* package http', |
|
201 | - 'page2rss', |
|
202 | - 'PagePeeker', |
|
203 | - 'panscient', |
|
204 | - 'Peew', |
|
205 | - 'PhantomJS\/', |
|
206 | - 'phpcrawl', |
|
207 | - 'phpservermon', |
|
208 | - 'Pingdom\.com', |
|
209 | - 'Pinterest', |
|
210 | - 'Pizilla', |
|
211 | - 'Ploetz \+ Zeller', |
|
212 | - 'Plukkie', |
|
213 | - 'PocketParser', |
|
214 | - 'Pompos', |
|
215 | - 'postano', |
|
216 | - 'PostPost', |
|
217 | - 'postrank', |
|
218 | - 'proximic', |
|
219 | - 'Pulsepoint XT3 web scraper', |
|
220 | - 'Python-httplib2', |
|
221 | - 'python-requests', |
|
222 | - 'Python-urllib', |
|
223 | - 'Qseero', |
|
224 | - 'Qwantify', |
|
225 | - 'Radian6', |
|
226 | - 'Readability', |
|
227 | - 'RebelMouse', |
|
228 | - 'RetrevoPageAnalyzer', |
|
229 | - 'Riddler', |
|
230 | - 'Robosourcer', |
|
231 | - 'ROI Hunter', |
|
232 | - 'Ruby', |
|
233 | - 'SalesIntelligent', |
|
234 | - 'SBIder', |
|
235 | - 'scooter', |
|
236 | - 'ScoutJet', |
|
237 | - 'ScoutURLMonitor', |
|
238 | - 'Scrapy', |
|
239 | - 'Scrubby', |
|
240 | - 'SearchSight', |
|
241 | - 'semanticdiscovery', |
|
242 | - 'SEOstats', |
|
243 | - 'Server Density Service Monitoring', |
|
244 | - 'servernfo\.com', |
|
245 | - 'Seznam screenshot-generator', |
|
246 | - 'ShopWiki', |
|
247 | - 'SilverReader', |
|
248 | - 'SimplePie', |
|
249 | - 'Site24x7', |
|
250 | - 'SiteBar', |
|
251 | - 'siteexplorer\.info', |
|
252 | - 'Siteimprove\.com', |
|
253 | - 'SkypeUriPreview', |
|
254 | - 'slider\.com', |
|
255 | - 'slurp', |
|
256 | - 'SMRF URL Expander', |
|
257 | - 'snapchat-proxy', |
|
258 | - 'Snappy', |
|
259 | - 'SNK Siteshooter B0t', |
|
260 | - 'sogou', |
|
261 | - 'SortSite', |
|
262 | - 'speedy', |
|
263 | - 'Spinn3r', |
|
264 | - 'Sqworm', |
|
265 | - 'StackRambler', |
|
266 | - 'Stratagems Kumo', |
|
267 | - 'summify', |
|
268 | - 'teoma', |
|
269 | - 'theoldreader\.com', |
|
270 | - 'TinEye', |
|
271 | - 'Tiny Tiny RSS', |
|
272 | - 'Traackr.com', |
|
273 | - 'truwoGPS', |
|
274 | - 'tweetedtimes\.com', |
|
275 | - 'Twikle', |
|
276 | - 'Typhoeus', |
|
277 | - 'ubermetrics-technologies', |
|
278 | - 'UdmSearch', |
|
279 | - 'UnwindFetchor', |
|
280 | - 'updated', |
|
281 | - 'URLChecker', |
|
282 | - 'urlresolver', |
|
283 | - 'Vagabondo', |
|
284 | - 'Validator\.nu\/LV', |
|
285 | - 'via ggpht\.com GoogleImageProxy', |
|
286 | - 'vkShare', |
|
287 | - 'Vortex', |
|
288 | - 'voyager\/', |
|
289 | - 'VYU2', |
|
290 | - 'W3C-checklink', |
|
291 | - 'W3C-mobileOK', |
|
292 | - 'W3C_CSS_Validator_JFouffa', |
|
293 | - 'W3C_I18n-Checker', |
|
294 | - 'W3C_Unicorn', |
|
295 | - 'W3C_Validator', |
|
296 | - 'Wappalyzer', |
|
297 | - 'WinHttpRequest', |
|
298 | - 'web-capture\.net', |
|
299 | - 'WebCapture', |
|
300 | - 'WebCorp', |
|
301 | - 'webcollage', |
|
302 | - 'WebIndex', |
|
303 | - 'WebFetch', |
|
304 | - 'webmon ', |
|
305 | - 'websitepulse[+ ]checker', |
|
306 | - 'Websquash\.com', |
|
307 | - 'WebThumbnail', |
|
308 | - 'WeSEE:Search', |
|
309 | - 'wf84', |
|
310 | - 'wget', |
|
311 | - 'WhatsApp', |
|
312 | - 'WomlpeFactory', |
|
313 | - 'WordPress\/', |
|
314 | - 'wotbox', |
|
315 | - 'wscheck', |
|
316 | - 'WWW-Mechanize', |
|
317 | - 'www\.monitor\.us', |
|
318 | - 'XaxisSemanticsClassifier', |
|
319 | - 'Xenu Link Sleuth', |
|
320 | - 'XML Sitemaps Generator', |
|
321 | - 'Y!J-ASR', |
|
322 | - 'yacy', |
|
323 | - 'Yahoo Ad monitoring', |
|
324 | - 'Yahoo Link Preview', |
|
325 | - 'YahooSeeker', |
|
326 | - 'yandex', |
|
327 | - 'yanga', |
|
328 | - 'yeti', |
|
329 | - 'yoogliFetchAgent', |
|
330 | - 'YottaaMonitor', |
|
331 | - 'Zao', |
|
332 | - 'zgrab', |
|
333 | - 'ZyBorg', |
|
334 | - '[a-z0-9\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
335 | - ); |
|
16 | + /** |
|
17 | + * Array of regular expressions to match against the user agent. |
|
18 | + * |
|
19 | + * @var array |
|
20 | + */ |
|
21 | + protected $data = array( |
|
22 | + '.*Java.*outbrain', |
|
23 | + '008\/', |
|
24 | + '^NING\/', |
|
25 | + 'A6-Indexer', |
|
26 | + 'Aboundex', |
|
27 | + 'Accoona-AI-Agent', |
|
28 | + 'acoon', |
|
29 | + 'AddThis', |
|
30 | + 'ADmantX', |
|
31 | + 'AHC', |
|
32 | + 'Airmail', |
|
33 | + 'alexa site audit', |
|
34 | + 'Anemone', |
|
35 | + 'Apache-HttpClient\/', |
|
36 | + 'Arachmo', |
|
37 | + 'archive-com', |
|
38 | + 'B-l-i-t-z-B-O-T', |
|
39 | + 'Backlink-Ceck\.de', |
|
40 | + 'baidu\.com', |
|
41 | + 'BazQux', |
|
42 | + 'bibnum\.bnf', |
|
43 | + 'biglotron', |
|
44 | + 'BingLocalSearch', |
|
45 | + 'BingPreview', |
|
46 | + 'binlar', |
|
47 | + 'Bloglovin', |
|
48 | + 'Blogtrottr', |
|
49 | + 'boitho\.com-dc', |
|
50 | + 'Browsershots', |
|
51 | + 'BUbiNG', |
|
52 | + 'Butterfly\/', |
|
53 | + 'BuzzSumo', |
|
54 | + 'CapsuleChecker', |
|
55 | + 'CC Metadata Scaper', |
|
56 | + 'Cerberian Drtrs', |
|
57 | + 'changedetection', |
|
58 | + 'Charlotte', |
|
59 | + 'clips\.ua\.ac\.be', |
|
60 | + 'CloudFlare-AlwaysOnline', |
|
61 | + 'coccoc', |
|
62 | + 'CommaFeed', |
|
63 | + 'Commons-HttpClient', |
|
64 | + 'convera', |
|
65 | + 'cosmos', |
|
66 | + 'corporatetwitnews', |
|
67 | + 'Covario-IDS', |
|
68 | + 'cron-job\.org', |
|
69 | + 'Curious George', |
|
70 | + 'curl', |
|
71 | + 'CyberPatrol', |
|
72 | + 'DataparkSearch', |
|
73 | + 'dataprovider', |
|
74 | + 'Daum(oa)?[ \/][0-9]', |
|
75 | + 'developers\.google\.com\/\+\/web\/snippet\/', |
|
76 | + 'Digg', |
|
77 | + 'DomainAppender', |
|
78 | + 'Dragonfly File Reader', |
|
79 | + 'drupact', |
|
80 | + 'EARTHCOM', |
|
81 | + 'ec2linkfinder', |
|
82 | + 'ECCP', |
|
83 | + 'ElectricMonk', |
|
84 | + 'EMail Exractor', |
|
85 | + 'EmailWolf', |
|
86 | + 'Embed PHP Library', |
|
87 | + 'Embedly', |
|
88 | + 'europarchive\.org', |
|
89 | + 'EventMachine HttpClient', |
|
90 | + 'ExactSearch', |
|
91 | + 'ExaleadCloudview', |
|
92 | + 'ezooms', |
|
93 | + 'facebookexternalhit', |
|
94 | + 'facebookplatform', |
|
95 | + 'Feed Wrangler', |
|
96 | + 'Feedbin', |
|
97 | + 'FeedBurner', |
|
98 | + 'Feedfetcher-Google', |
|
99 | + 'Feedly', |
|
100 | + 'Feedspot', |
|
101 | + 'FeedValidator', |
|
102 | + 'Fever', |
|
103 | + 'findlink', |
|
104 | + 'findthatfile', |
|
105 | + 'Flamingo_SearchEngine', |
|
106 | + 'FlipboardProxy', |
|
107 | + 'fluffy', |
|
108 | + 'Funnelback', |
|
109 | + 'g00g1e\.net', |
|
110 | + 'Genieo', |
|
111 | + 'getprismatic\.com', |
|
112 | + 'GigablastOpenSource', |
|
113 | + 'Go-http-client', |
|
114 | + 'Google favicon', |
|
115 | + 'Google Keyword Suggestion', |
|
116 | + 'Google Page Speed Insights', |
|
117 | + 'Google Web Preview', |
|
118 | + 'Google-HTTP-Java-Client', |
|
119 | + 'Google-Site-Verification', |
|
120 | + 'google_partner_monitoring', |
|
121 | + 'GoogleProducer', |
|
122 | + 'Grammarly', |
|
123 | + 'grub-client', |
|
124 | + 'heritrix', |
|
125 | + 'Holmes', |
|
126 | + 'htdig', |
|
127 | + 'HTTPMon', |
|
128 | + 'http-kit', |
|
129 | + 'http_requester', |
|
130 | + 'httpunit', |
|
131 | + 'http_request2', |
|
132 | + 'httrack', |
|
133 | + 'HubPages.*crawlingpolicy', |
|
134 | + 'HubSpot Marketing Grader', |
|
135 | + 'ichiro', |
|
136 | + 'IDG Twitter Links Resolver', |
|
137 | + 'igdeSpyder', |
|
138 | + 'InAGist', |
|
139 | + 'infegy', |
|
140 | + 'InfoWizards Reciprocal Link System PRO', |
|
141 | + 'inpwrd\.com', |
|
142 | + 'integromedb', |
|
143 | + 'IODC', |
|
144 | + 'IOI', |
|
145 | + 'ips-agent', |
|
146 | + 'iZSearch', |
|
147 | + '^Java\/', |
|
148 | + 'Jigsaw', |
|
149 | + 'Jobrapido', |
|
150 | + 'kouio', |
|
151 | + 'L\.webis', |
|
152 | + 'Larbin', |
|
153 | + 'libwww', |
|
154 | + 'Link Valet', |
|
155 | + 'linkCheck', |
|
156 | + 'linkdex', |
|
157 | + 'LinkExaminer', |
|
158 | + 'LinkWalker', |
|
159 | + 'Lipperhey', |
|
160 | + 'link checker', |
|
161 | + 'link validator', |
|
162 | + 'LongURL API', |
|
163 | + 'ltx71', |
|
164 | + 'lwp-trivial', |
|
165 | + 'lycos', |
|
166 | + 'mabontland', |
|
167 | + 'MagpieRSS', |
|
168 | + 'Mediapartners-Google', |
|
169 | + 'MegaIndex\.ru', |
|
170 | + 'MetaURI', |
|
171 | + 'MergeFlow-PageReader', |
|
172 | + 'Mnogosearch', |
|
173 | + 'mogimogi', |
|
174 | + 'Mojolicious (Perl)', |
|
175 | + 'Morning Paper', |
|
176 | + 'Mrcgiguy', |
|
177 | + 'MVAClient', |
|
178 | + 'Netcraft Web Server Survey', |
|
179 | + 'NetcraftSurveyAgent', |
|
180 | + 'NetLyzer FastProbe', |
|
181 | + 'netresearch', |
|
182 | + 'Netvibes', |
|
183 | + 'NewsBlur .*(Fetcher|Finder)', |
|
184 | + 'NewsGator', |
|
185 | + 'newsme', |
|
186 | + 'newspaper\/', |
|
187 | + 'NG-Search', |
|
188 | + 'nineconnections\.com', |
|
189 | + 'nominet\.org\.uk', |
|
190 | + 'Notifixious', |
|
191 | + 'nuhk', |
|
192 | + 'nutch', |
|
193 | + 'Nuzzel', |
|
194 | + 'Nymesis', |
|
195 | + 'oegp', |
|
196 | + 'Omea Reader', |
|
197 | + 'omgili', |
|
198 | + 'Orbiter', |
|
199 | + 'ow\.ly', |
|
200 | + 'Go [\d\.]* package http', |
|
201 | + 'page2rss', |
|
202 | + 'PagePeeker', |
|
203 | + 'panscient', |
|
204 | + 'Peew', |
|
205 | + 'PhantomJS\/', |
|
206 | + 'phpcrawl', |
|
207 | + 'phpservermon', |
|
208 | + 'Pingdom\.com', |
|
209 | + 'Pinterest', |
|
210 | + 'Pizilla', |
|
211 | + 'Ploetz \+ Zeller', |
|
212 | + 'Plukkie', |
|
213 | + 'PocketParser', |
|
214 | + 'Pompos', |
|
215 | + 'postano', |
|
216 | + 'PostPost', |
|
217 | + 'postrank', |
|
218 | + 'proximic', |
|
219 | + 'Pulsepoint XT3 web scraper', |
|
220 | + 'Python-httplib2', |
|
221 | + 'python-requests', |
|
222 | + 'Python-urllib', |
|
223 | + 'Qseero', |
|
224 | + 'Qwantify', |
|
225 | + 'Radian6', |
|
226 | + 'Readability', |
|
227 | + 'RebelMouse', |
|
228 | + 'RetrevoPageAnalyzer', |
|
229 | + 'Riddler', |
|
230 | + 'Robosourcer', |
|
231 | + 'ROI Hunter', |
|
232 | + 'Ruby', |
|
233 | + 'SalesIntelligent', |
|
234 | + 'SBIder', |
|
235 | + 'scooter', |
|
236 | + 'ScoutJet', |
|
237 | + 'ScoutURLMonitor', |
|
238 | + 'Scrapy', |
|
239 | + 'Scrubby', |
|
240 | + 'SearchSight', |
|
241 | + 'semanticdiscovery', |
|
242 | + 'SEOstats', |
|
243 | + 'Server Density Service Monitoring', |
|
244 | + 'servernfo\.com', |
|
245 | + 'Seznam screenshot-generator', |
|
246 | + 'ShopWiki', |
|
247 | + 'SilverReader', |
|
248 | + 'SimplePie', |
|
249 | + 'Site24x7', |
|
250 | + 'SiteBar', |
|
251 | + 'siteexplorer\.info', |
|
252 | + 'Siteimprove\.com', |
|
253 | + 'SkypeUriPreview', |
|
254 | + 'slider\.com', |
|
255 | + 'slurp', |
|
256 | + 'SMRF URL Expander', |
|
257 | + 'snapchat-proxy', |
|
258 | + 'Snappy', |
|
259 | + 'SNK Siteshooter B0t', |
|
260 | + 'sogou', |
|
261 | + 'SortSite', |
|
262 | + 'speedy', |
|
263 | + 'Spinn3r', |
|
264 | + 'Sqworm', |
|
265 | + 'StackRambler', |
|
266 | + 'Stratagems Kumo', |
|
267 | + 'summify', |
|
268 | + 'teoma', |
|
269 | + 'theoldreader\.com', |
|
270 | + 'TinEye', |
|
271 | + 'Tiny Tiny RSS', |
|
272 | + 'Traackr.com', |
|
273 | + 'truwoGPS', |
|
274 | + 'tweetedtimes\.com', |
|
275 | + 'Twikle', |
|
276 | + 'Typhoeus', |
|
277 | + 'ubermetrics-technologies', |
|
278 | + 'UdmSearch', |
|
279 | + 'UnwindFetchor', |
|
280 | + 'updated', |
|
281 | + 'URLChecker', |
|
282 | + 'urlresolver', |
|
283 | + 'Vagabondo', |
|
284 | + 'Validator\.nu\/LV', |
|
285 | + 'via ggpht\.com GoogleImageProxy', |
|
286 | + 'vkShare', |
|
287 | + 'Vortex', |
|
288 | + 'voyager\/', |
|
289 | + 'VYU2', |
|
290 | + 'W3C-checklink', |
|
291 | + 'W3C-mobileOK', |
|
292 | + 'W3C_CSS_Validator_JFouffa', |
|
293 | + 'W3C_I18n-Checker', |
|
294 | + 'W3C_Unicorn', |
|
295 | + 'W3C_Validator', |
|
296 | + 'Wappalyzer', |
|
297 | + 'WinHttpRequest', |
|
298 | + 'web-capture\.net', |
|
299 | + 'WebCapture', |
|
300 | + 'WebCorp', |
|
301 | + 'webcollage', |
|
302 | + 'WebIndex', |
|
303 | + 'WebFetch', |
|
304 | + 'webmon ', |
|
305 | + 'websitepulse[+ ]checker', |
|
306 | + 'Websquash\.com', |
|
307 | + 'WebThumbnail', |
|
308 | + 'WeSEE:Search', |
|
309 | + 'wf84', |
|
310 | + 'wget', |
|
311 | + 'WhatsApp', |
|
312 | + 'WomlpeFactory', |
|
313 | + 'WordPress\/', |
|
314 | + 'wotbox', |
|
315 | + 'wscheck', |
|
316 | + 'WWW-Mechanize', |
|
317 | + 'www\.monitor\.us', |
|
318 | + 'XaxisSemanticsClassifier', |
|
319 | + 'Xenu Link Sleuth', |
|
320 | + 'XML Sitemaps Generator', |
|
321 | + 'Y!J-ASR', |
|
322 | + 'yacy', |
|
323 | + 'Yahoo Ad monitoring', |
|
324 | + 'Yahoo Link Preview', |
|
325 | + 'YahooSeeker', |
|
326 | + 'yandex', |
|
327 | + 'yanga', |
|
328 | + 'yeti', |
|
329 | + 'yoogliFetchAgent', |
|
330 | + 'YottaaMonitor', |
|
331 | + 'Zao', |
|
332 | + 'zgrab', |
|
333 | + 'ZyBorg', |
|
334 | + '[a-z0-9\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', |
|
335 | + ); |
|
336 | 336 | } |
@@ -14,55 +14,55 @@ |
||
14 | 14 | |
15 | 15 | class UserAgentTest extends PHPUnit_Framework_TestCase |
16 | 16 | { |
17 | - protected $CrawlerDetect; |
|
17 | + protected $CrawlerDetect; |
|
18 | 18 | |
19 | - public function setUp() |
|
20 | - { |
|
21 | - $this->CrawlerDetect = new CrawlerDetect(); |
|
22 | - } |
|
19 | + public function setUp() |
|
20 | + { |
|
21 | + $this->CrawlerDetect = new CrawlerDetect(); |
|
22 | + } |
|
23 | 23 | |
24 | - public function testBots() |
|
25 | - { |
|
26 | - $lines = file(__DIR__.'/crawlers.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
|
24 | + public function testBots() |
|
25 | + { |
|
26 | + $lines = file(__DIR__.'/crawlers.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
|
27 | 27 | |
28 | - foreach ($lines as $line) { |
|
29 | - $test = $this->CrawlerDetect->isCrawler($line); |
|
30 | - $this->assertEquals($test, true, $line); |
|
31 | - } |
|
32 | - } |
|
28 | + foreach ($lines as $line) { |
|
29 | + $test = $this->CrawlerDetect->isCrawler($line); |
|
30 | + $this->assertEquals($test, true, $line); |
|
31 | + } |
|
32 | + } |
|
33 | 33 | |
34 | - public function testDevices() |
|
35 | - { |
|
36 | - $lines = file(__DIR__.'/devices.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
|
34 | + public function testDevices() |
|
35 | + { |
|
36 | + $lines = file(__DIR__.'/devices.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
|
37 | 37 | |
38 | - foreach ($lines as $line) { |
|
39 | - $test = $this->CrawlerDetect->isCrawler($line); |
|
40 | - $this->assertEquals($test, false, $line); |
|
41 | - } |
|
42 | - } |
|
38 | + foreach ($lines as $line) { |
|
39 | + $test = $this->CrawlerDetect->isCrawler($line); |
|
40 | + $this->assertEquals($test, false, $line); |
|
41 | + } |
|
42 | + } |
|
43 | 43 | |
44 | - public function testReturnsCorrectMatchedBotName() |
|
45 | - { |
|
46 | - $test = $this->CrawlerDetect->isCrawler('Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)'); |
|
44 | + public function testReturnsCorrectMatchedBotName() |
|
45 | + { |
|
46 | + $test = $this->CrawlerDetect->isCrawler('Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)'); |
|
47 | 47 | |
48 | - $matches = $this->CrawlerDetect->getMatches(); |
|
48 | + $matches = $this->CrawlerDetect->getMatches(); |
|
49 | 49 | |
50 | - $this->assertEquals($this->CrawlerDetect->getMatches(), 'Yahoo Ad monitoring', $matches); |
|
51 | - } |
|
50 | + $this->assertEquals($this->CrawlerDetect->getMatches(), 'Yahoo Ad monitoring', $matches); |
|
51 | + } |
|
52 | 52 | |
53 | - public function testForRegexCollision() |
|
54 | - { |
|
55 | - $crawlers = new Crawlers(); |
|
53 | + public function testForRegexCollision() |
|
54 | + { |
|
55 | + $crawlers = new Crawlers(); |
|
56 | 56 | |
57 | - foreach ($crawlers->getAll() as $key1 => $regex) { |
|
58 | - foreach ($crawlers->getAll() as $key2 => $compare) { |
|
59 | - // Dont check this regex against itself |
|
60 | - if ($key1 != $key2) { |
|
61 | - preg_match('/'.$regex.'/i', stripslashes($compare), $matches); |
|
57 | + foreach ($crawlers->getAll() as $key1 => $regex) { |
|
58 | + foreach ($crawlers->getAll() as $key2 => $compare) { |
|
59 | + // Dont check this regex against itself |
|
60 | + if ($key1 != $key2) { |
|
61 | + preg_match('/'.$regex.'/i', stripslashes($compare), $matches); |
|
62 | 62 | |
63 | - $this->assertEmpty($matches, $regex.' collided with '.$compare); |
|
64 | - } |
|
65 | - } |
|
66 | - } |
|
67 | - } |
|
63 | + $this->assertEmpty($matches, $regex.' collided with '.$compare); |
|
64 | + } |
|
65 | + } |
|
66 | + } |
|
67 | + } |
|
68 | 68 | } |
@@ -16,178 +16,178 @@ |
||
16 | 16 | |
17 | 17 | class CrawlerDetect |
18 | 18 | { |
19 | - /** |
|
20 | - * The user agent. |
|
21 | - * |
|
22 | - * @var null |
|
23 | - */ |
|
24 | - protected $userAgent = null; |
|
25 | - |
|
26 | - /** |
|
27 | - * Headers that contain a user agent. |
|
28 | - * |
|
29 | - * @var array |
|
30 | - */ |
|
31 | - protected $httpHeaders = array(); |
|
32 | - |
|
33 | - /** |
|
34 | - * Store regex matches. |
|
35 | - * |
|
36 | - * @var array |
|
37 | - */ |
|
38 | - protected $matches = array(); |
|
39 | - |
|
40 | - /** |
|
41 | - * Crawlers object |
|
42 | - * |
|
43 | - * @var Jaybizzle\CrawlerDetect\Fixtures\Crawlers |
|
44 | - */ |
|
45 | - protected $crawlers; |
|
46 | - |
|
47 | - /** |
|
48 | - * Exclusions object |
|
49 | - * |
|
50 | - * @var Jaybizzle\CrawlerDetect\Fixtures\Exclusions |
|
51 | - */ |
|
52 | - protected $exclusions; |
|
53 | - |
|
54 | - /** |
|
55 | - * All possible HTTP headers that represent the |
|
56 | - * User-Agent string. |
|
57 | - * |
|
58 | - * @var array |
|
59 | - */ |
|
60 | - protected static $uaHttpHeaders = array( |
|
61 | - // The default User-Agent string. |
|
62 | - 'HTTP_USER_AGENT', |
|
63 | - // Header can occur on devices using Opera Mini. |
|
64 | - 'HTTP_X_OPERAMINI_PHONE_UA', |
|
65 | - // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
66 | - 'HTTP_X_DEVICE_USER_AGENT', |
|
67 | - 'HTTP_X_ORIGINAL_USER_AGENT', |
|
68 | - 'HTTP_X_SKYFIRE_PHONE', |
|
69 | - 'HTTP_X_BOLT_PHONE_UA', |
|
70 | - 'HTTP_DEVICE_STOCK_UA', |
|
71 | - 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
72 | - ); |
|
73 | - |
|
74 | - /** |
|
75 | - * Class constructor. |
|
76 | - */ |
|
77 | - public function __construct(array $headers = null, $userAgent = null) |
|
78 | - { |
|
79 | - $this->setHttpHeaders($headers); |
|
80 | - $this->setUserAgent($userAgent); |
|
81 | - $this->crawlers = new Crawlers(); |
|
82 | - $this->exclusions = new Exclusions(); |
|
83 | - } |
|
84 | - |
|
85 | - /** |
|
86 | - * Set HTTP headers. |
|
87 | - * |
|
88 | - * @param array $httpHeaders |
|
89 | - */ |
|
90 | - public function setHttpHeaders($httpHeaders = null) |
|
91 | - { |
|
92 | - // use global _SERVER if $httpHeaders aren't defined |
|
93 | - if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
94 | - $httpHeaders = $_SERVER; |
|
95 | - } |
|
96 | - // clear existing headers |
|
97 | - $this->httpHeaders = array(); |
|
98 | - // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
99 | - // start with HTTP_. |
|
100 | - foreach ($httpHeaders as $key => $value) { |
|
101 | - if (substr($key, 0, 5) === 'HTTP_') { |
|
102 | - $this->httpHeaders[$key] = $value; |
|
103 | - } |
|
104 | - } |
|
105 | - } |
|
106 | - |
|
107 | - /** |
|
108 | - * Return user agent headers. |
|
109 | - * |
|
110 | - * @return array |
|
111 | - */ |
|
112 | - public function getUaHttpHeaders() |
|
113 | - { |
|
114 | - return self::$uaHttpHeaders; |
|
115 | - } |
|
116 | - |
|
117 | - /** |
|
118 | - * Set the user agent. |
|
119 | - * |
|
120 | - * @param string $userAgent |
|
121 | - */ |
|
122 | - public function setUserAgent($userAgent = null) |
|
123 | - { |
|
124 | - if (false === empty($userAgent)) { |
|
125 | - return $this->userAgent = $userAgent; |
|
126 | - } else { |
|
127 | - $this->userAgent = null; |
|
128 | - foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
129 | - if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. |
|
130 | - $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
131 | - } |
|
132 | - } |
|
133 | - |
|
134 | - return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
135 | - } |
|
136 | - } |
|
137 | - |
|
138 | - /** |
|
139 | - * Build the user agent regex. |
|
140 | - * |
|
141 | - * @return string |
|
142 | - */ |
|
143 | - public function getRegex() |
|
144 | - { |
|
145 | - return '('.implode('|', $this->crawlers->getAll()).')'; |
|
146 | - } |
|
147 | - |
|
148 | - /** |
|
149 | - * Build the replacement regex. |
|
150 | - * |
|
151 | - * @return string |
|
152 | - */ |
|
153 | - public function getExclusions() |
|
154 | - { |
|
155 | - return '('.implode('|', $this->exclusions->getAll()).')'; |
|
156 | - } |
|
157 | - |
|
158 | - /** |
|
159 | - * Check user agent string against the regex. |
|
160 | - * |
|
161 | - * @param string $userAgent |
|
162 | - * |
|
163 | - * @return bool |
|
164 | - */ |
|
165 | - public function isCrawler($userAgent = null) |
|
166 | - { |
|
167 | - $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
168 | - |
|
169 | - $agent = preg_replace('/'.$this->getExclusions().'/i', '', $agent); |
|
170 | - |
|
171 | - if (trim($agent) === false) { |
|
172 | - return false; |
|
173 | - } else { |
|
174 | - $result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches); |
|
175 | - } |
|
176 | - |
|
177 | - if ($matches) { |
|
178 | - $this->matches = $matches; |
|
179 | - } |
|
180 | - |
|
181 | - return (bool) $result; |
|
182 | - } |
|
183 | - |
|
184 | - /** |
|
185 | - * Return the matches. |
|
186 | - * |
|
187 | - * @return string |
|
188 | - */ |
|
189 | - public function getMatches() |
|
190 | - { |
|
191 | - return $this->matches[0]; |
|
192 | - } |
|
19 | + /** |
|
20 | + * The user agent. |
|
21 | + * |
|
22 | + * @var null |
|
23 | + */ |
|
24 | + protected $userAgent = null; |
|
25 | + |
|
26 | + /** |
|
27 | + * Headers that contain a user agent. |
|
28 | + * |
|
29 | + * @var array |
|
30 | + */ |
|
31 | + protected $httpHeaders = array(); |
|
32 | + |
|
33 | + /** |
|
34 | + * Store regex matches. |
|
35 | + * |
|
36 | + * @var array |
|
37 | + */ |
|
38 | + protected $matches = array(); |
|
39 | + |
|
40 | + /** |
|
41 | + * Crawlers object |
|
42 | + * |
|
43 | + * @var Jaybizzle\CrawlerDetect\Fixtures\Crawlers |
|
44 | + */ |
|
45 | + protected $crawlers; |
|
46 | + |
|
47 | + /** |
|
48 | + * Exclusions object |
|
49 | + * |
|
50 | + * @var Jaybizzle\CrawlerDetect\Fixtures\Exclusions |
|
51 | + */ |
|
52 | + protected $exclusions; |
|
53 | + |
|
54 | + /** |
|
55 | + * All possible HTTP headers that represent the |
|
56 | + * User-Agent string. |
|
57 | + * |
|
58 | + * @var array |
|
59 | + */ |
|
60 | + protected static $uaHttpHeaders = array( |
|
61 | + // The default User-Agent string. |
|
62 | + 'HTTP_USER_AGENT', |
|
63 | + // Header can occur on devices using Opera Mini. |
|
64 | + 'HTTP_X_OPERAMINI_PHONE_UA', |
|
65 | + // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ |
|
66 | + 'HTTP_X_DEVICE_USER_AGENT', |
|
67 | + 'HTTP_X_ORIGINAL_USER_AGENT', |
|
68 | + 'HTTP_X_SKYFIRE_PHONE', |
|
69 | + 'HTTP_X_BOLT_PHONE_UA', |
|
70 | + 'HTTP_DEVICE_STOCK_UA', |
|
71 | + 'HTTP_X_UCBROWSER_DEVICE_UA', |
|
72 | + ); |
|
73 | + |
|
74 | + /** |
|
75 | + * Class constructor. |
|
76 | + */ |
|
77 | + public function __construct(array $headers = null, $userAgent = null) |
|
78 | + { |
|
79 | + $this->setHttpHeaders($headers); |
|
80 | + $this->setUserAgent($userAgent); |
|
81 | + $this->crawlers = new Crawlers(); |
|
82 | + $this->exclusions = new Exclusions(); |
|
83 | + } |
|
84 | + |
|
85 | + /** |
|
86 | + * Set HTTP headers. |
|
87 | + * |
|
88 | + * @param array $httpHeaders |
|
89 | + */ |
|
90 | + public function setHttpHeaders($httpHeaders = null) |
|
91 | + { |
|
92 | + // use global _SERVER if $httpHeaders aren't defined |
|
93 | + if (!is_array($httpHeaders) || !count($httpHeaders)) { |
|
94 | + $httpHeaders = $_SERVER; |
|
95 | + } |
|
96 | + // clear existing headers |
|
97 | + $this->httpHeaders = array(); |
|
98 | + // Only save HTTP headers. In PHP land, that means only _SERVER vars that |
|
99 | + // start with HTTP_. |
|
100 | + foreach ($httpHeaders as $key => $value) { |
|
101 | + if (substr($key, 0, 5) === 'HTTP_') { |
|
102 | + $this->httpHeaders[$key] = $value; |
|
103 | + } |
|
104 | + } |
|
105 | + } |
|
106 | + |
|
107 | + /** |
|
108 | + * Return user agent headers. |
|
109 | + * |
|
110 | + * @return array |
|
111 | + */ |
|
112 | + public function getUaHttpHeaders() |
|
113 | + { |
|
114 | + return self::$uaHttpHeaders; |
|
115 | + } |
|
116 | + |
|
117 | + /** |
|
118 | + * Set the user agent. |
|
119 | + * |
|
120 | + * @param string $userAgent |
|
121 | + */ |
|
122 | + public function setUserAgent($userAgent = null) |
|
123 | + { |
|
124 | + if (false === empty($userAgent)) { |
|
125 | + return $this->userAgent = $userAgent; |
|
126 | + } else { |
|
127 | + $this->userAgent = null; |
|
128 | + foreach ($this->getUaHttpHeaders() as $altHeader) { |
|
129 | + if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. |
|
130 | + $this->userAgent .= $this->httpHeaders[$altHeader].' '; |
|
131 | + } |
|
132 | + } |
|
133 | + |
|
134 | + return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); |
|
135 | + } |
|
136 | + } |
|
137 | + |
|
138 | + /** |
|
139 | + * Build the user agent regex. |
|
140 | + * |
|
141 | + * @return string |
|
142 | + */ |
|
143 | + public function getRegex() |
|
144 | + { |
|
145 | + return '('.implode('|', $this->crawlers->getAll()).')'; |
|
146 | + } |
|
147 | + |
|
148 | + /** |
|
149 | + * Build the replacement regex. |
|
150 | + * |
|
151 | + * @return string |
|
152 | + */ |
|
153 | + public function getExclusions() |
|
154 | + { |
|
155 | + return '('.implode('|', $this->exclusions->getAll()).')'; |
|
156 | + } |
|
157 | + |
|
158 | + /** |
|
159 | + * Check user agent string against the regex. |
|
160 | + * |
|
161 | + * @param string $userAgent |
|
162 | + * |
|
163 | + * @return bool |
|
164 | + */ |
|
165 | + public function isCrawler($userAgent = null) |
|
166 | + { |
|
167 | + $agent = is_null($userAgent) ? $this->userAgent : $userAgent; |
|
168 | + |
|
169 | + $agent = preg_replace('/'.$this->getExclusions().'/i', '', $agent); |
|
170 | + |
|
171 | + if (trim($agent) === false) { |
|
172 | + return false; |
|
173 | + } else { |
|
174 | + $result = preg_match('/'.$this->getRegex().'/i', trim($agent), $matches); |
|
175 | + } |
|
176 | + |
|
177 | + if ($matches) { |
|
178 | + $this->matches = $matches; |
|
179 | + } |
|
180 | + |
|
181 | + return (bool) $result; |
|
182 | + } |
|
183 | + |
|
184 | + /** |
|
185 | + * Return the matches. |
|
186 | + * |
|
187 | + * @return string |
|
188 | + */ |
|
189 | + public function getMatches() |
|
190 | + { |
|
191 | + return $this->matches[0]; |
|
192 | + } |
|
193 | 193 | } |