Completed
Pull Request — master (#42)
by Mark
11:06
created
src/CrawlerDetect.php 1 patch
Indentation   +340 added lines, -340 removed lines patch added patch discarded remove patch
@@ -4,362 +4,362 @@
 block discarded – undo
4 4
 
5 5
 class CrawlerDetect
6 6
 {
7
-    protected $userAgent = null;
7
+	protected $userAgent = null;
8 8
 
9
-    protected $httpHeaders = array();
9
+	protected $httpHeaders = array();
10 10
 
11
-    protected $matches = array();
11
+	protected $matches = array();
12 12
 
13
-    /**
14
-     * List of strings to remove from the user agent before running the crawler regex
15
-     * Over a large list of user agents, this gives us about a 55% speed increase!
16
-     * 
17
-     * @var array
18
-     */
19
-    protected static $ignore = array(
20
-        'Safari.[\d\.]*',
21
-        'Firefox.[\d\.]*',
22
-        'Chrome.[\d\.]*',
23
-        'Chromium.[\d\.]*',
24
-        'MSIE.[\d\.]',
25
-        'Opera\/[\d\.]*',
26
-        'Mozilla.[\d\.]*',
27
-        'AppleWebKit.[\d\.]*',
28
-        'Trident.[\d\.]*',
29
-        'Windows NT.[\d\.]*',
30
-        'Android.[\d\.]*',
31
-        'Macintosh.',
32
-        'Ubuntu',
33
-        'Linux',
34
-        'Intel',
35
-        'Mac OS X',
36
-        'Gecko.[\d\.]*',
37
-        'KHTML',
38
-        'CriOS.[\d\.]*',
39
-        'CPU iPhone OS ([0-9_])* like Mac OS X',
40
-        'CPU OS ([0-9_])* like Mac OS X',
41
-        'iPod',
42
-        'like Gecko',
43
-        'compatible',
44
-        'x86_..',
45
-        'i686',
46
-        'x64',
47
-        'X11',
48
-        'rv:[\d\.]*',
49
-        'Version.[\d\.]*',
50
-        'WOW64',
51
-        'Win64',
52
-        'Dalvik.[\d\.]*',
53
-        '\.NET CLR [\d\.]*',
54
-        'Presto.[\d\.]*',
55
-        'Media Center PC',
56
-    );
13
+	/**
14
+	 * List of strings to remove from the user agent before running the crawler regex
15
+	 * Over a large list of user agents, this gives us about a 55% speed increase!
16
+	 * 
17
+	 * @var array
18
+	 */
19
+	protected static $ignore = array(
20
+		'Safari.[\d\.]*',
21
+		'Firefox.[\d\.]*',
22
+		'Chrome.[\d\.]*',
23
+		'Chromium.[\d\.]*',
24
+		'MSIE.[\d\.]',
25
+		'Opera\/[\d\.]*',
26
+		'Mozilla.[\d\.]*',
27
+		'AppleWebKit.[\d\.]*',
28
+		'Trident.[\d\.]*',
29
+		'Windows NT.[\d\.]*',
30
+		'Android.[\d\.]*',
31
+		'Macintosh.',
32
+		'Ubuntu',
33
+		'Linux',
34
+		'Intel',
35
+		'Mac OS X',
36
+		'Gecko.[\d\.]*',
37
+		'KHTML',
38
+		'CriOS.[\d\.]*',
39
+		'CPU iPhone OS ([0-9_])* like Mac OS X',
40
+		'CPU OS ([0-9_])* like Mac OS X',
41
+		'iPod',
42
+		'like Gecko',
43
+		'compatible',
44
+		'x86_..',
45
+		'i686',
46
+		'x64',
47
+		'X11',
48
+		'rv:[\d\.]*',
49
+		'Version.[\d\.]*',
50
+		'WOW64',
51
+		'Win64',
52
+		'Dalvik.[\d\.]*',
53
+		'\.NET CLR [\d\.]*',
54
+		'Presto.[\d\.]*',
55
+		'Media Center PC',
56
+	);
57 57
 
58
-    protected static $crawlers = array(
59
-        '008\\/',
60
-        'A6-Indexer',
61
-        'Aboundex',
62
-        'Accoona-AI-Agent',
63
-        'acoon',
64
-        'AddThis',
65
-        'ADmantX',
66
-        'AHC',
67
-        'Airmail',
68
-        'Anemone',
69
-        'Arachmo',
70
-        'archive-com',
71
-        'B-l-i-t-z-B-O-T',
72
-        'bibnum\.bnf',
73
-        'biglotron',
74
-        'binlar',
75
-        'boitho\.com-dc',
76
-        'BUbiNG',
77
-        'Butterfly\\/',
78
-        'BuzzSumo',
79
-        'CC Metadata Scaper',
80
-        'Cerberian Drtrs',
81
-        'changedetection',
82
-        'Charlotte',
83
-        'clips\.ua\.ac\.be',
84
-        'CloudFlare-AlwaysOnline',
85
-        'coccoc',
86
-        'Commons-HttpClient',
87
-        'convera',
88
-        'cosmos',
89
-        'Covario-IDS',
90
-        'curl',
91
-        'CyberPatrol',
92
-        'DataparkSearch',
93
-        'dataprovider',
94
-        'Digg',
95
-        'DomainAppender',
96
-        'drupact',
97
-        'EARTHCOM',
98
-        'ec2linkfinder',
99
-        'ElectricMonk',
100
-        'Embedly',
101
-        'europarchive\.org',
102
-        'EventMachine HttpClient',
103
-        'ezooms',
104
-        'eZ Publish Link Validator',
105
-        'facebookexternalhit',
106
-        'Feedfetcher-Google',
107
-        'FeedValidator',
108
-        'FindLinks',
109
-        'findlink',
110
-        'findthatfile',
111
-        'Flamingo_SearchEngine',
112
-        'fluffy',
113
-        'getprismatic\.com',
114
-        'g00g1e\.net',
115
-        'GigablastOpenSource',
116
-        'grub-client',
117
-        'Genieo',
118
-        'Go-http-client',
119
-        'Google-HTTP-Java-Client',
120
-        'Google favicon',
121
-        'heritrix',
122
-        'Holmes',
123
-        'htdig',
124
-        'httpunit',
125
-        'httrack',
126
-        'ia_archiver',
127
-        'ichiro',
128
-        'igdeSpyder',
129
-        'InAGist',
130
-        'InfoWizards Reciprocal Link System PRO',
131
-        'integromedb',
132
-        'IODC',
133
-        'IOI',
134
-        'ips-agent',
135
-        'iZSearch',
136
-        'L\.webis',
137
-        'Larbin',
138
-        'libwww',
139
-        'Link Valet',
140
-        'linkdex',
141
-        'LinkExaminer',
142
-        'LinkWalker',
143
-        'Lipperhey Link Explorer',
144
-        'Lipperhey SEO Service',
145
-        'LongURL API',
146
-        'ltx71',
147
-        'lwp-trivial',
148
-        'MegaIndex\.ru',
149
-        'mabontland',
150
-        'MagpieRSS',
151
-        'Mediapartners-Google',
152
-        'MetaURI',
153
-        'Mnogosearch',
154
-        'mogimogi',
155
-        'Morning Paper',
156
-        'Mrcgiguy',
157
-        'MVAClient',
158
-        'netresearchserver',
159
-        'NewsGator',
160
-        'newsme',
161
-        'NG-Search',
162
-        '^NING\\/',
163
-        'Notifixious',
164
-        'nutch',
165
-        'NutchCVS',
166
-        'Nymesis',
167
-        'oegp',
168
-        'online link validator',
169
-        'Online Website Link Checker',
170
-        'Orbiter',
171
-        'ow\.ly',
172
-        'Ploetz \+ Zeller',
173
-        'page2rss',
174
-        'panscient',
175
-        'Peew',
176
-        'phpcrawl',
177
-        'Pizilla',
178
-        'Plukkie',
179
-        'Pompos',
180
-        'postano',
181
-        'PostPost',
182
-        'postrank',
183
-        'proximic',
184
-        'PycURL',
185
-        'Python-httplib2',
186
-        'python-requests',
187
-        'Python-urllib',
188
-        'Qseero',
189
-        'Qwantify',
190
-        'Radian6',
191
-        'RebelMouse',
192
-        'REL Link Checker',
193
-        'RetrevoPageAnalyzer',
194
-        'Riddler',
195
-        'Robosourcer',
196
-        'Ruby',
197
-        'SBIder',
198
-        'ScoutJet',
199
-        'ScoutURLMonitor',
200
-        'Scrapy',
201
-        'Scrubby',
202
-        'SearchSight',
203
-        'semanticdiscovery',
204
-        'SEOstats',
205
-        'Seznam screenshot-generator',
206
-        'ShopWiki',
207
-        'SiteBar',
208
-        'siteexplorer\.info',
209
-        'slider\.com',
210
-        'slurp',
211
-        'Snappy',
212
-        'sogou',
213
-        'speedy',
214
-        'Sqworm',
215
-        'StackRambler',
216
-        'Stratagems Kumo',
217
-        'summify',
218
-        'teoma',
219
-        'theoldreader\.com',
220
-        'TinEye',
221
-        'Traackr.com',
222
-        'truwoGPS',
223
-        'tweetedtimes\.com',
224
-        'Twikle',
225
-        'UnwindFetchor',
226
-        'updated',
227
-        'urlresolver',
228
-        'Validator\.nu\\/LV',
229
-        'Vagabondo',
230
-        'Vivante Link Checker',
231
-        'Vortex',
232
-        'voyager\\/',
233
-        'VYU2',
234
-        'W3C-checklink',
235
-        'W3C_CSS_Validator_JFouffa',
236
-        'W3C_I18n-Checker',
237
-        'W3C-mobileOK',
238
-        'W3C_Unicorn',
239
-        'W3C_Validator',
240
-        'WebIndex',
241
-        'Websquash\.com',
242
-        'webcollage',
243
-        'webmon ',
244
-        'WeSEE:Search',
245
-        'wf84',
246
-        'wget',
247
-        'WomlpeFactory',
248
-        'wotbox',
249
-        'Xenu Link Sleuth',
250
-        'XML Sitemaps Generator',
251
-        'Y!J-ASR',
252
-        'yacy',
253
-        'Yahoo Link Preview',
254
-        'Yahoo! Slurp China',
255
-        'Yahoo! Slurp',
256
-        'YahooSeeker',
257
-        'YahooSeeker-Testing',
258
-        'YandexImages',
259
-        'YandexMetrika',
260
-        'yandex',
261
-        'yanga',
262
-        'yeti',
263
-        'yoogliFetchAgent',
264
-        'Zao',
265
-        'ZyBorg',
266
-        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
267
-    );
58
+	protected static $crawlers = array(
59
+		'008\\/',
60
+		'A6-Indexer',
61
+		'Aboundex',
62
+		'Accoona-AI-Agent',
63
+		'acoon',
64
+		'AddThis',
65
+		'ADmantX',
66
+		'AHC',
67
+		'Airmail',
68
+		'Anemone',
69
+		'Arachmo',
70
+		'archive-com',
71
+		'B-l-i-t-z-B-O-T',
72
+		'bibnum\.bnf',
73
+		'biglotron',
74
+		'binlar',
75
+		'boitho\.com-dc',
76
+		'BUbiNG',
77
+		'Butterfly\\/',
78
+		'BuzzSumo',
79
+		'CC Metadata Scaper',
80
+		'Cerberian Drtrs',
81
+		'changedetection',
82
+		'Charlotte',
83
+		'clips\.ua\.ac\.be',
84
+		'CloudFlare-AlwaysOnline',
85
+		'coccoc',
86
+		'Commons-HttpClient',
87
+		'convera',
88
+		'cosmos',
89
+		'Covario-IDS',
90
+		'curl',
91
+		'CyberPatrol',
92
+		'DataparkSearch',
93
+		'dataprovider',
94
+		'Digg',
95
+		'DomainAppender',
96
+		'drupact',
97
+		'EARTHCOM',
98
+		'ec2linkfinder',
99
+		'ElectricMonk',
100
+		'Embedly',
101
+		'europarchive\.org',
102
+		'EventMachine HttpClient',
103
+		'ezooms',
104
+		'eZ Publish Link Validator',
105
+		'facebookexternalhit',
106
+		'Feedfetcher-Google',
107
+		'FeedValidator',
108
+		'FindLinks',
109
+		'findlink',
110
+		'findthatfile',
111
+		'Flamingo_SearchEngine',
112
+		'fluffy',
113
+		'getprismatic\.com',
114
+		'g00g1e\.net',
115
+		'GigablastOpenSource',
116
+		'grub-client',
117
+		'Genieo',
118
+		'Go-http-client',
119
+		'Google-HTTP-Java-Client',
120
+		'Google favicon',
121
+		'heritrix',
122
+		'Holmes',
123
+		'htdig',
124
+		'httpunit',
125
+		'httrack',
126
+		'ia_archiver',
127
+		'ichiro',
128
+		'igdeSpyder',
129
+		'InAGist',
130
+		'InfoWizards Reciprocal Link System PRO',
131
+		'integromedb',
132
+		'IODC',
133
+		'IOI',
134
+		'ips-agent',
135
+		'iZSearch',
136
+		'L\.webis',
137
+		'Larbin',
138
+		'libwww',
139
+		'Link Valet',
140
+		'linkdex',
141
+		'LinkExaminer',
142
+		'LinkWalker',
143
+		'Lipperhey Link Explorer',
144
+		'Lipperhey SEO Service',
145
+		'LongURL API',
146
+		'ltx71',
147
+		'lwp-trivial',
148
+		'MegaIndex\.ru',
149
+		'mabontland',
150
+		'MagpieRSS',
151
+		'Mediapartners-Google',
152
+		'MetaURI',
153
+		'Mnogosearch',
154
+		'mogimogi',
155
+		'Morning Paper',
156
+		'Mrcgiguy',
157
+		'MVAClient',
158
+		'netresearchserver',
159
+		'NewsGator',
160
+		'newsme',
161
+		'NG-Search',
162
+		'^NING\\/',
163
+		'Notifixious',
164
+		'nutch',
165
+		'NutchCVS',
166
+		'Nymesis',
167
+		'oegp',
168
+		'online link validator',
169
+		'Online Website Link Checker',
170
+		'Orbiter',
171
+		'ow\.ly',
172
+		'Ploetz \+ Zeller',
173
+		'page2rss',
174
+		'panscient',
175
+		'Peew',
176
+		'phpcrawl',
177
+		'Pizilla',
178
+		'Plukkie',
179
+		'Pompos',
180
+		'postano',
181
+		'PostPost',
182
+		'postrank',
183
+		'proximic',
184
+		'PycURL',
185
+		'Python-httplib2',
186
+		'python-requests',
187
+		'Python-urllib',
188
+		'Qseero',
189
+		'Qwantify',
190
+		'Radian6',
191
+		'RebelMouse',
192
+		'REL Link Checker',
193
+		'RetrevoPageAnalyzer',
194
+		'Riddler',
195
+		'Robosourcer',
196
+		'Ruby',
197
+		'SBIder',
198
+		'ScoutJet',
199
+		'ScoutURLMonitor',
200
+		'Scrapy',
201
+		'Scrubby',
202
+		'SearchSight',
203
+		'semanticdiscovery',
204
+		'SEOstats',
205
+		'Seznam screenshot-generator',
206
+		'ShopWiki',
207
+		'SiteBar',
208
+		'siteexplorer\.info',
209
+		'slider\.com',
210
+		'slurp',
211
+		'Snappy',
212
+		'sogou',
213
+		'speedy',
214
+		'Sqworm',
215
+		'StackRambler',
216
+		'Stratagems Kumo',
217
+		'summify',
218
+		'teoma',
219
+		'theoldreader\.com',
220
+		'TinEye',
221
+		'Traackr.com',
222
+		'truwoGPS',
223
+		'tweetedtimes\.com',
224
+		'Twikle',
225
+		'UnwindFetchor',
226
+		'updated',
227
+		'urlresolver',
228
+		'Validator\.nu\\/LV',
229
+		'Vagabondo',
230
+		'Vivante Link Checker',
231
+		'Vortex',
232
+		'voyager\\/',
233
+		'VYU2',
234
+		'W3C-checklink',
235
+		'W3C_CSS_Validator_JFouffa',
236
+		'W3C_I18n-Checker',
237
+		'W3C-mobileOK',
238
+		'W3C_Unicorn',
239
+		'W3C_Validator',
240
+		'WebIndex',
241
+		'Websquash\.com',
242
+		'webcollage',
243
+		'webmon ',
244
+		'WeSEE:Search',
245
+		'wf84',
246
+		'wget',
247
+		'WomlpeFactory',
248
+		'wotbox',
249
+		'Xenu Link Sleuth',
250
+		'XML Sitemaps Generator',
251
+		'Y!J-ASR',
252
+		'yacy',
253
+		'Yahoo Link Preview',
254
+		'Yahoo! Slurp China',
255
+		'Yahoo! Slurp',
256
+		'YahooSeeker',
257
+		'YahooSeeker-Testing',
258
+		'YandexImages',
259
+		'YandexMetrika',
260
+		'yandex',
261
+		'yanga',
262
+		'yeti',
263
+		'yoogliFetchAgent',
264
+		'Zao',
265
+		'ZyBorg',
266
+		'[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
267
+	);
268 268
 
269
-    /**
270
-     * All possible HTTP headers that represent the
271
-     * User-Agent string.
272
-     *
273
-     * @var array
274
-     */
275
-    protected static $uaHttpHeaders = array(
276
-        // The default User-Agent string.
277
-        'HTTP_USER_AGENT',
278
-        // Header can occur on devices using Opera Mini.
279
-        'HTTP_X_OPERAMINI_PHONE_UA',
280
-        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
281
-        'HTTP_X_DEVICE_USER_AGENT',
282
-        'HTTP_X_ORIGINAL_USER_AGENT',
283
-        'HTTP_X_SKYFIRE_PHONE',
284
-        'HTTP_X_BOLT_PHONE_UA',
285
-        'HTTP_DEVICE_STOCK_UA',
286
-        'HTTP_X_UCBROWSER_DEVICE_UA',
287
-    );
269
+	/**
270
+	 * All possible HTTP headers that represent the
271
+	 * User-Agent string.
272
+	 *
273
+	 * @var array
274
+	 */
275
+	protected static $uaHttpHeaders = array(
276
+		// The default User-Agent string.
277
+		'HTTP_USER_AGENT',
278
+		// Header can occur on devices using Opera Mini.
279
+		'HTTP_X_OPERAMINI_PHONE_UA',
280
+		// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
281
+		'HTTP_X_DEVICE_USER_AGENT',
282
+		'HTTP_X_ORIGINAL_USER_AGENT',
283
+		'HTTP_X_SKYFIRE_PHONE',
284
+		'HTTP_X_BOLT_PHONE_UA',
285
+		'HTTP_DEVICE_STOCK_UA',
286
+		'HTTP_X_UCBROWSER_DEVICE_UA',
287
+	);
288 288
 
289
-    /**
290
-     * Class constructor.
291
-     */
292
-    public function __construct(array $headers = null, $userAgent = null)
293
-    {
294
-        $this->setHttpHeaders($headers);
295
-        $this->setUserAgent($userAgent);
296
-    }
289
+	/**
290
+	 * Class constructor.
291
+	 */
292
+	public function __construct(array $headers = null, $userAgent = null)
293
+	{
294
+		$this->setHttpHeaders($headers);
295
+		$this->setUserAgent($userAgent);
296
+	}
297 297
 
298
-    public function setHttpHeaders($httpHeaders = null)
299
-    {
300
-        // use global _SERVER if $httpHeaders aren't defined
301
-        if (!is_array($httpHeaders) || !count($httpHeaders)) {
302
-            $httpHeaders = $_SERVER;
303
-        }
304
-        // clear existing headers
305
-        $this->httpHeaders = array();
306
-        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
307
-        // start with HTTP_.
308
-        foreach ($httpHeaders as $key => $value) {
309
-            if (substr($key, 0, 5) === 'HTTP_') {
310
-                $this->httpHeaders[$key] = $value;
311
-            }
312
-        }
313
-    }
298
+	public function setHttpHeaders($httpHeaders = null)
299
+	{
300
+		// use global _SERVER if $httpHeaders aren't defined
301
+		if (!is_array($httpHeaders) || !count($httpHeaders)) {
302
+			$httpHeaders = $_SERVER;
303
+		}
304
+		// clear existing headers
305
+		$this->httpHeaders = array();
306
+		// Only save HTTP headers. In PHP land, that means only _SERVER vars that
307
+		// start with HTTP_.
308
+		foreach ($httpHeaders as $key => $value) {
309
+			if (substr($key, 0, 5) === 'HTTP_') {
310
+				$this->httpHeaders[$key] = $value;
311
+			}
312
+		}
313
+	}
314 314
 
315
-    public function getUaHttpHeaders()
316
-    {
317
-        return self::$uaHttpHeaders;
318
-    }
315
+	public function getUaHttpHeaders()
316
+	{
317
+		return self::$uaHttpHeaders;
318
+	}
319 319
 
320
-    public function setUserAgent($userAgent = null)
321
-    {
322
-        if (false === empty($userAgent)) {
323
-            return $this->userAgent = $userAgent;
324
-        } else {
325
-            $this->userAgent = null;
326
-            foreach ($this->getUaHttpHeaders() as $altHeader) {
327
-                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
328
-                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
329
-                }
330
-            }
320
+	public function setUserAgent($userAgent = null)
321
+	{
322
+		if (false === empty($userAgent)) {
323
+			return $this->userAgent = $userAgent;
324
+		} else {
325
+			$this->userAgent = null;
326
+			foreach ($this->getUaHttpHeaders() as $altHeader) {
327
+				if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
328
+					$this->userAgent .= $this->httpHeaders[$altHeader].' ';
329
+				}
330
+			}
331 331
 
332
-            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
333
-        }
334
-    }
332
+			return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
333
+		}
334
+	}
335 335
 
336
-    public function getRegex()
337
-    {
338
-        return '('.implode('|', self::$crawlers).')';
339
-    }
336
+	public function getRegex()
337
+	{
338
+		return '('.implode('|', self::$crawlers).')';
339
+	}
340 340
 
341
-    public function getIgnored()
342
-    {
343
-        return '('.implode('|', self::$ignore).')';
344
-    }
341
+	public function getIgnored()
342
+	{
343
+		return '('.implode('|', self::$ignore).')';
344
+	}
345 345
 
346
-    public function isCrawler($userAgent = null)
347
-    {
348
-        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
346
+	public function isCrawler($userAgent = null)
347
+	{
348
+		$agent = is_null($userAgent) ? $this->userAgent : $userAgent;
349 349
 
350
-        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
350
+		$agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
351 351
 
352
-        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
352
+		$result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
353 353
 
354
-        if ($matches) {
355
-            $this->matches = $matches;
356
-        }
354
+		if ($matches) {
355
+			$this->matches = $matches;
356
+		}
357 357
 
358
-        return (bool) $result;
359
-    }
358
+		return (bool) $result;
359
+	}
360 360
 
361
-    public function getMatches()
362
-    {
363
-        return $this->matches[0];
364
-    }
361
+	public function getMatches()
362
+	{
363
+		return $this->matches[0];
364
+	}
365 365
 }
Please login to merge, or discard this patch.