Inspection of "Increase performance by over 50%" - JayBizzle/Crawler-Detect - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#42)

by Mark

created 2015-12-09 16:47 UTC

Status

Indentation +346 added lines, -346 removed lines patch added patch discarded remove patch

@@ -4,368 +4,368 @@
 block discarded – undo
 
 class CrawlerDetect
 {
-    protected $userAgent = null;
+	protected $userAgent = null;
 
-    protected $httpHeaders = array();
+	protected $httpHeaders = array();
 
-    protected $matches = array();
+	protected $matches = array();
 
-    /**
-     * List of strings to remove from the user agent before running the crawler regex
-     * Over a large list of user agents, this gives us about a 55% speed increase!
-     * 
-     * @var array
-     */
-    protected static $ignore = array(
-        'Safari.[\d\.]*',
-        'Firefox.[\d\.]*',
-        'Chrome.[\d\.]*',
-        'Chromium.[\d\.]*',
-        'MSIE.[\d\.]',
-        'Opera\/[\d\.]*',
-        'Mozilla.[\d\.]*',
-        'AppleWebKit.[\d\.]*',
-        'Trident.[\d\.]*',
-        'Windows NT.[\d\.]*',
-        'Android.[\d\.]*',
-        'Macintosh.',
-        'Ubuntu',
-        'Linux',
-        'Intel',
-        'Mac OS X',
-        'Gecko.[\d\.]*',
-        'KHTML',
-        'CriOS.[\d\.]*',
-        'CPU iPhone OS ([0-9_])* like Mac OS X',
-        'CPU OS ([0-9_])* like Mac OS X',
-        'iPod',
-        'like Gecko',
-        'compatible',
-        'x86_..',
-        'i686',
-        'x64',
-        'X11',
-        'rv:[\d\.]*',
-        'Version.[\d\.]*',
-        'WOW64',
-        'Win64',
-        'Dalvik.[\d\.]*',
-        '\.NET CLR [\d\.]*',
-        'Presto.[\d\.]*',
-        'Media Center PC',
-    );
+	/**
+	 * List of strings to remove from the user agent before running the crawler regex
+	 * Over a large list of user agents, this gives us about a 55% speed increase!
+	 * 
+	 * @var array
+	 */
+	protected static $ignore = array(
+		'Safari.[\d\.]*',
+		'Firefox.[\d\.]*',
+		'Chrome.[\d\.]*',
+		'Chromium.[\d\.]*',
+		'MSIE.[\d\.]',
+		'Opera\/[\d\.]*',
+		'Mozilla.[\d\.]*',
+		'AppleWebKit.[\d\.]*',
+		'Trident.[\d\.]*',
+		'Windows NT.[\d\.]*',
+		'Android.[\d\.]*',
+		'Macintosh.',
+		'Ubuntu',
+		'Linux',
+		'Intel',
+		'Mac OS X',
+		'Gecko.[\d\.]*',
+		'KHTML',
+		'CriOS.[\d\.]*',
+		'CPU iPhone OS ([0-9_])* like Mac OS X',
+		'CPU OS ([0-9_])* like Mac OS X',
+		'iPod',
+		'like Gecko',
+		'compatible',
+		'x86_..',
+		'i686',
+		'x64',
+		'X11',
+		'rv:[\d\.]*',
+		'Version.[\d\.]*',
+		'WOW64',
+		'Win64',
+		'Dalvik.[\d\.]*',
+		'\.NET CLR [\d\.]*',
+		'Presto.[\d\.]*',
+		'Media Center PC',
+	);
 
-    protected static $crawlers = array(
-        '007ac9 Crawler',
-        '008\\/',
-        'A6-Indexer',
-        'Aboundex',
-        'Accoona-AI-Agent',
-        'acoon',
-        'AddThis',
-        'ADmantX',
-        'AHC',
-        'Airmail',
-        'Anemone',
-        'Arachmo',
-        'archive-com',
-        'B-l-i-t-z-B-O-T',
-        'bibnum\.bnf',
-        'biglotron',
-        'binlar',
-        'boitho\.com-dc',
-        'BUbiNG',
-        'Butterfly\\/',
-        'BuzzSumo',
-        'CC Metadata Scaper',
-        'Cerberian Drtrs',
-        'changedetection',
-        'Charlotte',
-        'clips\.ua\.ac\.be',
-        'CloudFlare-AlwaysOnline',
-        'coccoc',
-        'Commons-HttpClient',
-        'convera',
-        'cosmos',
-        'Covario-IDS',
-        'crawler4j',
-        'curl',
-        'CyberPatrol',
-        'DataparkSearch',
-        'dataprovider',
-        'Digg',
-        'DomainAppender',
-        'drupact',
-        'EARTHCOM',
-        'ec2linkfinder',
-        'ElectricMonk',
-        'Embedly',
-        'europarchive\.org',
-        'EventMachine HttpClient',
-        'ezooms',
-        'eZ Publish Link Validator',
-        'facebookexternalhit',
-        'Feedfetcher-Google',
-        'FeedValidator',
-        'FindLinks',
-        'findlink',
-        'findthatfile',
-        'Flamingo_SearchEngine',
-        'fluffy',
-        'getprismatic\.com',
-        'g00g1e\.net',
-        'GigablastOpenSource',
-        'grub-client',
-        'Genieo',
-        'Go-http-client',
-        'Googlebot-Image',
-        'Googlebot-Mobile',
-        'Google-HTTP-Java-Client',
-        'Google favicon',
-        'heritrix',
-        'Holmes',
-        'htdig',
-        'httpunit',
-        'httrack',
-        'ia_archiver',
-        'ichiro',
-        'igdeSpyder',
-        'InAGist',
-        'InfoWizards Reciprocal Link System PRO',
-        'integromedb',
-        'IODC',
-        'IOI',
-        'ips-agent',
-        'iZSearch',
-        'L\.webis',
-        'Larbin',
-        'libwww',
-        'Link Valet',
-        'linkdex',
-        'LinkExaminer',
-        'LinkWalker',
-        'Lipperhey Link Explorer',
-        'Lipperhey SEO Service',
-        'LongURL API',
-        'ltx71',
-        'lwp-trivial',
-        'MegaIndex\.ru',
-        'mabontland',
-        'MagpieRSS',
-        'Mediapartners-Google',
-        'MetaURI',
-        'Mnogosearch',
-        'mogimogi',
-        'Morning Paper',
-        'Mrcgiguy',
-        'MVAClient',
-        'netresearchserver',
-        'NewsGator',
-        'newsme',
-        'NG-Search',
-        '^NING\\/',
-        'Notifixious',
-        'nutch',
-        'NutchCVS',
-        'Nymesis',
-        'oegp',
-        'online link validator',
-        'Online Website Link Checker',
-        'Orbiter',
-        'ow\.ly',
-        'Ploetz \+ Zeller',
-        'page2rss',
-        'panscient',
-        'Peew',
-        'phpcrawl',
-        'Pizilla',
-        'Plukkie',
-        'Pompos',
-        'postano',
-        'PostPost',
-        'postrank',
-        'proximic',
-        'PycURL',
-        'Python-httplib2',
-        'python-requests',
-        'Python-urllib',
-        'Qseero',
-        'Qwantify',
-        'Radian6',
-        'RebelMouse',
-        'REL Link Checker',
-        'RetrevoPageAnalyzer',
-        'Riddler',
-        'Robosourcer',
-        'Ruby',
-        'SandCrawler',
-        'SBIder',
-        'ScoutJet',
-        'ScoutURLMonitor',
-        'Scrapy',
-        'Scrubby',
-        'SearchSight',
-        'semanticdiscovery',
-        'SEOstats',
-        'Seznam screenshot-generator', 
-        'ShopWiki',
-        'SiteBar',
-        'siteexplorer\.info',
-        'slider\.com',
-        'slurp',
-        'Snappy',
-        'sogou',
-        'speedy',
-        'Sqworm',
-        'StackRambler',
-        'Stratagems Kumo',
-        'summify',
-        'teoma',
-        'theoldreader\.com',
-        'TinEye',
-        'Traackr.com',
-        'truwoGPS',
-        'tweetedtimes\.com',
-        'Twikle',
-        'UnwindFetchor',
-        'updated',
-        'urlresolver',
-        'Validator\.nu\\/LV',
-        'Vagabondo',
-        'Vivante Link Checker',
-        'Vortex',
-        'voyager\\/',
-        'VYU2',
-        'W3C-checklink',
-        'W3C_CSS_Validator_JFouffa',
-        'W3C_I18n-Checker',
-        'W3C-mobileOK',
-        'W3C_Unicorn',
-        'W3C_Validator',
-        'WebIndex',
-        'Websquash\.com',
-        'webcollage',
-        'webmon ',
-        'WeSEE:Search',
-        'wf84',
-        'wget',
-        'WomlpeFactory',
-        'wotbox',
-        'Xenu Link Sleuth',
-        'XML Sitemaps Generator',
-        'Y!J-ASR',
-        'yacy',
-        'yacybot',
-        'Yahoo Link Preview',
-        'Yahoo! Slurp China',
-        'Yahoo! Slurp',
-        'YahooSeeker',
-        'YahooSeeker-Testing',
-        'YandexImages',
-        'YandexMetrika',
-        'yandex',
-        'yanga',
-        'yeti',
-        'yoogliFetchAgent',
-        'Zao',
-        'ZyBorg',
-        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
-    );
+	protected static $crawlers = array(
+		'007ac9 Crawler',
+		'008\\/',
+		'A6-Indexer',
+		'Aboundex',
+		'Accoona-AI-Agent',
+		'acoon',
+		'AddThis',
+		'ADmantX',
+		'AHC',
+		'Airmail',
+		'Anemone',
+		'Arachmo',
+		'archive-com',
+		'B-l-i-t-z-B-O-T',
+		'bibnum\.bnf',
+		'biglotron',
+		'binlar',
+		'boitho\.com-dc',
+		'BUbiNG',
+		'Butterfly\\/',
+		'BuzzSumo',
+		'CC Metadata Scaper',
+		'Cerberian Drtrs',
+		'changedetection',
+		'Charlotte',
+		'clips\.ua\.ac\.be',
+		'CloudFlare-AlwaysOnline',
+		'coccoc',
+		'Commons-HttpClient',
+		'convera',
+		'cosmos',
+		'Covario-IDS',
+		'crawler4j',
+		'curl',
+		'CyberPatrol',
+		'DataparkSearch',
+		'dataprovider',
+		'Digg',
+		'DomainAppender',
+		'drupact',
+		'EARTHCOM',
+		'ec2linkfinder',
+		'ElectricMonk',
+		'Embedly',
+		'europarchive\.org',
+		'EventMachine HttpClient',
+		'ezooms',
+		'eZ Publish Link Validator',
+		'facebookexternalhit',
+		'Feedfetcher-Google',
+		'FeedValidator',
+		'FindLinks',
+		'findlink',
+		'findthatfile',
+		'Flamingo_SearchEngine',
+		'fluffy',
+		'getprismatic\.com',
+		'g00g1e\.net',
+		'GigablastOpenSource',
+		'grub-client',
+		'Genieo',
+		'Go-http-client',
+		'Googlebot-Image',
+		'Googlebot-Mobile',
+		'Google-HTTP-Java-Client',
+		'Google favicon',
+		'heritrix',
+		'Holmes',
+		'htdig',
+		'httpunit',
+		'httrack',
+		'ia_archiver',
+		'ichiro',
+		'igdeSpyder',
+		'InAGist',
+		'InfoWizards Reciprocal Link System PRO',
+		'integromedb',
+		'IODC',
+		'IOI',
+		'ips-agent',
+		'iZSearch',
+		'L\.webis',
+		'Larbin',
+		'libwww',
+		'Link Valet',
+		'linkdex',
+		'LinkExaminer',
+		'LinkWalker',
+		'Lipperhey Link Explorer',
+		'Lipperhey SEO Service',
+		'LongURL API',
+		'ltx71',
+		'lwp-trivial',
+		'MegaIndex\.ru',
+		'mabontland',
+		'MagpieRSS',
+		'Mediapartners-Google',
+		'MetaURI',
+		'Mnogosearch',
+		'mogimogi',
+		'Morning Paper',
+		'Mrcgiguy',
+		'MVAClient',
+		'netresearchserver',
+		'NewsGator',
+		'newsme',
+		'NG-Search',
+		'^NING\\/',
+		'Notifixious',
+		'nutch',
+		'NutchCVS',
+		'Nymesis',
+		'oegp',
+		'online link validator',
+		'Online Website Link Checker',
+		'Orbiter',
+		'ow\.ly',
+		'Ploetz \+ Zeller',
+		'page2rss',
+		'panscient',
+		'Peew',
+		'phpcrawl',
+		'Pizilla',
+		'Plukkie',
+		'Pompos',
+		'postano',
+		'PostPost',
+		'postrank',
+		'proximic',
+		'PycURL',
+		'Python-httplib2',
+		'python-requests',
+		'Python-urllib',
+		'Qseero',
+		'Qwantify',
+		'Radian6',
+		'RebelMouse',
+		'REL Link Checker',
+		'RetrevoPageAnalyzer',
+		'Riddler',
+		'Robosourcer',
+		'Ruby',
+		'SandCrawler',
+		'SBIder',
+		'ScoutJet',
+		'ScoutURLMonitor',
+		'Scrapy',
+		'Scrubby',
+		'SearchSight',
+		'semanticdiscovery',
+		'SEOstats',
+		'Seznam screenshot-generator', 
+		'ShopWiki',
+		'SiteBar',
+		'siteexplorer\.info',
+		'slider\.com',
+		'slurp',
+		'Snappy',
+		'sogou',
+		'speedy',
+		'Sqworm',
+		'StackRambler',
+		'Stratagems Kumo',
+		'summify',
+		'teoma',
+		'theoldreader\.com',
+		'TinEye',
+		'Traackr.com',
+		'truwoGPS',
+		'tweetedtimes\.com',
+		'Twikle',
+		'UnwindFetchor',
+		'updated',
+		'urlresolver',
+		'Validator\.nu\\/LV',
+		'Vagabondo',
+		'Vivante Link Checker',
+		'Vortex',
+		'voyager\\/',
+		'VYU2',
+		'W3C-checklink',
+		'W3C_CSS_Validator_JFouffa',
+		'W3C_I18n-Checker',
+		'W3C-mobileOK',
+		'W3C_Unicorn',
+		'W3C_Validator',
+		'WebIndex',
+		'Websquash\.com',
+		'webcollage',
+		'webmon ',
+		'WeSEE:Search',
+		'wf84',
+		'wget',
+		'WomlpeFactory',
+		'wotbox',
+		'Xenu Link Sleuth',
+		'XML Sitemaps Generator',
+		'Y!J-ASR',
+		'yacy',
+		'yacybot',
+		'Yahoo Link Preview',
+		'Yahoo! Slurp China',
+		'Yahoo! Slurp',
+		'YahooSeeker',
+		'YahooSeeker-Testing',
+		'YandexImages',
+		'YandexMetrika',
+		'yandex',
+		'yanga',
+		'yeti',
+		'yoogliFetchAgent',
+		'Zao',
+		'ZyBorg',
+		'[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
+	);
 
-    /**
-     * All possible HTTP headers that represent the
-     * User-Agent string.
-     *
-     * @var array
-     */
-    protected static $uaHttpHeaders = array(
-        // The default User-Agent string.
-        'HTTP_USER_AGENT',
-        // Header can occur on devices using Opera Mini.
-        'HTTP_X_OPERAMINI_PHONE_UA',
-        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
-        'HTTP_X_DEVICE_USER_AGENT',
-        'HTTP_X_ORIGINAL_USER_AGENT',
-        'HTTP_X_SKYFIRE_PHONE',
-        'HTTP_X_BOLT_PHONE_UA',
-        'HTTP_DEVICE_STOCK_UA',
-        'HTTP_X_UCBROWSER_DEVICE_UA',
-    );
+	/**
+	 * All possible HTTP headers that represent the
+	 * User-Agent string.
+	 *
+	 * @var array
+	 */
+	protected static $uaHttpHeaders = array(
+		// The default User-Agent string.
+		'HTTP_USER_AGENT',
+		// Header can occur on devices using Opera Mini.
+		'HTTP_X_OPERAMINI_PHONE_UA',
+		// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
+		'HTTP_X_DEVICE_USER_AGENT',
+		'HTTP_X_ORIGINAL_USER_AGENT',
+		'HTTP_X_SKYFIRE_PHONE',
+		'HTTP_X_BOLT_PHONE_UA',
+		'HTTP_DEVICE_STOCK_UA',
+		'HTTP_X_UCBROWSER_DEVICE_UA',
+	);
 
-    /**
-     * Class constructor.
-     */
-    public function __construct(array $headers = null, $userAgent = null)
-    {
-        $this->setHttpHeaders($headers);
-        $this->setUserAgent($userAgent);
-    }
+	/**
+	 * Class constructor.
+	 */
+	public function __construct(array $headers = null, $userAgent = null)
+	{
+		$this->setHttpHeaders($headers);
+		$this->setUserAgent($userAgent);
+	}
 
-    public function setHttpHeaders($httpHeaders = null)
-    {
-        // use global _SERVER if $httpHeaders aren't defined
-        if (!is_array($httpHeaders) || !count($httpHeaders)) {
-            $httpHeaders = $_SERVER;
-        }
-        // clear existing headers
-        $this->httpHeaders = array();
-        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
-        // start with HTTP_.
-        foreach ($httpHeaders as $key => $value) {
-            if (substr($key, 0, 5) === 'HTTP_') {
-                $this->httpHeaders[$key] = $value;
-            }
-        }
-    }
+	public function setHttpHeaders($httpHeaders = null)
+	{
+		// use global _SERVER if $httpHeaders aren't defined
+		if (!is_array($httpHeaders) || !count($httpHeaders)) {
+			$httpHeaders = $_SERVER;
+		}
+		// clear existing headers
+		$this->httpHeaders = array();
+		// Only save HTTP headers. In PHP land, that means only _SERVER vars that
+		// start with HTTP_.
+		foreach ($httpHeaders as $key => $value) {
+			if (substr($key, 0, 5) === 'HTTP_') {
+				$this->httpHeaders[$key] = $value;
+			}
+		}
+	}
 
-    public function getUaHttpHeaders()
-    {
-        return self::$uaHttpHeaders;
-    }
+	public function getUaHttpHeaders()
+	{
+		return self::$uaHttpHeaders;
+	}
 
-    public function setUserAgent($userAgent = null)
-    {
-        if (false === empty($userAgent)) {
-            return $this->userAgent = $userAgent;
-        } else {
-            $this->userAgent = null;
-            foreach ($this->getUaHttpHeaders() as $altHeader) {
-                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
-                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
-                }
-            }
+	public function setUserAgent($userAgent = null)
+	{
+		if (false === empty($userAgent)) {
+			return $this->userAgent = $userAgent;
+		} else {
+			$this->userAgent = null;
+			foreach ($this->getUaHttpHeaders() as $altHeader) {
+				if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
+					$this->userAgent .= $this->httpHeaders[$altHeader].' ';
+				}
+			}
 
-            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
-        }
-    }
+			return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
+		}
+	}
 
-    public function getRegex()
-    {
-        return '('.implode('|', self::$crawlers).')';
-    }
+	public function getRegex()
+	{
+		return '('.implode('|', self::$crawlers).')';
+	}
 
-    public function getIgnored()
-    {
-        return '('.implode('|', self::$ignore).')';
-    }
+	public function getIgnored()
+	{
+		return '('.implode('|', self::$ignore).')';
+	}
 
-    public function isCrawler($userAgent = null)
-    {
-        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
+	public function isCrawler($userAgent = null)
+	{
+		$agent = is_null($userAgent) ? $this->userAgent : $userAgent;
 
-        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
+		$agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
 
-        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
+		$result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
 
-        if ($matches) {
-            $this->matches = $matches;
-        }
+		if ($matches) {
+			$this->matches = $matches;
+		}
 
-        return (bool) $result;
-    }
+		return (bool) $result;
+	}
 
-    public function getMatches()
-    {
-        return $this->matches[0];
-    }
+	public function getMatches()
+	{
+		return $this->matches[0];
+	}
 }

Please login to merge, or discard this patch.

		@@ -4,368 +4,368 @@
		block discarded – undo
4	4
5	5	class CrawlerDetect
6	6	{
7		- protected $userAgent = null;
	7	+ protected $userAgent = null;
8	8
9		- protected $httpHeaders = array();
	9	+ protected $httpHeaders = array();
10	10
11		- protected $matches = array();
	11	+ protected $matches = array();
12	12
13		- /**
14		- * List of strings to remove from the user agent before running the crawler regex
15		- * Over a large list of user agents, this gives us about a 55% speed increase!
16		- *
17		- * @var array
18		- */
19		- protected static $ignore = array(
20		- 'Safari.[\d\.]*',
21		- 'Firefox.[\d\.]*',
22		- 'Chrome.[\d\.]*',
23		- 'Chromium.[\d\.]*',
24		- 'MSIE.[\d\.]',
25		- 'Opera\/[\d\.]*',
26		- 'Mozilla.[\d\.]*',
27		- 'AppleWebKit.[\d\.]*',
28		- 'Trident.[\d\.]*',
29		- 'Windows NT.[\d\.]*',
30		- 'Android.[\d\.]*',
31		- 'Macintosh.',
32		- 'Ubuntu',
33		- 'Linux',
34		- 'Intel',
35		- 'Mac OS X',
36		- 'Gecko.[\d\.]*',
37		- 'KHTML',
38		- 'CriOS.[\d\.]*',
39		- 'CPU iPhone OS ([0-9_])* like Mac OS X',
40		- 'CPU OS ([0-9_])* like Mac OS X',
41		- 'iPod',
42		- 'like Gecko',
43		- 'compatible',
44		- 'x86_..',
45		- 'i686',
46		- 'x64',
47		- 'X11',
48		- 'rv:[\d\.]*',
49		- 'Version.[\d\.]*',
50		- 'WOW64',
51		- 'Win64',
52		- 'Dalvik.[\d\.]*',
53		- '\.NET CLR [\d\.]*',
54		- 'Presto.[\d\.]*',
55		- 'Media Center PC',
56		- );
	13	+ /**
	14	+ * List of strings to remove from the user agent before running the crawler regex
	15	+ * Over a large list of user agents, this gives us about a 55% speed increase!
	16	+ *
	17	+ * @var array
	18	+ */
	19	+ protected static $ignore = array(
	20	+ 'Safari.[\d\.]*',
	21	+ 'Firefox.[\d\.]*',
	22	+ 'Chrome.[\d\.]*',
	23	+ 'Chromium.[\d\.]*',
	24	+ 'MSIE.[\d\.]',
	25	+ 'Opera\/[\d\.]*',
	26	+ 'Mozilla.[\d\.]*',
	27	+ 'AppleWebKit.[\d\.]*',
	28	+ 'Trident.[\d\.]*',
	29	+ 'Windows NT.[\d\.]*',
	30	+ 'Android.[\d\.]*',
	31	+ 'Macintosh.',
	32	+ 'Ubuntu',
	33	+ 'Linux',
	34	+ 'Intel',
	35	+ 'Mac OS X',
	36	+ 'Gecko.[\d\.]*',
	37	+ 'KHTML',
	38	+ 'CriOS.[\d\.]*',
	39	+ 'CPU iPhone OS ([0-9_])* like Mac OS X',
	40	+ 'CPU OS ([0-9_])* like Mac OS X',
	41	+ 'iPod',
	42	+ 'like Gecko',
	43	+ 'compatible',
	44	+ 'x86_..',
	45	+ 'i686',
	46	+ 'x64',
	47	+ 'X11',
	48	+ 'rv:[\d\.]*',
	49	+ 'Version.[\d\.]*',
	50	+ 'WOW64',
	51	+ 'Win64',
	52	+ 'Dalvik.[\d\.]*',
	53	+ '\.NET CLR [\d\.]*',
	54	+ 'Presto.[\d\.]*',
	55	+ 'Media Center PC',
	56	+ );
57	57
58		- protected static $crawlers = array(
59		- '007ac9 Crawler',
60		- '008\\/',
61		- 'A6-Indexer',
62		- 'Aboundex',
63		- 'Accoona-AI-Agent',
64		- 'acoon',
65		- 'AddThis',
66		- 'ADmantX',
67		- 'AHC',
68		- 'Airmail',
69		- 'Anemone',
70		- 'Arachmo',
71		- 'archive-com',
72		- 'B-l-i-t-z-B-O-T',
73		- 'bibnum\.bnf',
74		- 'biglotron',
75		- 'binlar',
76		- 'boitho\.com-dc',
77		- 'BUbiNG',
78		- 'Butterfly\\/',
79		- 'BuzzSumo',
80		- 'CC Metadata Scaper',
81		- 'Cerberian Drtrs',
82		- 'changedetection',
83		- 'Charlotte',
84		- 'clips\.ua\.ac\.be',
85		- 'CloudFlare-AlwaysOnline',
86		- 'coccoc',
87		- 'Commons-HttpClient',
88		- 'convera',
89		- 'cosmos',
90		- 'Covario-IDS',
91		- 'crawler4j',
92		- 'curl',
93		- 'CyberPatrol',
94		- 'DataparkSearch',
95		- 'dataprovider',
96		- 'Digg',
97		- 'DomainAppender',
98		- 'drupact',
99		- 'EARTHCOM',
100		- 'ec2linkfinder',
101		- 'ElectricMonk',
102		- 'Embedly',
103		- 'europarchive\.org',
104		- 'EventMachine HttpClient',
105		- 'ezooms',
106		- 'eZ Publish Link Validator',
107		- 'facebookexternalhit',
108		- 'Feedfetcher-Google',
109		- 'FeedValidator',
110		- 'FindLinks',
111		- 'findlink',
112		- 'findthatfile',
113		- 'Flamingo_SearchEngine',
114		- 'fluffy',
115		- 'getprismatic\.com',
116		- 'g00g1e\.net',
117		- 'GigablastOpenSource',
118		- 'grub-client',
119		- 'Genieo',
120		- 'Go-http-client',
121		- 'Googlebot-Image',
122		- 'Googlebot-Mobile',
123		- 'Google-HTTP-Java-Client',
124		- 'Google favicon',
125		- 'heritrix',
126		- 'Holmes',
127		- 'htdig',
128		- 'httpunit',
129		- 'httrack',
130		- 'ia_archiver',
131		- 'ichiro',
132		- 'igdeSpyder',
133		- 'InAGist',
134		- 'InfoWizards Reciprocal Link System PRO',
135		- 'integromedb',
136		- 'IODC',
137		- 'IOI',
138		- 'ips-agent',
139		- 'iZSearch',
140		- 'L\.webis',
141		- 'Larbin',
142		- 'libwww',
143		- 'Link Valet',
144		- 'linkdex',
145		- 'LinkExaminer',
146		- 'LinkWalker',
147		- 'Lipperhey Link Explorer',
148		- 'Lipperhey SEO Service',
149		- 'LongURL API',
150		- 'ltx71',
151		- 'lwp-trivial',
152		- 'MegaIndex\.ru',
153		- 'mabontland',
154		- 'MagpieRSS',
155		- 'Mediapartners-Google',
156		- 'MetaURI',
157		- 'Mnogosearch',
158		- 'mogimogi',
159		- 'Morning Paper',
160		- 'Mrcgiguy',
161		- 'MVAClient',
162		- 'netresearchserver',
163		- 'NewsGator',
164		- 'newsme',
165		- 'NG-Search',
166		- '^NING\\/',
167		- 'Notifixious',
168		- 'nutch',
169		- 'NutchCVS',
170		- 'Nymesis',
171		- 'oegp',
172		- 'online link validator',
173		- 'Online Website Link Checker',
174		- 'Orbiter',
175		- 'ow\.ly',
176		- 'Ploetz \+ Zeller',
177		- 'page2rss',
178		- 'panscient',
179		- 'Peew',
180		- 'phpcrawl',
181		- 'Pizilla',
182		- 'Plukkie',
183		- 'Pompos',
184		- 'postano',
185		- 'PostPost',
186		- 'postrank',
187		- 'proximic',
188		- 'PycURL',
189		- 'Python-httplib2',
190		- 'python-requests',
191		- 'Python-urllib',
192		- 'Qseero',
193		- 'Qwantify',
194		- 'Radian6',
195		- 'RebelMouse',
196		- 'REL Link Checker',
197		- 'RetrevoPageAnalyzer',
198		- 'Riddler',
199		- 'Robosourcer',
200		- 'Ruby',
201		- 'SandCrawler',
202		- 'SBIder',
203		- 'ScoutJet',
204		- 'ScoutURLMonitor',
205		- 'Scrapy',
206		- 'Scrubby',
207		- 'SearchSight',
208		- 'semanticdiscovery',
209		- 'SEOstats',
210		- 'Seznam screenshot-generator',
211		- 'ShopWiki',
212		- 'SiteBar',
213		- 'siteexplorer\.info',
214		- 'slider\.com',
215		- 'slurp',
216		- 'Snappy',
217		- 'sogou',
218		- 'speedy',
219		- 'Sqworm',
220		- 'StackRambler',
221		- 'Stratagems Kumo',
222		- 'summify',
223		- 'teoma',
224		- 'theoldreader\.com',
225		- 'TinEye',
226		- 'Traackr.com',
227		- 'truwoGPS',
228		- 'tweetedtimes\.com',
229		- 'Twikle',
230		- 'UnwindFetchor',
231		- 'updated',
232		- 'urlresolver',
233		- 'Validator\.nu\\/LV',
234		- 'Vagabondo',
235		- 'Vivante Link Checker',
236		- 'Vortex',
237		- 'voyager\\/',
238		- 'VYU2',
239		- 'W3C-checklink',
240		- 'W3C_CSS_Validator_JFouffa',
241		- 'W3C_I18n-Checker',
242		- 'W3C-mobileOK',
243		- 'W3C_Unicorn',
244		- 'W3C_Validator',
245		- 'WebIndex',
246		- 'Websquash\.com',
247		- 'webcollage',
248		- 'webmon ',
249		- 'WeSEE:Search',
250		- 'wf84',
251		- 'wget',
252		- 'WomlpeFactory',
253		- 'wotbox',
254		- 'Xenu Link Sleuth',
255		- 'XML Sitemaps Generator',
256		- 'Y!J-ASR',
257		- 'yacy',
258		- 'yacybot',
259		- 'Yahoo Link Preview',
260		- 'Yahoo! Slurp China',
261		- 'Yahoo! Slurp',
262		- 'YahooSeeker',
263		- 'YahooSeeker-Testing',
264		- 'YandexImages',
265		- 'YandexMetrika',
266		- 'yandex',
267		- 'yanga',
268		- 'yeti',
269		- 'yoogliFetchAgent',
270		- 'Zao',
271		- 'ZyBorg',
272		- '[a-z0-9\\-_]*((?<!cu)bot\|crawler\|archiver\|transcoder\|spider)',
273		- );
	58	+ protected static $crawlers = array(
	59	+ '007ac9 Crawler',
	60	+ '008\\/',
	61	+ 'A6-Indexer',
	62	+ 'Aboundex',
	63	+ 'Accoona-AI-Agent',
	64	+ 'acoon',
	65	+ 'AddThis',
	66	+ 'ADmantX',
	67	+ 'AHC',
	68	+ 'Airmail',
	69	+ 'Anemone',
	70	+ 'Arachmo',
	71	+ 'archive-com',
	72	+ 'B-l-i-t-z-B-O-T',
	73	+ 'bibnum\.bnf',
	74	+ 'biglotron',
	75	+ 'binlar',
	76	+ 'boitho\.com-dc',
	77	+ 'BUbiNG',
	78	+ 'Butterfly\\/',
	79	+ 'BuzzSumo',
	80	+ 'CC Metadata Scaper',
	81	+ 'Cerberian Drtrs',
	82	+ 'changedetection',
	83	+ 'Charlotte',
	84	+ 'clips\.ua\.ac\.be',
	85	+ 'CloudFlare-AlwaysOnline',
	86	+ 'coccoc',
	87	+ 'Commons-HttpClient',
	88	+ 'convera',
	89	+ 'cosmos',
	90	+ 'Covario-IDS',
	91	+ 'crawler4j',
	92	+ 'curl',
	93	+ 'CyberPatrol',
	94	+ 'DataparkSearch',
	95	+ 'dataprovider',
	96	+ 'Digg',
	97	+ 'DomainAppender',
	98	+ 'drupact',
	99	+ 'EARTHCOM',
	100	+ 'ec2linkfinder',
	101	+ 'ElectricMonk',
	102	+ 'Embedly',
	103	+ 'europarchive\.org',
	104	+ 'EventMachine HttpClient',
	105	+ 'ezooms',
	106	+ 'eZ Publish Link Validator',
	107	+ 'facebookexternalhit',
	108	+ 'Feedfetcher-Google',
	109	+ 'FeedValidator',
	110	+ 'FindLinks',
	111	+ 'findlink',
	112	+ 'findthatfile',
	113	+ 'Flamingo_SearchEngine',
	114	+ 'fluffy',
	115	+ 'getprismatic\.com',
	116	+ 'g00g1e\.net',
	117	+ 'GigablastOpenSource',
	118	+ 'grub-client',
	119	+ 'Genieo',
	120	+ 'Go-http-client',
	121	+ 'Googlebot-Image',
	122	+ 'Googlebot-Mobile',
	123	+ 'Google-HTTP-Java-Client',
	124	+ 'Google favicon',
	125	+ 'heritrix',
	126	+ 'Holmes',
	127	+ 'htdig',
	128	+ 'httpunit',
	129	+ 'httrack',
	130	+ 'ia_archiver',
	131	+ 'ichiro',
	132	+ 'igdeSpyder',
	133	+ 'InAGist',
	134	+ 'InfoWizards Reciprocal Link System PRO',
	135	+ 'integromedb',
	136	+ 'IODC',
	137	+ 'IOI',
	138	+ 'ips-agent',
	139	+ 'iZSearch',
	140	+ 'L\.webis',
	141	+ 'Larbin',
	142	+ 'libwww',
	143	+ 'Link Valet',
	144	+ 'linkdex',
	145	+ 'LinkExaminer',
	146	+ 'LinkWalker',
	147	+ 'Lipperhey Link Explorer',
	148	+ 'Lipperhey SEO Service',
	149	+ 'LongURL API',
	150	+ 'ltx71',
	151	+ 'lwp-trivial',
	152	+ 'MegaIndex\.ru',
	153	+ 'mabontland',
	154	+ 'MagpieRSS',
	155	+ 'Mediapartners-Google',
	156	+ 'MetaURI',
	157	+ 'Mnogosearch',
	158	+ 'mogimogi',
	159	+ 'Morning Paper',
	160	+ 'Mrcgiguy',
	161	+ 'MVAClient',
	162	+ 'netresearchserver',
	163	+ 'NewsGator',
	164	+ 'newsme',
	165	+ 'NG-Search',
	166	+ '^NING\\/',
	167	+ 'Notifixious',
	168	+ 'nutch',
	169	+ 'NutchCVS',
	170	+ 'Nymesis',
	171	+ 'oegp',
	172	+ 'online link validator',
	173	+ 'Online Website Link Checker',
	174	+ 'Orbiter',
	175	+ 'ow\.ly',
	176	+ 'Ploetz \+ Zeller',
	177	+ 'page2rss',
	178	+ 'panscient',
	179	+ 'Peew',
	180	+ 'phpcrawl',
	181	+ 'Pizilla',
	182	+ 'Plukkie',
	183	+ 'Pompos',
	184	+ 'postano',
	185	+ 'PostPost',
	186	+ 'postrank',
	187	+ 'proximic',
	188	+ 'PycURL',
	189	+ 'Python-httplib2',
	190	+ 'python-requests',
	191	+ 'Python-urllib',
	192	+ 'Qseero',
	193	+ 'Qwantify',
	194	+ 'Radian6',
	195	+ 'RebelMouse',
	196	+ 'REL Link Checker',
	197	+ 'RetrevoPageAnalyzer',
	198	+ 'Riddler',
	199	+ 'Robosourcer',
	200	+ 'Ruby',
	201	+ 'SandCrawler',
	202	+ 'SBIder',
	203	+ 'ScoutJet',
	204	+ 'ScoutURLMonitor',
	205	+ 'Scrapy',
	206	+ 'Scrubby',
	207	+ 'SearchSight',
	208	+ 'semanticdiscovery',
	209	+ 'SEOstats',
	210	+ 'Seznam screenshot-generator',
	211	+ 'ShopWiki',
	212	+ 'SiteBar',
	213	+ 'siteexplorer\.info',
	214	+ 'slider\.com',
	215	+ 'slurp',
	216	+ 'Snappy',
	217	+ 'sogou',
	218	+ 'speedy',
	219	+ 'Sqworm',
	220	+ 'StackRambler',
	221	+ 'Stratagems Kumo',
	222	+ 'summify',
	223	+ 'teoma',
	224	+ 'theoldreader\.com',
	225	+ 'TinEye',
	226	+ 'Traackr.com',
	227	+ 'truwoGPS',
	228	+ 'tweetedtimes\.com',
	229	+ 'Twikle',
	230	+ 'UnwindFetchor',
	231	+ 'updated',
	232	+ 'urlresolver',
	233	+ 'Validator\.nu\\/LV',
	234	+ 'Vagabondo',
	235	+ 'Vivante Link Checker',
	236	+ 'Vortex',
	237	+ 'voyager\\/',
	238	+ 'VYU2',
	239	+ 'W3C-checklink',
	240	+ 'W3C_CSS_Validator_JFouffa',
	241	+ 'W3C_I18n-Checker',
	242	+ 'W3C-mobileOK',
	243	+ 'W3C_Unicorn',
	244	+ 'W3C_Validator',
	245	+ 'WebIndex',
	246	+ 'Websquash\.com',
	247	+ 'webcollage',
	248	+ 'webmon ',
	249	+ 'WeSEE:Search',
	250	+ 'wf84',
	251	+ 'wget',
	252	+ 'WomlpeFactory',
	253	+ 'wotbox',
	254	+ 'Xenu Link Sleuth',
	255	+ 'XML Sitemaps Generator',
	256	+ 'Y!J-ASR',
	257	+ 'yacy',
	258	+ 'yacybot',
	259	+ 'Yahoo Link Preview',
	260	+ 'Yahoo! Slurp China',
	261	+ 'Yahoo! Slurp',
	262	+ 'YahooSeeker',
	263	+ 'YahooSeeker-Testing',
	264	+ 'YandexImages',
	265	+ 'YandexMetrika',
	266	+ 'yandex',
	267	+ 'yanga',
	268	+ 'yeti',
	269	+ 'yoogliFetchAgent',
	270	+ 'Zao',
	271	+ 'ZyBorg',
	272	+ '[a-z0-9\\-_]*((?<!cu)bot\|crawler\|archiver\|transcoder\|spider)',
	273	+ );
274	274
275		- /**
276		- * All possible HTTP headers that represent the
277		- * User-Agent string.
278		- *
279		- * @var array
280		- */
281		- protected static $uaHttpHeaders = array(
282		- // The default User-Agent string.
283		- 'HTTP_USER_AGENT',
284		- // Header can occur on devices using Opera Mini.
285		- 'HTTP_X_OPERAMINI_PHONE_UA',
286		- // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
287		- 'HTTP_X_DEVICE_USER_AGENT',
288		- 'HTTP_X_ORIGINAL_USER_AGENT',
289		- 'HTTP_X_SKYFIRE_PHONE',
290		- 'HTTP_X_BOLT_PHONE_UA',
291		- 'HTTP_DEVICE_STOCK_UA',
292		- 'HTTP_X_UCBROWSER_DEVICE_UA',
293		- );
	275	+ /**
	276	+ * All possible HTTP headers that represent the
	277	+ * User-Agent string.
	278	+ *
	279	+ * @var array
	280	+ */
	281	+ protected static $uaHttpHeaders = array(
	282	+ // The default User-Agent string.
	283	+ 'HTTP_USER_AGENT',
	284	+ // Header can occur on devices using Opera Mini.
	285	+ 'HTTP_X_OPERAMINI_PHONE_UA',
	286	+ // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
	287	+ 'HTTP_X_DEVICE_USER_AGENT',
	288	+ 'HTTP_X_ORIGINAL_USER_AGENT',
	289	+ 'HTTP_X_SKYFIRE_PHONE',
	290	+ 'HTTP_X_BOLT_PHONE_UA',
	291	+ 'HTTP_DEVICE_STOCK_UA',
	292	+ 'HTTP_X_UCBROWSER_DEVICE_UA',
	293	+ );
294	294
295		- /**
296		- * Class constructor.
297		- */
298		- public function __construct(array $headers = null, $userAgent = null)
299		- {
300		- $this->setHttpHeaders($headers);
301		- $this->setUserAgent($userAgent);
302		- }
	295	+ /**
	296	+ * Class constructor.
	297	+ */
	298	+ public function __construct(array $headers = null, $userAgent = null)
	299	+ {
	300	+ $this->setHttpHeaders($headers);
	301	+ $this->setUserAgent($userAgent);
	302	+ }
303	303
304		- public function setHttpHeaders($httpHeaders = null)
305		- {
306		- // use global _SERVER if $httpHeaders aren't defined
307		- if (!is_array($httpHeaders) \|\| !count($httpHeaders)) {
308		- $httpHeaders = $_SERVER;
309		- }
310		- // clear existing headers
311		- $this->httpHeaders = array();
312		- // Only save HTTP headers. In PHP land, that means only _SERVER vars that
313		- // start with HTTP_.
314		- foreach ($httpHeaders as $key => $value) {
315		- if (substr($key, 0, 5) === 'HTTP_') {
316		- $this->httpHeaders[$key] = $value;
317		- }
318		- }
319		- }
	304	+ public function setHttpHeaders($httpHeaders = null)
	305	+ {
	306	+ // use global _SERVER if $httpHeaders aren't defined
	307	+ if (!is_array($httpHeaders) \|\| !count($httpHeaders)) {
	308	+ $httpHeaders = $_SERVER;
	309	+ }
	310	+ // clear existing headers
	311	+ $this->httpHeaders = array();
	312	+ // Only save HTTP headers. In PHP land, that means only _SERVER vars that
	313	+ // start with HTTP_.
	314	+ foreach ($httpHeaders as $key => $value) {
	315	+ if (substr($key, 0, 5) === 'HTTP_') {
	316	+ $this->httpHeaders[$key] = $value;
	317	+ }
	318	+ }
	319	+ }
320	320
321		- public function getUaHttpHeaders()
322		- {
323		- return self::$uaHttpHeaders;
324		- }
	321	+ public function getUaHttpHeaders()
	322	+ {
	323	+ return self::$uaHttpHeaders;
	324	+ }
325	325
326		- public function setUserAgent($userAgent = null)
327		- {
328		- if (false === empty($userAgent)) {
329		- return $this->userAgent = $userAgent;
330		- } else {
331		- $this->userAgent = null;
332		- foreach ($this->getUaHttpHeaders() as $altHeader) {
333		- if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
334		- $this->userAgent .= $this->httpHeaders[$altHeader].' ';
335		- }
336		- }
	326	+ public function setUserAgent($userAgent = null)
	327	+ {
	328	+ if (false === empty($userAgent)) {
	329	+ return $this->userAgent = $userAgent;
	330	+ } else {
	331	+ $this->userAgent = null;
	332	+ foreach ($this->getUaHttpHeaders() as $altHeader) {
	333	+ if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
	334	+ $this->userAgent .= $this->httpHeaders[$altHeader].' ';
	335	+ }
	336	+ }
337	337
338		- return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
339		- }
340		- }
	338	+ return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
	339	+ }
	340	+ }
341	341
342		- public function getRegex()
343		- {
344		- return '('.implode('\|', self::$crawlers).')';
345		- }
	342	+ public function getRegex()
	343	+ {
	344	+ return '('.implode('\|', self::$crawlers).')';
	345	+ }
346	346
347		- public function getIgnored()
348		- {
349		- return '('.implode('\|', self::$ignore).')';
350		- }
	347	+ public function getIgnored()
	348	+ {
	349	+ return '('.implode('\|', self::$ignore).')';
	350	+ }
351	351
352		- public function isCrawler($userAgent = null)
353		- {
354		- $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
	352	+ public function isCrawler($userAgent = null)
	353	+ {
	354	+ $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
355	355
356		- $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
	356	+ $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
357	357
358		- $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
	358	+ $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
359	359
360		- if ($matches) {
361		- $this->matches = $matches;
362		- }
	360	+ if ($matches) {
	361	+ $this->matches = $matches;
	362	+ }
363	363
364		- return (bool) $result;
365		- }
	364	+ return (bool) $result;
	365	+ }
366	366
367		- public function getMatches()
368		- {
369		- return $this->matches[0];
370		- }
	367	+ public function getMatches()
	368	+ {
	369	+ return $this->matches[0];
	370	+ }
371	371	}

JayBizzle / Crawler-Detect

Pull Request — master (#42)

Status

Category

Indentation +346 added lines, -346 removed lines patch added patch discarded remove patch