CrawlerDetect::getRegex() - Code Metrics - Inspection of "Increase performance by over 50%" - JayBizzle/Crawler-Detect - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#42)

by Mark

created 2015-12-09 15:57 UTC

CrawlerDetect::getRegex() A

↳ Parent: CrawlerDetect

Complexity

Conditions	1
Paths	1

Size

Total Lines	4
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	1	Features	0

Metric	Value
c	2
b	1
f	0
dl	0
loc	4
rs	10
cc	1
eloc	2
nc	1
nop	0

<?php

namespace Jaybizzle\CrawlerDetect;

class CrawlerDetect
{
    protected $userAgent = null;

    protected $httpHeaders = array();

    protected $matches = array();

    /**
     * List of strings to remove from the user agent before running the crawler regex
     * Over a large list of user agents, this gives us about a 55% speed increase!
     * 
     * @var array
     */
    protected static $ignore = array(
        'Safari.[\d\.]*',
        'Firefox.[\d\.]*',
        'Chrome.[\d\.]*',
        'Chromium.[\d\.]*',
        'MSIE.[\d\.]',
        'Opera\/[\d\.]*',
        'Mozilla.[\d\.]*',
        'AppleWebKit.[\d\.]*',
        'Trident.[\d\.]*',
        'Windows NT.[\d\.]*',
        'Macintosh.',
        'Ubuntu',
        'Linux',
        'Intel',
        'Mac OS X',
        'Gecko.[\d\.]*',
        'KHTML',
        'iPhone',
        'like Gecko',
        'compatible',
        'x86_..',
        'i686',
        'x64',
        'X11',
        'rv:[\d\.]*',
        'Version.[\d\.]*',
        'WOW64',
        'Win64',
        '\.NET CLR [\d\.]*',
        'Presto.[\d\.]*',
        'Media Center PC',
    );

    protected static $crawlers = array(
        '007ac9 Crawler',
        '008\\/',
        '360Spider',
        'A6-Indexer',
        'ABACHOBot',
        'AbiLogicBot',
        'Aboundex',
        'Accoona-AI-Agent',
        'acoon',
        'AddSugarSpiderBot',
        'AddThis',
        'Adidxbot',
        'ADmantX',
        'AdvBot',
        'AHC',
        'ahrefsbot',
        'aihitbot',
        'Airmail',
        'AISearchBot',
        'Anemone',
        'antibot',
        'AnyApexBot',
        'Applebot',
        'arabot',
        'Arachmo',
        'archive-com',
        'archive\.org_bot',
        'B-l-i-t-z-B-O-T',
        'backlinkcrawler',
        'baiduspider',
        'BecomeBot',
        'BeslistBot',
        'bibnum\.bnf',
        'biglotron',
        'BillyBobBot',
        'Bimbot',
        'bingbot',
        'binlar',
        'blekkobot',
        'blexbot',
        'BlitzBOT',
        'bl\.uk_lddc_bot',
        'bnf\.fr_bot',
        'boitho\.com-dc',
        'boitho\.com-robot',
        'brainobot',
        'btbot',
        'BUbiNG',
        'Butterfly\\/',
        'buzzbot',
        'BuzzSumo',
        'careerbot',
        'CatchBot',
        'CC Metadata Scaper',
        'ccbot',
        'Cerberian Drtrs',
        'changedetection',
        'Charlotte',
        'clips\.ua\.ac\.be',
        'CloudFlare-AlwaysOnline',
        'citeseerxbot',
        'coccoc',
        'classbot',
        'Commons-HttpClient',
        'content crawler spider',
        'Content Crawler',
        'convera',
        'ConveraCrawler',
        'CoPubbot',
        'cosmos',
        'Covario-IDS',
        'CrawlBot',
        'crawler4j',
        'CrystalSemanticsBot',
        'curl',
        'cXensebot',
        'CyberPatrol',
        'DataparkSearch',
        'dataprovider',
        'DiamondBot',
        'Digg',
        'discobot',
        'DomainAppender',
        'domaincrawler',
        'Domain Re-Animator Bot',
        'dotbot',
        'drupact',
        'DuckDuckBot',
        'EARTHCOM',
        'EasouSpider',
        'ec2linkfinder',
        'edisterbot',
        'ElectricMonk',
        'elisabot',
        'emailmarketingrobot',
        'Embedly',
        'EmeraldShield\.com WebBot',
        'envolk\[ITS\]spider',
        'EsperanzaBot',
        'europarchive\.org',
        'EventMachine HttpClient',
        'exabot',
        'ezooms',
        'eZ Publish Link Validator',
        'facebookexternalhit',
        'Facebot',
        'FAST Enteprise Crawler',
        'FAST Enterprise Crawler',
        'FAST-WebCrawler',
        'FDSE robot',
        'Feedfetcher-Google',
        'FeedValidator',
        'FindLinks',
        'findlink',
        'findthatfile',
        'findxbot',
        'Flamingo_SearchEngine',
        'fluffy',
        'fr-crawler',
        'FRCrawler',
        'FurlBot',
        'FyberSpider',
        'g00g1e\.net',
        'GigablastOpenSource',
        'grub-client',
        'g2crawler',
        'Gaisbot',
        'GalaxyBot',
        'genieBot',
        'Genieo',
        'GermCrawler',
        'getprismatic\.com',
        'gigabot',
        'GingerCrawler',
        'Girafabot',
        'Gluten Free Crawler',
        'gnam gnam spider',
        'Go-http-client',
        'Googlebot-Image',
        'Googlebot-Mobile',
        'Googlebot',
        'Google-HTTP-Java-Client',
        'Google favicon',
        'GrapeshotCrawler',
        'gslfbot',
        'GurujiBot',
        'HappyFunBot',
        'Healthbot',
        'heritrix',
        'hl_ftien_spider',
        'Holmes',
        'htdig',
        'httpunit',
        'httrack',
        'ia_archiver',
        'iaskspider',
        'iCCrawler',
        'ichiro',
        'igdeSpyder',
        'iisbot',
        'InAGist',
        'InfoWizards Reciprocal Link System PRO',
        'Insitesbot',
        'integromedb',
        'intelium_bot',
        'InterfaxScanBot',
        'IODC',
        'IOI',
        'ip-web-crawler\.com',
        'ips-agent',
        'IRLbot',
        'IssueCrawler',
        'IstellaBot',
        'it2media-domain-crawler',
        'iZSearch',
        'Jaxified Bot',
        'JOC Web Spider',
        'jyxobot',
        'KoepaBot',
        'L\.webis',
        'LapozzBot',
        'Larbin',
        'lb-spider',
        'LDSpider',
        'LexxeBot',
        'libwww',
        'Linguee Bot',
        'Link Valet',
        'linkdex',
        'LinkExaminer',
        'LinksManager\.com_bot',
        'LinkpadBot',
        'LinksCrawler',
        'LinkWalker',
        'Lipperhey Link Explorer',
        'Lipperhey SEO Service',
        'Livelapbot',
        'LongURL API',
        'lmspider',
        'lssbot',
        'lssrocketcrawler',
        'ltx71',
        'lufsbot',
        'lwp-trivial',
        'Mail\.RU_Bot',
        'MegaIndex\.ru',
        'mabontland',
        'magpie-crawler',
        'MagpieRSS',
        'Mediapartners-Google',
        'memorybot',
        'MetaURI',
        'MJ12bot',
        'mlbot',
        'Mnogosearch',
        'mogimogi',
        'MojeekBot',
        'Moreoverbot',
        'Morning Paper',
        'Mrcgiguy',
        'MSIECrawler',
        'msnbot',
        'msrbot',
        'MVAClient',
        'mxbot',
        'NerdByNature\.Bot',
        'NerdyBot',
        'netEstate NE Crawler',
        'netresearchserver',
        'NetSeer Crawler',
        'NewsGator',
        'newsme',
        'NextGenSearchBot',
        'NG-Search',
        'ngbot',
        'nicebot',
        'niki-bot',
        '^NING\\/',
        'Notifixious',
        'noxtrumbot',
        'Nusearch Spider',
        'nutch',
        'NutchCVS',
        'Nymesis',
        'obot',
        'oegp',
        'ocrawler',
        'omgilibot',
        'OmniExplorer_Bot',
        'online link validator',
        'Online Website Link Checker',
        'OOZBOT',
        'openindexspider',
        'OpenWebSpider',
        'OrangeBot',
        'Orbiter',
        'ow\.ly',
        'PaperLiBot',
        'Pingdom\.com_bot',
        'Ploetz \+ Zeller',
        'page2rss',
        'PageBitesHyperBot',
        'panscient',
        'Peew',
        'PercolateCrawler',
        'phpcrawl',
        'Pizilla',
        'Plukkie',
        'polybot',
        'Pompos',
        'postano',
        'PostPost',
        'postrank',
        'proximic',
        'psbot',
        'purebot',
        'PycURL',
        'Python-httplib2',
        'python-requests',
        'Python-urllib',
        'Qseero',
        'QuerySeekerSpider',
        'Qwantify',
        'Radian6',
        'RAMPyBot',
        'RebelMouse',
        'REL Link Checker',
        'RetrevoPageAnalyzer',
        'Riddler',
        'Robosourcer',
        'rogerbot',
        'Ruby',
        'RufusBot',
        'SandCrawler',
        'SBIder',
        'ScoutJet',
        'ScoutURLMonitor',
        'Scrapy',
        'ScreenerBot',
        'scribdbot',
        'Scrubby',
        'SearchmetricsBot',
        'SearchSight',
        'seekbot',
        'semanticdiscovery',
        'SemrushBot',
        'Sensis Web Crawler',
        'SEOChat::Bot',
        'seokicks-robot',
        'SEOstats',
        'Seznam screenshot-generator',
        'seznambot',
        'Shim-Crawler',
        'ShopWiki',
        'Shoula robot',
        'ShowyouBot',
        'SimpleCrawler',
        'sistrix crawler',
        'SiteBar',
        'sitebot',
        'siteexplorer\.info',
        'SklikBot',
        'slider\.com',
        'slurp',
        'smtbot',
        'Snappy',
        'sogou spider',
        'sogou',
        'Sosospider',
        'spbot',
        'Speedy Spider',
        'speedy',
        'SpiderMan',
        'Sqworm',
        'SSL-Crawler',
        'StackRambler',
        'Stratagems Kumo',
        'suggybot',
        'summify',
        'SurdotlyBot',
        'SurveyBot',
        'SynooBot',
        'tagoobot',
        'teoma',
        'TerrawizBot',
        'theoldreader.com',
        'TheSuBot',
        'Thumbnail\.CZ robot',
        'TinEye',
        'toplistbot',
        'Traackr.com',
        'trendictionbot',
        'TrueBot',
        'truwoGPS',
        'turnitinbot',
        'TweetedTimes Bot',
        'tweetedtimes\.com',
        'TweetmemeBot',
        'twengabot',
        'Twikle',
        'Twitterbot',
        'uMBot',
        'UnisterBot',
        'UnwindFetchor',
        'updated',
        'urlappendbot',
        'Urlfilebot',
        'urlresolver',
        'UsineNouvelleCrawler',
        'Validator\.nu\\/LV',
        'Vagabondo',
        'Vivante Link Checker',
        'voilabot',
        'Vortex',
        'voyager\\/',
        'VYU2',
        'W3C-checklink',
        'W3C_CSS_Validator_JFouffa',
        'W3C_I18n-Checker',
        'W3C-mobileOK',
        'W3C_Unicorn',
        'W3C_Validator',
        'WebIndex',
        'web-archive-net\.com\.bot',
        'Websquash\.com',
        'WeSEE:Ads\\/PageBot',
        'wbsearchbot',
        'webcollage',
        'webcompanycrawler',
        'webcrawler',
        'webmon ',
        'WeSEE:Search',
        'wf84',
        'wget',
        'wocbot',
        'WoFindeIch Robot',
        'WomlpeFactory',
        'woriobot',
        'wotbox',
        'Xaldon_WebSpider',
        'Xenu Link Sleuth',
        'xintellibot',
        'XML Sitemaps Generator',
        'XoviBot',
        'Y!J-ASR',
        'yacy',
        'yacybot',
        'Yahoo Link Preview',
        'Yahoo! Slurp China',
        'Yahoo! Slurp',
        'YahooSeeker',
        'YahooSeeker-Testing',
        'YandexBot',
        'YandexImages',
        'YandexMetrika',
        'yandex',
        'yanga',
        'Yasaklibot',
        'yeti',
        'YioopBot',
        'YisouSpider',
        'YodaoBot',
        'yoogliFetchAgent',
        'yoozBot',
        'YoudaoBot',
        'Zao',
        'Zealbot',
        'zspider',
        'ZyBorg',
        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
    );

    /**
     * All possible HTTP headers that represent the
     * User-Agent string.
     *
     * @var array
     */
    protected static $uaHttpHeaders = array(
        // The default User-Agent string.
        'HTTP_USER_AGENT',
        // Header can occur on devices using Opera Mini.
        'HTTP_X_OPERAMINI_PHONE_UA',
        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/

        'HTTP_X_DEVICE_USER_AGENT',
        'HTTP_X_ORIGINAL_USER_AGENT',
        'HTTP_X_SKYFIRE_PHONE',
        'HTTP_X_BOLT_PHONE_UA',
        'HTTP_DEVICE_STOCK_UA',
        'HTTP_X_UCBROWSER_DEVICE_UA',
    );

    /**
     * Class constructor.
     */
    public function __construct(array $headers = null, $userAgent = null)
    {
        $this->setHttpHeaders($headers);
        $this->setUserAgent($userAgent);
    }

    public function setHttpHeaders($httpHeaders = null)
// Bad
class Router
{
    public function generate($path)
    {
        return $_SERVER['HOST'].$path;
    }
}

// Better
class Router
{
    private $host;

    public function __construct($host)
    {
        $this->host = $host;
    }

    public function generate($path)
    {
        return $this->host.$path;
    }
}

class Controller
{
    public function myAction(Request $request)
    {
        // Instead of
        $page = isset($_GET['page']) ? intval($_GET['page']) : 1;

        // Better (assuming you use the Symfony2 request)
        $page = $request->query->get('page', 1);
    }
}
    {
        // use global _SERVER if $httpHeaders aren't defined
        if (!is_array($httpHeaders) || !count($httpHeaders)) {
            $httpHeaders = $_SERVER;
        }
        // clear existing headers
        $this->httpHeaders = array();
        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
        // start with HTTP_.
        foreach ($httpHeaders as $key => $value) {
            if (substr($key, 0, 5) === 'HTTP_') {
                $this->httpHeaders[$key] = $value;
            }
        }
    }

    public function getUaHttpHeaders()
    {
        return self::$uaHttpHeaders;
    }

    public function setUserAgent($userAgent = null)
    {
        if (false === empty($userAgent)) {
            return $this->userAgent = $userAgent;
        } else {
            $this->userAgent = null;
            foreach ($this->getUaHttpHeaders() as $altHeader) {
                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
                }
            }

            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
        }
    }

    public function getRegex()
    {
        return '('.implode('|', self::$crawlers).')';
    }

    public function getIgnored()
    {
        return '('.implode('|', self::$ignore).')';
    }

    public function isCrawler($userAgent = null)
    {
        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;

        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);

        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);

        if ($matches) {

            $this->matches = $matches;
        }

        return (bool) $result;
    }

    public function getMatches()
    {
        return $this->matches[0];
    }
}


1			<?php
2
3			namespace Jaybizzle\CrawlerDetect;
4
5			class CrawlerDetect
6			{
7			protected $userAgent = null;
8
9			protected $httpHeaders = array();
10
11			protected $matches = array();
12
13			/**
14			* List of strings to remove from the user agent before running the crawler regex
15			* Over a large list of user agents, this gives us about a 55% speed increase!
16			*
17			* @var array
18			*/
19			protected static $ignore = array(
20			'Safari.[\d\.]*',
21			'Firefox.[\d\.]*',
22			'Chrome.[\d\.]*',
23			'Chromium.[\d\.]*',
24			'MSIE.[\d\.]',
25			'Opera\/[\d\.]*',
26			'Mozilla.[\d\.]*',
27			'AppleWebKit.[\d\.]*',
28			'Trident.[\d\.]*',
29			'Windows NT.[\d\.]*',
30			'Macintosh.',
31			'Ubuntu',
32			'Linux',
33			'Intel',
34			'Mac OS X',
35			'Gecko.[\d\.]*',
36			'KHTML',
37			'iPhone',
38			'like Gecko',
39			'compatible',
40			'x86_..',
41			'i686',
42			'x64',
43			'X11',
44			'rv:[\d\.]*',
45			'Version.[\d\.]*',
46			'WOW64',
47			'Win64',
48			'\.NET CLR [\d\.]*',
49			'Presto.[\d\.]*',
50			'Media Center PC',
51			);
52
53			protected static $crawlers = array(
54			'007ac9 Crawler',
55			'008\\/',
56			'360Spider',
57			'A6-Indexer',
58			'ABACHOBot',
59			'AbiLogicBot',
60			'Aboundex',
61			'Accoona-AI-Agent',
62			'acoon',
63			'AddSugarSpiderBot',
64			'AddThis',
65			'Adidxbot',
66			'ADmantX',
67			'AdvBot',
68			'AHC',
69			'ahrefsbot',
70			'aihitbot',
71			'Airmail',
72			'AISearchBot',
73			'Anemone',
74			'antibot',
75			'AnyApexBot',
76			'Applebot',
77			'arabot',
78			'Arachmo',
79			'archive-com',
80			'archive\.org_bot',
81			'B-l-i-t-z-B-O-T',
82			'backlinkcrawler',
83			'baiduspider',
84			'BecomeBot',
85			'BeslistBot',
86			'bibnum\.bnf',
87			'biglotron',
88			'BillyBobBot',
89			'Bimbot',
90			'bingbot',
91			'binlar',
92			'blekkobot',
93			'blexbot',
94			'BlitzBOT',
95			'bl\.uk_lddc_bot',
96			'bnf\.fr_bot',
97			'boitho\.com-dc',
98			'boitho\.com-robot',
99			'brainobot',
100			'btbot',
101			'BUbiNG',
102			'Butterfly\\/',
103			'buzzbot',
104			'BuzzSumo',
105			'careerbot',
106			'CatchBot',
107			'CC Metadata Scaper',
108			'ccbot',
109			'Cerberian Drtrs',
110			'changedetection',
111			'Charlotte',
112			'clips\.ua\.ac\.be',
113			'CloudFlare-AlwaysOnline',
114			'citeseerxbot',
115			'coccoc',
116			'classbot',
117			'Commons-HttpClient',
118			'content crawler spider',
119			'Content Crawler',
120			'convera',
121			'ConveraCrawler',
122			'CoPubbot',
123			'cosmos',
124			'Covario-IDS',
125			'CrawlBot',
126			'crawler4j',
127			'CrystalSemanticsBot',
128			'curl',
129			'cXensebot',
130			'CyberPatrol',
131			'DataparkSearch',
132			'dataprovider',
133			'DiamondBot',
134			'Digg',
135			'discobot',
136			'DomainAppender',
137			'domaincrawler',
138			'Domain Re-Animator Bot',
139			'dotbot',
140			'drupact',
141			'DuckDuckBot',
142			'EARTHCOM',
143			'EasouSpider',
144			'ec2linkfinder',
145			'edisterbot',
146			'ElectricMonk',
147			'elisabot',
148			'emailmarketingrobot',
149			'Embedly',
150			'EmeraldShield\.com WebBot',
151			'envolk\[ITS\]spider',
152			'EsperanzaBot',
153			'europarchive\.org',
154			'EventMachine HttpClient',
155			'exabot',
156			'ezooms',
157			'eZ Publish Link Validator',
158			'facebookexternalhit',
159			'Facebot',
160			'FAST Enteprise Crawler',
161			'FAST Enterprise Crawler',
162			'FAST-WebCrawler',
163			'FDSE robot',
164			'Feedfetcher-Google',
165			'FeedValidator',
166			'FindLinks',
167			'findlink',
168			'findthatfile',
169			'findxbot',
170			'Flamingo_SearchEngine',
171			'fluffy',
172			'fr-crawler',
173			'FRCrawler',
174			'FurlBot',
175			'FyberSpider',
176			'g00g1e\.net',
177			'GigablastOpenSource',
178			'grub-client',
179			'g2crawler',
180			'Gaisbot',
181			'GalaxyBot',
182			'genieBot',
183			'Genieo',
184			'GermCrawler',
185			'getprismatic\.com',
186			'gigabot',
187			'GingerCrawler',
188			'Girafabot',
189			'Gluten Free Crawler',
190			'gnam gnam spider',
191			'Go-http-client',
192			'Googlebot-Image',
193			'Googlebot-Mobile',
194			'Googlebot',
195			'Google-HTTP-Java-Client',
196			'Google favicon',
197			'GrapeshotCrawler',
198			'gslfbot',
199			'GurujiBot',
200			'HappyFunBot',
201			'Healthbot',
202			'heritrix',
203			'hl_ftien_spider',
204			'Holmes',
205			'htdig',
206			'httpunit',
207			'httrack',
208			'ia_archiver',
209			'iaskspider',
210			'iCCrawler',
211			'ichiro',
212			'igdeSpyder',
213			'iisbot',
214			'InAGist',
215			'InfoWizards Reciprocal Link System PRO',
216			'Insitesbot',
217			'integromedb',
218			'intelium_bot',
219			'InterfaxScanBot',
220			'IODC',
221			'IOI',
222			'ip-web-crawler\.com',
223			'ips-agent',
224			'IRLbot',
225			'IssueCrawler',
226			'IstellaBot',
227			'it2media-domain-crawler',
228			'iZSearch',
229			'Jaxified Bot',
230			'JOC Web Spider',
231			'jyxobot',
232			'KoepaBot',
233			'L\.webis',
234			'LapozzBot',
235			'Larbin',
236			'lb-spider',
237			'LDSpider',
238			'LexxeBot',
239			'libwww',
240			'Linguee Bot',
241			'Link Valet',
242			'linkdex',
243			'LinkExaminer',
244			'LinksManager\.com_bot',
245			'LinkpadBot',
246			'LinksCrawler',
247			'LinkWalker',
248			'Lipperhey Link Explorer',
249			'Lipperhey SEO Service',
250			'Livelapbot',
251			'LongURL API',
252			'lmspider',
253			'lssbot',
254			'lssrocketcrawler',
255			'ltx71',
256			'lufsbot',
257			'lwp-trivial',
258			'Mail\.RU_Bot',
259			'MegaIndex\.ru',
260			'mabontland',
261			'magpie-crawler',
262			'MagpieRSS',
263			'Mediapartners-Google',
264			'memorybot',
265			'MetaURI',
266			'MJ12bot',
267			'mlbot',
268			'Mnogosearch',
269			'mogimogi',
270			'MojeekBot',
271			'Moreoverbot',
272			'Morning Paper',
273			'Mrcgiguy',
274			'MSIECrawler',
275			'msnbot',
276			'msrbot',
277			'MVAClient',
278			'mxbot',
279			'NerdByNature\.Bot',
280			'NerdyBot',
281			'netEstate NE Crawler',
282			'netresearchserver',
283			'NetSeer Crawler',
284			'NewsGator',
285			'newsme',
286			'NextGenSearchBot',
287			'NG-Search',
288			'ngbot',
289			'nicebot',
290			'niki-bot',
291			'^NING\\/',
292			'Notifixious',
293			'noxtrumbot',
294			'Nusearch Spider',
295			'nutch',
296			'NutchCVS',
297			'Nymesis',
298			'obot',
299			'oegp',
300			'ocrawler',
301			'omgilibot',
302			'OmniExplorer_Bot',
303			'online link validator',
304			'Online Website Link Checker',
305			'OOZBOT',
306			'openindexspider',
307			'OpenWebSpider',
308			'OrangeBot',
309			'Orbiter',
310			'ow\.ly',
311			'PaperLiBot',
312			'Pingdom\.com_bot',
313			'Ploetz \+ Zeller',
314			'page2rss',
315			'PageBitesHyperBot',
316			'panscient',
317			'Peew',
318			'PercolateCrawler',
319			'phpcrawl',
320			'Pizilla',
321			'Plukkie',
322			'polybot',
323			'Pompos',
324			'postano',
325			'PostPost',
326			'postrank',
327			'proximic',
328			'psbot',
329			'purebot',
330			'PycURL',
331			'Python-httplib2',
332			'python-requests',
333			'Python-urllib',
334			'Qseero',
335			'QuerySeekerSpider',
336			'Qwantify',
337			'Radian6',
338			'RAMPyBot',
339			'RebelMouse',
340			'REL Link Checker',
341			'RetrevoPageAnalyzer',
342			'Riddler',
343			'Robosourcer',
344			'rogerbot',
345			'Ruby',
346			'RufusBot',
347			'SandCrawler',
348			'SBIder',
349			'ScoutJet',
350			'ScoutURLMonitor',
351			'Scrapy',
352			'ScreenerBot',
353			'scribdbot',
354			'Scrubby',
355			'SearchmetricsBot',
356			'SearchSight',
357			'seekbot',
358			'semanticdiscovery',
359			'SemrushBot',
360			'Sensis Web Crawler',
361			'SEOChat::Bot',
362			'seokicks-robot',
363			'SEOstats',
364			'Seznam screenshot-generator',
365			'seznambot',
366			'Shim-Crawler',
367			'ShopWiki',
368			'Shoula robot',
369			'ShowyouBot',
370			'SimpleCrawler',
371			'sistrix crawler',
372			'SiteBar',
373			'sitebot',
374			'siteexplorer\.info',
375			'SklikBot',
376			'slider\.com',
377			'slurp',
378			'smtbot',
379			'Snappy',
380			'sogou spider',
381			'sogou',
382			'Sosospider',
383			'spbot',
384			'Speedy Spider',
385			'speedy',
386			'SpiderMan',
387			'Sqworm',
388			'SSL-Crawler',
389			'StackRambler',
390			'Stratagems Kumo',
391			'suggybot',
392			'summify',
393			'SurdotlyBot',
394			'SurveyBot',
395			'SynooBot',
396			'tagoobot',
397			'teoma',
398			'TerrawizBot',
399			'theoldreader.com',
400			'TheSuBot',
401			'Thumbnail\.CZ robot',
402			'TinEye',
403			'toplistbot',
404			'Traackr.com',
405			'trendictionbot',
406			'TrueBot',
407			'truwoGPS',
408			'turnitinbot',
409			'TweetedTimes Bot',
410			'tweetedtimes\.com',
411			'TweetmemeBot',
412			'twengabot',
413			'Twikle',
414			'Twitterbot',
415			'uMBot',
416			'UnisterBot',
417			'UnwindFetchor',
418			'updated',
419			'urlappendbot',
420			'Urlfilebot',
421			'urlresolver',
422			'UsineNouvelleCrawler',
423			'Validator\.nu\\/LV',
424			'Vagabondo',
425			'Vivante Link Checker',
426			'voilabot',
427			'Vortex',
428			'voyager\\/',
429			'VYU2',
430			'W3C-checklink',
431			'W3C_CSS_Validator_JFouffa',
432			'W3C_I18n-Checker',
433			'W3C-mobileOK',
434			'W3C_Unicorn',
435			'W3C_Validator',
436			'WebIndex',
437			'web-archive-net\.com\.bot',
438			'Websquash\.com',
439			'WeSEE:Ads\\/PageBot',
440			'wbsearchbot',
441			'webcollage',
442			'webcompanycrawler',
443			'webcrawler',
444			'webmon ',
445			'WeSEE:Search',
446			'wf84',
447			'wget',
448			'wocbot',
449			'WoFindeIch Robot',
450			'WomlpeFactory',
451			'woriobot',
452			'wotbox',
453			'Xaldon_WebSpider',
454			'Xenu Link Sleuth',
455			'xintellibot',
456			'XML Sitemaps Generator',
457			'XoviBot',
458			'Y!J-ASR',
459			'yacy',
460			'yacybot',
461			'Yahoo Link Preview',
462			'Yahoo! Slurp China',
463			'Yahoo! Slurp',
464			'YahooSeeker',
465			'YahooSeeker-Testing',
466			'YandexBot',
467			'YandexImages',
468			'YandexMetrika',
469			'yandex',
470			'yanga',
471			'Yasaklibot',
472			'yeti',
473			'YioopBot',
474			'YisouSpider',
475			'YodaoBot',
476			'yoogliFetchAgent',
477			'yoozBot',
478			'YoudaoBot',
479			'Zao',
480			'Zealbot',
481			'zspider',
482			'ZyBorg',
483			'[a-z0-9\\-_]*((?<!cu)bot\|crawler\|archiver\|transcoder\|spider)',
484			);
485
486			/**
487			* All possible HTTP headers that represent the
488			* User-Agent string.
489			*
490			* @var array
491			*/
492			protected static $uaHttpHeaders = array(
493			// The default User-Agent string.
494			'HTTP_USER_AGENT',
495			// Header can occur on devices using Opera Mini.
496			'HTTP_X_OPERAMINI_PHONE_UA',
497			// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
			0 ignored issues – show Unused Code Comprehensibility introduced 2015-11-25 13:32 UTC by Report Bug Copy Issue Report `38%` of this comment could be valid code. Did you maybe forget this after debugging? Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. Loading history...
498			'HTTP_X_DEVICE_USER_AGENT',
499			'HTTP_X_ORIGINAL_USER_AGENT',
500			'HTTP_X_SKYFIRE_PHONE',
501			'HTTP_X_BOLT_PHONE_UA',
502			'HTTP_DEVICE_STOCK_UA',
503			'HTTP_X_UCBROWSER_DEVICE_UA',
504			);
505
506			/**
507			* Class constructor.
508			*/
509			public function __construct(array $headers = null, $userAgent = null)
510			{
511			$this->setHttpHeaders($headers);
512			$this->setUserAgent($userAgent);
513			}
514
515			public function setHttpHeaders($httpHeaders = null)
			0 ignored issues – show Coding Style introduced 2015-11-25 13:32 UTC by Report Bug Copy Issue Report `setHttpHeaders` uses the super-global variable `$_SERVER` which is generally not recommended. Instead of super-globals, we recommend to explicitly inject the dependencies of your class. This makes your code less dependent on global state and it becomes generally more testable: // Bad class Router { public function generate($path) { return $_SERVER['HOST'].$path; } } // Better class Router { private $host; public function __construct($host) { $this->host = $host; } public function generate($path) { return $this->host.$path; } } class Controller { public function myAction(Request $request) { // Instead of $page = isset($_GET['page']) ? intval($_GET['page']) : 1; // Better (assuming you use the Symfony2 request) $page = $request->query->get('page', 1); } } Loading history...
516			{
517			// use global _SERVER if $httpHeaders aren't defined
518			if (!is_array($httpHeaders) \|\| !count($httpHeaders)) {
519			$httpHeaders = $_SERVER;
520			}
521			// clear existing headers
522			$this->httpHeaders = array();
523			// Only save HTTP headers. In PHP land, that means only _SERVER vars that
524			// start with HTTP_.
525			foreach ($httpHeaders as $key => $value) {
526			if (substr($key, 0, 5) === 'HTTP_') {
527			$this->httpHeaders[$key] = $value;
528			}
529			}
530			}
531
532			public function getUaHttpHeaders()
533			{
534			return self::$uaHttpHeaders;
535			}
536
537			public function setUserAgent($userAgent = null)
538			{
539			if (false === empty($userAgent)) {
540			return $this->userAgent = $userAgent;
541			} else {
542			$this->userAgent = null;
543			foreach ($this->getUaHttpHeaders() as $altHeader) {
544			if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. (Serban)
545			$this->userAgent .= $this->httpHeaders[$altHeader].' ';
546			}
547			}
548
549			return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
550			}
551			}
552
553			public function getRegex()
554			{
555			return '('.implode('\|', self::$crawlers).')';
556			}
557
558			public function getIgnored()
559			{
560			return '('.implode('\|', self::$ignore).')';
561			}
562
563			public function isCrawler($userAgent = null)
564			{
565			$agent = is_null($userAgent) ? $this->userAgent : $userAgent;
566
567			$agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
568
569			$result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
570
571			if ($matches) {
			0 ignored issues – show Bug Best Practice introduced 2015-11-25 13:32 UTC by Report Bug Copy Issue Report The expression `$matches` of type `string[]` is implicitly converted to a boolean; are you sure this is intended? If so, consider using `! empty($expr)` instead to make it clear that you intend to check for an array without elements. This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using `empty(..)` or `! empty(...)` instead. Loading history...
572			$this->matches = $matches;
573			}
574
575			return (bool) $result;
576			}
577
578			public function getMatches()
579			{
580			return $this->matches[0];
581			}
582			}
583

JayBizzle / Crawler-Detect

Pull Request — master (#42)

CrawlerDetect::getRegex() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like