crawler.is_external - Code Metrics - Inspection of "#7 Functional Tests" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3f2ac7...f065be )

by Dylan

created 2016-09-28 07:21 UTC

crawler.is_external A

↳ Parent: js/crawler.js

Complexity

Conditions	1
Paths	6

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
dl	0
loc	10
rs	9.4285
c	1
b	0
f	0
cc	1
nc	6
nop	1

const crawler = {

    que             : [],
    tested          : [],
    crawling        : [],
    failed          : [],
    tests           : [],
    ignore_paths    : [],
    crawl_id        : undefined,
    events          : {},
    linked_from     : {},
    useragent       : 'desktop',

    /**
     * Register a test to run.
     *
     * @param {string} name
     * @param {string} title
     * @param {Array} headers
     * @param {string} callable
     * @returns {undefined}
     * @throws Exception
     */
    regiser_test: function(name, title, headers, callable){
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
        if(title == undefined) throw 'Title not specified';
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
        return undefined;
    },

    /**
     * Return a registered test by name
     *
     * @param {string} name
     * @returns {object|false}
     */
    get_test_by_name: function(name){
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
        return false;
    },

    /**
     * Check if the url passed is valid for crawling, if so and it hasn't
     * been added or crawled before, add it to the que
     *
     * Returns true|false if added to que
     *
     * @param {string} url
     * @returns {boolean}
     */
    que_url: function(url){
        var sanitized = this.sanitize(url);
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
        this.que.push(sanitized);
        return true;
    },

    /**
     *  Clean up a url so it becomes relative and standardized
     *
     * @param {string} url
     * @returns {string}
     */
    sanitize: function(url){
        if(url == undefined) return '';

        url = url
            .replace(/^\/|\/$/g, '')
            .replace(/https?:\/\/[^\/]+/i, '')
            .replace(/^\/|\/$/g, '')
            .split('#')[0];

        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
        if( url.length < 1 ) url = '/';

        return url;
    },

    /**
     * Get the domain for the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_domain: function(url){
        if( !url ) return '';
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
        else return url.split('/')[0].split(':')[0];
    },

    /**
     * Checks if the passed url should be ignored or not
     *
     * @param {string} url
     * @returns {boolean}
     */
    ignore_url: function( url ){
        for(var regex in this.ignore_paths) {
            var reg = new RegExp(this.ignore_paths[regex], 'i');
            if( url.match(reg) != null ) return true;
        }
        return false;
    },

    /**
     * Add a path to ignore when crawler
     * Note: Paths can be in regex format
     *
     * @param {string} path
     * @returns {crawler}
     */
    add_ignore_path: function(path){
        this.ignore_paths.push(path);
        return this;
    },

    /**
     * Update all ignore paths to the paths specified
     * Note: Path can be in regex format
     *
     * @param paths
     * @returns {crawler}
     */
    set_ignore_paths: function(paths){
        this.ignore_paths = paths;
        return this;
    },

    /**
     * Sets the crawl id
     *
     * @param crawl_id
     * @returns {crawler}
     */
    set_crawl_id: function(crawl_id){
        this.crawl_id = crawl_id;
        return this;
    },

    /**
     * Does some soft checks to determine if url is a valid candidate for crawling
     *
     * @param {string} url
     * @returns {boolean}
     */
    can_crawl: function(url){
        if(url == undefined) return false;
        return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 ||
                    this.is_file(url) || this.ignore_url(url) || this.is_external(url));
    },

    /**
     * Does a soft check for the url passed and checks if it's a file
     * by checking if it has an extension and if the extension contains 'html'
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_file: function(url){
        var split = this.sanitize( url ).split( '.' );
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
    },

    /**
     * Does some soft checking for the url passed to see if it's external
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_external: function(url){
        return !(
            url.length < 1              ||
            url[0] == '/'               ||
            url[0] == '#'               ||
            url.indexOf('://') < 0      ||
            url == this.sanitize( url ) ||
            this.get_domain( url ) == location.hostname
        );
    },

    /**
     * Checks if the href passed is an anchor link for url passed.
     *
     * @param {string} href
     * @param {string} url
     * @return {boolean}
     */
    is_anchor: function(href, url){
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
    },

    /**
     * Fetch the next url from the que and run the tests on it
     */
    fetch_and_test: function(){
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;

        var url = this.que.pop();
        this.crawling.push(url);

        $.ajax({
            url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
        })
            .done(function( result ) {
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
                    if( !crawler.is_external(result['url_fetched']) ) {
                        url = crawler.sanitize(result['url_fetched']);
                        if(crawler.tested.indexOf(url) >= 0){
                            this.skipped = true;
                            return true;
                        }

                        var html = $(crawler.strip_img_src(result['body']));
                        crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
                        crawler.fetch_links(html, url);
                        crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
                        crawler.trigger('CRAWL_AFTER_TESTS', [url]);
                        return true;
                    }
                }
                crawler.failed.push(url);
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
            })
            .fail( function(){
                crawler.failed.push(url);
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
            })
            .always( function(){
                if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) {
                    crawler.tested.push(url)
                }
                return crawler.trigger('CRAWL_FINISHED', [url]);
            });

    },

    /**
     * Check for links in the html of the rendered page so we add them to the que
     * and also map how pages are linked to each other
     *
     * @param {jQuery} html
     * @param {string} url
     */
    fetch_links: function(html, url){
        $.each(html.find('a'), function(){
            var href    = $(this).attr('href'),
                link    = crawler.sanitize(href);

            crawler.que_url( href );

            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
        });
    },

    /**
     * Run the registered tests
     *
     * @param {string} url
     * @param {jQuery} html
     * @param {Array} headers
     * @param {Array} field_data
     * @param {Array} phrases
     */
    run_tests: function(url, html, headers, field_data, phrases){
        for(var t in this.tests) {
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
        }
    },

    /**
     * Trigger event callback and pass on the data
     *
     * @param {string} event
     * @param {*} data
     */
    trigger: function(event, data){
        if(this.events.hasOwnProperty(event))
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
    },

    /**
     * Register callback on action
     *
     * @param {string} event
     * @param {function} callback
     * @returns {crawler}
     */
    on: function(event, callback){
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
        this.events[event].push(callback);
    },

    /**
     * Strip out src=<anything> so that we avoid loading the images
     * on the pages
     *
     * @param {string}html
     * @returns {string}
     */
    strip_img_src: function(html){
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
    },

    /**
     * Return the proxy url to test the passed url
     *
     * @param {$string} url
     * @returns {string}
     */
    get_proxy: function(url){
        return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
    },

    /**
     * @see crawler_painter.add_row(name, data)
     * @param {string} name
     * @param {Array} data
     */
    add_row: function(name, data){
        crawler_painter.add_row(name, data);
    },

    /**
     * Returns the word count for a given set of sentences or string
     *
     * @param {string|array} data
     * @returns {number}
     */
    get_word_count: function(data){
        if( typeof data === 'string' ) return data.split(' ').length;

        var count = 0;
        for( var d in data ) count += data[d].split(' ').length;
        return count;
    },

    /**
     * Set an arbitrary property on the crawler object
     *
     * @param {string} property
     * @param {string|int} key
     * @param {*} val
     * @return undefined
     */
    set_property: function(property, key, val){
        if(!this.hasOwnProperty(property)) this[property] = {};
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
        else this[property][key].push(val);
    },

    /**
     * Start the crawler
     *
     * @param {object} settings
     * @throws Exception
     */
    init: function(settings){
        this.trigger('BEFORE_INIT', []);

        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);

        if( !this.crawl_id ) throw "crawl_id must be specified";

        // When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
        this.on('CRAWL_FINISHED', function(){
            if( crawler.que.length > 0 ) crawler.fetch_and_test();
            else window.clearInterval(crawler.interval);
        });

        // Every second try to initialize a new crawl request just in-case something crashes
        this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);

        crawler_painter.init();
        this.trigger('AFTER_INIT', []);
    }
};


1			const crawler = {
2
3			que : [],
4			tested : [],
5			crawling : [],
6			failed : [],
7			tests : [],
8			ignore_paths : [],
9			crawl_id : undefined,
10			events : {},
11			linked_from : {},
12			useragent : 'desktop',
13
14			/**
15			* Register a test to run.
16			*
17			* @param {string} name
18			* @param {string} title
19			* @param {Array} headers
20			* @param {string} callable
21			* @returns {undefined}
22			* @throws Exception
23			*/
24			regiser_test: function(name, title, headers, callable){
25			if(name == undefined \|\| this.get_test_by_name(name)) throw 'Invalid name specified for your test';
26			if(title == undefined) throw 'Title not specified';
27			if(!(headers instanceof Array) \|\| headers.length < 1) throw 'Headers array is invalid';
28			if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
29			this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
30			return undefined;
31			},
32
33			/**
34			* Return a registered test by name
35			*
36			* @param {string} name
37			* @returns {object\|false}
38			*/
39			get_test_by_name: function(name){
40			for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
41			return false;
42			},
43
44			/**
45			* Check if the url passed is valid for crawling, if so and it hasn't
46			* been added or crawled before, add it to the que
47			*
48			* Returns true\|false if added to que
49			*
50			* @param {string} url
51			* @returns {boolean}
52			*/
53			que_url: function(url){
54			var sanitized = this.sanitize(url);
55			if( !this.can_crawl(url) \|\| this.que.indexOf(sanitized) > -1 \|\| !this.can_crawl(sanitized)) return false;
56			this.que.push(sanitized);
57			return true;
58			},
59
60			/**
61			* Clean up a url so it becomes relative and standardized
62			*
63			* @param {string} url
64			* @returns {string}
65			*/
66			sanitize: function(url){
67			if(url == undefined) return '';
68
69			url = url
70			.replace(/^\/\|\/$/g, '')
71			.replace(/https?:\/\/[^\/]+/i, '')
72			.replace(/^\/\|\/$/g, '')
73			.split('#')[0];
74
75			if( url.slice(-1) == '?' ) url = url.slice(0, -1);
76			if( url.length < 1 ) url = '/';
77
78			return url;
79			},
80
81			/**
82			* Get the domain for the passed url
83			*
84			* @param {string} url
85			* @returns {string}
86			*/
87			get_domain: function(url){
88			if( !url ) return '';
89			if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
90			else return url.split('/')[0].split(':')[0];
91			},
92
93			/**
94			* Checks if the passed url should be ignored or not
95			*
96			* @param {string} url
97			* @returns {boolean}
98			*/
99			ignore_url: function( url ){
100			for(var regex in this.ignore_paths) {
101			var reg = new RegExp(this.ignore_paths[regex], 'i');
102			if( url.match(reg) != null ) return true;
103			}
104			return false;
105			},
106
107			/**
108			* Add a path to ignore when crawler
109			* Note: Paths can be in regex format
110			*
111			* @param {string} path
112			* @returns {crawler}
113			*/
114			add_ignore_path: function(path){
115			this.ignore_paths.push(path);
116			return this;
117			},
118
119			/**
120			* Update all ignore paths to the paths specified
121			* Note: Path can be in regex format
122			*
123			* @param paths
124			* @returns {crawler}
125			*/
126			set_ignore_paths: function(paths){
127			this.ignore_paths = paths;
128			return this;
129			},
130
131			/**
132			* Sets the crawl id
133			*
134			* @param crawl_id
135			* @returns {crawler}
136			*/
137			set_crawl_id: function(crawl_id){
138			this.crawl_id = crawl_id;
139			return this;
140			},
141
142			/**
143			* Does some soft checks to determine if url is a valid candidate for crawling
144			*
145			* @param {string} url
146			* @returns {boolean}
147			*/
148			can_crawl: function(url){
149			if(url == undefined) return false;
150			return !(this.crawling.indexOf(url) >= 0 \|\| this.tested.indexOf(url) >= 0 \|\|
151			this.is_file(url) \|\| this.ignore_url(url) \|\| this.is_external(url));
152			},
153
154			/**
155			* Does a soft check for the url passed and checks if it's a file
156			* by checking if it has an extension and if the extension contains 'html'
157			*
158			* @param {string} url
159			* @returns {boolean}
160			*/
161			is_file: function(url){
162			var split = this.sanitize( url ).split( '.' );
163			return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
164			},
165
166			/**
167			* Does some soft checking for the url passed to see if it's external
168			* Note: If the url is internal but redirects to an external source, we wown't detect it here
169			*
170			* @param {string} url
171			* @returns {boolean}
172			*/
173			is_external: function(url){
174			return !(
175			url.length < 1 \|\|
176			url[0] == '/' \|\|
177			url[0] == '#' \|\|
178			url.indexOf('://') < 0 \|\|
179			url == this.sanitize( url ) \|\|
180			this.get_domain( url ) == location.hostname
181			);
182			},
183
184			/**
185			* Checks if the href passed is an anchor link for url passed.
186			*
187			* @param {string} href
188			* @param {string} url
189			* @return {boolean}
190			*/
191			is_anchor: function(href, url){
192			return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
193			},
194
195			/**
196			* Fetch the next url from the que and run the tests on it
197			*/
198			fetch_and_test: function(){
199			if( !this.que \|\| this.que.length < 1 \|\| this.que.length < 1 \|\| $.active > 2 ) return false;
200
201			var url = this.que.pop();
202			this.crawling.push(url);
203
204			$.ajax({
205			url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
206			})
207			.done(function( result ) {
208			if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
209			if( !crawler.is_external(result['url_fetched']) ) {
210			url = crawler.sanitize(result['url_fetched']);
211			if(crawler.tested.indexOf(url) >= 0){
212			this.skipped = true;
213			return true;
214			}
215
216			var html = $(crawler.strip_img_src(result['body']));
217			crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
218			crawler.fetch_links(html, url);
219			crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
220			crawler.trigger('CRAWL_AFTER_TESTS', [url]);
221			return true;
222			}
223			}
224			crawler.failed.push(url);
225			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
226			})
227			.fail( function(){
228			crawler.failed.push(url);
229			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
230			})
231			.always( function(){
232			if((this.hasOwnProperty('skipped') && this.skipped) \|\| crawler.tested.indexOf(url) < 0 ) {
233			crawler.tested.push(url)
234			}
235			return crawler.trigger('CRAWL_FINISHED', [url]);
236			});
			0 ignored issues – show Best Practice introduced 2016-09-25 13:32 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
237			},
238
239			/**
240			* Check for links in the html of the rendered page so we add them to the que
241			* and also map how pages are linked to each other
242			*
243			* @param {jQuery} html
244			* @param {string} url
245			*/
246			fetch_links: function(html, url){
247			$.each(html.find('a'), function(){
248			var href = $(this).attr('href'),
249			link = crawler.sanitize(href);
250
251			crawler.que_url( href );
252
253			if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
254			else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
255			});
256			},
257
258			/**
259			* Run the registered tests
260			*
261			* @param {string} url
262			* @param {jQuery} html
263			* @param {Array} headers
264			* @param {Array} field_data
265			* @param {Array} phrases
266			*/
267			run_tests: function(url, html, headers, field_data, phrases){
268			for(var t in this.tests) {
269			this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
270			this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
271			this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
272			}
273			},
274
275			/**
276			* Trigger event callback and pass on the data
277			*
278			* @param {string} event
279			* @param {*} data
280			*/
281			trigger: function(event, data){
282			if(this.events.hasOwnProperty(event))
283			for(var e in this.events[event]) this.events[event][e].apply(this, data);
284			},
285
286			/**
287			* Register callback on action
288			*
289			* @param {string} event
290			* @param {function} callback
291			* @returns {crawler}
292			*/
293			on: function(event, callback){
294			if(!this.events.hasOwnProperty(event)) this.events[event] = [];
295			this.events[event].push(callback);
296			},
297
298			/**
299			* Strip out src=<anything> so that we avoid loading the images
300			* on the pages
301			*
302			* @param {string}html
303			* @returns {string}
304			*/
305			strip_img_src: function(html){
306			return html.replace( /(src).?=(['\|"].?['\|"])/ig, '' );
307			},
308
309			/**
310			* Return the proxy url to test the passed url
311			*
312			* @param {$string} url
313			* @returns {string}
314			*/
315			get_proxy: function(url){
316			return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
317			},
318
319			/**
320			* @see crawler_painter.add_row(name, data)
321			* @param {string} name
322			* @param {Array} data
323			*/
324			add_row: function(name, data){
325			crawler_painter.add_row(name, data);
326			},
327
328			/**
329			* Returns the word count for a given set of sentences or string
330			*
331			* @param {string\|array} data
332			* @returns {number}
333			*/
334			get_word_count: function(data){
335			if( typeof data === 'string' ) return data.split(' ').length;
336
337			var count = 0;
338			for( var d in data ) count += data[d].split(' ').length;
339			return count;
340			},
341
342			/**
343			* Set an arbitrary property on the crawler object
344			*
345			* @param {string} property
346			* @param {string\|int} key
347			* @param {*} val
348			* @return undefined
349			*/
350			set_property: function(property, key, val){
351			if(!this.hasOwnProperty(property)) this[property] = {};
352			if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
353			else this[property][key].push(val);
354			},
355
356			/**
357			* Start the crawler
358			*
359			* @param {object} settings
360			* @throws Exception
361			*/
362			init: function(settings){
363			this.trigger('BEFORE_INIT', []);
364
365			if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
366			if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
367
368			if( !this.crawl_id ) throw "crawl_id must be specified";
369
370			// When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
371			this.on('CRAWL_FINISHED', function(){
372			if( crawler.que.length > 0 ) crawler.fetch_and_test();
373			else window.clearInterval(crawler.interval);
374			});
375
376			// Every second try to initialize a new crawl request just in-case something crashes
377			this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);
378
379			crawler_painter.init();
380			this.trigger('AFTER_INIT', []);
381			}
382			};
383

dylangrech92 / seotoolbox

Push — master ( 3f2ac7...f065be )

crawler.is_external A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like