crawler.failed_url - Code Metrics - Inspection of "#17 Show links to Error Page" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — develop ( 8024f0...d911c7 )

by Dylan

created 2016-09-29 12:17 UTC

crawler.failed_url A

↳ Parent: js/crawler.js

Complexity

Conditions	1
Paths	1

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
c	0
b	0
f	0
nc	1
nop	1
dl	0
loc	4
rs	10

const crawler = {

    que             : [],
    tested          : [],
    crawling        : [],
    failed          : [],
    tests           : [],
    ignore_paths    : [],
    crawl_id        : undefined,
    events          : {},
    linked_from     : {},
    redirects       : {},
    useragent       : 'desktop',

    /**
     * Register a test to run.
     *
     * @param {string} name
     * @param {string} title
     * @param {Array} headers
     * @param {*} callable
     * @returns {undefined}
     * @throws Exception
     */
    regiser_test: function(name, title, headers, callable){
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
        if(title == undefined) throw 'Title not specified';
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
        return undefined;
    },

    /**
     * Return a registered test by name
     *
     * @param {string} name
     * @returns {object|false}
     */
    get_test_by_name: function(name){
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
        return false;
    },

    /**
     * Check if the url passed is valid for crawling, if so and it hasn't
     * been added or crawled before, add it to the que
     *
     * Returns true|false if added to que
     *
     * @param {string} url
     * @returns {boolean}
     */
    que_url: function(url){
        var sanitized = this.sanitize(url);
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
        this.que.push(sanitized);
        return true;
    },

    /**
     *  Clean up a url so it becomes relative and standardized
     *
     * @param {string} url
     * @returns {string}
     */
    sanitize: function(url){
        if(url == undefined) return '';

        url = url
            .replace(/^\/|\/$/g, '')
            .replace(/https?:\/\/[^\/]+/i, '')
            .replace(/^\/|\/$/g, '')
            .split('#')[0];

        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
        if( url.length < 1 ) url = '/';

        return url;
    },

    /**
     * Get the domain for the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_domain: function(url){
        if( !url ) return '';
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
        else return url.split('/')[0].split(':')[0];
    },

    /**
     * Checks if the passed url should be ignored or not
     *
     * @param {string} url
     * @returns {boolean}
     */
    ignore_url: function( url ){
        for(var regex in this.ignore_paths) {
            var reg = new RegExp(this.ignore_paths[regex], 'i');
            if( url.match(reg) != null ) return true;
        }
        return false;
    },

    /**
     * Add a path to ignore when crawler
     * Note: Paths can be in regex format
     *
     * @param {string} path
     * @returns {crawler}
     */
    add_ignore_path: function(path){
        this.ignore_paths.push(path);
        return this;
    },

    /**
     * Update all ignore paths to the paths specified
     * Note: Path can be in regex format
     *
     * @param paths
     * @returns {crawler}
     */
    set_ignore_paths: function(paths){
        this.ignore_paths = paths;
        return this;
    },

    /**
     * Sets the crawl id
     *
     * @param crawl_id
     * @returns {crawler}
     */
    set_crawl_id: function(crawl_id){
        this.crawl_id = crawl_id;
        return this;
    },

    /**
     * Does some soft checks to determine if url is a valid candidate for crawling
     *
     * @param {string} url
     * @returns {boolean}
     */
    can_crawl: function(url){
        if(url == undefined) return false;
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
    },

    /**
     * Does a soft check for the url passed and checks if it's a file
     * by checking if it has an extension and if the extension contains 'html'
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_file: function(url){
        var split = this.sanitize( url ).split( '.' );
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
    },

    /**
     * Does some soft checking for the url passed to see if it's external
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_external: function(url){
        return !(
            url.length < 1              ||
            url[0] == '/'               ||
            url[0] == '#'               ||
            url.indexOf('://') < 0      ||
            url == this.sanitize( url ) ||
            this.get_domain( url ) == location.hostname
        );
    },

    /**
     * Checks if the href passed is an anchor link for url passed.
     *
     * @param {string} href
     * @param {string} url
     * @return {boolean}
     */
    is_anchor: function(href, url){
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
    },

    /**
     * Check if that target we requested matches the response we got.
     * If not mark as a redirect and append the redirect to be crawled
     *
     * @param {string} target
     * @param {string} response
     * @return {boolean}
     */
    check_fetched_url: function(target, response){
        if(target != response){
            this.redirects[target] = response;
            this.que_url(response);
            return false;
        }

        return true;
    },

    /**
     * Fetch the next url from the que and run the tests on it
     */
    fetch_and_test: function(){
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;

        var url = this.que.pop();
        this.crawling.push(url);

        $.ajax({
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
            data: { agent: this.useragent },
            accepts: 'json',
            dataType: 'json'
        })
            .done(function( result ) {
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
                    var fetched = crawler.sanitize(result['url_fetched']);
                    if(!crawler.check_fetched_url(url, fetched)){
                        this.skipped = true;
                        return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
                    }

                    var html = $(crawler.strip_img_src(result['body']));
                    crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
                    crawler.fetch_links(html, url);
                    crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
                    return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
                }else{
                    return crawler.failed(url);
                }
            })
            .fail( function(){
                return crawler.failed_url(url);
            })
            .always( function(){
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);

                if(!this.hasOwnProperty('skipped')){
                    crawler.tested.push(url);
                }

                crawler.trigger('CRAWL_FINISHED', [url]);

                if( crawler.que.length < 1 && crawler.crawling.length < 1){
                    crawler.trigger('ALL_CRAWLS_FINISHED', []);
                }

                return crawler.fetch_and_test();
            });

    },

    /**
     * Check for links in the html of the rendered page so we add them to the que
     * and also map how pages are linked to each other
     *
     * @param {jQuery} html
     * @param {string} url
     */
    fetch_links: function(html, url){
        $.each(html.find('a'), function(){
            var href    = $(this).attr('href'),
                link    = crawler.sanitize(href);

            crawler.que_url( href );

            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
        });
    },

    /**
     * Run the registered tests
     *
     * @param {string} url
     * @param {jQuery} html
     * @param {Array} headers
     * @param {Array} field_data
     * @param {Array} phrases
     */
    run_tests: function(url, html, headers, field_data, phrases){
        for(var t in this.tests) {
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
            this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
        }
    },

    /**
     * Trigger event callback and pass on the data
     *
     * @param {string} event
     * @param {*} data
     * return {undefined}
     */
    trigger: function(event, data){
        if(this.events.hasOwnProperty(event))
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
    },

    /**
     * Register callback on action
     *
     * @param {string} event
     * @param {function} callback
     * @returns {crawler}
     */
    on: function(event, callback){
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
        this.events[event].push(callback);
    },

    /**
     * Strip out src=<anything> so that we avoid loading the images
     * on the pages
     *
     * @param {string}html
     * @returns {string}
     */
    strip_img_src: function(html){
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
    },

    /**
     * Return the proxy url to test the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_proxy: function(url){
        return location.protocol + '//' + location.hostname + url;
    },

    /**
     * @see crawler_painter.add_row(name, data)
     * @param {string} name
     * @param {Array} data
     */
    add_row: function(name, data){
        crawler_painter.add_row(name, data);
    },

    /**
     * Returns the word count for a given set of sentences or string
     *
     * @param {string|array} data
     * @returns {number}
     */
    get_word_count: function(data){
        if( typeof data === 'string' ) return data.split(' ').length;

        var count = 0;
        for( var d in data ) count += data[d].split(' ').length;
        return count;
    },

    /**
     * Set an arbitrary property on the crawler object
     *
     * @param {string} property
     * @param {string|int} key
     * @param {*} val
     * @return undefined
     */
    set_property: function(property, key, val){
        if(!this.hasOwnProperty(property)) this[property] = {};
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
        else this[property][key].push(val);
    },

    /**
     * Add the failed url to the failed list and trigger the failed event
     *
     * @param {string} url
     * @returns {undefined}
     */
    failed_url: function(url){
        this.failed.push(url);
        return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
    },

    /**
     * Start the crawler
     *
     * @param {object} settings
     * @throws Exception
     */
    init: function(settings){
        this.trigger('BEFORE_INIT', []);

        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);

        if( !this.crawl_id ) throw "crawl_id must be specified";

        crawler.fetch_and_test();
        crawler.fetch_and_test();

        crawler_painter.init();
        this.trigger('AFTER_INIT', []);
    }
};


1			const crawler = {
2
3			que : [],
4			tested : [],
5			crawling : [],
6			failed : [],
7			tests : [],
8			ignore_paths : [],
9			crawl_id : undefined,
10			events : {},
11			linked_from : {},
12			redirects : {},
13			useragent : 'desktop',
14
15			/**
16			* Register a test to run.
17			*
18			* @param {string} name
19			* @param {string} title
20			* @param {Array} headers
21			* @param {*} callable
22			* @returns {undefined}
23			* @throws Exception
24			*/
25			regiser_test: function(name, title, headers, callable){
26			if(name == undefined \|\| this.get_test_by_name(name)) throw 'Invalid name specified for your test';
27			if(title == undefined) throw 'Title not specified';
28			if(!(headers instanceof Array) \|\| headers.length < 1) throw 'Headers array is invalid';
29			if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
30			this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
31			return undefined;
32			},
33
34			/**
35			* Return a registered test by name
36			*
37			* @param {string} name
38			* @returns {object\|false}
39			*/
40			get_test_by_name: function(name){
41			for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
42			return false;
43			},
44
45			/**
46			* Check if the url passed is valid for crawling, if so and it hasn't
47			* been added or crawled before, add it to the que
48			*
49			* Returns true\|false if added to que
50			*
51			* @param {string} url
52			* @returns {boolean}
53			*/
54			que_url: function(url){
55			var sanitized = this.sanitize(url);
56			if( !this.can_crawl(url) \|\| this.que.indexOf(sanitized) > -1 \|\| !this.can_crawl(sanitized)) return false;
57			this.que.push(sanitized);
58			return true;
59			},
60
61			/**
62			* Clean up a url so it becomes relative and standardized
63			*
64			* @param {string} url
65			* @returns {string}
66			*/
67			sanitize: function(url){
68			if(url == undefined) return '';
69
70			url = url
71			.replace(/^\/\|\/$/g, '')
72			.replace(/https?:\/\/[^\/]+/i, '')
73			.replace(/^\/\|\/$/g, '')
74			.split('#')[0];
75
76			if( url.slice(-1) == '?' ) url = url.slice(0, -1);
77			if( url.length < 1 ) url = '/';
78
79			return url;
80			},
81
82			/**
83			* Get the domain for the passed url
84			*
85			* @param {string} url
86			* @returns {string}
87			*/
88			get_domain: function(url){
89			if( !url ) return '';
90			if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
91			else return url.split('/')[0].split(':')[0];
92			},
93
94			/**
95			* Checks if the passed url should be ignored or not
96			*
97			* @param {string} url
98			* @returns {boolean}
99			*/
100			ignore_url: function( url ){
101			for(var regex in this.ignore_paths) {
102			var reg = new RegExp(this.ignore_paths[regex], 'i');
103			if( url.match(reg) != null ) return true;
104			}
105			return false;
106			},
107
108			/**
109			* Add a path to ignore when crawler
110			* Note: Paths can be in regex format
111			*
112			* @param {string} path
113			* @returns {crawler}
114			*/
115			add_ignore_path: function(path){
116			this.ignore_paths.push(path);
117			return this;
118			},
119
120			/**
121			* Update all ignore paths to the paths specified
122			* Note: Path can be in regex format
123			*
124			* @param paths
125			* @returns {crawler}
126			*/
127			set_ignore_paths: function(paths){
128			this.ignore_paths = paths;
129			return this;
130			},
131
132			/**
133			* Sets the crawl id
134			*
135			* @param crawl_id
136			* @returns {crawler}
137			*/
138			set_crawl_id: function(crawl_id){
139			this.crawl_id = crawl_id;
140			return this;
141			},
142
143			/**
144			* Does some soft checks to determine if url is a valid candidate for crawling
145			*
146			* @param {string} url
147			* @returns {boolean}
148			*/
149			can_crawl: function(url){
150			if(url == undefined) return false;
151			return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
152			!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
153			},
154
155			/**
156			* Does a soft check for the url passed and checks if it's a file
157			* by checking if it has an extension and if the extension contains 'html'
158			*
159			* @param {string} url
160			* @returns {boolean}
161			*/
162			is_file: function(url){
163			var split = this.sanitize( url ).split( '.' );
164			return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
165			},
166
167			/**
168			* Does some soft checking for the url passed to see if it's external
169			* Note: If the url is internal but redirects to an external source, we wown't detect it here
170			*
171			* @param {string} url
172			* @returns {boolean}
173			*/
174			is_external: function(url){
175			return !(
176			url.length < 1 \|\|
177			url[0] == '/' \|\|
178			url[0] == '#' \|\|
179			url.indexOf('://') < 0 \|\|
180			url == this.sanitize( url ) \|\|
181			this.get_domain( url ) == location.hostname
182			);
183			},
184
185			/**
186			* Checks if the href passed is an anchor link for url passed.
187			*
188			* @param {string} href
189			* @param {string} url
190			* @return {boolean}
191			*/
192			is_anchor: function(href, url){
193			return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
194			},
195
196			/**
197			* Check if that target we requested matches the response we got.
198			* If not mark as a redirect and append the redirect to be crawled
199			*
200			* @param {string} target
201			* @param {string} response
202			* @return {boolean}
203			*/
204			check_fetched_url: function(target, response){
205			if(target != response){
206			this.redirects[target] = response;
207			this.que_url(response);
208			return false;
209			}
210
211			return true;
212			},
213
214			/**
215			* Fetch the next url from the que and run the tests on it
216			*/
217			fetch_and_test: function(){
218			if( !this.que \|\| this.que.length < 1 \|\| this.que.length < 1 \|\| $.active > 2 ) return false;
219
220			var url = this.que.pop();
221			this.crawling.push(url);
222
223			$.ajax({
224			url: this.get_proxy( '/seotest/getPageData?u='+url ),
225			data: { agent: this.useragent },
226			accepts: 'json',
227			dataType: 'json'
228			})
229			.done(function( result ) {
230			if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
231			var fetched = crawler.sanitize(result['url_fetched']);
232			if(!crawler.check_fetched_url(url, fetched)){
233			this.skipped = true;
234			return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235			}
236
237			var html = $(crawler.strip_img_src(result['body']));
238			crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
239			crawler.fetch_links(html, url);
240			crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
241			return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
242			}else{
243			return crawler.failed(url);
244			}
245			})
246			.fail( function(){
247			return crawler.failed_url(url);
248			})
249			.always( function(){
250			crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
251
252			if(!this.hasOwnProperty('skipped')){
253			crawler.tested.push(url);
254			}
255
256			crawler.trigger('CRAWL_FINISHED', [url]);
257
258			if( crawler.que.length < 1 && crawler.crawling.length < 1){
259			crawler.trigger('ALL_CRAWLS_FINISHED', []);
260			}
261
262			return crawler.fetch_and_test();
263			});
			0 ignored issues – show Best Practice introduced 2016-09-25 13:32 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
264			},
265
266			/**
267			* Check for links in the html of the rendered page so we add them to the que
268			* and also map how pages are linked to each other
269			*
270			* @param {jQuery} html
271			* @param {string} url
272			*/
273			fetch_links: function(html, url){
274			$.each(html.find('a'), function(){
275			var href = $(this).attr('href'),
276			link = crawler.sanitize(href);
277
278			crawler.que_url( href );
279
280			if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
281			else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
282			});
283			},
284
285			/**
286			* Run the registered tests
287			*
288			* @param {string} url
289			* @param {jQuery} html
290			* @param {Array} headers
291			* @param {Array} field_data
292			* @param {Array} phrases
293			*/
294			run_tests: function(url, html, headers, field_data, phrases){
295			for(var t in this.tests) {
296			this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
297			this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
298			this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
299			}
300			},
301
302			/**
303			* Trigger event callback and pass on the data
304			*
305			* @param {string} event
306			* @param {*} data
307			* return {undefined}
308			*/
309			trigger: function(event, data){
310			if(this.events.hasOwnProperty(event))
311			for(var e in this.events[event]) this.events[event][e].apply(this, data);
312			},
313
314			/**
315			* Register callback on action
316			*
317			* @param {string} event
318			* @param {function} callback
319			* @returns {crawler}
320			*/
321			on: function(event, callback){
322			if(!this.events.hasOwnProperty(event)) this.events[event] = [];
323			this.events[event].push(callback);
324			},
325
326			/**
327			* Strip out src=<anything> so that we avoid loading the images
328			* on the pages
329			*
330			* @param {string}html
331			* @returns {string}
332			*/
333			strip_img_src: function(html){
334			return html.replace( /(src).?=(['\|"].?['\|"])/ig, '' );
335			},
336
337			/**
338			* Return the proxy url to test the passed url
339			*
340			* @param {string} url
341			* @returns {string}
342			*/
343			get_proxy: function(url){
344			return location.protocol + '//' + location.hostname + url;
345			},
346
347			/**
348			* @see crawler_painter.add_row(name, data)
349			* @param {string} name
350			* @param {Array} data
351			*/
352			add_row: function(name, data){
353			crawler_painter.add_row(name, data);
354			},
355
356			/**
357			* Returns the word count for a given set of sentences or string
358			*
359			* @param {string\|array} data
360			* @returns {number}
361			*/
362			get_word_count: function(data){
363			if( typeof data === 'string' ) return data.split(' ').length;
364
365			var count = 0;
366			for( var d in data ) count += data[d].split(' ').length;
367			return count;
368			},
369
370			/**
371			* Set an arbitrary property on the crawler object
372			*
373			* @param {string} property
374			* @param {string\|int} key
375			* @param {*} val
376			* @return undefined
377			*/
378			set_property: function(property, key, val){
379			if(!this.hasOwnProperty(property)) this[property] = {};
380			if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
381			else this[property][key].push(val);
382			},
383
384			/**
385			* Add the failed url to the failed list and trigger the failed event
386			*
387			* @param {string} url
388			* @returns {undefined}
389			*/
390			failed_url: function(url){
391			this.failed.push(url);
392			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
393			},
394
395			/**
396			* Start the crawler
397			*
398			* @param {object} settings
399			* @throws Exception
400			*/
401			init: function(settings){
402			this.trigger('BEFORE_INIT', []);
403
404			if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
405			if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
406
407			if( !this.crawl_id ) throw "crawl_id must be specified";
408
409			crawler.fetch_and_test();
410			crawler.fetch_and_test();
411
412			crawler_painter.init();
413			this.trigger('AFTER_INIT', []);
414			}
415			};
416

dylangrech92 / seotoolbox

Push — develop ( 8024f0...d911c7 )

crawler.failed_url A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like