crawler.check_fetched_url - Code Metrics - Inspection of "#12 Bug & QA Fixes" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — develop ( 7852a2...d7f552 )

by Dylan

created 2016-09-29 07:00 UTC

crawler.check_fetched_url A

↳ Parent: js/crawler.js

Complexity

Conditions	2
Paths	2

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	9
rs	9.6666
c	0
b	0
f	0
cc	2
nc	2
nop	2

const crawler = {

    que             : [],
    tested          : [],
    crawling        : [],
    failed          : [],
    tests           : [],
    ignore_paths    : [],
    crawl_id        : undefined,
    events          : {},
    linked_from     : {},
    redirects       : {},
    useragent       : 'desktop',

    /**
     * Register a test to run.
     *
     * @param {string} name
     * @param {string} title
     * @param {Array} headers
     * @param {*} callable
     * @returns {undefined}
     * @throws Exception
     */
    regiser_test: function(name, title, headers, callable){
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
        if(title == undefined) throw 'Title not specified';
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
        return undefined;
    },

    /**
     * Return a registered test by name
     *
     * @param {string} name
     * @returns {object|false}
     */
    get_test_by_name: function(name){
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
        return false;
    },

    /**
     * Check if the url passed is valid for crawling, if so and it hasn't
     * been added or crawled before, add it to the que
     *
     * Returns true|false if added to que
     *
     * @param {string} url
     * @returns {boolean}
     */
    que_url: function(url){
        var sanitized = this.sanitize(url);
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
        this.que.push(sanitized);
        return true;
    },

    /**
     *  Clean up a url so it becomes relative and standardized
     *
     * @param {string} url
     * @returns {string}
     */
    sanitize: function(url){
        if(url == undefined) return '';

        url = url
            .replace(/^\/|\/$/g, '')
            .replace(/https?:\/\/[^\/]+/i, '')
            .replace(/^\/|\/$/g, '')
            .split('#')[0];

        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
        if( url.length < 1 ) url = '/';

        return url;
    },

    /**
     * Get the domain for the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_domain: function(url){
        if( !url ) return '';
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
        else return url.split('/')[0].split(':')[0];
    },

    /**
     * Checks if the passed url should be ignored or not
     *
     * @param {string} url
     * @returns {boolean}
     */
    ignore_url: function( url ){
        for(var regex in this.ignore_paths) {
            var reg = new RegExp(this.ignore_paths[regex], 'i');
            if( url.match(reg) != null ) return true;
        }
        return false;
    },

    /**
     * Add a path to ignore when crawler
     * Note: Paths can be in regex format
     *
     * @param {string} path
     * @returns {crawler}
     */
    add_ignore_path: function(path){
        this.ignore_paths.push(path);
        return this;
    },

    /**
     * Update all ignore paths to the paths specified
     * Note: Path can be in regex format
     *
     * @param paths
     * @returns {crawler}
     */
    set_ignore_paths: function(paths){
        this.ignore_paths = paths;
        return this;
    },

    /**
     * Sets the crawl id
     *
     * @param crawl_id
     * @returns {crawler}
     */
    set_crawl_id: function(crawl_id){
        this.crawl_id = crawl_id;
        return this;
    },

    /**
     * Does some soft checks to determine if url is a valid candidate for crawling
     *
     * @param {string} url
     * @returns {boolean}
     */
    can_crawl: function(url){
        if(url == undefined) return false;
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
    },

    /**
     * Does a soft check for the url passed and checks if it's a file
     * by checking if it has an extension and if the extension contains 'html'
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_file: function(url){
        var split = this.sanitize( url ).split( '.' );
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
    },

    /**
     * Does some soft checking for the url passed to see if it's external
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_external: function(url){
        return !(
            url.length < 1              ||
            url[0] == '/'               ||
            url[0] == '#'               ||
            url.indexOf('://') < 0      ||
            url == this.sanitize( url ) ||
            this.get_domain( url ) == location.hostname
        );
    },

    /**
     * Checks if the href passed is an anchor link for url passed.
     *
     * @param {string} href
     * @param {string} url
     * @return {boolean}
     */
    is_anchor: function(href, url){
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
    },

    /**
     * Check if that target we requested matches the response we got.
     * If not mark as a redirect and append the redirect to be crawled
     *
     * @param {string} target
     * @param {string} response
     * @return {boolean}
     */
    check_fetched_url: function(target, response){
        if(target != response){
            this.redirects[target] = response;
            this.que_url(response);
            return false;
        }

        return true;
    },

    /**
     * Fetch the next url from the que and run the tests on it
     */
    fetch_and_test: function(){
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;

        var url = this.que.pop();
        this.crawling.push(url);

        $.ajax({
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
            data: { agent: this.useragent },
            accepts: 'json',
            dataType: 'json'
        })
            .done(function( result ) {
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
                    var fetched = crawler.sanitize(result['url_fetched']);
                    if(!crawler.check_fetched_url(url, fetched)){
                        this.skipped = true;
                        return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
                    }

                    var html = $(crawler.strip_img_src(result['body']));
                    crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
                    crawler.fetch_links(html, url);
                    crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
                    return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
                }else{
                    crawler.failed.push(url);
                    return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
                }
            })
            .fail( function(){
                crawler.failed.push(url);
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
            })
            .always( function(){
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);

                if(!this.hasOwnProperty('skipped')){
                    crawler.tested.push(url);
                }

                crawler.trigger('CRAWL_FINISHED', [url]);

                if( crawler.que.length < 1 && crawler.crawling.length < 1){
                    crawler.trigger('ALL_CRAWLS_FINISHED', []);
                }

                return crawler.fetch_and_test();
            });

    },

    /**
     * Check for links in the html of the rendered page so we add them to the que
     * and also map how pages are linked to each other
     *
     * @param {jQuery} html
     * @param {string} url
     */
    fetch_links: function(html, url){
        $.each(html.find('a'), function(){
            var href    = $(this).attr('href'),
                link    = crawler.sanitize(href);

            crawler.que_url( href );

            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
        });
    },

    /**
     * Run the registered tests
     *
     * @param {string} url
     * @param {jQuery} html
     * @param {Array} headers
     * @param {Array} field_data
     * @param {Array} phrases
     */
    run_tests: function(url, html, headers, field_data, phrases){
        for(var t in this.tests) {
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
        }
    },

    /**
     * Trigger event callback and pass on the data
     *
     * @param {string} event
     * @param {*} data
     */
    trigger: function(event, data){
        if(this.events.hasOwnProperty(event))
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
    },

    /**
     * Register callback on action
     *
     * @param {string} event
     * @param {function} callback
     * @returns {crawler}
     */
    on: function(event, callback){
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
        this.events[event].push(callback);
    },

    /**
     * Strip out src=<anything> so that we avoid loading the images
     * on the pages
     *
     * @param {string}html
     * @returns {string}
     */
    strip_img_src: function(html){
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
    },

    /**
     * Return the proxy url to test the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_proxy: function(url){
        return location.protocol + '//' + location.hostname + url;
    },

    /**
     * @see crawler_painter.add_row(name, data)
     * @param {string} name
     * @param {Array} data
     */
    add_row: function(name, data){
        crawler_painter.add_row(name, data);
    },

    /**
     * Returns the word count for a given set of sentences or string
     *
     * @param {string|array} data
     * @returns {number}
     */
    get_word_count: function(data){
        if( typeof data === 'string' ) return data.split(' ').length;

        var count = 0;
        for( var d in data ) count += data[d].split(' ').length;
        return count;
    },

    /**
     * Set an arbitrary property on the crawler object
     *
     * @param {string} property
     * @param {string|int} key
     * @param {*} val
     * @return undefined
     */
    set_property: function(property, key, val){
        if(!this.hasOwnProperty(property)) this[property] = {};
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
        else this[property][key].push(val);
    },

    /**
     * Start the crawler
     *
     * @param {object} settings
     * @throws Exception
     */
    init: function(settings){
        this.trigger('BEFORE_INIT', []);

        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);

        if( !this.crawl_id ) throw "crawl_id must be specified";

        crawler.fetch_and_test();
        crawler.fetch_and_test();

        crawler_painter.init();
        this.trigger('AFTER_INIT', []);
    }
};


1			const crawler = {
2
3			que : [],
4			tested : [],
5			crawling : [],
6			failed : [],
7			tests : [],
8			ignore_paths : [],
9			crawl_id : undefined,
10			events : {},
11			linked_from : {},
12			redirects : {},
13			useragent : 'desktop',
14
15			/**
16			* Register a test to run.
17			*
18			* @param {string} name
19			* @param {string} title
20			* @param {Array} headers
21			* @param {*} callable
22			* @returns {undefined}
23			* @throws Exception
24			*/
25			regiser_test: function(name, title, headers, callable){
26			if(name == undefined \|\| this.get_test_by_name(name)) throw 'Invalid name specified for your test';
27			if(title == undefined) throw 'Title not specified';
28			if(!(headers instanceof Array) \|\| headers.length < 1) throw 'Headers array is invalid';
29			if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
30			this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
31			return undefined;
32			},
33
34			/**
35			* Return a registered test by name
36			*
37			* @param {string} name
38			* @returns {object\|false}
39			*/
40			get_test_by_name: function(name){
41			for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
42			return false;
43			},
44
45			/**
46			* Check if the url passed is valid for crawling, if so and it hasn't
47			* been added or crawled before, add it to the que
48			*
49			* Returns true\|false if added to que
50			*
51			* @param {string} url
52			* @returns {boolean}
53			*/
54			que_url: function(url){
55			var sanitized = this.sanitize(url);
56			if( !this.can_crawl(url) \|\| this.que.indexOf(sanitized) > -1 \|\| !this.can_crawl(sanitized)) return false;
57			this.que.push(sanitized);
58			return true;
59			},
60
61			/**
62			* Clean up a url so it becomes relative and standardized
63			*
64			* @param {string} url
65			* @returns {string}
66			*/
67			sanitize: function(url){
68			if(url == undefined) return '';
69
70			url = url
71			.replace(/^\/\|\/$/g, '')
72			.replace(/https?:\/\/[^\/]+/i, '')
73			.replace(/^\/\|\/$/g, '')
74			.split('#')[0];
75
76			if( url.slice(-1) == '?' ) url = url.slice(0, -1);
77			if( url.length < 1 ) url = '/';
78
79			return url;
80			},
81
82			/**
83			* Get the domain for the passed url
84			*
85			* @param {string} url
86			* @returns {string}
87			*/
88			get_domain: function(url){
89			if( !url ) return '';
90			if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
91			else return url.split('/')[0].split(':')[0];
92			},
93
94			/**
95			* Checks if the passed url should be ignored or not
96			*
97			* @param {string} url
98			* @returns {boolean}
99			*/
100			ignore_url: function( url ){
101			for(var regex in this.ignore_paths) {
102			var reg = new RegExp(this.ignore_paths[regex], 'i');
103			if( url.match(reg) != null ) return true;
104			}
105			return false;
106			},
107
108			/**
109			* Add a path to ignore when crawler
110			* Note: Paths can be in regex format
111			*
112			* @param {string} path
113			* @returns {crawler}
114			*/
115			add_ignore_path: function(path){
116			this.ignore_paths.push(path);
117			return this;
118			},
119
120			/**
121			* Update all ignore paths to the paths specified
122			* Note: Path can be in regex format
123			*
124			* @param paths
125			* @returns {crawler}
126			*/
127			set_ignore_paths: function(paths){
128			this.ignore_paths = paths;
129			return this;
130			},
131
132			/**
133			* Sets the crawl id
134			*
135			* @param crawl_id
136			* @returns {crawler}
137			*/
138			set_crawl_id: function(crawl_id){
139			this.crawl_id = crawl_id;
140			return this;
141			},
142
143			/**
144			* Does some soft checks to determine if url is a valid candidate for crawling
145			*
146			* @param {string} url
147			* @returns {boolean}
148			*/
149			can_crawl: function(url){
150			if(url == undefined) return false;
151			return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
152			!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
153			},
154
155			/**
156			* Does a soft check for the url passed and checks if it's a file
157			* by checking if it has an extension and if the extension contains 'html'
158			*
159			* @param {string} url
160			* @returns {boolean}
161			*/
162			is_file: function(url){
163			var split = this.sanitize( url ).split( '.' );
164			return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
165			},
166
167			/**
168			* Does some soft checking for the url passed to see if it's external
169			* Note: If the url is internal but redirects to an external source, we wown't detect it here
170			*
171			* @param {string} url
172			* @returns {boolean}
173			*/
174			is_external: function(url){
175			return !(
176			url.length < 1 \|\|
177			url[0] == '/' \|\|
178			url[0] == '#' \|\|
179			url.indexOf('://') < 0 \|\|
180			url == this.sanitize( url ) \|\|
181			this.get_domain( url ) == location.hostname
182			);
183			},
184
185			/**
186			* Checks if the href passed is an anchor link for url passed.
187			*
188			* @param {string} href
189			* @param {string} url
190			* @return {boolean}
191			*/
192			is_anchor: function(href, url){
193			return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
194			},
195
196			/**
197			* Check if that target we requested matches the response we got.
198			* If not mark as a redirect and append the redirect to be crawled
199			*
200			* @param {string} target
201			* @param {string} response
202			* @return {boolean}
203			*/
204			check_fetched_url: function(target, response){
205			if(target != response){
206			this.redirects[target] = response;
207			this.que_url(response);
208			return false;
209			}
210
211			return true;
212			},
213
214			/**
215			* Fetch the next url from the que and run the tests on it
216			*/
217			fetch_and_test: function(){
218			if( !this.que \|\| this.que.length < 1 \|\| this.que.length < 1 \|\| $.active > 2 ) return false;
219
220			var url = this.que.pop();
221			this.crawling.push(url);
222
223			$.ajax({
224			url: this.get_proxy( '/seotest/getPageData?u='+url ),
225			data: { agent: this.useragent },
226			accepts: 'json',
227			dataType: 'json'
228			})
229			.done(function( result ) {
230			if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
231			var fetched = crawler.sanitize(result['url_fetched']);
232			if(!crawler.check_fetched_url(url, fetched)){
233			this.skipped = true;
234			return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235			}
236
237			var html = $(crawler.strip_img_src(result['body']));
238			crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
239			crawler.fetch_links(html, url);
240			crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
241			return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
242			}else{
243			crawler.failed.push(url);
244			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
245			}
246			})
247			.fail( function(){
248			crawler.failed.push(url);
249			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
250			})
251			.always( function(){
252			crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
253
254			if(!this.hasOwnProperty('skipped')){
255			crawler.tested.push(url);
256			}
257
258			crawler.trigger('CRAWL_FINISHED', [url]);
259
260			if( crawler.que.length < 1 && crawler.crawling.length < 1){
261			crawler.trigger('ALL_CRAWLS_FINISHED', []);
262			}
263
264			return crawler.fetch_and_test();
265			});
			0 ignored issues – show Best Practice introduced 2016-09-25 13:32 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
266			},
267
268			/**
269			* Check for links in the html of the rendered page so we add them to the que
270			* and also map how pages are linked to each other
271			*
272			* @param {jQuery} html
273			* @param {string} url
274			*/
275			fetch_links: function(html, url){
276			$.each(html.find('a'), function(){
277			var href = $(this).attr('href'),
278			link = crawler.sanitize(href);
279
280			crawler.que_url( href );
281
282			if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
283			else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
284			});
285			},
286
287			/**
288			* Run the registered tests
289			*
290			* @param {string} url
291			* @param {jQuery} html
292			* @param {Array} headers
293			* @param {Array} field_data
294			* @param {Array} phrases
295			*/
296			run_tests: function(url, html, headers, field_data, phrases){
297			for(var t in this.tests) {
298			this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
299			this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
300			this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
301			}
302			},
303
304			/**
305			* Trigger event callback and pass on the data
306			*
307			* @param {string} event
308			* @param {*} data
309			*/
310			trigger: function(event, data){
311			if(this.events.hasOwnProperty(event))
312			for(var e in this.events[event]) this.events[event][e].apply(this, data);
313			},
314
315			/**
316			* Register callback on action
317			*
318			* @param {string} event
319			* @param {function} callback
320			* @returns {crawler}
321			*/
322			on: function(event, callback){
323			if(!this.events.hasOwnProperty(event)) this.events[event] = [];
324			this.events[event].push(callback);
325			},
326
327			/**
328			* Strip out src=<anything> so that we avoid loading the images
329			* on the pages
330			*
331			* @param {string}html
332			* @returns {string}
333			*/
334			strip_img_src: function(html){
335			return html.replace( /(src).?=(['\|"].?['\|"])/ig, '' );
336			},
337
338			/**
339			* Return the proxy url to test the passed url
340			*
341			* @param {string} url
342			* @returns {string}
343			*/
344			get_proxy: function(url){
345			return location.protocol + '//' + location.hostname + url;
346			},
347
348			/**
349			* @see crawler_painter.add_row(name, data)
350			* @param {string} name
351			* @param {Array} data
352			*/
353			add_row: function(name, data){
354			crawler_painter.add_row(name, data);
355			},
356
357			/**
358			* Returns the word count for a given set of sentences or string
359			*
360			* @param {string\|array} data
361			* @returns {number}
362			*/
363			get_word_count: function(data){
364			if( typeof data === 'string' ) return data.split(' ').length;
365
366			var count = 0;
367			for( var d in data ) count += data[d].split(' ').length;
368			return count;
369			},
370
371			/**
372			* Set an arbitrary property on the crawler object
373			*
374			* @param {string} property
375			* @param {string\|int} key
376			* @param {*} val
377			* @return undefined
378			*/
379			set_property: function(property, key, val){
380			if(!this.hasOwnProperty(property)) this[property] = {};
381			if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
382			else this[property][key].push(val);
383			},
384
385			/**
386			* Start the crawler
387			*
388			* @param {object} settings
389			* @throws Exception
390			*/
391			init: function(settings){
392			this.trigger('BEFORE_INIT', []);
393
394			if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
395			if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
396
397			if( !this.crawl_id ) throw "crawl_id must be specified";
398
399			crawler.fetch_and_test();
400			crawler.fetch_and_test();
401
402			crawler_painter.init();
403			this.trigger('AFTER_INIT', []);
404			}
405			};
406

dylangrech92 / seotoolbox

Push — develop ( 7852a2...d7f552 )

crawler.check_fetched_url A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like