crawler.init - Code Metrics - Inspection of "#14 New Test & Bug & QA Fixes" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — develop ( 322e3f...d82e9f )

by Dylan

created 2016-09-30 06:34 UTC

crawler.init A

↳ Parent: crawler.init

Complexity

Conditions	1
Paths	1

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	1	Features	0

Metric	Value
dl	0
loc	1
rs	10
c	2
b	1
f	0
cc	1
nc	1
nop	0

const crawler = {

    que             : [],
    tested          : [],
    crawling        : [],
    failed          : [],
    tests           : [],
    ignore_paths    : [],
    crawl_id        : undefined,
    linked_from     : {},
    redirects       : {},
    useragent       : 'desktop',
    event_handler   : crawler_event_handler,

    painter         : crawler_painter,

    /**
     * Register a test to run.
     *
     * @param {string} name
     * @param {string} title
     * @param {Array} headers
     * @param {*} callable
     * @returns {undefined}
     * @throws Exception
     */
    regiser_test: function(name, title, headers, callable){
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
        if(title == undefined) throw 'Title not specified';
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
        if(typeof callable != 'function') return this.painter.create(name, title, headers);
        this.tests.push({name: name, title: title, callback: callable, cont: this.painter.create(name, title, headers)});
        return undefined;
    },

    /**
     * Return a registered test by name
     *
     * @param {string} name
     * @returns {object|false}
     */
    get_test_by_name: function(name){
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
        return false;
    },

    /**
     * Check if the url passed is valid for crawling, if so and it hasn't
     * been added or crawled before, add it to the que
     *
     * Returns true|false if added to que
     *
     * @param {string} url
     * @returns {boolean}
     */
    que_url: function(url){
        var sanitized = this.sanitize(url);
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
        this.que.push(sanitized);
        return true;
    },

    /**
     *  Clean up a url so it becomes relative and standardized
     *
     * @param {string} url
     * @returns {string}
     */
    sanitize: function(url){
        if(url == undefined) return '';

        url = url
            .replace(/^\/|\/$/g, '')
            .replace(/https?:\/\/[^\/]+/i, '')
            .replace(/^\/|\/$/g, '')
            .split('#')[0];

        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
        if( url.length < 1 ) url = '/';

        return url;
    },

    /**
     * Get the domain for the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_domain: function(url){
        if( !url ) return '';
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
        else return url.split('/')[0].split(':')[0];
    },

    /**
     * Checks if the passed url should be ignored or not
     *
     * @param {string} url
     * @returns {boolean}
     */
    ignore_url: function( url ){
        for(var regex in this.ignore_paths) {
            var reg = new RegExp(this.ignore_paths[regex], 'i');
            if( url.match(reg) != null ) return true;
        }
        return false;
    },

    /**
     * Update all ignore paths to the paths specified
     * Note: Path can be in regex format
     *
     * @param paths
     * @returns {crawler}
     */
    set_ignore_paths: function(paths){
        this.ignore_paths = paths;
        return this;
    },

    /**
     * Sets the crawl id
     *
     * @param crawl_id
     * @returns {crawler}
     */
    set_crawl_id: function(crawl_id){
        this.crawl_id = crawl_id;
        return this;
    },

    /**
     * Does some soft checks to determine if url is a valid candidate for crawling
     *
     * @param {string} url
     * @returns {boolean}
     */
    can_crawl: function(url){
        if(url == undefined) return false;
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
    },

    /**
     * Does a soft check for the url passed and checks if it's a file
     * by checking if it has an extension and if the extension contains 'html'
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_file: function(url){
        var split = this.sanitize( url ).split( '.' );
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
    },

    /**
     * Does some soft checking for the url passed to see if it's external
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_external: function(url){
        return !(
            url.length < 1              ||
            url[0] == '/'               ||
            url[0] == '#'               ||
            url.indexOf('://') < 0      ||
            url == this.sanitize( url ) ||
            this.get_domain( url ) == location.hostname
        );
    },

    /**
     * Checks if the href passed is an anchor link for url passed.
     *
     * @param {string} href
     * @param {string} url
     * @return {boolean}
     */
    is_anchor: function(href, url){
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
    },

    /**
     * Check if that target we requested matches the response we got.
     * If not mark as a redirect and append the redirect to be crawled
     *
     * @param {string} target
     * @param {string} response
     * @return {boolean}
     */
    check_fetched_url: function(target, response){
        if(target != response){
            this.redirects[target] = response;
            this.que_url(response);
            return false;
        }

        return true;
    },

    /**
     * Checks if the string passed is an html page
     *
     * @param {string} html
     * @returns {boolean}
     */
    is_html: function(html){
        return html.indexOf('<head') > 0 && html.indexOf('<body') > 0;
    },

    /**
     * Fetch the next url from the que and run the tests on it
     */
    fetch_and_test: function(){
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;

        var url = this.que.pop();
        this.crawling.push(url);

        $.ajax({
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
            data: { agent: this.useragent },
            accepts: 'json',
            dataType: 'json'
        })
            .done(function( result ) {
                var fetched = crawler.sanitize(result['url_fetched']);
                if( !result['headers'] || !result['body'] ) {
                    return crawler.failed_url(url);
                }else if(!crawler.check_fetched_url(url, fetched)){
                    this.skipped = true;
                    return crawler.event_handler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
                }else if(crawler.is_html(result['body'])){
                    var html = $(crawler.strip_img_src(result['body']));
                    crawler.fetch_links(html, url);
                    return crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
                }else{
                    this.skipped = true;

                }
            })
            .fail( function(){
                return crawler.failed_url(url);
            })
            .always( function(){
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);

                if(!this.hasOwnProperty('skipped')){
                    crawler.tested.push(url);
                }

                crawler.event_handler.trigger('CRAWL_FINISHED', [url]);

                if( crawler.que.length < 1 && crawler.crawling.length < 1){
                    crawler.event_handler.trigger('ALL_CRAWLS_FINISHED', []);
                }

                return crawler.fetch_and_test();
            });

    },

    /**
     * Check for links in the html of the rendered page so we add them to the que
     * and also map how pages are linked to each other
     *
     * @param {jQuery} html
     * @param {string} url
     */
    fetch_links: function(html, url){
        $.each(html.find('a'), function(){
            var href    = $(this).attr('href'),
                link    = crawler.sanitize(href);

            crawler.que_url( href );

            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
        });
    },

    /**
     * Run the registered tests
     *
     * @param {string} url
     * @param {jQuery} html
     * @param {Array} headers
     * @param {Array} field_data
     * @param {Array} phrases
     * @returns {undefined}
     */
    run_tests: function(url, html, headers, field_data, phrases){
        this.event_handler.trigger('CRAWL_BEFORE_TESTS', [url]);
        for(var t in this.tests) {
            this.event_handler.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
            this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
            this.event_handler.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
        }
        return this.event_handler.trigger('CRAWL_AFTER_TESTS', [url]);
    },

    /**
     * Strip out src=<anything> so that we avoid loading the images
     * on the pages
     *
     * @param {string}html
     * @returns {string}
     */
    strip_img_src: function(html){
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
    },

    /**
     * Return the proxy url to test the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_proxy: function(url){
        return location.protocol + '//' + location.hostname + url;
    },

    /**
     * Returns the word count for a given set of sentences or string
     *
     * @param {string|array} data
     * @returns {number}
     */
    get_word_count: function(data){
        if( typeof data === 'string' ) return data.split(' ').length;

        var count = 0;
        for( var d in data ) count += data[d].split(' ').length;
        return count;
    },

    /**
     * Set an arbitrary property on the crawler object
     *
     * @param {string} property
     * @param {string|int} key
     * @param {*} val
     * @return undefined
     */
    set_property: function(property, key, val){
        if(!this.hasOwnProperty(property)) this[property] = {};
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
        else this[property][key].push(val);
    },

    /**
     * Add the failed url to the failed list and trigger the failed event
     *
     * @param {string} url
     * @returns {undefined}
     */
    failed_url: function(url){
        this.failed.push(url);
        return this.event_handler.trigger('CRAWL_LOAD_FAILED', [url]);
    },

    /**
     * Triggered every second
     *
     * @returns {undefined}
     */
    loop: function(){
        this.event_handler.trigger('CRAWLER_LOOP', [this]);
        this.fetch_and_test();
        return undefined;
    },

    /**
     * Start the crawler
     *
     * @param {object} settings
     * @throws Exception
     */
    init: function(settings){
        this.event_handler.trigger('BEFORE_INIT', [this]);

        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);

        if( !this.crawl_id ) throw "crawl_id must be specified";

        this.interval = setInterval(function(){crawler.loop();}, 1000);
        this.event_handler.on('ALL_CRAWLS_FINISHED', function(){ window.clearInterval( crawler.interval ); });

        this.painter.init();
        this.event_handler.trigger('AFTER_INIT', [this]);
    }
};


1			const crawler = {
2
3			que : [],
4			tested : [],
5			crawling : [],
6			failed : [],
7			tests : [],
8			ignore_paths : [],
9			crawl_id : undefined,
10			linked_from : {},
11			redirects : {},
12			useragent : 'desktop',
13			event_handler : crawler_event_handler,
			1 ignored issue – show Bug introduced 2016-09-30 06:37 UTC by Report Bug Copy Issue Report The variable `crawler_event_handler` seems to be never declared. If this is a global, consider adding a `/** global: crawler_event_handler */` comment. This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed. To learn more about declaring variables in Javascript, see the MDN. Loading history...
14			painter : crawler_painter,
15
16			/**
17			* Register a test to run.
18			*
19			* @param {string} name
20			* @param {string} title
21			* @param {Array} headers
22			* @param {*} callable
23			* @returns {undefined}
24			* @throws Exception
25			*/
26			regiser_test: function(name, title, headers, callable){
27			if(name == undefined \|\| this.get_test_by_name(name)) throw 'Invalid name specified for your test';
28			if(title == undefined) throw 'Title not specified';
29			if(!(headers instanceof Array) \|\| headers.length < 1) throw 'Headers array is invalid';
30			if(typeof callable != 'function') return this.painter.create(name, title, headers);
31			this.tests.push({name: name, title: title, callback: callable, cont: this.painter.create(name, title, headers)});
32			return undefined;
33			},
34
35			/**
36			* Return a registered test by name
37			*
38			* @param {string} name
39			* @returns {object\|false}
40			*/
41			get_test_by_name: function(name){
42			for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
43			return false;
44			},
45
46			/**
47			* Check if the url passed is valid for crawling, if so and it hasn't
48			* been added or crawled before, add it to the que
49			*
50			* Returns true\|false if added to que
51			*
52			* @param {string} url
53			* @returns {boolean}
54			*/
55			que_url: function(url){
56			var sanitized = this.sanitize(url);
57			if( !this.can_crawl(url) \|\| this.que.indexOf(sanitized) > -1 \|\| !this.can_crawl(sanitized)) return false;
58			this.que.push(sanitized);
59			return true;
60			},
61
62			/**
63			* Clean up a url so it becomes relative and standardized
64			*
65			* @param {string} url
66			* @returns {string}
67			*/
68			sanitize: function(url){
69			if(url == undefined) return '';
70
71			url = url
72			.replace(/^\/\|\/$/g, '')
73			.replace(/https?:\/\/[^\/]+/i, '')
74			.replace(/^\/\|\/$/g, '')
75			.split('#')[0];
76
77			if( url.slice(-1) == '?' ) url = url.slice(0, -1);
78			if( url.length < 1 ) url = '/';
79
80			return url;
81			},
82
83			/**
84			* Get the domain for the passed url
85			*
86			* @param {string} url
87			* @returns {string}
88			*/
89			get_domain: function(url){
90			if( !url ) return '';
91			if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
92			else return url.split('/')[0].split(':')[0];
93			},
94
95			/**
96			* Checks if the passed url should be ignored or not
97			*
98			* @param {string} url
99			* @returns {boolean}
100			*/
101			ignore_url: function( url ){
102			for(var regex in this.ignore_paths) {
103			var reg = new RegExp(this.ignore_paths[regex], 'i');
104			if( url.match(reg) != null ) return true;
105			}
106			return false;
107			},
108
109			/**
110			* Update all ignore paths to the paths specified
111			* Note: Path can be in regex format
112			*
113			* @param paths
114			* @returns {crawler}
115			*/
116			set_ignore_paths: function(paths){
117			this.ignore_paths = paths;
118			return this;
119			},
120
121			/**
122			* Sets the crawl id
123			*
124			* @param crawl_id
125			* @returns {crawler}
126			*/
127			set_crawl_id: function(crawl_id){
128			this.crawl_id = crawl_id;
129			return this;
130			},
131
132			/**
133			* Does some soft checks to determine if url is a valid candidate for crawling
134			*
135			* @param {string} url
136			* @returns {boolean}
137			*/
138			can_crawl: function(url){
139			if(url == undefined) return false;
140			return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
141			!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
142			},
143
144			/**
145			* Does a soft check for the url passed and checks if it's a file
146			* by checking if it has an extension and if the extension contains 'html'
147			*
148			* @param {string} url
149			* @returns {boolean}
150			*/
151			is_file: function(url){
152			var split = this.sanitize( url ).split( '.' );
153			return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
154			},
155
156			/**
157			* Does some soft checking for the url passed to see if it's external
158			* Note: If the url is internal but redirects to an external source, we wown't detect it here
159			*
160			* @param {string} url
161			* @returns {boolean}
162			*/
163			is_external: function(url){
164			return !(
165			url.length < 1 \|\|
166			url[0] == '/' \|\|
167			url[0] == '#' \|\|
168			url.indexOf('://') < 0 \|\|
169			url == this.sanitize( url ) \|\|
170			this.get_domain( url ) == location.hostname
171			);
172			},
173
174			/**
175			* Checks if the href passed is an anchor link for url passed.
176			*
177			* @param {string} href
178			* @param {string} url
179			* @return {boolean}
180			*/
181			is_anchor: function(href, url){
182			return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
183			},
184
185			/**
186			* Check if that target we requested matches the response we got.
187			* If not mark as a redirect and append the redirect to be crawled
188			*
189			* @param {string} target
190			* @param {string} response
191			* @return {boolean}
192			*/
193			check_fetched_url: function(target, response){
194			if(target != response){
195			this.redirects[target] = response;
196			this.que_url(response);
197			return false;
198			}
199
200			return true;
201			},
202
203			/**
204			* Checks if the string passed is an html page
205			*
206			* @param {string} html
207			* @returns {boolean}
208			*/
209			is_html: function(html){
210			return html.indexOf('<head') > 0 && html.indexOf('<body') > 0;
211			},
212
213			/**
214			* Fetch the next url from the que and run the tests on it
215			*/
216			fetch_and_test: function(){
217			if( !this.que \|\| this.que.length < 1 \|\| this.que.length < 1 \|\| $.active > 2 ) return false;
218
219			var url = this.que.pop();
220			this.crawling.push(url);
221
222			$.ajax({
223			url: this.get_proxy( '/seotest/getPageData?u='+url ),
224			data: { agent: this.useragent },
225			accepts: 'json',
226			dataType: 'json'
227			})
228			.done(function( result ) {
229			var fetched = crawler.sanitize(result['url_fetched']);
230			if( !result['headers'] \|\| !result['body'] ) {
231			return crawler.failed_url(url);
232			}else if(!crawler.check_fetched_url(url, fetched)){
233			this.skipped = true;
234			return crawler.event_handler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235			}else if(crawler.is_html(result['body'])){
236			var html = $(crawler.strip_img_src(result['body']));
237			crawler.fetch_links(html, url);
238			return crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
239			}else{
240			this.skipped = true;
			0 ignored issues – show Best Practice introduced 2016-09-30 06:37 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
241			}
242			})
243			.fail( function(){
244			return crawler.failed_url(url);
245			})
246			.always( function(){
247			crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
248
249			if(!this.hasOwnProperty('skipped')){
250			crawler.tested.push(url);
251			}
252
253			crawler.event_handler.trigger('CRAWL_FINISHED', [url]);
254
255			if( crawler.que.length < 1 && crawler.crawling.length < 1){
256			crawler.event_handler.trigger('ALL_CRAWLS_FINISHED', []);
257			}
258
259			return crawler.fetch_and_test();
260			});
			0 ignored issues – show Best Practice introduced 2016-09-25 13:32 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
261			},
262
263			/**
264			* Check for links in the html of the rendered page so we add them to the que
265			* and also map how pages are linked to each other
266			*
267			* @param {jQuery} html
268			* @param {string} url
269			*/
270			fetch_links: function(html, url){
271			$.each(html.find('a'), function(){
272			var href = $(this).attr('href'),
273			link = crawler.sanitize(href);
274
275			crawler.que_url( href );
276
277			if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
278			else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
279			});
280			},
281
282			/**
283			* Run the registered tests
284			*
285			* @param {string} url
286			* @param {jQuery} html
287			* @param {Array} headers
288			* @param {Array} field_data
289			* @param {Array} phrases
290			* @returns {undefined}
291			*/
292			run_tests: function(url, html, headers, field_data, phrases){
293			this.event_handler.trigger('CRAWL_BEFORE_TESTS', [url]);
294			for(var t in this.tests) {
295			this.event_handler.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
296			this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
297			this.event_handler.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
298			}
299			return this.event_handler.trigger('CRAWL_AFTER_TESTS', [url]);
300			},
301
302			/**
303			* Strip out src=<anything> so that we avoid loading the images
304			* on the pages
305			*
306			* @param {string}html
307			* @returns {string}
308			*/
309			strip_img_src: function(html){
310			return html.replace( /(src).?=(['\|"].?['\|"])/ig, '' );
311			},
312
313			/**
314			* Return the proxy url to test the passed url
315			*
316			* @param {string} url
317			* @returns {string}
318			*/
319			get_proxy: function(url){
320			return location.protocol + '//' + location.hostname + url;
321			},
322
323			/**
324			* Returns the word count for a given set of sentences or string
325			*
326			* @param {string\|array} data
327			* @returns {number}
328			*/
329			get_word_count: function(data){
330			if( typeof data === 'string' ) return data.split(' ').length;
331
332			var count = 0;
333			for( var d in data ) count += data[d].split(' ').length;
334			return count;
335			},
336
337			/**
338			* Set an arbitrary property on the crawler object
339			*
340			* @param {string} property
341			* @param {string\|int} key
342			* @param {*} val
343			* @return undefined
344			*/
345			set_property: function(property, key, val){
346			if(!this.hasOwnProperty(property)) this[property] = {};
347			if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
348			else this[property][key].push(val);
349			},
350
351			/**
352			* Add the failed url to the failed list and trigger the failed event
353			*
354			* @param {string} url
355			* @returns {undefined}
356			*/
357			failed_url: function(url){
358			this.failed.push(url);
359			return this.event_handler.trigger('CRAWL_LOAD_FAILED', [url]);
360			},
361
362			/**
363			* Triggered every second
364			*
365			* @returns {undefined}
366			*/
367			loop: function(){
368			this.event_handler.trigger('CRAWLER_LOOP', [this]);
369			this.fetch_and_test();
370			return undefined;
371			},
372
373			/**
374			* Start the crawler
375			*
376			* @param {object} settings
377			* @throws Exception
378			*/
379			init: function(settings){
380			this.event_handler.trigger('BEFORE_INIT', [this]);
381
382			if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
383			if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
384
385			if( !this.crawl_id ) throw "crawl_id must be specified";
386
387			this.interval = setInterval(function(){crawler.loop();}, 1000);
388			this.event_handler.on('ALL_CRAWLS_FINISHED', function(){ window.clearInterval( crawler.interval ); });
389
390			this.painter.init();
391			this.event_handler.trigger('AFTER_INIT', [this]);
392			}
393			};
394

dylangrech92 / seotoolbox

Push — develop ( 322e3f...d82e9f )

crawler.init A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like