crawler.set_property - Code Metrics - Inspection of "QA Fix" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — develop ( 1545da...3f2ac7 )

by Dylan

created 2016-09-26 06:55 UTC

crawler.set_property A

↳ Parent: js/crawler.js

Complexity

Conditions	3
Paths	4

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
nc	4
nop	3
dl	0
loc	5
rs	9.4285
c	0
b	0
f	0

const crawler = {

    que             : [],
    tested          : [],
    crawling        : [],
    failed          : [],
    tests           : [],
    ignore_paths    : [],
    crawl_id        : undefined,
    events          : {},
    linked_from     : {},
    useragent       : 'desktop',

    /**
     * Register a test to run.
     *
     * @param {string} name
     * @param {string} title
     * @param {Array} headers
     * @param {string} callable
     * @returns {undefined}
     * @throws Exception
     */
    regiser_test: function(name, title, headers, callable){
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
        if(title == undefined) throw 'Title not specified';
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
        return undefined;
    },

    /**
     * Return a registered test by name
     *
     * @param {string} name
     * @returns {object|false}
     */
    get_test_by_name: function(name){
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
        return false;
    },

    /**
     * Check if the url passed is valid for crawling, if so and it hasn't
     * been added or crawled before, add it to the que
     *
     * Returns true|false if added to que
     *
     * @param {string} url
     * @returns {boolean}
     */
    que_url: function(url){
        var sanitized = this.sanitize(url);
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
        this.que.push(sanitized);
        return true;
    },

    /**
     *  Clean up a url so it becomes relative and standardized
     *
     * @param {string} url
     * @returns {string}
     */
    sanitize: function(url){
        if(url == undefined) return '';

        url = url
            .replace(/^\/|\/$/g, '')
            .replace(/https?:\/\/[^\/]+/i, '')
            .replace(/^\/|\/$/g, '')
            .split('#')[0];

        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
        if( url.length < 1 ) url = '/';

        return url;
    },

    /**
     * Get the domain for the passed url
     *
     * @param {string} url
     * @returns {string}
     */
    get_domain: function(url){
        if( !url ) return '';
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
        else return url.split('/')[0].split(':')[0];
    },

    /**
     * Checks if the passed url should be ignored or not
     *
     * @param {string} url
     * @returns {boolean}
     */
    ignore_url: function( url ){
        for(var regex in this.ignore_paths) {
            var reg = new RegExp(this.ignore_paths[regex], 'i');
            if( url.match(reg) != null ) return true;
        }
        return false;
    },

    /**
     * Add a path to ignore when crawler
     * Note: Paths can be in regex format
     *
     * @param {string} path
     * @returns {crawler}
     */
    add_ignore_path: function(path){
        this.ignore_paths.push(path);
        return this;
    },

    /**
     * Update all ignore paths to the paths specified
     * Note: Path can be in regex format
     *
     * @param paths
     * @returns {crawler}
     */
    set_ignore_paths: function(paths){
        this.ignore_paths = paths;
        return this;
    },

    /**
     * Sets the crawl id
     *
     * @param crawl_id
     * @returns {crawler}
     */
    set_crawl_id: function(crawl_id){
        this.crawl_id = crawl_id;
        return this;
    },

    /**
     * Does some soft checks to determine if url is a valid candidate for crawling
     *
     * @param {string} url
     * @returns {boolean}
     */
    can_crawl: function(url){
        if(url == undefined) return false;
        return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 ||
                    this.is_file(url) || this.ignore_url(url) || this.is_external(url));
    },

    /**
     * Does a soft check for the url passed and checks if it's a file
     * by checking if it has an extension and if the extension contains 'html'
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_file: function(url){
        var split = this.sanitize( url ).split( '.' );
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
    },

    /**
     * Does some soft checking for the url passed to see if it's external
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
     *
     * @param {string} url
     * @returns {boolean}
     */
    is_external: function(url){
        // Starts with / or # or doesn't have :// in it has to be internal
        if( url.length < 1 || url[0] == '/' || url[0] == '#' || url.indexOf('://') < 0 ) return false;

        // If we removed the domain and the url is still the same then it's an internal link without the leading /
        if( url == this.sanitize( url ) ) return false;

        // The domain is the same the domain we're running this script on
        if( this.get_domain( url ) == location.hostname ) return false;

        return true;
    },

    /**
     * Checks if the href passed is an anchor link for url passed.
     *
     * @param {string} href
     * @param {string} url
     * @return {boolean}
     */
    is_anchor: function(href, url){
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
    },

    /**
     * Fetch the next url from the que and run the tests on it
     */
    fetch_and_test: function(){
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;

        var url = this.que.pop();
        this.crawling.push(url);

        $.ajax({
            url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
        })
            .done(function( result ) {
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
                    if( !crawler.is_external(result['url_fetched']) ) {
                        url = crawler.sanitize(result['url_fetched']);
                        if(crawler.tested.indexOf(url) >= 0){
                            this.skipped = true;
                            return true;
                        }

                        var html = $(crawler.strip_img_src(result['body']));
                        crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
                        crawler.fetch_links(html, url);
                        crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
                        crawler.trigger('CRAWL_AFTER_TESTS', [url]);
                        return true;
                    }
                }
                crawler.failed.push(url);
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
            })
            .fail( function(){
                crawler.failed.push(url);
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
            })
            .always( function(){
                if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) {
                    crawler.tested.push(url)
                }
                return crawler.trigger('CRAWL_FINISHED', [url]);
            });

    },

    /**
     * Check for links in the html of the rendered page so we add them to the que
     * and also map how pages are linked to each other
     *
     * @param {jQuery} html
     * @param {string} url
     */
    fetch_links: function(html, url){
        $.each(html.find('a'), function(){
            var href    = $(this).attr('href'),
                link    = crawler.sanitize(href);

            crawler.que_url( href );

            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
        });
    },

    /**
     * Run the registered tests
     *
     * @param {string} url
     * @param {jQuery} html
     * @param {Array} headers
     * @param {Array} field_data
     * @param {Array} phrases
     */
    run_tests: function(url, html, headers, field_data, phrases){
        for(var t in this.tests) {
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
        }
    },

    /**
     * Trigger event callback and pass on the data
     *
     * @param {string} event
     * @param {*} data
     */
    trigger: function(event, data){
        if(this.events.hasOwnProperty(event))
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
    },

    /**
     * Register callback on action
     *
     * @param {string} event
     * @param {function} callback
     * @returns {crawler}
     */
    on: function(event, callback){
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
        this.events[event].push(callback);
    },

    /**
     * Strip out src=<anything> so that we avoid loading the images
     * on the pages
     *
     * @param {string}html
     * @returns {string}
     */
    strip_img_src: function(html){
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
    },

    /**
     * Return the proxy url to test the passed url
     *
     * @param {$string} url
     * @returns {string}
     */
    get_proxy: function(url){
        return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
    },

    /**
     * @see crawler_painter.add_row(name, data)
     * @param {string} name
     * @param {Array} data
     */
    add_row: function(name, data){
        crawler_painter.add_row(name, data);
    },

    /**
     * Returns the word count for a given set of sentences or string
     *
     * @param {string|array} data
     * @returns {number}
     */
    get_word_count: function(data){
        if( typeof data === 'string' ) return data.split(' ').length;

        var count = 0;
        for( var d in data ) count += data[d].split(' ').length;
        return count;
    },

    /**
     * Set an arbitrary property on the crawler object
     *
     * @param {string} property
     * @param {string|int} key
     * @param {*} val
     * @return undefined
     */
    set_property: function(property, key, val){
        if(!this.hasOwnProperty(property)) this[property] = {};
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
        else this[property][key].push(val);
    },

    /**
     * Start the crawler
     *
     * @param {object} settings
     * @throws Exception
     */
    init: function(settings){
        this.trigger('BEFORE_INIT', []);

        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);

        if( !this.crawl_id ) throw "crawl_id must be specified";

        // When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
        this.on('CRAWL_FINISHED', function(){
            if( crawler.que.length > 0 ) crawler.fetch_and_test();
            else window.clearInterval(crawler.interval);
        });

        // Every second try to initialize a new crawl request just in-case something crashes
        this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);

        crawler_painter.init();
        this.trigger('AFTER_INIT', []);
    }
};


1			const crawler = {
2
3			que : [],
4			tested : [],
5			crawling : [],
6			failed : [],
7			tests : [],
8			ignore_paths : [],
9			crawl_id : undefined,
10			events : {},
11			linked_from : {},
12			useragent : 'desktop',
13
14			/**
15			* Register a test to run.
16			*
17			* @param {string} name
18			* @param {string} title
19			* @param {Array} headers
20			* @param {string} callable
21			* @returns {undefined}
22			* @throws Exception
23			*/
24			regiser_test: function(name, title, headers, callable){
25			if(name == undefined \|\| this.get_test_by_name(name)) throw 'Invalid name specified for your test';
26			if(title == undefined) throw 'Title not specified';
27			if(!(headers instanceof Array) \|\| headers.length < 1) throw 'Headers array is invalid';
28			if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
29			this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
30			return undefined;
31			},
32
33			/**
34			* Return a registered test by name
35			*
36			* @param {string} name
37			* @returns {object\|false}
38			*/
39			get_test_by_name: function(name){
40			for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
41			return false;
42			},
43
44			/**
45			* Check if the url passed is valid for crawling, if so and it hasn't
46			* been added or crawled before, add it to the que
47			*
48			* Returns true\|false if added to que
49			*
50			* @param {string} url
51			* @returns {boolean}
52			*/
53			que_url: function(url){
54			var sanitized = this.sanitize(url);
55			if( !this.can_crawl(url) \|\| this.que.indexOf(sanitized) > -1 \|\| !this.can_crawl(sanitized)) return false;
56			this.que.push(sanitized);
57			return true;
58			},
59
60			/**
61			* Clean up a url so it becomes relative and standardized
62			*
63			* @param {string} url
64			* @returns {string}
65			*/
66			sanitize: function(url){
67			if(url == undefined) return '';
68
69			url = url
70			.replace(/^\/\|\/$/g, '')
71			.replace(/https?:\/\/[^\/]+/i, '')
72			.replace(/^\/\|\/$/g, '')
73			.split('#')[0];
74
75			if( url.slice(-1) == '?' ) url = url.slice(0, -1);
76			if( url.length < 1 ) url = '/';
77
78			return url;
79			},
80
81			/**
82			* Get the domain for the passed url
83			*
84			* @param {string} url
85			* @returns {string}
86			*/
87			get_domain: function(url){
88			if( !url ) return '';
89			if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
90			else return url.split('/')[0].split(':')[0];
91			},
92
93			/**
94			* Checks if the passed url should be ignored or not
95			*
96			* @param {string} url
97			* @returns {boolean}
98			*/
99			ignore_url: function( url ){
100			for(var regex in this.ignore_paths) {
101			var reg = new RegExp(this.ignore_paths[regex], 'i');
102			if( url.match(reg) != null ) return true;
103			}
104			return false;
105			},
106
107			/**
108			* Add a path to ignore when crawler
109			* Note: Paths can be in regex format
110			*
111			* @param {string} path
112			* @returns {crawler}
113			*/
114			add_ignore_path: function(path){
115			this.ignore_paths.push(path);
116			return this;
117			},
118
119			/**
120			* Update all ignore paths to the paths specified
121			* Note: Path can be in regex format
122			*
123			* @param paths
124			* @returns {crawler}
125			*/
126			set_ignore_paths: function(paths){
127			this.ignore_paths = paths;
128			return this;
129			},
130
131			/**
132			* Sets the crawl id
133			*
134			* @param crawl_id
135			* @returns {crawler}
136			*/
137			set_crawl_id: function(crawl_id){
138			this.crawl_id = crawl_id;
139			return this;
140			},
141
142			/**
143			* Does some soft checks to determine if url is a valid candidate for crawling
144			*
145			* @param {string} url
146			* @returns {boolean}
147			*/
148			can_crawl: function(url){
149			if(url == undefined) return false;
150			return !(this.crawling.indexOf(url) >= 0 \|\| this.tested.indexOf(url) >= 0 \|\|
151			this.is_file(url) \|\| this.ignore_url(url) \|\| this.is_external(url));
152			},
153
154			/**
155			* Does a soft check for the url passed and checks if it's a file
156			* by checking if it has an extension and if the extension contains 'html'
157			*
158			* @param {string} url
159			* @returns {boolean}
160			*/
161			is_file: function(url){
162			var split = this.sanitize( url ).split( '.' );
163			return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
164			},
165
166			/**
167			* Does some soft checking for the url passed to see if it's external
168			* Note: If the url is internal but redirects to an external source, we wown't detect it here
169			*
170			* @param {string} url
171			* @returns {boolean}
172			*/
173			is_external: function(url){
174			// Starts with / or # or doesn't have :// in it has to be internal
175			if( url.length < 1 \|\| url[0] == '/' \|\| url[0] == '#' \|\| url.indexOf('://') < 0 ) return false;
176
177			// If we removed the domain and the url is still the same then it's an internal link without the leading /
178			if( url == this.sanitize( url ) ) return false;
179
180			// The domain is the same the domain we're running this script on
181			if( this.get_domain( url ) == location.hostname ) return false;
182
183			return true;
184			},
185
186			/**
187			* Checks if the href passed is an anchor link for url passed.
188			*
189			* @param {string} href
190			* @param {string} url
191			* @return {boolean}
192			*/
193			is_anchor: function(href, url){
194			return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
195			},
196
197			/**
198			* Fetch the next url from the que and run the tests on it
199			*/
200			fetch_and_test: function(){
201			if( !this.que \|\| this.que.length < 1 \|\| this.que.length < 1 \|\| $.active > 2 ) return false;
202
203			var url = this.que.pop();
204			this.crawling.push(url);
205
206			$.ajax({
207			url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
208			})
209			.done(function( result ) {
210			if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
211			if( !crawler.is_external(result['url_fetched']) ) {
212			url = crawler.sanitize(result['url_fetched']);
213			if(crawler.tested.indexOf(url) >= 0){
214			this.skipped = true;
215			return true;
216			}
217
218			var html = $(crawler.strip_img_src(result['body']));
219			crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
220			crawler.fetch_links(html, url);
221			crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
222			crawler.trigger('CRAWL_AFTER_TESTS', [url]);
223			return true;
224			}
225			}
226			crawler.failed.push(url);
227			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
228			})
229			.fail( function(){
230			crawler.failed.push(url);
231			return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
232			})
233			.always( function(){
234			if((this.hasOwnProperty('skipped') && this.skipped) \|\| crawler.tested.indexOf(url) < 0 ) {
235			crawler.tested.push(url)
236			}
237			return crawler.trigger('CRAWL_FINISHED', [url]);
238			});
			0 ignored issues – show Best Practice introduced 2016-09-25 13:32 UTC by Report Bug Copy Issue Report There is no `return` statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding `return undefined;` explicitly. Loading history...
239			},
240
241			/**
242			* Check for links in the html of the rendered page so we add them to the que
243			* and also map how pages are linked to each other
244			*
245			* @param {jQuery} html
246			* @param {string} url
247			*/
248			fetch_links: function(html, url){
249			$.each(html.find('a'), function(){
250			var href = $(this).attr('href'),
251			link = crawler.sanitize(href);
252
253			crawler.que_url( href );
254
255			if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
256			else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
257			});
258			},
259
260			/**
261			* Run the registered tests
262			*
263			* @param {string} url
264			* @param {jQuery} html
265			* @param {Array} headers
266			* @param {Array} field_data
267			* @param {Array} phrases
268			*/
269			run_tests: function(url, html, headers, field_data, phrases){
270			for(var t in this.tests) {
271			this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
272			this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
273			this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
274			}
275			},
276
277			/**
278			* Trigger event callback and pass on the data
279			*
280			* @param {string} event
281			* @param {*} data
282			*/
283			trigger: function(event, data){
284			if(this.events.hasOwnProperty(event))
285			for(var e in this.events[event]) this.events[event][e].apply(this, data);
286			},
287
288			/**
289			* Register callback on action
290			*
291			* @param {string} event
292			* @param {function} callback
293			* @returns {crawler}
294			*/
295			on: function(event, callback){
296			if(!this.events.hasOwnProperty(event)) this.events[event] = [];
297			this.events[event].push(callback);
298			},
299
300			/**
301			* Strip out src=<anything> so that we avoid loading the images
302			* on the pages
303			*
304			* @param {string}html
305			* @returns {string}
306			*/
307			strip_img_src: function(html){
308			return html.replace( /(src).?=(['\|"].?['\|"])/ig, '' );
309			},
310
311			/**
312			* Return the proxy url to test the passed url
313			*
314			* @param {$string} url
315			* @returns {string}
316			*/
317			get_proxy: function(url){
318			return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
319			},
320
321			/**
322			* @see crawler_painter.add_row(name, data)
323			* @param {string} name
324			* @param {Array} data
325			*/
326			add_row: function(name, data){
327			crawler_painter.add_row(name, data);
328			},
329
330			/**
331			* Returns the word count for a given set of sentences or string
332			*
333			* @param {string\|array} data
334			* @returns {number}
335			*/
336			get_word_count: function(data){
337			if( typeof data === 'string' ) return data.split(' ').length;
338
339			var count = 0;
340			for( var d in data ) count += data[d].split(' ').length;
341			return count;
342			},
343
344			/**
345			* Set an arbitrary property on the crawler object
346			*
347			* @param {string} property
348			* @param {string\|int} key
349			* @param {*} val
350			* @return undefined
351			*/
352			set_property: function(property, key, val){
353			if(!this.hasOwnProperty(property)) this[property] = {};
354			if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
355			else this[property][key].push(val);
356			},
357
358			/**
359			* Start the crawler
360			*
361			* @param {object} settings
362			* @throws Exception
363			*/
364			init: function(settings){
365			this.trigger('BEFORE_INIT', []);
366
367			if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
368			if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
369
370			if( !this.crawl_id ) throw "crawl_id must be specified";
371
372			// When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
373			this.on('CRAWL_FINISHED', function(){
374			if( crawler.que.length > 0 ) crawler.fetch_and_test();
375			else window.clearInterval(crawler.interval);
376			});
377
378			// Every second try to initialize a new crawl request just in-case something crashes
379			this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);
380
381			crawler_painter.init();
382			this.trigger('AFTER_INIT', []);
383			}
384			};
385

dylangrech92 / seotoolbox

Push — develop ( 1545da...3f2ac7 )

crawler.set_property A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like