Passed
Push — develop ( 1545da...3f2ac7 )
by Dylan
02:45
created

crawler.set_property   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
nc 4
nop 3
dl 0
loc 5
rs 9.4285
c 0
b 0
f 0
1
const crawler = {
2
3
    que             : [],
4
    tested          : [],
5
    crawling        : [],
6
    failed          : [],
7
    tests           : [],
8
    ignore_paths    : [],
9
    crawl_id        : undefined,
10
    events          : {},
11
    linked_from     : {},
12
    useragent       : 'desktop',
13
14
    /**
15
     * Register a test to run.
16
     *
17
     * @param {string} name
18
     * @param {string} title
19
     * @param {Array} headers
20
     * @param {string} callable
21
     * @returns {undefined}
22
     * @throws Exception
23
     */
24
    regiser_test: function(name, title, headers, callable){
25
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
26
        if(title == undefined) throw 'Title not specified';
27
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
28
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
29
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
30
        return undefined;
31
    },
32
33
    /**
34
     * Return a registered test by name
35
     *
36
     * @param {string} name
37
     * @returns {object|false}
38
     */
39
    get_test_by_name: function(name){
40
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
41
        return false;
42
    },
43
44
    /**
45
     * Check if the url passed is valid for crawling, if so and it hasn't
46
     * been added or crawled before, add it to the que
47
     *
48
     * Returns true|false if added to que
49
     *
50
     * @param {string} url
51
     * @returns {boolean}
52
     */
53
    que_url: function(url){
54
        var sanitized = this.sanitize(url);
55
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
56
        this.que.push(sanitized);
57
        return true;
58
    },
59
60
    /**
61
     *  Clean up a url so it becomes relative and standardized
62
     *
63
     * @param {string} url
64
     * @returns {string}
65
     */
66
    sanitize: function(url){
67
        if(url == undefined) return '';
68
69
        url = url
70
            .replace(/^\/|\/$/g, '')
71
            .replace(/https?:\/\/[^\/]+/i, '')
72
            .replace(/^\/|\/$/g, '')
73
            .split('#')[0];
74
75
        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
76
        if( url.length < 1 ) url = '/';
77
78
        return url;
79
    },
80
81
    /**
82
     * Get the domain for the passed url
83
     *
84
     * @param {string} url
85
     * @returns {string}
86
     */
87
    get_domain: function(url){
88
        if( !url ) return '';
89
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
90
        else return url.split('/')[0].split(':')[0];
91
    },
92
93
    /**
94
     * Checks if the passed url should be ignored or not
95
     *
96
     * @param {string} url
97
     * @returns {boolean}
98
     */
99
    ignore_url: function( url ){
100
        for(var regex in this.ignore_paths) {
101
            var reg = new RegExp(this.ignore_paths[regex], 'i');
102
            if( url.match(reg) != null ) return true;
103
        }
104
        return false;
105
    },
106
107
    /**
108
     * Add a path to ignore when crawler
109
     * Note: Paths can be in regex format
110
     *
111
     * @param {string} path
112
     * @returns {crawler}
113
     */
114
    add_ignore_path: function(path){
115
        this.ignore_paths.push(path);
116
        return this;
117
    },
118
119
    /**
120
     * Update all ignore paths to the paths specified
121
     * Note: Path can be in regex format
122
     *
123
     * @param paths
124
     * @returns {crawler}
125
     */
126
    set_ignore_paths: function(paths){
127
        this.ignore_paths = paths;
128
        return this;
129
    },
130
131
    /**
132
     * Sets the crawl id
133
     *
134
     * @param crawl_id
135
     * @returns {crawler}
136
     */
137
    set_crawl_id: function(crawl_id){
138
        this.crawl_id = crawl_id;
139
        return this;
140
    },
141
142
    /**
143
     * Does some soft checks to determine if url is a valid candidate for crawling
144
     *
145
     * @param {string} url
146
     * @returns {boolean}
147
     */
148
    can_crawl: function(url){
149
        if(url == undefined) return false;
150
        return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 ||
151
                    this.is_file(url) || this.ignore_url(url) || this.is_external(url));
152
    },
153
154
    /**
155
     * Does a soft check for the url passed and checks if it's a file
156
     * by checking if it has an extension and if the extension contains 'html'
157
     *
158
     * @param {string} url
159
     * @returns {boolean}
160
     */
161
    is_file: function(url){
162
        var split = this.sanitize( url ).split( '.' );
163
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
164
    },
165
166
    /**
167
     * Does some soft checking for the url passed to see if it's external
168
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
169
     *
170
     * @param {string} url
171
     * @returns {boolean}
172
     */
173
    is_external: function(url){
174
        // Starts with / or # or doesn't have :// in it has to be internal
175
        if( url.length < 1 || url[0] == '/' || url[0] == '#' || url.indexOf('://') < 0 ) return false;
176
177
        // If we removed the domain and the url is still the same then it's an internal link without the leading /
178
        if( url == this.sanitize( url ) ) return false;
179
180
        // The domain is the same the domain we're running this script on
181
        if( this.get_domain( url ) == location.hostname ) return false;
182
183
        return true;
184
    },
185
186
    /**
187
     * Checks if the href passed is an anchor link for url passed.
188
     *
189
     * @param {string} href
190
     * @param {string} url
191
     * @return {boolean}
192
     */
193
    is_anchor: function(href, url){
194
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
195
    },
196
197
    /**
198
     * Fetch the next url from the que and run the tests on it
199
     */
200
    fetch_and_test: function(){
201
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;
202
203
        var url = this.que.pop();
204
        this.crawling.push(url);
205
206
        $.ajax({
207
            url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
208
        })
209
            .done(function( result ) {
210
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
211
                    if( !crawler.is_external(result['url_fetched']) ) {
212
                        url = crawler.sanitize(result['url_fetched']);
213
                        if(crawler.tested.indexOf(url) >= 0){
214
                            this.skipped = true;
215
                            return true;
216
                        }
217
218
                        var html = $(crawler.strip_img_src(result['body']));
219
                        crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
220
                        crawler.fetch_links(html, url);
221
                        crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
222
                        crawler.trigger('CRAWL_AFTER_TESTS', [url]);
223
                        return true;
224
                    }
225
                }
226
                crawler.failed.push(url);
227
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
228
            })
229
            .fail( function(){
230
                crawler.failed.push(url);
231
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
232
            })
233
            .always( function(){
234
                if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) {
235
                    crawler.tested.push(url)
236
                }
237
                return crawler.trigger('CRAWL_FINISHED', [url]);
238
            });
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
239
    },
240
241
    /**
242
     * Check for links in the html of the rendered page so we add them to the que
243
     * and also map how pages are linked to each other
244
     *
245
     * @param {jQuery} html
246
     * @param {string} url
247
     */
248
    fetch_links: function(html, url){
249
        $.each(html.find('a'), function(){
250
            var href    = $(this).attr('href'),
251
                link    = crawler.sanitize(href);
252
253
            crawler.que_url( href );
254
255
            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
256
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
257
        });
258
    },
259
260
    /**
261
     * Run the registered tests
262
     *
263
     * @param {string} url
264
     * @param {jQuery} html
265
     * @param {Array} headers
266
     * @param {Array} field_data
267
     * @param {Array} phrases
268
     */
269
    run_tests: function(url, html, headers, field_data, phrases){
270
        for(var t in this.tests) {
271
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
272
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
273
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
274
        }
275
    },
276
277
    /**
278
     * Trigger event callback and pass on the data
279
     *
280
     * @param {string} event
281
     * @param {*} data
282
     */
283
    trigger: function(event, data){
284
        if(this.events.hasOwnProperty(event))
285
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
286
    },
287
288
    /**
289
     * Register callback on action
290
     *
291
     * @param {string} event
292
     * @param {function} callback
293
     * @returns {crawler}
294
     */
295
    on: function(event, callback){
296
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
297
        this.events[event].push(callback);
298
    },
299
300
    /**
301
     * Strip out src=<anything> so that we avoid loading the images
302
     * on the pages
303
     *
304
     * @param {string}html
305
     * @returns {string}
306
     */
307
    strip_img_src: function(html){
308
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
309
    },
310
311
    /**
312
     * Return the proxy url to test the passed url
313
     *
314
     * @param {$string} url
315
     * @returns {string}
316
     */
317
    get_proxy: function(url){
318
        return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
319
    },
320
321
    /**
322
     * @see crawler_painter.add_row(name, data)
323
     * @param {string} name
324
     * @param {Array} data
325
     */
326
    add_row: function(name, data){
327
        crawler_painter.add_row(name, data);
328
    },
329
330
    /**
331
     * Returns the word count for a given set of sentences or string
332
     *
333
     * @param {string|array} data
334
     * @returns {number}
335
     */
336
    get_word_count: function(data){
337
        if( typeof data === 'string' ) return data.split(' ').length;
338
339
        var count = 0;
340
        for( var d in data ) count += data[d].split(' ').length;
341
        return count;
342
    },
343
344
    /**
345
     * Set an arbitrary property on the crawler object
346
     *
347
     * @param {string} property
348
     * @param {string|int} key
349
     * @param {*} val
350
     * @return undefined
351
     */
352
    set_property: function(property, key, val){
353
        if(!this.hasOwnProperty(property)) this[property] = {};
354
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
355
        else this[property][key].push(val);
356
    },
357
358
    /**
359
     * Start the crawler
360
     *
361
     * @param {object} settings
362
     * @throws Exception
363
     */
364
    init: function(settings){
365
        this.trigger('BEFORE_INIT', []);
366
367
        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
368
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
369
370
        if( !this.crawl_id ) throw "crawl_id must be specified";
371
372
        // When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
373
        this.on('CRAWL_FINISHED', function(){
374
            if( crawler.que.length > 0 ) crawler.fetch_and_test();
375
            else window.clearInterval(crawler.interval);
376
        });
377
378
        // Every second try to initialize a new crawl request just in-case something crashes
379
        this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);
380
381
        crawler_painter.init();
382
        this.trigger('AFTER_INIT', []);
383
    }
384
};
385