Completed
Push — master ( 3f2ac7...f065be )
by Dylan
02:41
created

crawler.is_external   A

Complexity

Conditions 1
Paths 6

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
dl 0
loc 10
rs 9.4285
c 1
b 0
f 0
cc 1
nc 6
nop 1
1
const crawler = {
2
3
    que             : [],
4
    tested          : [],
5
    crawling        : [],
6
    failed          : [],
7
    tests           : [],
8
    ignore_paths    : [],
9
    crawl_id        : undefined,
10
    events          : {},
11
    linked_from     : {},
12
    useragent       : 'desktop',
13
14
    /**
15
     * Register a test to run.
16
     *
17
     * @param {string} name
18
     * @param {string} title
19
     * @param {Array} headers
20
     * @param {string} callable
21
     * @returns {undefined}
22
     * @throws Exception
23
     */
24
    regiser_test: function(name, title, headers, callable){
25
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
26
        if(title == undefined) throw 'Title not specified';
27
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
28
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
29
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
30
        return undefined;
31
    },
32
33
    /**
34
     * Return a registered test by name
35
     *
36
     * @param {string} name
37
     * @returns {object|false}
38
     */
39
    get_test_by_name: function(name){
40
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
41
        return false;
42
    },
43
44
    /**
45
     * Check if the url passed is valid for crawling, if so and it hasn't
46
     * been added or crawled before, add it to the que
47
     *
48
     * Returns true|false if added to que
49
     *
50
     * @param {string} url
51
     * @returns {boolean}
52
     */
53
    que_url: function(url){
54
        var sanitized = this.sanitize(url);
55
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
56
        this.que.push(sanitized);
57
        return true;
58
    },
59
60
    /**
61
     *  Clean up a url so it becomes relative and standardized
62
     *
63
     * @param {string} url
64
     * @returns {string}
65
     */
66
    sanitize: function(url){
67
        if(url == undefined) return '';
68
69
        url = url
70
            .replace(/^\/|\/$/g, '')
71
            .replace(/https?:\/\/[^\/]+/i, '')
72
            .replace(/^\/|\/$/g, '')
73
            .split('#')[0];
74
75
        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
76
        if( url.length < 1 ) url = '/';
77
78
        return url;
79
    },
80
81
    /**
82
     * Get the domain for the passed url
83
     *
84
     * @param {string} url
85
     * @returns {string}
86
     */
87
    get_domain: function(url){
88
        if( !url ) return '';
89
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
90
        else return url.split('/')[0].split(':')[0];
91
    },
92
93
    /**
94
     * Checks if the passed url should be ignored or not
95
     *
96
     * @param {string} url
97
     * @returns {boolean}
98
     */
99
    ignore_url: function( url ){
100
        for(var regex in this.ignore_paths) {
101
            var reg = new RegExp(this.ignore_paths[regex], 'i');
102
            if( url.match(reg) != null ) return true;
103
        }
104
        return false;
105
    },
106
107
    /**
108
     * Add a path to ignore when crawler
109
     * Note: Paths can be in regex format
110
     *
111
     * @param {string} path
112
     * @returns {crawler}
113
     */
114
    add_ignore_path: function(path){
115
        this.ignore_paths.push(path);
116
        return this;
117
    },
118
119
    /**
120
     * Update all ignore paths to the paths specified
121
     * Note: Path can be in regex format
122
     *
123
     * @param paths
124
     * @returns {crawler}
125
     */
126
    set_ignore_paths: function(paths){
127
        this.ignore_paths = paths;
128
        return this;
129
    },
130
131
    /**
132
     * Sets the crawl id
133
     *
134
     * @param crawl_id
135
     * @returns {crawler}
136
     */
137
    set_crawl_id: function(crawl_id){
138
        this.crawl_id = crawl_id;
139
        return this;
140
    },
141
142
    /**
143
     * Does some soft checks to determine if url is a valid candidate for crawling
144
     *
145
     * @param {string} url
146
     * @returns {boolean}
147
     */
148
    can_crawl: function(url){
149
        if(url == undefined) return false;
150
        return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 ||
151
                    this.is_file(url) || this.ignore_url(url) || this.is_external(url));
152
    },
153
154
    /**
155
     * Does a soft check for the url passed and checks if it's a file
156
     * by checking if it has an extension and if the extension contains 'html'
157
     *
158
     * @param {string} url
159
     * @returns {boolean}
160
     */
161
    is_file: function(url){
162
        var split = this.sanitize( url ).split( '.' );
163
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
164
    },
165
166
    /**
167
     * Does some soft checking for the url passed to see if it's external
168
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
169
     *
170
     * @param {string} url
171
     * @returns {boolean}
172
     */
173
    is_external: function(url){
174
        return !(
175
            url.length < 1              ||
176
            url[0] == '/'               ||
177
            url[0] == '#'               ||
178
            url.indexOf('://') < 0      ||
179
            url == this.sanitize( url ) ||
180
            this.get_domain( url ) == location.hostname
181
        );
182
    },
183
184
    /**
185
     * Checks if the href passed is an anchor link for url passed.
186
     *
187
     * @param {string} href
188
     * @param {string} url
189
     * @return {boolean}
190
     */
191
    is_anchor: function(href, url){
192
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
193
    },
194
195
    /**
196
     * Fetch the next url from the que and run the tests on it
197
     */
198
    fetch_and_test: function(){
199
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;
200
201
        var url = this.que.pop();
202
        this.crawling.push(url);
203
204
        $.ajax({
205
            url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json'
206
        })
207
            .done(function( result ) {
208
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
209
                    if( !crawler.is_external(result['url_fetched']) ) {
210
                        url = crawler.sanitize(result['url_fetched']);
211
                        if(crawler.tested.indexOf(url) >= 0){
212
                            this.skipped = true;
213
                            return true;
214
                        }
215
216
                        var html = $(crawler.strip_img_src(result['body']));
217
                        crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
218
                        crawler.fetch_links(html, url);
219
                        crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
220
                        crawler.trigger('CRAWL_AFTER_TESTS', [url]);
221
                        return true;
222
                    }
223
                }
224
                crawler.failed.push(url);
225
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
226
            })
227
            .fail( function(){
228
                crawler.failed.push(url);
229
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
230
            })
231
            .always( function(){
232
                if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) {
233
                    crawler.tested.push(url)
234
                }
235
                return crawler.trigger('CRAWL_FINISHED', [url]);
236
            });
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
237
    },
238
239
    /**
240
     * Check for links in the html of the rendered page so we add them to the que
241
     * and also map how pages are linked to each other
242
     *
243
     * @param {jQuery} html
244
     * @param {string} url
245
     */
246
    fetch_links: function(html, url){
247
        $.each(html.find('a'), function(){
248
            var href    = $(this).attr('href'),
249
                link    = crawler.sanitize(href);
250
251
            crawler.que_url( href );
252
253
            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
254
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
255
        });
256
    },
257
258
    /**
259
     * Run the registered tests
260
     *
261
     * @param {string} url
262
     * @param {jQuery} html
263
     * @param {Array} headers
264
     * @param {Array} field_data
265
     * @param {Array} phrases
266
     */
267
    run_tests: function(url, html, headers, field_data, phrases){
268
        for(var t in this.tests) {
269
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
270
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
271
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
272
        }
273
    },
274
275
    /**
276
     * Trigger event callback and pass on the data
277
     *
278
     * @param {string} event
279
     * @param {*} data
280
     */
281
    trigger: function(event, data){
282
        if(this.events.hasOwnProperty(event))
283
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
284
    },
285
286
    /**
287
     * Register callback on action
288
     *
289
     * @param {string} event
290
     * @param {function} callback
291
     * @returns {crawler}
292
     */
293
    on: function(event, callback){
294
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
295
        this.events[event].push(callback);
296
    },
297
298
    /**
299
     * Strip out src=<anything> so that we avoid loading the images
300
     * on the pages
301
     *
302
     * @param {string}html
303
     * @returns {string}
304
     */
305
    strip_img_src: function(html){
306
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
307
    },
308
309
    /**
310
     * Return the proxy url to test the passed url
311
     *
312
     * @param {$string} url
313
     * @returns {string}
314
     */
315
    get_proxy: function(url){
316
        return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url;
317
    },
318
319
    /**
320
     * @see crawler_painter.add_row(name, data)
321
     * @param {string} name
322
     * @param {Array} data
323
     */
324
    add_row: function(name, data){
325
        crawler_painter.add_row(name, data);
326
    },
327
328
    /**
329
     * Returns the word count for a given set of sentences or string
330
     *
331
     * @param {string|array} data
332
     * @returns {number}
333
     */
334
    get_word_count: function(data){
335
        if( typeof data === 'string' ) return data.split(' ').length;
336
337
        var count = 0;
338
        for( var d in data ) count += data[d].split(' ').length;
339
        return count;
340
    },
341
342
    /**
343
     * Set an arbitrary property on the crawler object
344
     *
345
     * @param {string} property
346
     * @param {string|int} key
347
     * @param {*} val
348
     * @return undefined
349
     */
350
    set_property: function(property, key, val){
351
        if(!this.hasOwnProperty(property)) this[property] = {};
352
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
353
        else this[property][key].push(val);
354
    },
355
356
    /**
357
     * Start the crawler
358
     *
359
     * @param {object} settings
360
     * @throws Exception
361
     */
362
    init: function(settings){
363
        this.trigger('BEFORE_INIT', []);
364
365
        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
366
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
367
368
        if( !this.crawl_id ) throw "crawl_id must be specified";
369
370
        // When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart
371
        this.on('CRAWL_FINISHED', function(){
372
            if( crawler.que.length > 0 ) crawler.fetch_and_test();
373
            else window.clearInterval(crawler.interval);
374
        });
375
376
        // Every second try to initialize a new crawl request just in-case something crashes
377
        this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000);
378
379
        crawler_painter.init();
380
        this.trigger('AFTER_INIT', []);
381
    }
382
};
383