Completed
Push — develop ( 8024f0...d911c7 )
by Dylan
03:13
created

crawler.failed_url   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 4
rs 10
1
const crawler = {
2
3
    que             : [],
4
    tested          : [],
5
    crawling        : [],
6
    failed          : [],
7
    tests           : [],
8
    ignore_paths    : [],
9
    crawl_id        : undefined,
10
    events          : {},
11
    linked_from     : {},
12
    redirects       : {},
13
    useragent       : 'desktop',
14
15
    /**
16
     * Register a test to run.
17
     *
18
     * @param {string} name
19
     * @param {string} title
20
     * @param {Array} headers
21
     * @param {*} callable
22
     * @returns {undefined}
23
     * @throws Exception
24
     */
25
    regiser_test: function(name, title, headers, callable){
26
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
27
        if(title == undefined) throw 'Title not specified';
28
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
29
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
30
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
31
        return undefined;
32
    },
33
34
    /**
35
     * Return a registered test by name
36
     *
37
     * @param {string} name
38
     * @returns {object|false}
39
     */
40
    get_test_by_name: function(name){
41
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
42
        return false;
43
    },
44
45
    /**
46
     * Check if the url passed is valid for crawling, if so and it hasn't
47
     * been added or crawled before, add it to the que
48
     *
49
     * Returns true|false if added to que
50
     *
51
     * @param {string} url
52
     * @returns {boolean}
53
     */
54
    que_url: function(url){
55
        var sanitized = this.sanitize(url);
56
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
57
        this.que.push(sanitized);
58
        return true;
59
    },
60
61
    /**
62
     *  Clean up a url so it becomes relative and standardized
63
     *
64
     * @param {string} url
65
     * @returns {string}
66
     */
67
    sanitize: function(url){
68
        if(url == undefined) return '';
69
70
        url = url
71
            .replace(/^\/|\/$/g, '')
72
            .replace(/https?:\/\/[^\/]+/i, '')
73
            .replace(/^\/|\/$/g, '')
74
            .split('#')[0];
75
76
        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
77
        if( url.length < 1 ) url = '/';
78
79
        return url;
80
    },
81
82
    /**
83
     * Get the domain for the passed url
84
     *
85
     * @param {string} url
86
     * @returns {string}
87
     */
88
    get_domain: function(url){
89
        if( !url ) return '';
90
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
91
        else return url.split('/')[0].split(':')[0];
92
    },
93
94
    /**
95
     * Checks if the passed url should be ignored or not
96
     *
97
     * @param {string} url
98
     * @returns {boolean}
99
     */
100
    ignore_url: function( url ){
101
        for(var regex in this.ignore_paths) {
102
            var reg = new RegExp(this.ignore_paths[regex], 'i');
103
            if( url.match(reg) != null ) return true;
104
        }
105
        return false;
106
    },
107
108
    /**
109
     * Add a path to ignore when crawler
110
     * Note: Paths can be in regex format
111
     *
112
     * @param {string} path
113
     * @returns {crawler}
114
     */
115
    add_ignore_path: function(path){
116
        this.ignore_paths.push(path);
117
        return this;
118
    },
119
120
    /**
121
     * Update all ignore paths to the paths specified
122
     * Note: Path can be in regex format
123
     *
124
     * @param paths
125
     * @returns {crawler}
126
     */
127
    set_ignore_paths: function(paths){
128
        this.ignore_paths = paths;
129
        return this;
130
    },
131
132
    /**
133
     * Sets the crawl id
134
     *
135
     * @param crawl_id
136
     * @returns {crawler}
137
     */
138
    set_crawl_id: function(crawl_id){
139
        this.crawl_id = crawl_id;
140
        return this;
141
    },
142
143
    /**
144
     * Does some soft checks to determine if url is a valid candidate for crawling
145
     *
146
     * @param {string} url
147
     * @returns {boolean}
148
     */
149
    can_crawl: function(url){
150
        if(url == undefined) return false;
151
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
152
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
153
    },
154
155
    /**
156
     * Does a soft check for the url passed and checks if it's a file
157
     * by checking if it has an extension and if the extension contains 'html'
158
     *
159
     * @param {string} url
160
     * @returns {boolean}
161
     */
162
    is_file: function(url){
163
        var split = this.sanitize( url ).split( '.' );
164
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
165
    },
166
167
    /**
168
     * Does some soft checking for the url passed to see if it's external
169
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
170
     *
171
     * @param {string} url
172
     * @returns {boolean}
173
     */
174
    is_external: function(url){
175
        return !(
176
            url.length < 1              ||
177
            url[0] == '/'               ||
178
            url[0] == '#'               ||
179
            url.indexOf('://') < 0      ||
180
            url == this.sanitize( url ) ||
181
            this.get_domain( url ) == location.hostname
182
        );
183
    },
184
185
    /**
186
     * Checks if the href passed is an anchor link for url passed.
187
     *
188
     * @param {string} href
189
     * @param {string} url
190
     * @return {boolean}
191
     */
192
    is_anchor: function(href, url){
193
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
194
    },
195
196
    /**
197
     * Check if that target we requested matches the response we got.
198
     * If not mark as a redirect and append the redirect to be crawled
199
     *
200
     * @param {string} target
201
     * @param {string} response
202
     * @return {boolean}
203
     */
204
    check_fetched_url: function(target, response){
205
        if(target != response){
206
            this.redirects[target] = response;
207
            this.que_url(response);
208
            return false;
209
        }
210
211
        return true;
212
    },
213
214
    /**
215
     * Fetch the next url from the que and run the tests on it
216
     */
217
    fetch_and_test: function(){
218
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;
219
220
        var url = this.que.pop();
221
        this.crawling.push(url);
222
223
        $.ajax({
224
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
225
            data: { agent: this.useragent },
226
            accepts: 'json',
227
            dataType: 'json'
228
        })
229
            .done(function( result ) {
230
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
231
                    var fetched = crawler.sanitize(result['url_fetched']);
232
                    if(!crawler.check_fetched_url(url, fetched)){
233
                        this.skipped = true;
234
                        return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235
                    }
236
237
                    var html = $(crawler.strip_img_src(result['body']));
238
                    crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
239
                    crawler.fetch_links(html, url);
240
                    crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
241
                    return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
242
                }else{
243
                    return crawler.failed(url);
244
                }
245
            })
246
            .fail( function(){
247
                return crawler.failed_url(url);
248
            })
249
            .always( function(){
250
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
251
252
                if(!this.hasOwnProperty('skipped')){
253
                    crawler.tested.push(url);
254
                }
255
256
                crawler.trigger('CRAWL_FINISHED', [url]);
257
258
                if( crawler.que.length < 1 && crawler.crawling.length < 1){
259
                    crawler.trigger('ALL_CRAWLS_FINISHED', []);
260
                }
261
262
                return crawler.fetch_and_test();
263
            });
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
264
    },
265
266
    /**
267
     * Check for links in the html of the rendered page so we add them to the que
268
     * and also map how pages are linked to each other
269
     *
270
     * @param {jQuery} html
271
     * @param {string} url
272
     */
273
    fetch_links: function(html, url){
274
        $.each(html.find('a'), function(){
275
            var href    = $(this).attr('href'),
276
                link    = crawler.sanitize(href);
277
278
            crawler.que_url( href );
279
280
            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
281
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
282
        });
283
    },
284
285
    /**
286
     * Run the registered tests
287
     *
288
     * @param {string} url
289
     * @param {jQuery} html
290
     * @param {Array} headers
291
     * @param {Array} field_data
292
     * @param {Array} phrases
293
     */
294
    run_tests: function(url, html, headers, field_data, phrases){
295
        for(var t in this.tests) {
296
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
297
            this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
298
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
299
        }
300
    },
301
302
    /**
303
     * Trigger event callback and pass on the data
304
     *
305
     * @param {string} event
306
     * @param {*} data
307
     * return {undefined}
308
     */
309
    trigger: function(event, data){
310
        if(this.events.hasOwnProperty(event))
311
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
312
    },
313
314
    /**
315
     * Register callback on action
316
     *
317
     * @param {string} event
318
     * @param {function} callback
319
     * @returns {crawler}
320
     */
321
    on: function(event, callback){
322
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
323
        this.events[event].push(callback);
324
    },
325
326
    /**
327
     * Strip out src=<anything> so that we avoid loading the images
328
     * on the pages
329
     *
330
     * @param {string}html
331
     * @returns {string}
332
     */
333
    strip_img_src: function(html){
334
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
335
    },
336
337
    /**
338
     * Return the proxy url to test the passed url
339
     *
340
     * @param {string} url
341
     * @returns {string}
342
     */
343
    get_proxy: function(url){
344
        return location.protocol + '//' + location.hostname + url;
345
    },
346
347
    /**
348
     * @see crawler_painter.add_row(name, data)
349
     * @param {string} name
350
     * @param {Array} data
351
     */
352
    add_row: function(name, data){
353
        crawler_painter.add_row(name, data);
354
    },
355
356
    /**
357
     * Returns the word count for a given set of sentences or string
358
     *
359
     * @param {string|array} data
360
     * @returns {number}
361
     */
362
    get_word_count: function(data){
363
        if( typeof data === 'string' ) return data.split(' ').length;
364
365
        var count = 0;
366
        for( var d in data ) count += data[d].split(' ').length;
367
        return count;
368
    },
369
370
    /**
371
     * Set an arbitrary property on the crawler object
372
     *
373
     * @param {string} property
374
     * @param {string|int} key
375
     * @param {*} val
376
     * @return undefined
377
     */
378
    set_property: function(property, key, val){
379
        if(!this.hasOwnProperty(property)) this[property] = {};
380
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
381
        else this[property][key].push(val);
382
    },
383
384
    /**
385
     * Add the failed url to the failed list and trigger the failed event
386
     *
387
     * @param {string} url
388
     * @returns {undefined}
389
     */
390
    failed_url: function(url){
391
        this.failed.push(url);
392
        return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
393
    },
394
395
    /**
396
     * Start the crawler
397
     *
398
     * @param {object} settings
399
     * @throws Exception
400
     */
401
    init: function(settings){
402
        this.trigger('BEFORE_INIT', []);
403
404
        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
405
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
406
407
        if( !this.crawl_id ) throw "crawl_id must be specified";
408
409
        crawler.fetch_and_test();
410
        crawler.fetch_and_test();
411
412
        crawler_painter.init();
413
        this.trigger('AFTER_INIT', []);
414
    }
415
};
416