Completed
Push — develop ( 7852a2...d7f552 )
by Dylan
03:02
created

crawler.check_fetched_url   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
cc 2
nc 2
nop 2
1
const crawler = {
2
3
    que             : [],
4
    tested          : [],
5
    crawling        : [],
6
    failed          : [],
7
    tests           : [],
8
    ignore_paths    : [],
9
    crawl_id        : undefined,
10
    events          : {},
11
    linked_from     : {},
12
    redirects       : {},
13
    useragent       : 'desktop',
14
15
    /**
16
     * Register a test to run.
17
     *
18
     * @param {string} name
19
     * @param {string} title
20
     * @param {Array} headers
21
     * @param {*} callable
22
     * @returns {undefined}
23
     * @throws Exception
24
     */
25
    regiser_test: function(name, title, headers, callable){
26
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
27
        if(title == undefined) throw 'Title not specified';
28
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
29
        if(typeof callable != 'function') return crawler_painter.create(name, title, headers);
30
        this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)});
31
        return undefined;
32
    },
33
34
    /**
35
     * Return a registered test by name
36
     *
37
     * @param {string} name
38
     * @returns {object|false}
39
     */
40
    get_test_by_name: function(name){
41
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
42
        return false;
43
    },
44
45
    /**
46
     * Check if the url passed is valid for crawling, if so and it hasn't
47
     * been added or crawled before, add it to the que
48
     *
49
     * Returns true|false if added to que
50
     *
51
     * @param {string} url
52
     * @returns {boolean}
53
     */
54
    que_url: function(url){
55
        var sanitized = this.sanitize(url);
56
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
57
        this.que.push(sanitized);
58
        return true;
59
    },
60
61
    /**
62
     *  Clean up a url so it becomes relative and standardized
63
     *
64
     * @param {string} url
65
     * @returns {string}
66
     */
67
    sanitize: function(url){
68
        if(url == undefined) return '';
69
70
        url = url
71
            .replace(/^\/|\/$/g, '')
72
            .replace(/https?:\/\/[^\/]+/i, '')
73
            .replace(/^\/|\/$/g, '')
74
            .split('#')[0];
75
76
        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
77
        if( url.length < 1 ) url = '/';
78
79
        return url;
80
    },
81
82
    /**
83
     * Get the domain for the passed url
84
     *
85
     * @param {string} url
86
     * @returns {string}
87
     */
88
    get_domain: function(url){
89
        if( !url ) return '';
90
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
91
        else return url.split('/')[0].split(':')[0];
92
    },
93
94
    /**
95
     * Checks if the passed url should be ignored or not
96
     *
97
     * @param {string} url
98
     * @returns {boolean}
99
     */
100
    ignore_url: function( url ){
101
        for(var regex in this.ignore_paths) {
102
            var reg = new RegExp(this.ignore_paths[regex], 'i');
103
            if( url.match(reg) != null ) return true;
104
        }
105
        return false;
106
    },
107
108
    /**
109
     * Add a path to ignore when crawler
110
     * Note: Paths can be in regex format
111
     *
112
     * @param {string} path
113
     * @returns {crawler}
114
     */
115
    add_ignore_path: function(path){
116
        this.ignore_paths.push(path);
117
        return this;
118
    },
119
120
    /**
121
     * Update all ignore paths to the paths specified
122
     * Note: Path can be in regex format
123
     *
124
     * @param paths
125
     * @returns {crawler}
126
     */
127
    set_ignore_paths: function(paths){
128
        this.ignore_paths = paths;
129
        return this;
130
    },
131
132
    /**
133
     * Sets the crawl id
134
     *
135
     * @param crawl_id
136
     * @returns {crawler}
137
     */
138
    set_crawl_id: function(crawl_id){
139
        this.crawl_id = crawl_id;
140
        return this;
141
    },
142
143
    /**
144
     * Does some soft checks to determine if url is a valid candidate for crawling
145
     *
146
     * @param {string} url
147
     * @returns {boolean}
148
     */
149
    can_crawl: function(url){
150
        if(url == undefined) return false;
151
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
152
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
153
    },
154
155
    /**
156
     * Does a soft check for the url passed and checks if it's a file
157
     * by checking if it has an extension and if the extension contains 'html'
158
     *
159
     * @param {string} url
160
     * @returns {boolean}
161
     */
162
    is_file: function(url){
163
        var split = this.sanitize( url ).split( '.' );
164
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
165
    },
166
167
    /**
168
     * Does some soft checking for the url passed to see if it's external
169
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
170
     *
171
     * @param {string} url
172
     * @returns {boolean}
173
     */
174
    is_external: function(url){
175
        return !(
176
            url.length < 1              ||
177
            url[0] == '/'               ||
178
            url[0] == '#'               ||
179
            url.indexOf('://') < 0      ||
180
            url == this.sanitize( url ) ||
181
            this.get_domain( url ) == location.hostname
182
        );
183
    },
184
185
    /**
186
     * Checks if the href passed is an anchor link for url passed.
187
     *
188
     * @param {string} href
189
     * @param {string} url
190
     * @return {boolean}
191
     */
192
    is_anchor: function(href, url){
193
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
194
    },
195
196
    /**
197
     * Check if that target we requested matches the response we got.
198
     * If not mark as a redirect and append the redirect to be crawled
199
     *
200
     * @param {string} target
201
     * @param {string} response
202
     * @return {boolean}
203
     */
204
    check_fetched_url: function(target, response){
205
        if(target != response){
206
            this.redirects[target] = response;
207
            this.que_url(response);
208
            return false;
209
        }
210
211
        return true;
212
    },
213
214
    /**
215
     * Fetch the next url from the que and run the tests on it
216
     */
217
    fetch_and_test: function(){
218
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;
219
220
        var url = this.que.pop();
221
        this.crawling.push(url);
222
223
        $.ajax({
224
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
225
            data: { agent: this.useragent },
226
            accepts: 'json',
227
            dataType: 'json'
228
        })
229
            .done(function( result ) {
230
                if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) {
231
                    var fetched = crawler.sanitize(result['url_fetched']);
232
                    if(!crawler.check_fetched_url(url, fetched)){
233
                        this.skipped = true;
234
                        return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235
                    }
236
237
                    var html = $(crawler.strip_img_src(result['body']));
238
                    crawler.trigger('CRAWL_BEFORE_TESTS', [url]);
239
                    crawler.fetch_links(html, url);
240
                    crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
241
                    return crawler.trigger('CRAWL_AFTER_TESTS', [url]);
242
                }else{
243
                    crawler.failed.push(url);
244
                    return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
245
                }
246
            })
247
            .fail( function(){
248
                crawler.failed.push(url);
249
                return crawler.trigger('CRAWL_LOAD_FAILED', [url]);
250
            })
251
            .always( function(){
252
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
253
254
                if(!this.hasOwnProperty('skipped')){
255
                    crawler.tested.push(url);
256
                }
257
258
                crawler.trigger('CRAWL_FINISHED', [url]);
259
260
                if( crawler.que.length < 1 && crawler.crawling.length < 1){
261
                    crawler.trigger('ALL_CRAWLS_FINISHED', []);
262
                }
263
264
                return crawler.fetch_and_test();
265
            });
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
266
    },
267
268
    /**
269
     * Check for links in the html of the rendered page so we add them to the que
270
     * and also map how pages are linked to each other
271
     *
272
     * @param {jQuery} html
273
     * @param {string} url
274
     */
275
    fetch_links: function(html, url){
276
        $.each(html.find('a'), function(){
277
            var href    = $(this).attr('href'),
278
                link    = crawler.sanitize(href);
279
280
            crawler.que_url( href );
281
282
            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
283
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
284
        });
285
    },
286
287
    /**
288
     * Run the registered tests
289
     *
290
     * @param {string} url
291
     * @param {jQuery} html
292
     * @param {Array} headers
293
     * @param {Array} field_data
294
     * @param {Array} phrases
295
     */
296
    run_tests: function(url, html, headers, field_data, phrases){
297
        for(var t in this.tests) {
298
            this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
299
            this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]);
300
            this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
301
        }
302
    },
303
304
    /**
305
     * Trigger event callback and pass on the data
306
     *
307
     * @param {string} event
308
     * @param {*} data
309
     */
310
    trigger: function(event, data){
311
        if(this.events.hasOwnProperty(event))
312
            for(var e in this.events[event]) this.events[event][e].apply(this, data);
313
    },
314
315
    /**
316
     * Register callback on action
317
     *
318
     * @param {string} event
319
     * @param {function} callback
320
     * @returns {crawler}
321
     */
322
    on: function(event, callback){
323
        if(!this.events.hasOwnProperty(event)) this.events[event] = [];
324
        this.events[event].push(callback);
325
    },
326
327
    /**
328
     * Strip out src=<anything> so that we avoid loading the images
329
     * on the pages
330
     *
331
     * @param {string}html
332
     * @returns {string}
333
     */
334
    strip_img_src: function(html){
335
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
336
    },
337
338
    /**
339
     * Return the proxy url to test the passed url
340
     *
341
     * @param {string} url
342
     * @returns {string}
343
     */
344
    get_proxy: function(url){
345
        return location.protocol + '//' + location.hostname + url;
346
    },
347
348
    /**
349
     * @see crawler_painter.add_row(name, data)
350
     * @param {string} name
351
     * @param {Array} data
352
     */
353
    add_row: function(name, data){
354
        crawler_painter.add_row(name, data);
355
    },
356
357
    /**
358
     * Returns the word count for a given set of sentences or string
359
     *
360
     * @param {string|array} data
361
     * @returns {number}
362
     */
363
    get_word_count: function(data){
364
        if( typeof data === 'string' ) return data.split(' ').length;
365
366
        var count = 0;
367
        for( var d in data ) count += data[d].split(' ').length;
368
        return count;
369
    },
370
371
    /**
372
     * Set an arbitrary property on the crawler object
373
     *
374
     * @param {string} property
375
     * @param {string|int} key
376
     * @param {*} val
377
     * @return undefined
378
     */
379
    set_property: function(property, key, val){
380
        if(!this.hasOwnProperty(property)) this[property] = {};
381
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
382
        else this[property][key].push(val);
383
    },
384
385
    /**
386
     * Start the crawler
387
     *
388
     * @param {object} settings
389
     * @throws Exception
390
     */
391
    init: function(settings){
392
        this.trigger('BEFORE_INIT', []);
393
394
        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
395
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
396
397
        if( !this.crawl_id ) throw "crawl_id must be specified";
398
399
        crawler.fetch_and_test();
400
        crawler.fetch_and_test();
401
402
        crawler_painter.init();
403
        this.trigger('AFTER_INIT', []);
404
    }
405
};
406