Completed
Push — develop ( 322e3f...d82e9f )
by Dylan
02:40
created

crawler.init   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
dl 0
loc 1
rs 10
c 2
b 1
f 0
cc 1
nc 1
nop 0
1
const crawler = {
2
3
    que             : [],
4
    tested          : [],
5
    crawling        : [],
6
    failed          : [],
7
    tests           : [],
8
    ignore_paths    : [],
9
    crawl_id        : undefined,
10
    linked_from     : {},
11
    redirects       : {},
12
    useragent       : 'desktop',
13
    event_handler   : crawler_event_handler,
1 ignored issue
show
Bug introduced by
The variable crawler_event_handler seems to be never declared. If this is a global, consider adding a /** global: crawler_event_handler */ comment.

This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.

To learn more about declaring variables in Javascript, see the MDN.

Loading history...
14
    painter         : crawler_painter,
15
16
    /**
17
     * Register a test to run.
18
     *
19
     * @param {string} name
20
     * @param {string} title
21
     * @param {Array} headers
22
     * @param {*} callable
23
     * @returns {undefined}
24
     * @throws Exception
25
     */
26
    regiser_test: function(name, title, headers, callable){
27
        if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test';
28
        if(title == undefined) throw 'Title not specified';
29
        if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid';
30
        if(typeof callable != 'function') return this.painter.create(name, title, headers);
31
        this.tests.push({name: name, title: title, callback: callable, cont: this.painter.create(name, title, headers)});
32
        return undefined;
33
    },
34
35
    /**
36
     * Return a registered test by name
37
     *
38
     * @param {string} name
39
     * @returns {object|false}
40
     */
41
    get_test_by_name: function(name){
42
        for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t];
43
        return false;
44
    },
45
46
    /**
47
     * Check if the url passed is valid for crawling, if so and it hasn't
48
     * been added or crawled before, add it to the que
49
     *
50
     * Returns true|false if added to que
51
     *
52
     * @param {string} url
53
     * @returns {boolean}
54
     */
55
    que_url: function(url){
56
        var sanitized = this.sanitize(url);
57
        if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false;
58
        this.que.push(sanitized);
59
        return true;
60
    },
61
62
    /**
63
     *  Clean up a url so it becomes relative and standardized
64
     *
65
     * @param {string} url
66
     * @returns {string}
67
     */
68
    sanitize: function(url){
69
        if(url == undefined) return '';
70
71
        url = url
72
            .replace(/^\/|\/$/g, '')
73
            .replace(/https?:\/\/[^\/]+/i, '')
74
            .replace(/^\/|\/$/g, '')
75
            .split('#')[0];
76
77
        if( url.slice(-1) == '?' ) url = url.slice(0, -1);
78
        if( url.length < 1 ) url = '/';
79
80
        return url;
81
    },
82
83
    /**
84
     * Get the domain for the passed url
85
     *
86
     * @param {string} url
87
     * @returns {string}
88
     */
89
    get_domain: function(url){
90
        if( !url ) return '';
91
        if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0];
92
        else return url.split('/')[0].split(':')[0];
93
    },
94
95
    /**
96
     * Checks if the passed url should be ignored or not
97
     *
98
     * @param {string} url
99
     * @returns {boolean}
100
     */
101
    ignore_url: function( url ){
102
        for(var regex in this.ignore_paths) {
103
            var reg = new RegExp(this.ignore_paths[regex], 'i');
104
            if( url.match(reg) != null ) return true;
105
        }
106
        return false;
107
    },
108
109
    /**
110
     * Update all ignore paths to the paths specified
111
     * Note: Path can be in regex format
112
     *
113
     * @param paths
114
     * @returns {crawler}
115
     */
116
    set_ignore_paths: function(paths){
117
        this.ignore_paths = paths;
118
        return this;
119
    },
120
121
    /**
122
     * Sets the crawl id
123
     *
124
     * @param crawl_id
125
     * @returns {crawler}
126
     */
127
    set_crawl_id: function(crawl_id){
128
        this.crawl_id = crawl_id;
129
        return this;
130
    },
131
132
    /**
133
     * Does some soft checks to determine if url is a valid candidate for crawling
134
     *
135
     * @param {string} url
136
     * @returns {boolean}
137
     */
138
    can_crawl: function(url){
139
        if(url == undefined) return false;
140
        return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 &&
141
                !this.is_file(url) && !this.ignore_url(url) && !this.is_external(url);
142
    },
143
144
    /**
145
     * Does a soft check for the url passed and checks if it's a file
146
     * by checking if it has an extension and if the extension contains 'html'
147
     *
148
     * @param {string} url
149
     * @returns {boolean}
150
     */
151
    is_file: function(url){
152
        var split = this.sanitize( url ).split( '.' );
153
        return split.length > 1 && split.pop().indexOf( 'html' ) < 0;
154
    },
155
156
    /**
157
     * Does some soft checking for the url passed to see if it's external
158
     * Note: If the url is internal but redirects to an external source, we wown't detect it here
159
     *
160
     * @param {string} url
161
     * @returns {boolean}
162
     */
163
    is_external: function(url){
164
        return !(
165
            url.length < 1              ||
166
            url[0] == '/'               ||
167
            url[0] == '#'               ||
168
            url.indexOf('://') < 0      ||
169
            url == this.sanitize( url ) ||
170
            this.get_domain( url ) == location.hostname
171
        );
172
    },
173
174
    /**
175
     * Checks if the href passed is an anchor link for url passed.
176
     *
177
     * @param {string} href
178
     * @param {string} url
179
     * @return {boolean}
180
     */
181
    is_anchor: function(href, url){
182
        return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url);
183
    },
184
185
    /**
186
     * Check if that target we requested matches the response we got.
187
     * If not mark as a redirect and append the redirect to be crawled
188
     *
189
     * @param {string} target
190
     * @param {string} response
191
     * @return {boolean}
192
     */
193
    check_fetched_url: function(target, response){
194
        if(target != response){
195
            this.redirects[target] = response;
196
            this.que_url(response);
197
            return false;
198
        }
199
200
        return true;
201
    },
202
203
    /**
204
     * Checks if the string passed is an html page
205
     *
206
     * @param {string} html
207
     * @returns {boolean}
208
     */
209
    is_html: function(html){
210
        return html.indexOf('<head') > 0 && html.indexOf('<body') > 0;
211
    },
212
213
    /**
214
     * Fetch the next url from the que and run the tests on it
215
     */
216
    fetch_and_test: function(){
217
        if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false;
218
219
        var url = this.que.pop();
220
        this.crawling.push(url);
221
222
        $.ajax({
223
            url: this.get_proxy( '/seotest/getPageData?u='+url ),
224
            data: { agent: this.useragent },
225
            accepts: 'json',
226
            dataType: 'json'
227
        })
228
            .done(function( result ) {
229
                var fetched = crawler.sanitize(result['url_fetched']);
230
                if( !result['headers'] || !result['body'] ) {
231
                    return crawler.failed_url(url);
232
                }else if(!crawler.check_fetched_url(url, fetched)){
233
                    this.skipped = true;
234
                    return crawler.event_handler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]);
235
                }else if(crawler.is_html(result['body'])){
236
                    var html = $(crawler.strip_img_src(result['body']));
237
                    crawler.fetch_links(html, url);
238
                    return crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']);
239
                }else{
240
                    this.skipped = true;
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
241
                }
242
            })
243
            .fail( function(){
244
                return crawler.failed_url(url);
245
            })
246
            .always( function(){
247
                crawler.crawling.splice(crawler.crawling.indexOf(url), 1);
248
249
                if(!this.hasOwnProperty('skipped')){
250
                    crawler.tested.push(url);
251
                }
252
253
                crawler.event_handler.trigger('CRAWL_FINISHED', [url]);
254
255
                if( crawler.que.length < 1 && crawler.crawling.length < 1){
256
                    crawler.event_handler.trigger('ALL_CRAWLS_FINISHED', []);
257
                }
258
259
                return crawler.fetch_and_test();
260
            });
0 ignored issues
show
Best Practice introduced by
There is no return statement in this branch, but you do return something in other branches. Did you maybe miss it? If you do not want to return anything, consider adding return undefined; explicitly.
Loading history...
261
    },
262
263
    /**
264
     * Check for links in the html of the rendered page so we add them to the que
265
     * and also map how pages are linked to each other
266
     *
267
     * @param {jQuery} html
268
     * @param {string} url
269
     */
270
    fetch_links: function(html, url){
271
        $.each(html.find('a'), function(){
272
            var href    = $(this).attr('href'),
273
                link    = crawler.sanitize(href);
274
275
            crawler.que_url( href );
276
277
            if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url];
278
            else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url);
279
        });
280
    },
281
282
    /**
283
     * Run the registered tests
284
     *
285
     * @param {string} url
286
     * @param {jQuery} html
287
     * @param {Array} headers
288
     * @param {Array} field_data
289
     * @param {Array} phrases
290
     * @returns {undefined}
291
     */
292
    run_tests: function(url, html, headers, field_data, phrases){
293
        this.event_handler.trigger('CRAWL_BEFORE_TESTS', [url]);
294
        for(var t in this.tests) {
295
            this.event_handler.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
296
            this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]);
297
            this.event_handler.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]);
298
        }
299
        return this.event_handler.trigger('CRAWL_AFTER_TESTS', [url]);
300
    },
301
302
    /**
303
     * Strip out src=<anything> so that we avoid loading the images
304
     * on the pages
305
     *
306
     * @param {string}html
307
     * @returns {string}
308
     */
309
    strip_img_src: function(html){
310
        return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' );
311
    },
312
313
    /**
314
     * Return the proxy url to test the passed url
315
     *
316
     * @param {string} url
317
     * @returns {string}
318
     */
319
    get_proxy: function(url){
320
        return location.protocol + '//' + location.hostname + url;
321
    },
322
323
    /**
324
     * Returns the word count for a given set of sentences or string
325
     *
326
     * @param {string|array} data
327
     * @returns {number}
328
     */
329
    get_word_count: function(data){
330
        if( typeof data === 'string' ) return data.split(' ').length;
331
332
        var count = 0;
333
        for( var d in data ) count += data[d].split(' ').length;
334
        return count;
335
    },
336
337
    /**
338
     * Set an arbitrary property on the crawler object
339
     *
340
     * @param {string} property
341
     * @param {string|int} key
342
     * @param {*} val
343
     * @return undefined
344
     */
345
    set_property: function(property, key, val){
346
        if(!this.hasOwnProperty(property)) this[property] = {};
347
        if(!this[property].hasOwnProperty(key)) this[property][key] = [val];
348
        else this[property][key].push(val);
349
    },
350
351
    /**
352
     * Add the failed url to the failed list and trigger the failed event
353
     *
354
     * @param {string} url
355
     * @returns {undefined}
356
     */
357
    failed_url: function(url){
358
        this.failed.push(url);
359
        return this.event_handler.trigger('CRAWL_LOAD_FAILED', [url]);
360
    },
361
362
    /**
363
     * Triggered every second
364
     *
365
     * @returns {undefined}
366
     */
367
    loop: function(){
368
        this.event_handler.trigger('CRAWLER_LOOP', [this]);
369
        this.fetch_and_test();
370
        return undefined;
371
    },
372
373
    /**
374
     * Start the crawler
375
     *
376
     * @param {object} settings
377
     * @throws Exception
378
     */
379
    init: function(settings){
380
        this.event_handler.trigger('BEFORE_INIT', [this]);
381
382
        if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']);
383
        if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']);
384
385
        if( !this.crawl_id ) throw "crawl_id must be specified";
386
387
        this.interval = setInterval(function(){crawler.loop();}, 1000);
388
        this.event_handler.on('ALL_CRAWLS_FINISHED', function(){ window.clearInterval( crawler.interval ); });
389
390
        this.painter.init();
391
        this.event_handler.trigger('AFTER_INIT', [this]);
392
    }
393
};
394