1
|
|
|
const crawler = { |
2
|
|
|
|
3
|
|
|
que : [], |
4
|
|
|
tested : [], |
5
|
|
|
crawling : [], |
6
|
|
|
failed : [], |
7
|
|
|
tests : [], |
8
|
|
|
ignore_paths : [], |
9
|
|
|
crawl_id : undefined, |
10
|
|
|
events : {}, |
11
|
|
|
linked_from : {}, |
12
|
|
|
useragent : 'desktop', |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* Register a test to run. |
16
|
|
|
* |
17
|
|
|
* @param {string} name |
18
|
|
|
* @param {string} title |
19
|
|
|
* @param {Array} headers |
20
|
|
|
* @param {string} callable |
21
|
|
|
* @returns {undefined} |
22
|
|
|
* @throws Exception |
23
|
|
|
*/ |
24
|
|
|
regiser_test: function(name, title, headers, callable){ |
25
|
|
|
if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test'; |
26
|
|
|
if(title == undefined) throw 'Title not specified'; |
27
|
|
|
if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid'; |
28
|
|
|
if(typeof callable != 'function') return crawler_painter.create(name, title, headers); |
29
|
|
|
this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)}); |
30
|
|
|
return undefined; |
31
|
|
|
}, |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Return a registered test by name |
35
|
|
|
* |
36
|
|
|
* @param {string} name |
37
|
|
|
* @returns {object|false} |
38
|
|
|
*/ |
39
|
|
|
get_test_by_name: function(name){ |
40
|
|
|
for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t]; |
41
|
|
|
return false; |
42
|
|
|
}, |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* Check if the url passed is valid for crawling, if so and it hasn't |
46
|
|
|
* been added or crawled before, add it to the que |
47
|
|
|
* |
48
|
|
|
* Returns true|false if added to que |
49
|
|
|
* |
50
|
|
|
* @param {string} url |
51
|
|
|
* @returns {boolean} |
52
|
|
|
*/ |
53
|
|
|
que_url: function(url){ |
54
|
|
|
var sanitized = this.sanitize(url); |
55
|
|
|
if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false; |
56
|
|
|
this.que.push(sanitized); |
57
|
|
|
return true; |
58
|
|
|
}, |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Clean up a url so it becomes relative and standardized |
62
|
|
|
* |
63
|
|
|
* @param {string} url |
64
|
|
|
* @returns {string} |
65
|
|
|
*/ |
66
|
|
|
sanitize: function(url){ |
67
|
|
|
if(url == undefined) return ''; |
68
|
|
|
|
69
|
|
|
url = url |
70
|
|
|
.replace(/^\/|\/$/g, '') |
71
|
|
|
.replace(/https?:\/\/[^\/]+/i, '') |
72
|
|
|
.replace(/^\/|\/$/g, '') |
73
|
|
|
.split('#')[0]; |
74
|
|
|
|
75
|
|
|
if( url.slice(-1) == '?' ) url = url.slice(0, -1); |
76
|
|
|
if( url.length < 1 ) url = '/'; |
77
|
|
|
|
78
|
|
|
return url; |
79
|
|
|
}, |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* Get the domain for the passed url |
83
|
|
|
* |
84
|
|
|
* @param {string} url |
85
|
|
|
* @returns {string} |
86
|
|
|
*/ |
87
|
|
|
get_domain: function(url){ |
88
|
|
|
if( !url ) return ''; |
89
|
|
|
if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0]; |
90
|
|
|
else return url.split('/')[0].split(':')[0]; |
91
|
|
|
}, |
92
|
|
|
|
93
|
|
|
/** |
94
|
|
|
* Checks if the passed url should be ignored or not |
95
|
|
|
* |
96
|
|
|
* @param {string} url |
97
|
|
|
* @returns {boolean} |
98
|
|
|
*/ |
99
|
|
|
ignore_url: function( url ){ |
100
|
|
|
for(var regex in this.ignore_paths) { |
101
|
|
|
var reg = new RegExp(this.ignore_paths[regex], 'i'); |
102
|
|
|
if( url.match(reg) != null ) return true; |
103
|
|
|
} |
104
|
|
|
return false; |
105
|
|
|
}, |
106
|
|
|
|
107
|
|
|
/** |
108
|
|
|
* Add a path to ignore when crawler |
109
|
|
|
* Note: Paths can be in regex format |
110
|
|
|
* |
111
|
|
|
* @param {string} path |
112
|
|
|
* @returns {crawler} |
113
|
|
|
*/ |
114
|
|
|
add_ignore_path: function(path){ |
115
|
|
|
this.ignore_paths.push(path); |
116
|
|
|
return this; |
117
|
|
|
}, |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* Update all ignore paths to the paths specified |
121
|
|
|
* Note: Path can be in regex format |
122
|
|
|
* |
123
|
|
|
* @param paths |
124
|
|
|
* @returns {crawler} |
125
|
|
|
*/ |
126
|
|
|
set_ignore_paths: function(paths){ |
127
|
|
|
this.ignore_paths = paths; |
128
|
|
|
return this; |
129
|
|
|
}, |
130
|
|
|
|
131
|
|
|
/** |
132
|
|
|
* Sets the crawl id |
133
|
|
|
* |
134
|
|
|
* @param crawl_id |
135
|
|
|
* @returns {crawler} |
136
|
|
|
*/ |
137
|
|
|
set_crawl_id: function(crawl_id){ |
138
|
|
|
this.crawl_id = crawl_id; |
139
|
|
|
return this; |
140
|
|
|
}, |
141
|
|
|
|
142
|
|
|
/** |
143
|
|
|
* Does some soft checks to determine if url is a valid candidate for crawling |
144
|
|
|
* |
145
|
|
|
* @param {string} url |
146
|
|
|
* @returns {boolean} |
147
|
|
|
*/ |
148
|
|
|
can_crawl: function(url){ |
149
|
|
|
if(url == undefined) return false; |
150
|
|
|
return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 || |
151
|
|
|
this.is_file(url) || this.ignore_url(url) || this.is_external(url)); |
152
|
|
|
}, |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Does a soft check for the url passed and checks if it's a file |
156
|
|
|
* by checking if it has an extension and if the extension contains 'html' |
157
|
|
|
* |
158
|
|
|
* @param {string} url |
159
|
|
|
* @returns {boolean} |
160
|
|
|
*/ |
161
|
|
|
is_file: function(url){ |
162
|
|
|
var split = this.sanitize( url ).split( '.' ); |
163
|
|
|
return split.length > 1 && split.pop().indexOf( 'html' ) < 0; |
164
|
|
|
}, |
165
|
|
|
|
166
|
|
|
/** |
167
|
|
|
* Does some soft checking for the url passed to see if it's external |
168
|
|
|
* Note: If the url is internal but redirects to an external source, we wown't detect it here |
169
|
|
|
* |
170
|
|
|
* @param {string} url |
171
|
|
|
* @returns {boolean} |
172
|
|
|
*/ |
173
|
|
|
is_external: function(url){ |
174
|
|
|
return !( |
175
|
|
|
url.length < 1 || |
176
|
|
|
url[0] == '/' || |
177
|
|
|
url[0] == '#' || |
178
|
|
|
url.indexOf('://') < 0 || |
179
|
|
|
url == this.sanitize( url ) || |
180
|
|
|
this.get_domain( url ) == location.hostname |
181
|
|
|
); |
182
|
|
|
}, |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* Checks if the href passed is an anchor link for url passed. |
186
|
|
|
* |
187
|
|
|
* @param {string} href |
188
|
|
|
* @param {string} url |
189
|
|
|
* @return {boolean} |
190
|
|
|
*/ |
191
|
|
|
is_anchor: function(href, url){ |
192
|
|
|
return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url); |
193
|
|
|
}, |
194
|
|
|
|
195
|
|
|
/** |
196
|
|
|
* Fetch the next url from the que and run the tests on it |
197
|
|
|
*/ |
198
|
|
|
fetch_and_test: function(){ |
199
|
|
|
if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false; |
200
|
|
|
|
201
|
|
|
var url = this.que.pop(); |
202
|
|
|
this.crawling.push(url); |
203
|
|
|
|
204
|
|
|
$.ajax({ |
205
|
|
|
url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json' |
206
|
|
|
}) |
207
|
|
|
.done(function( result ) { |
208
|
|
|
if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) { |
209
|
|
|
if( !crawler.is_external(result['url_fetched']) ) { |
210
|
|
|
url = crawler.sanitize(result['url_fetched']); |
211
|
|
|
if(crawler.tested.indexOf(url) >= 0){ |
212
|
|
|
this.skipped = true; |
213
|
|
|
return true; |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
var html = $(crawler.strip_img_src(result['body'])); |
217
|
|
|
crawler.trigger('CRAWL_BEFORE_TESTS', [url]); |
218
|
|
|
crawler.fetch_links(html, url); |
219
|
|
|
crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']); |
220
|
|
|
crawler.trigger('CRAWL_AFTER_TESTS', [url]); |
221
|
|
|
return true; |
222
|
|
|
} |
223
|
|
|
} |
224
|
|
|
crawler.failed.push(url); |
225
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
226
|
|
|
}) |
227
|
|
|
.fail( function(){ |
228
|
|
|
crawler.failed.push(url); |
229
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
230
|
|
|
}) |
231
|
|
|
.always( function(){ |
232
|
|
|
if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) { |
233
|
|
|
crawler.tested.push(url) |
234
|
|
|
} |
235
|
|
|
return crawler.trigger('CRAWL_FINISHED', [url]); |
236
|
|
|
}); |
|
|
|
|
237
|
|
|
}, |
238
|
|
|
|
239
|
|
|
/** |
240
|
|
|
* Check for links in the html of the rendered page so we add them to the que |
241
|
|
|
* and also map how pages are linked to each other |
242
|
|
|
* |
243
|
|
|
* @param {jQuery} html |
244
|
|
|
* @param {string} url |
245
|
|
|
*/ |
246
|
|
|
fetch_links: function(html, url){ |
247
|
|
|
$.each(html.find('a'), function(){ |
248
|
|
|
var href = $(this).attr('href'), |
249
|
|
|
link = crawler.sanitize(href); |
250
|
|
|
|
251
|
|
|
crawler.que_url( href ); |
252
|
|
|
|
253
|
|
|
if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url]; |
254
|
|
|
else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url); |
255
|
|
|
}); |
256
|
|
|
}, |
257
|
|
|
|
258
|
|
|
/** |
259
|
|
|
* Run the registered tests |
260
|
|
|
* |
261
|
|
|
* @param {string} url |
262
|
|
|
* @param {jQuery} html |
263
|
|
|
* @param {Array} headers |
264
|
|
|
* @param {Array} field_data |
265
|
|
|
* @param {Array} phrases |
266
|
|
|
*/ |
267
|
|
|
run_tests: function(url, html, headers, field_data, phrases){ |
268
|
|
|
for(var t in this.tests) { |
269
|
|
|
this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
270
|
|
|
this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]); |
271
|
|
|
this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
272
|
|
|
} |
273
|
|
|
}, |
274
|
|
|
|
275
|
|
|
/** |
276
|
|
|
* Trigger event callback and pass on the data |
277
|
|
|
* |
278
|
|
|
* @param {string} event |
279
|
|
|
* @param {*} data |
280
|
|
|
*/ |
281
|
|
|
trigger: function(event, data){ |
282
|
|
|
if(this.events.hasOwnProperty(event)) |
283
|
|
|
for(var e in this.events[event]) this.events[event][e].apply(this, data); |
284
|
|
|
}, |
285
|
|
|
|
286
|
|
|
/** |
287
|
|
|
* Register callback on action |
288
|
|
|
* |
289
|
|
|
* @param {string} event |
290
|
|
|
* @param {function} callback |
291
|
|
|
* @returns {crawler} |
292
|
|
|
*/ |
293
|
|
|
on: function(event, callback){ |
294
|
|
|
if(!this.events.hasOwnProperty(event)) this.events[event] = []; |
295
|
|
|
this.events[event].push(callback); |
296
|
|
|
}, |
297
|
|
|
|
298
|
|
|
/** |
299
|
|
|
* Strip out src=<anything> so that we avoid loading the images |
300
|
|
|
* on the pages |
301
|
|
|
* |
302
|
|
|
* @param {string}html |
303
|
|
|
* @returns {string} |
304
|
|
|
*/ |
305
|
|
|
strip_img_src: function(html){ |
306
|
|
|
return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' ); |
307
|
|
|
}, |
308
|
|
|
|
309
|
|
|
/** |
310
|
|
|
* Return the proxy url to test the passed url |
311
|
|
|
* |
312
|
|
|
* @param {$string} url |
313
|
|
|
* @returns {string} |
314
|
|
|
*/ |
315
|
|
|
get_proxy: function(url){ |
316
|
|
|
return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url; |
317
|
|
|
}, |
318
|
|
|
|
319
|
|
|
/** |
320
|
|
|
* @see crawler_painter.add_row(name, data) |
321
|
|
|
* @param {string} name |
322
|
|
|
* @param {Array} data |
323
|
|
|
*/ |
324
|
|
|
add_row: function(name, data){ |
325
|
|
|
crawler_painter.add_row(name, data); |
326
|
|
|
}, |
327
|
|
|
|
328
|
|
|
/** |
329
|
|
|
* Returns the word count for a given set of sentences or string |
330
|
|
|
* |
331
|
|
|
* @param {string|array} data |
332
|
|
|
* @returns {number} |
333
|
|
|
*/ |
334
|
|
|
get_word_count: function(data){ |
335
|
|
|
if( typeof data === 'string' ) return data.split(' ').length; |
336
|
|
|
|
337
|
|
|
var count = 0; |
338
|
|
|
for( var d in data ) count += data[d].split(' ').length; |
339
|
|
|
return count; |
340
|
|
|
}, |
341
|
|
|
|
342
|
|
|
/** |
343
|
|
|
* Set an arbitrary property on the crawler object |
344
|
|
|
* |
345
|
|
|
* @param {string} property |
346
|
|
|
* @param {string|int} key |
347
|
|
|
* @param {*} val |
348
|
|
|
* @return undefined |
349
|
|
|
*/ |
350
|
|
|
set_property: function(property, key, val){ |
351
|
|
|
if(!this.hasOwnProperty(property)) this[property] = {}; |
352
|
|
|
if(!this[property].hasOwnProperty(key)) this[property][key] = [val]; |
353
|
|
|
else this[property][key].push(val); |
354
|
|
|
}, |
355
|
|
|
|
356
|
|
|
/** |
357
|
|
|
* Start the crawler |
358
|
|
|
* |
359
|
|
|
* @param {object} settings |
360
|
|
|
* @throws Exception |
361
|
|
|
*/ |
362
|
|
|
init: function(settings){ |
363
|
|
|
this.trigger('BEFORE_INIT', []); |
364
|
|
|
|
365
|
|
|
if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']); |
366
|
|
|
if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']); |
367
|
|
|
|
368
|
|
|
if( !this.crawl_id ) throw "crawl_id must be specified"; |
369
|
|
|
|
370
|
|
|
// When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart |
371
|
|
|
this.on('CRAWL_FINISHED', function(){ |
372
|
|
|
if( crawler.que.length > 0 ) crawler.fetch_and_test(); |
373
|
|
|
else window.clearInterval(crawler.interval); |
374
|
|
|
}); |
375
|
|
|
|
376
|
|
|
// Every second try to initialize a new crawl request just in-case something crashes |
377
|
|
|
this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000); |
378
|
|
|
|
379
|
|
|
crawler_painter.init(); |
380
|
|
|
this.trigger('AFTER_INIT', []); |
381
|
|
|
} |
382
|
|
|
}; |
383
|
|
|
|