|
1
|
|
|
const crawler = { |
|
2
|
|
|
|
|
3
|
|
|
que : [], |
|
4
|
|
|
tested : [], |
|
5
|
|
|
crawling : [], |
|
6
|
|
|
failed : [], |
|
7
|
|
|
tests : [], |
|
8
|
|
|
ignore_paths : [], |
|
9
|
|
|
crawl_id : undefined, |
|
10
|
|
|
events : {}, |
|
11
|
|
|
linked_from : {}, |
|
12
|
|
|
useragent : 'desktop', |
|
13
|
|
|
|
|
14
|
|
|
/** |
|
15
|
|
|
* Register a test to run. |
|
16
|
|
|
* |
|
17
|
|
|
* @param {string} name |
|
18
|
|
|
* @param {string} title |
|
19
|
|
|
* @param {Array} headers |
|
20
|
|
|
* @param {string} callable |
|
21
|
|
|
* @returns {undefined} |
|
22
|
|
|
* @throws Exception |
|
23
|
|
|
*/ |
|
24
|
|
|
regiser_test: function(name, title, headers, callable){ |
|
25
|
|
|
if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test'; |
|
26
|
|
|
if(title == undefined) throw 'Title not specified'; |
|
27
|
|
|
if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid'; |
|
28
|
|
|
if(typeof callable != 'function') return crawler_painter.create(name, title, headers); |
|
29
|
|
|
this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)}); |
|
30
|
|
|
return undefined; |
|
31
|
|
|
}, |
|
32
|
|
|
|
|
33
|
|
|
/** |
|
34
|
|
|
* Return a registered test by name |
|
35
|
|
|
* |
|
36
|
|
|
* @param {string} name |
|
37
|
|
|
* @returns {object|false} |
|
38
|
|
|
*/ |
|
39
|
|
|
get_test_by_name: function(name){ |
|
40
|
|
|
for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t]; |
|
41
|
|
|
return false; |
|
42
|
|
|
}, |
|
43
|
|
|
|
|
44
|
|
|
/** |
|
45
|
|
|
* Check if the url passed is valid for crawling, if so and it hasn't |
|
46
|
|
|
* been added or crawled before, add it to the que |
|
47
|
|
|
* |
|
48
|
|
|
* Returns true|false if added to que |
|
49
|
|
|
* |
|
50
|
|
|
* @param {string} url |
|
51
|
|
|
* @returns {boolean} |
|
52
|
|
|
*/ |
|
53
|
|
|
que_url: function(url){ |
|
54
|
|
|
var sanitized = this.sanitize(url); |
|
55
|
|
|
if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false; |
|
56
|
|
|
this.que.push(sanitized); |
|
57
|
|
|
return true; |
|
58
|
|
|
}, |
|
59
|
|
|
|
|
60
|
|
|
/** |
|
61
|
|
|
* Clean up a url so it becomes relative and standardized |
|
62
|
|
|
* |
|
63
|
|
|
* @param {string} url |
|
64
|
|
|
* @returns {string} |
|
65
|
|
|
*/ |
|
66
|
|
|
sanitize: function(url){ |
|
67
|
|
|
if(url == undefined) return ''; |
|
68
|
|
|
|
|
69
|
|
|
url = url |
|
70
|
|
|
.replace(/^\/|\/$/g, '') |
|
71
|
|
|
.replace(/https?:\/\/[^\/]+/i, '') |
|
72
|
|
|
.replace(/^\/|\/$/g, '') |
|
73
|
|
|
.split('#')[0]; |
|
74
|
|
|
|
|
75
|
|
|
if( url.slice(-1) == '?' ) url = url.slice(0, -1); |
|
76
|
|
|
if( url.length < 1 ) url = '/'; |
|
77
|
|
|
|
|
78
|
|
|
return url; |
|
79
|
|
|
}, |
|
80
|
|
|
|
|
81
|
|
|
/** |
|
82
|
|
|
* Get the domain for the passed url |
|
83
|
|
|
* |
|
84
|
|
|
* @param {string} url |
|
85
|
|
|
* @returns {string} |
|
86
|
|
|
*/ |
|
87
|
|
|
get_domain: function(url){ |
|
88
|
|
|
if( !url ) return ''; |
|
89
|
|
|
if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0]; |
|
90
|
|
|
else return url.split('/')[0].split(':')[0]; |
|
91
|
|
|
}, |
|
92
|
|
|
|
|
93
|
|
|
/** |
|
94
|
|
|
* Checks if the passed url should be ignored or not |
|
95
|
|
|
* |
|
96
|
|
|
* @param {string} url |
|
97
|
|
|
* @returns {boolean} |
|
98
|
|
|
*/ |
|
99
|
|
|
ignore_url: function( url ){ |
|
100
|
|
|
for(var regex in this.ignore_paths) { |
|
101
|
|
|
var reg = new RegExp(this.ignore_paths[regex], 'i'); |
|
102
|
|
|
if( url.match(reg) != null ) return true; |
|
103
|
|
|
} |
|
104
|
|
|
return false; |
|
105
|
|
|
}, |
|
106
|
|
|
|
|
107
|
|
|
/** |
|
108
|
|
|
* Add a path to ignore when crawler |
|
109
|
|
|
* Note: Paths can be in regex format |
|
110
|
|
|
* |
|
111
|
|
|
* @param {string} path |
|
112
|
|
|
* @returns {crawler} |
|
113
|
|
|
*/ |
|
114
|
|
|
add_ignore_path: function(path){ |
|
115
|
|
|
this.ignore_paths.push(path); |
|
116
|
|
|
return this; |
|
117
|
|
|
}, |
|
118
|
|
|
|
|
119
|
|
|
/** |
|
120
|
|
|
* Update all ignore paths to the paths specified |
|
121
|
|
|
* Note: Path can be in regex format |
|
122
|
|
|
* |
|
123
|
|
|
* @param paths |
|
124
|
|
|
* @returns {crawler} |
|
125
|
|
|
*/ |
|
126
|
|
|
set_ignore_paths: function(paths){ |
|
127
|
|
|
this.ignore_paths = paths; |
|
128
|
|
|
return this; |
|
129
|
|
|
}, |
|
130
|
|
|
|
|
131
|
|
|
/** |
|
132
|
|
|
* Sets the crawl id |
|
133
|
|
|
* |
|
134
|
|
|
* @param crawl_id |
|
135
|
|
|
* @returns {crawler} |
|
136
|
|
|
*/ |
|
137
|
|
|
set_crawl_id: function(crawl_id){ |
|
138
|
|
|
this.crawl_id = crawl_id; |
|
139
|
|
|
return this; |
|
140
|
|
|
}, |
|
141
|
|
|
|
|
142
|
|
|
/** |
|
143
|
|
|
* Does some soft checks to determine if url is a valid candidate for crawling |
|
144
|
|
|
* |
|
145
|
|
|
* @param {string} url |
|
146
|
|
|
* @returns {boolean} |
|
147
|
|
|
*/ |
|
148
|
|
|
can_crawl: function(url){ |
|
149
|
|
|
if(url == undefined) return false; |
|
150
|
|
|
return !(this.crawling.indexOf(url) >= 0 || this.tested.indexOf(url) >= 0 || |
|
151
|
|
|
this.is_file(url) || this.ignore_url(url) || this.is_external(url)); |
|
152
|
|
|
}, |
|
153
|
|
|
|
|
154
|
|
|
/** |
|
155
|
|
|
* Does a soft check for the url passed and checks if it's a file |
|
156
|
|
|
* by checking if it has an extension and if the extension contains 'html' |
|
157
|
|
|
* |
|
158
|
|
|
* @param {string} url |
|
159
|
|
|
* @returns {boolean} |
|
160
|
|
|
*/ |
|
161
|
|
|
is_file: function(url){ |
|
162
|
|
|
var split = this.sanitize( url ).split( '.' ); |
|
163
|
|
|
return split.length > 1 && split.pop().indexOf( 'html' ) < 0; |
|
164
|
|
|
}, |
|
165
|
|
|
|
|
166
|
|
|
/** |
|
167
|
|
|
* Does some soft checking for the url passed to see if it's external |
|
168
|
|
|
* Note: If the url is internal but redirects to an external source, we wown't detect it here |
|
169
|
|
|
* |
|
170
|
|
|
* @param {string} url |
|
171
|
|
|
* @returns {boolean} |
|
172
|
|
|
*/ |
|
173
|
|
|
is_external: function(url){ |
|
174
|
|
|
return !( |
|
175
|
|
|
url.length < 1 || |
|
176
|
|
|
url[0] == '/' || |
|
177
|
|
|
url[0] == '#' || |
|
178
|
|
|
url.indexOf('://') < 0 || |
|
179
|
|
|
url == this.sanitize( url ) || |
|
180
|
|
|
this.get_domain( url ) == location.hostname |
|
181
|
|
|
); |
|
182
|
|
|
}, |
|
183
|
|
|
|
|
184
|
|
|
/** |
|
185
|
|
|
* Checks if the href passed is an anchor link for url passed. |
|
186
|
|
|
* |
|
187
|
|
|
* @param {string} href |
|
188
|
|
|
* @param {string} url |
|
189
|
|
|
* @return {boolean} |
|
190
|
|
|
*/ |
|
191
|
|
|
is_anchor: function(href, url){ |
|
192
|
|
|
return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url); |
|
193
|
|
|
}, |
|
194
|
|
|
|
|
195
|
|
|
/** |
|
196
|
|
|
* Fetch the next url from the que and run the tests on it |
|
197
|
|
|
*/ |
|
198
|
|
|
fetch_and_test: function(){ |
|
199
|
|
|
if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false; |
|
200
|
|
|
|
|
201
|
|
|
var url = this.que.pop(); |
|
202
|
|
|
this.crawling.push(url); |
|
203
|
|
|
|
|
204
|
|
|
$.ajax({ |
|
205
|
|
|
url: this.get_proxy( url ), data: { agent: this.useragent }, accepts: 'json', dataType: 'json' |
|
206
|
|
|
}) |
|
207
|
|
|
.done(function( result ) { |
|
208
|
|
|
if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) { |
|
209
|
|
|
if( !crawler.is_external(result['url_fetched']) ) { |
|
210
|
|
|
url = crawler.sanitize(result['url_fetched']); |
|
211
|
|
|
if(crawler.tested.indexOf(url) >= 0){ |
|
212
|
|
|
this.skipped = true; |
|
213
|
|
|
return true; |
|
214
|
|
|
} |
|
215
|
|
|
|
|
216
|
|
|
var html = $(crawler.strip_img_src(result['body'])); |
|
217
|
|
|
crawler.trigger('CRAWL_BEFORE_TESTS', [url]); |
|
218
|
|
|
crawler.fetch_links(html, url); |
|
219
|
|
|
crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']); |
|
220
|
|
|
crawler.trigger('CRAWL_AFTER_TESTS', [url]); |
|
221
|
|
|
return true; |
|
222
|
|
|
} |
|
223
|
|
|
} |
|
224
|
|
|
crawler.failed.push(url); |
|
225
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
|
226
|
|
|
}) |
|
227
|
|
|
.fail( function(){ |
|
228
|
|
|
crawler.failed.push(url); |
|
229
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
|
230
|
|
|
}) |
|
231
|
|
|
.always( function(){ |
|
232
|
|
|
if((this.hasOwnProperty('skipped') && this.skipped) || crawler.tested.indexOf(url) < 0 ) { |
|
233
|
|
|
crawler.tested.push(url) |
|
234
|
|
|
} |
|
235
|
|
|
return crawler.trigger('CRAWL_FINISHED', [url]); |
|
236
|
|
|
}); |
|
|
|
|
|
|
237
|
|
|
}, |
|
238
|
|
|
|
|
239
|
|
|
/** |
|
240
|
|
|
* Check for links in the html of the rendered page so we add them to the que |
|
241
|
|
|
* and also map how pages are linked to each other |
|
242
|
|
|
* |
|
243
|
|
|
* @param {jQuery} html |
|
244
|
|
|
* @param {string} url |
|
245
|
|
|
*/ |
|
246
|
|
|
fetch_links: function(html, url){ |
|
247
|
|
|
$.each(html.find('a'), function(){ |
|
248
|
|
|
var href = $(this).attr('href'), |
|
249
|
|
|
link = crawler.sanitize(href); |
|
250
|
|
|
|
|
251
|
|
|
crawler.que_url( href ); |
|
252
|
|
|
|
|
253
|
|
|
if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url]; |
|
254
|
|
|
else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url); |
|
255
|
|
|
}); |
|
256
|
|
|
}, |
|
257
|
|
|
|
|
258
|
|
|
/** |
|
259
|
|
|
* Run the registered tests |
|
260
|
|
|
* |
|
261
|
|
|
* @param {string} url |
|
262
|
|
|
* @param {jQuery} html |
|
263
|
|
|
* @param {Array} headers |
|
264
|
|
|
* @param {Array} field_data |
|
265
|
|
|
* @param {Array} phrases |
|
266
|
|
|
*/ |
|
267
|
|
|
run_tests: function(url, html, headers, field_data, phrases){ |
|
268
|
|
|
for(var t in this.tests) { |
|
269
|
|
|
this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
|
270
|
|
|
this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]); |
|
271
|
|
|
this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
|
272
|
|
|
} |
|
273
|
|
|
}, |
|
274
|
|
|
|
|
275
|
|
|
/** |
|
276
|
|
|
* Trigger event callback and pass on the data |
|
277
|
|
|
* |
|
278
|
|
|
* @param {string} event |
|
279
|
|
|
* @param {*} data |
|
280
|
|
|
*/ |
|
281
|
|
|
trigger: function(event, data){ |
|
282
|
|
|
if(this.events.hasOwnProperty(event)) |
|
283
|
|
|
for(var e in this.events[event]) this.events[event][e].apply(this, data); |
|
284
|
|
|
}, |
|
285
|
|
|
|
|
286
|
|
|
/** |
|
287
|
|
|
* Register callback on action |
|
288
|
|
|
* |
|
289
|
|
|
* @param {string} event |
|
290
|
|
|
* @param {function} callback |
|
291
|
|
|
* @returns {crawler} |
|
292
|
|
|
*/ |
|
293
|
|
|
on: function(event, callback){ |
|
294
|
|
|
if(!this.events.hasOwnProperty(event)) this.events[event] = []; |
|
295
|
|
|
this.events[event].push(callback); |
|
296
|
|
|
}, |
|
297
|
|
|
|
|
298
|
|
|
/** |
|
299
|
|
|
* Strip out src=<anything> so that we avoid loading the images |
|
300
|
|
|
* on the pages |
|
301
|
|
|
* |
|
302
|
|
|
* @param {string}html |
|
303
|
|
|
* @returns {string} |
|
304
|
|
|
*/ |
|
305
|
|
|
strip_img_src: function(html){ |
|
306
|
|
|
return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' ); |
|
307
|
|
|
}, |
|
308
|
|
|
|
|
309
|
|
|
/** |
|
310
|
|
|
* Return the proxy url to test the passed url |
|
311
|
|
|
* |
|
312
|
|
|
* @param {$string} url |
|
313
|
|
|
* @returns {string} |
|
314
|
|
|
*/ |
|
315
|
|
|
get_proxy: function(url){ |
|
316
|
|
|
return location.protocol + '//' + location.hostname + '/seotest/getPageData?u='+url; |
|
317
|
|
|
}, |
|
318
|
|
|
|
|
319
|
|
|
/** |
|
320
|
|
|
* @see crawler_painter.add_row(name, data) |
|
321
|
|
|
* @param {string} name |
|
322
|
|
|
* @param {Array} data |
|
323
|
|
|
*/ |
|
324
|
|
|
add_row: function(name, data){ |
|
325
|
|
|
crawler_painter.add_row(name, data); |
|
326
|
|
|
}, |
|
327
|
|
|
|
|
328
|
|
|
/** |
|
329
|
|
|
* Returns the word count for a given set of sentences or string |
|
330
|
|
|
* |
|
331
|
|
|
* @param {string|array} data |
|
332
|
|
|
* @returns {number} |
|
333
|
|
|
*/ |
|
334
|
|
|
get_word_count: function(data){ |
|
335
|
|
|
if( typeof data === 'string' ) return data.split(' ').length; |
|
336
|
|
|
|
|
337
|
|
|
var count = 0; |
|
338
|
|
|
for( var d in data ) count += data[d].split(' ').length; |
|
339
|
|
|
return count; |
|
340
|
|
|
}, |
|
341
|
|
|
|
|
342
|
|
|
/** |
|
343
|
|
|
* Set an arbitrary property on the crawler object |
|
344
|
|
|
* |
|
345
|
|
|
* @param {string} property |
|
346
|
|
|
* @param {string|int} key |
|
347
|
|
|
* @param {*} val |
|
348
|
|
|
* @return undefined |
|
349
|
|
|
*/ |
|
350
|
|
|
set_property: function(property, key, val){ |
|
351
|
|
|
if(!this.hasOwnProperty(property)) this[property] = {}; |
|
352
|
|
|
if(!this[property].hasOwnProperty(key)) this[property][key] = [val]; |
|
353
|
|
|
else this[property][key].push(val); |
|
354
|
|
|
}, |
|
355
|
|
|
|
|
356
|
|
|
/** |
|
357
|
|
|
* Start the crawler |
|
358
|
|
|
* |
|
359
|
|
|
* @param {object} settings |
|
360
|
|
|
* @throws Exception |
|
361
|
|
|
*/ |
|
362
|
|
|
init: function(settings){ |
|
363
|
|
|
this.trigger('BEFORE_INIT', []); |
|
364
|
|
|
|
|
365
|
|
|
if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']); |
|
366
|
|
|
if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']); |
|
367
|
|
|
|
|
368
|
|
|
if( !this.crawl_id ) throw "crawl_id must be specified"; |
|
369
|
|
|
|
|
370
|
|
|
// When a crawl finishes, start a new one if there are any more urls to go through else stop the auto-restart |
|
371
|
|
|
this.on('CRAWL_FINISHED', function(){ |
|
372
|
|
|
if( crawler.que.length > 0 ) crawler.fetch_and_test(); |
|
373
|
|
|
else window.clearInterval(crawler.interval); |
|
374
|
|
|
}); |
|
375
|
|
|
|
|
376
|
|
|
// Every second try to initialize a new crawl request just in-case something crashes |
|
377
|
|
|
this.interval = setInterval(function(){ crawler.fetch_and_test(); }, 1000); |
|
378
|
|
|
|
|
379
|
|
|
crawler_painter.init(); |
|
380
|
|
|
this.trigger('AFTER_INIT', []); |
|
381
|
|
|
} |
|
382
|
|
|
}; |
|
383
|
|
|
|