1
|
|
|
const crawler = { |
2
|
|
|
|
3
|
|
|
que : [], |
4
|
|
|
tested : [], |
5
|
|
|
crawling : [], |
6
|
|
|
failed : [], |
7
|
|
|
tests : [], |
8
|
|
|
ignore_paths : [], |
9
|
|
|
crawl_id : undefined, |
10
|
|
|
events : {}, |
11
|
|
|
linked_from : {}, |
12
|
|
|
redirects : {}, |
13
|
|
|
useragent : 'desktop', |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* Register a test to run. |
17
|
|
|
* |
18
|
|
|
* @param {string} name |
19
|
|
|
* @param {string} title |
20
|
|
|
* @param {Array} headers |
21
|
|
|
* @param {*} callable |
22
|
|
|
* @returns {undefined} |
23
|
|
|
* @throws Exception |
24
|
|
|
*/ |
25
|
|
|
regiser_test: function(name, title, headers, callable){ |
26
|
|
|
if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test'; |
27
|
|
|
if(title == undefined) throw 'Title not specified'; |
28
|
|
|
if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid'; |
29
|
|
|
if(typeof callable != 'function') return crawler_painter.create(name, title, headers); |
30
|
|
|
this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)}); |
31
|
|
|
return undefined; |
32
|
|
|
}, |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Return a registered test by name |
36
|
|
|
* |
37
|
|
|
* @param {string} name |
38
|
|
|
* @returns {object|false} |
39
|
|
|
*/ |
40
|
|
|
get_test_by_name: function(name){ |
41
|
|
|
for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t]; |
42
|
|
|
return false; |
43
|
|
|
}, |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Check if the url passed is valid for crawling, if so and it hasn't |
47
|
|
|
* been added or crawled before, add it to the que |
48
|
|
|
* |
49
|
|
|
* Returns true|false if added to que |
50
|
|
|
* |
51
|
|
|
* @param {string} url |
52
|
|
|
* @returns {boolean} |
53
|
|
|
*/ |
54
|
|
|
que_url: function(url){ |
55
|
|
|
var sanitized = this.sanitize(url); |
56
|
|
|
if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false; |
57
|
|
|
this.que.push(sanitized); |
58
|
|
|
return true; |
59
|
|
|
}, |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* Clean up a url so it becomes relative and standardized |
63
|
|
|
* |
64
|
|
|
* @param {string} url |
65
|
|
|
* @returns {string} |
66
|
|
|
*/ |
67
|
|
|
sanitize: function(url){ |
68
|
|
|
if(url == undefined) return ''; |
69
|
|
|
|
70
|
|
|
url = url |
71
|
|
|
.replace(/^\/|\/$/g, '') |
72
|
|
|
.replace(/https?:\/\/[^\/]+/i, '') |
73
|
|
|
.replace(/^\/|\/$/g, '') |
74
|
|
|
.split('#')[0]; |
75
|
|
|
|
76
|
|
|
if( url.slice(-1) == '?' ) url = url.slice(0, -1); |
77
|
|
|
if( url.length < 1 ) url = '/'; |
78
|
|
|
|
79
|
|
|
return url; |
80
|
|
|
}, |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* Get the domain for the passed url |
84
|
|
|
* |
85
|
|
|
* @param {string} url |
86
|
|
|
* @returns {string} |
87
|
|
|
*/ |
88
|
|
|
get_domain: function(url){ |
89
|
|
|
if( !url ) return ''; |
90
|
|
|
if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0]; |
91
|
|
|
else return url.split('/')[0].split(':')[0]; |
92
|
|
|
}, |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* Checks if the passed url should be ignored or not |
96
|
|
|
* |
97
|
|
|
* @param {string} url |
98
|
|
|
* @returns {boolean} |
99
|
|
|
*/ |
100
|
|
|
ignore_url: function( url ){ |
101
|
|
|
for(var regex in this.ignore_paths) { |
102
|
|
|
var reg = new RegExp(this.ignore_paths[regex], 'i'); |
103
|
|
|
if( url.match(reg) != null ) return true; |
104
|
|
|
} |
105
|
|
|
return false; |
106
|
|
|
}, |
107
|
|
|
|
108
|
|
|
/** |
109
|
|
|
* Add a path to ignore when crawler |
110
|
|
|
* Note: Paths can be in regex format |
111
|
|
|
* |
112
|
|
|
* @param {string} path |
113
|
|
|
* @returns {crawler} |
114
|
|
|
*/ |
115
|
|
|
add_ignore_path: function(path){ |
116
|
|
|
this.ignore_paths.push(path); |
117
|
|
|
return this; |
118
|
|
|
}, |
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Update all ignore paths to the paths specified |
122
|
|
|
* Note: Path can be in regex format |
123
|
|
|
* |
124
|
|
|
* @param paths |
125
|
|
|
* @returns {crawler} |
126
|
|
|
*/ |
127
|
|
|
set_ignore_paths: function(paths){ |
128
|
|
|
this.ignore_paths = paths; |
129
|
|
|
return this; |
130
|
|
|
}, |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Sets the crawl id |
134
|
|
|
* |
135
|
|
|
* @param crawl_id |
136
|
|
|
* @returns {crawler} |
137
|
|
|
*/ |
138
|
|
|
set_crawl_id: function(crawl_id){ |
139
|
|
|
this.crawl_id = crawl_id; |
140
|
|
|
return this; |
141
|
|
|
}, |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Does some soft checks to determine if url is a valid candidate for crawling |
145
|
|
|
* |
146
|
|
|
* @param {string} url |
147
|
|
|
* @returns {boolean} |
148
|
|
|
*/ |
149
|
|
|
can_crawl: function(url){ |
150
|
|
|
if(url == undefined) return false; |
151
|
|
|
return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 && |
152
|
|
|
!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url); |
153
|
|
|
}, |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* Does a soft check for the url passed and checks if it's a file |
157
|
|
|
* by checking if it has an extension and if the extension contains 'html' |
158
|
|
|
* |
159
|
|
|
* @param {string} url |
160
|
|
|
* @returns {boolean} |
161
|
|
|
*/ |
162
|
|
|
is_file: function(url){ |
163
|
|
|
var split = this.sanitize( url ).split( '.' ); |
164
|
|
|
return split.length > 1 && split.pop().indexOf( 'html' ) < 0; |
165
|
|
|
}, |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* Does some soft checking for the url passed to see if it's external |
169
|
|
|
* Note: If the url is internal but redirects to an external source, we wown't detect it here |
170
|
|
|
* |
171
|
|
|
* @param {string} url |
172
|
|
|
* @returns {boolean} |
173
|
|
|
*/ |
174
|
|
|
is_external: function(url){ |
175
|
|
|
return !( |
176
|
|
|
url.length < 1 || |
177
|
|
|
url[0] == '/' || |
178
|
|
|
url[0] == '#' || |
179
|
|
|
url.indexOf('://') < 0 || |
180
|
|
|
url == this.sanitize( url ) || |
181
|
|
|
this.get_domain( url ) == location.hostname |
182
|
|
|
); |
183
|
|
|
}, |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Checks if the href passed is an anchor link for url passed. |
187
|
|
|
* |
188
|
|
|
* @param {string} href |
189
|
|
|
* @param {string} url |
190
|
|
|
* @return {boolean} |
191
|
|
|
*/ |
192
|
|
|
is_anchor: function(href, url){ |
193
|
|
|
return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url); |
194
|
|
|
}, |
195
|
|
|
|
196
|
|
|
/** |
197
|
|
|
* Check if that target we requested matches the response we got. |
198
|
|
|
* If not mark as a redirect and append the redirect to be crawled |
199
|
|
|
* |
200
|
|
|
* @param {string} target |
201
|
|
|
* @param {string} response |
202
|
|
|
* @return {boolean} |
203
|
|
|
*/ |
204
|
|
|
check_fetched_url: function(target, response){ |
205
|
|
|
if(target != response){ |
206
|
|
|
this.redirects[target] = response; |
207
|
|
|
this.que_url(response); |
208
|
|
|
return false; |
209
|
|
|
} |
210
|
|
|
|
211
|
|
|
return true; |
212
|
|
|
}, |
213
|
|
|
|
214
|
|
|
/** |
215
|
|
|
* Fetch the next url from the que and run the tests on it |
216
|
|
|
*/ |
217
|
|
|
fetch_and_test: function(){ |
218
|
|
|
if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false; |
219
|
|
|
|
220
|
|
|
var url = this.que.pop(); |
221
|
|
|
this.crawling.push(url); |
222
|
|
|
|
223
|
|
|
$.ajax({ |
224
|
|
|
url: this.get_proxy( '/seotest/getPageData?u='+url ), |
225
|
|
|
data: { agent: this.useragent }, |
226
|
|
|
accepts: 'json', |
227
|
|
|
dataType: 'json' |
228
|
|
|
}) |
229
|
|
|
.done(function( result ) { |
230
|
|
|
if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) { |
231
|
|
|
var fetched = crawler.sanitize(result['url_fetched']); |
232
|
|
|
if(!crawler.check_fetched_url(url, fetched)){ |
233
|
|
|
this.skipped = true; |
234
|
|
|
return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]); |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
var html = $(crawler.strip_img_src(result['body'])); |
238
|
|
|
crawler.trigger('CRAWL_BEFORE_TESTS', [url]); |
239
|
|
|
crawler.fetch_links(html, url); |
240
|
|
|
crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']); |
241
|
|
|
return crawler.trigger('CRAWL_AFTER_TESTS', [url]); |
242
|
|
|
}else{ |
243
|
|
|
return crawler.failed(url); |
244
|
|
|
} |
245
|
|
|
}) |
246
|
|
|
.fail( function(){ |
247
|
|
|
return crawler.failed_url(url); |
248
|
|
|
}) |
249
|
|
|
.always( function(){ |
250
|
|
|
crawler.crawling.splice(crawler.crawling.indexOf(url), 1); |
251
|
|
|
|
252
|
|
|
if(!this.hasOwnProperty('skipped')){ |
253
|
|
|
crawler.tested.push(url); |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
crawler.trigger('CRAWL_FINISHED', [url]); |
257
|
|
|
|
258
|
|
|
if( crawler.que.length < 1 && crawler.crawling.length < 1){ |
259
|
|
|
crawler.trigger('ALL_CRAWLS_FINISHED', []); |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
return crawler.fetch_and_test(); |
263
|
|
|
}); |
|
|
|
|
264
|
|
|
}, |
265
|
|
|
|
266
|
|
|
/** |
267
|
|
|
* Check for links in the html of the rendered page so we add them to the que |
268
|
|
|
* and also map how pages are linked to each other |
269
|
|
|
* |
270
|
|
|
* @param {jQuery} html |
271
|
|
|
* @param {string} url |
272
|
|
|
*/ |
273
|
|
|
fetch_links: function(html, url){ |
274
|
|
|
$.each(html.find('a'), function(){ |
275
|
|
|
var href = $(this).attr('href'), |
276
|
|
|
link = crawler.sanitize(href); |
277
|
|
|
|
278
|
|
|
crawler.que_url( href ); |
279
|
|
|
|
280
|
|
|
if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url]; |
281
|
|
|
else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url); |
282
|
|
|
}); |
283
|
|
|
}, |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* Run the registered tests |
287
|
|
|
* |
288
|
|
|
* @param {string} url |
289
|
|
|
* @param {jQuery} html |
290
|
|
|
* @param {Array} headers |
291
|
|
|
* @param {Array} field_data |
292
|
|
|
* @param {Array} phrases |
293
|
|
|
*/ |
294
|
|
|
run_tests: function(url, html, headers, field_data, phrases){ |
295
|
|
|
for(var t in this.tests) { |
296
|
|
|
this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
297
|
|
|
this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]); |
298
|
|
|
this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
299
|
|
|
} |
300
|
|
|
}, |
301
|
|
|
|
302
|
|
|
/** |
303
|
|
|
* Trigger event callback and pass on the data |
304
|
|
|
* |
305
|
|
|
* @param {string} event |
306
|
|
|
* @param {*} data |
307
|
|
|
* return {undefined} |
308
|
|
|
*/ |
309
|
|
|
trigger: function(event, data){ |
310
|
|
|
if(this.events.hasOwnProperty(event)) |
311
|
|
|
for(var e in this.events[event]) this.events[event][e].apply(this, data); |
312
|
|
|
}, |
313
|
|
|
|
314
|
|
|
/** |
315
|
|
|
* Register callback on action |
316
|
|
|
* |
317
|
|
|
* @param {string} event |
318
|
|
|
* @param {function} callback |
319
|
|
|
* @returns {crawler} |
320
|
|
|
*/ |
321
|
|
|
on: function(event, callback){ |
322
|
|
|
if(!this.events.hasOwnProperty(event)) this.events[event] = []; |
323
|
|
|
this.events[event].push(callback); |
324
|
|
|
}, |
325
|
|
|
|
326
|
|
|
/** |
327
|
|
|
* Strip out src=<anything> so that we avoid loading the images |
328
|
|
|
* on the pages |
329
|
|
|
* |
330
|
|
|
* @param {string}html |
331
|
|
|
* @returns {string} |
332
|
|
|
*/ |
333
|
|
|
strip_img_src: function(html){ |
334
|
|
|
return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' ); |
335
|
|
|
}, |
336
|
|
|
|
337
|
|
|
/** |
338
|
|
|
* Return the proxy url to test the passed url |
339
|
|
|
* |
340
|
|
|
* @param {string} url |
341
|
|
|
* @returns {string} |
342
|
|
|
*/ |
343
|
|
|
get_proxy: function(url){ |
344
|
|
|
return location.protocol + '//' + location.hostname + url; |
345
|
|
|
}, |
346
|
|
|
|
347
|
|
|
/** |
348
|
|
|
* @see crawler_painter.add_row(name, data) |
349
|
|
|
* @param {string} name |
350
|
|
|
* @param {Array} data |
351
|
|
|
*/ |
352
|
|
|
add_row: function(name, data){ |
353
|
|
|
crawler_painter.add_row(name, data); |
354
|
|
|
}, |
355
|
|
|
|
356
|
|
|
/** |
357
|
|
|
* Returns the word count for a given set of sentences or string |
358
|
|
|
* |
359
|
|
|
* @param {string|array} data |
360
|
|
|
* @returns {number} |
361
|
|
|
*/ |
362
|
|
|
get_word_count: function(data){ |
363
|
|
|
if( typeof data === 'string' ) return data.split(' ').length; |
364
|
|
|
|
365
|
|
|
var count = 0; |
366
|
|
|
for( var d in data ) count += data[d].split(' ').length; |
367
|
|
|
return count; |
368
|
|
|
}, |
369
|
|
|
|
370
|
|
|
/** |
371
|
|
|
* Set an arbitrary property on the crawler object |
372
|
|
|
* |
373
|
|
|
* @param {string} property |
374
|
|
|
* @param {string|int} key |
375
|
|
|
* @param {*} val |
376
|
|
|
* @return undefined |
377
|
|
|
*/ |
378
|
|
|
set_property: function(property, key, val){ |
379
|
|
|
if(!this.hasOwnProperty(property)) this[property] = {}; |
380
|
|
|
if(!this[property].hasOwnProperty(key)) this[property][key] = [val]; |
381
|
|
|
else this[property][key].push(val); |
382
|
|
|
}, |
383
|
|
|
|
384
|
|
|
/** |
385
|
|
|
* Add the failed url to the failed list and trigger the failed event |
386
|
|
|
* |
387
|
|
|
* @param {string} url |
388
|
|
|
* @returns {undefined} |
389
|
|
|
*/ |
390
|
|
|
failed_url: function(url){ |
391
|
|
|
this.failed.push(url); |
392
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
393
|
|
|
}, |
394
|
|
|
|
395
|
|
|
/** |
396
|
|
|
* Start the crawler |
397
|
|
|
* |
398
|
|
|
* @param {object} settings |
399
|
|
|
* @throws Exception |
400
|
|
|
*/ |
401
|
|
|
init: function(settings){ |
402
|
|
|
this.trigger('BEFORE_INIT', []); |
403
|
|
|
|
404
|
|
|
if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']); |
405
|
|
|
if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']); |
406
|
|
|
|
407
|
|
|
if( !this.crawl_id ) throw "crawl_id must be specified"; |
408
|
|
|
|
409
|
|
|
crawler.fetch_and_test(); |
410
|
|
|
crawler.fetch_and_test(); |
411
|
|
|
|
412
|
|
|
crawler_painter.init(); |
413
|
|
|
this.trigger('AFTER_INIT', []); |
414
|
|
|
} |
415
|
|
|
}; |
416
|
|
|
|