1
|
|
|
const crawler = { |
2
|
|
|
|
3
|
|
|
que : [], |
4
|
|
|
tested : [], |
5
|
|
|
crawling : [], |
6
|
|
|
failed : [], |
7
|
|
|
tests : [], |
8
|
|
|
ignore_paths : [], |
9
|
|
|
crawl_id : undefined, |
10
|
|
|
events : {}, |
11
|
|
|
linked_from : {}, |
12
|
|
|
redirects : {}, |
13
|
|
|
useragent : 'desktop', |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* Register a test to run. |
17
|
|
|
* |
18
|
|
|
* @param {string} name |
19
|
|
|
* @param {string} title |
20
|
|
|
* @param {Array} headers |
21
|
|
|
* @param {*} callable |
22
|
|
|
* @returns {undefined} |
23
|
|
|
* @throws Exception |
24
|
|
|
*/ |
25
|
|
|
regiser_test: function(name, title, headers, callable){ |
26
|
|
|
if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test'; |
27
|
|
|
if(title == undefined) throw 'Title not specified'; |
28
|
|
|
if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid'; |
29
|
|
|
if(typeof callable != 'function') return crawler_painter.create(name, title, headers); |
30
|
|
|
this.tests.push({name: name, title: title, callback: callable, cont:crawler_painter.create(name, title, headers)}); |
31
|
|
|
return undefined; |
32
|
|
|
}, |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Return a registered test by name |
36
|
|
|
* |
37
|
|
|
* @param {string} name |
38
|
|
|
* @returns {object|false} |
39
|
|
|
*/ |
40
|
|
|
get_test_by_name: function(name){ |
41
|
|
|
for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t]; |
42
|
|
|
return false; |
43
|
|
|
}, |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Check if the url passed is valid for crawling, if so and it hasn't |
47
|
|
|
* been added or crawled before, add it to the que |
48
|
|
|
* |
49
|
|
|
* Returns true|false if added to que |
50
|
|
|
* |
51
|
|
|
* @param {string} url |
52
|
|
|
* @returns {boolean} |
53
|
|
|
*/ |
54
|
|
|
que_url: function(url){ |
55
|
|
|
var sanitized = this.sanitize(url); |
56
|
|
|
if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false; |
57
|
|
|
this.que.push(sanitized); |
58
|
|
|
return true; |
59
|
|
|
}, |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* Clean up a url so it becomes relative and standardized |
63
|
|
|
* |
64
|
|
|
* @param {string} url |
65
|
|
|
* @returns {string} |
66
|
|
|
*/ |
67
|
|
|
sanitize: function(url){ |
68
|
|
|
if(url == undefined) return ''; |
69
|
|
|
|
70
|
|
|
url = url |
71
|
|
|
.replace(/^\/|\/$/g, '') |
72
|
|
|
.replace(/https?:\/\/[^\/]+/i, '') |
73
|
|
|
.replace(/^\/|\/$/g, '') |
74
|
|
|
.split('#')[0]; |
75
|
|
|
|
76
|
|
|
if( url.slice(-1) == '?' ) url = url.slice(0, -1); |
77
|
|
|
if( url.length < 1 ) url = '/'; |
78
|
|
|
|
79
|
|
|
return url; |
80
|
|
|
}, |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* Get the domain for the passed url |
84
|
|
|
* |
85
|
|
|
* @param {string} url |
86
|
|
|
* @returns {string} |
87
|
|
|
*/ |
88
|
|
|
get_domain: function(url){ |
89
|
|
|
if( !url ) return ''; |
90
|
|
|
if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0]; |
91
|
|
|
else return url.split('/')[0].split(':')[0]; |
92
|
|
|
}, |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* Checks if the passed url should be ignored or not |
96
|
|
|
* |
97
|
|
|
* @param {string} url |
98
|
|
|
* @returns {boolean} |
99
|
|
|
*/ |
100
|
|
|
ignore_url: function( url ){ |
101
|
|
|
for(var regex in this.ignore_paths) { |
102
|
|
|
var reg = new RegExp(this.ignore_paths[regex], 'i'); |
103
|
|
|
if( url.match(reg) != null ) return true; |
104
|
|
|
} |
105
|
|
|
return false; |
106
|
|
|
}, |
107
|
|
|
|
108
|
|
|
/** |
109
|
|
|
* Add a path to ignore when crawler |
110
|
|
|
* Note: Paths can be in regex format |
111
|
|
|
* |
112
|
|
|
* @param {string} path |
113
|
|
|
* @returns {crawler} |
114
|
|
|
*/ |
115
|
|
|
add_ignore_path: function(path){ |
116
|
|
|
this.ignore_paths.push(path); |
117
|
|
|
return this; |
118
|
|
|
}, |
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Update all ignore paths to the paths specified |
122
|
|
|
* Note: Path can be in regex format |
123
|
|
|
* |
124
|
|
|
* @param paths |
125
|
|
|
* @returns {crawler} |
126
|
|
|
*/ |
127
|
|
|
set_ignore_paths: function(paths){ |
128
|
|
|
this.ignore_paths = paths; |
129
|
|
|
return this; |
130
|
|
|
}, |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Sets the crawl id |
134
|
|
|
* |
135
|
|
|
* @param crawl_id |
136
|
|
|
* @returns {crawler} |
137
|
|
|
*/ |
138
|
|
|
set_crawl_id: function(crawl_id){ |
139
|
|
|
this.crawl_id = crawl_id; |
140
|
|
|
return this; |
141
|
|
|
}, |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Does some soft checks to determine if url is a valid candidate for crawling |
145
|
|
|
* |
146
|
|
|
* @param {string} url |
147
|
|
|
* @returns {boolean} |
148
|
|
|
*/ |
149
|
|
|
can_crawl: function(url){ |
150
|
|
|
if(url == undefined) return false; |
151
|
|
|
return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 && |
152
|
|
|
!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url); |
153
|
|
|
}, |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* Does a soft check for the url passed and checks if it's a file |
157
|
|
|
* by checking if it has an extension and if the extension contains 'html' |
158
|
|
|
* |
159
|
|
|
* @param {string} url |
160
|
|
|
* @returns {boolean} |
161
|
|
|
*/ |
162
|
|
|
is_file: function(url){ |
163
|
|
|
var split = this.sanitize( url ).split( '.' ); |
164
|
|
|
return split.length > 1 && split.pop().indexOf( 'html' ) < 0; |
165
|
|
|
}, |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* Does some soft checking for the url passed to see if it's external |
169
|
|
|
* Note: If the url is internal but redirects to an external source, we wown't detect it here |
170
|
|
|
* |
171
|
|
|
* @param {string} url |
172
|
|
|
* @returns {boolean} |
173
|
|
|
*/ |
174
|
|
|
is_external: function(url){ |
175
|
|
|
return !( |
176
|
|
|
url.length < 1 || |
177
|
|
|
url[0] == '/' || |
178
|
|
|
url[0] == '#' || |
179
|
|
|
url.indexOf('://') < 0 || |
180
|
|
|
url == this.sanitize( url ) || |
181
|
|
|
this.get_domain( url ) == location.hostname |
182
|
|
|
); |
183
|
|
|
}, |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Checks if the href passed is an anchor link for url passed. |
187
|
|
|
* |
188
|
|
|
* @param {string} href |
189
|
|
|
* @param {string} url |
190
|
|
|
* @return {boolean} |
191
|
|
|
*/ |
192
|
|
|
is_anchor: function(href, url){ |
193
|
|
|
return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url); |
194
|
|
|
}, |
195
|
|
|
|
196
|
|
|
/** |
197
|
|
|
* Check if that target we requested matches the response we got. |
198
|
|
|
* If not mark as a redirect and append the redirect to be crawled |
199
|
|
|
* |
200
|
|
|
* @param {string} target |
201
|
|
|
* @param {string} response |
202
|
|
|
* @return {boolean} |
203
|
|
|
*/ |
204
|
|
|
check_fetched_url: function(target, response){ |
205
|
|
|
if(target != response){ |
206
|
|
|
this.redirects[target] = response; |
207
|
|
|
this.que_url(response); |
208
|
|
|
return false; |
209
|
|
|
} |
210
|
|
|
|
211
|
|
|
return true; |
212
|
|
|
}, |
213
|
|
|
|
214
|
|
|
/** |
215
|
|
|
* Fetch the next url from the que and run the tests on it |
216
|
|
|
*/ |
217
|
|
|
fetch_and_test: function(){ |
218
|
|
|
if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false; |
219
|
|
|
|
220
|
|
|
var url = this.que.pop(); |
221
|
|
|
this.crawling.push(url); |
222
|
|
|
|
223
|
|
|
$.ajax({ |
224
|
|
|
url: this.get_proxy( '/seotest/getPageData?u='+url ), |
225
|
|
|
data: { agent: this.useragent }, |
226
|
|
|
accepts: 'json', |
227
|
|
|
dataType: 'json' |
228
|
|
|
}) |
229
|
|
|
.done(function( result ) { |
230
|
|
|
if(result['headers'] && result['body'] && result['body'].toLowerCase().indexOf('<head') >= 0) { |
231
|
|
|
var fetched = crawler.sanitize(result['url_fetched']); |
232
|
|
|
if(!crawler.check_fetched_url(url, fetched)){ |
233
|
|
|
this.skipped = true; |
234
|
|
|
return crawler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]); |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
var html = $(crawler.strip_img_src(result['body'])); |
238
|
|
|
crawler.trigger('CRAWL_BEFORE_TESTS', [url]); |
239
|
|
|
crawler.fetch_links(html, url); |
240
|
|
|
crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']); |
241
|
|
|
return crawler.trigger('CRAWL_AFTER_TESTS', [url]); |
242
|
|
|
}else{ |
243
|
|
|
crawler.failed.push(url); |
244
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
245
|
|
|
} |
246
|
|
|
}) |
247
|
|
|
.fail( function(){ |
248
|
|
|
crawler.failed.push(url); |
249
|
|
|
return crawler.trigger('CRAWL_LOAD_FAILED', [url]); |
250
|
|
|
}) |
251
|
|
|
.always( function(){ |
252
|
|
|
crawler.crawling.splice(crawler.crawling.indexOf(url), 1); |
253
|
|
|
|
254
|
|
|
if(!this.hasOwnProperty('skipped')){ |
255
|
|
|
crawler.tested.push(url); |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
crawler.trigger('CRAWL_FINISHED', [url]); |
259
|
|
|
|
260
|
|
|
if( crawler.que.length < 1 && crawler.crawling.length < 1){ |
261
|
|
|
crawler.trigger('ALL_CRAWLS_FINISHED', []); |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
return crawler.fetch_and_test(); |
265
|
|
|
}); |
|
|
|
|
266
|
|
|
}, |
267
|
|
|
|
268
|
|
|
/** |
269
|
|
|
* Check for links in the html of the rendered page so we add them to the que |
270
|
|
|
* and also map how pages are linked to each other |
271
|
|
|
* |
272
|
|
|
* @param {jQuery} html |
273
|
|
|
* @param {string} url |
274
|
|
|
*/ |
275
|
|
|
fetch_links: function(html, url){ |
276
|
|
|
$.each(html.find('a'), function(){ |
277
|
|
|
var href = $(this).attr('href'), |
278
|
|
|
link = crawler.sanitize(href); |
279
|
|
|
|
280
|
|
|
crawler.que_url( href ); |
281
|
|
|
|
282
|
|
|
if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url]; |
283
|
|
|
else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url); |
284
|
|
|
}); |
285
|
|
|
}, |
286
|
|
|
|
287
|
|
|
/** |
288
|
|
|
* Run the registered tests |
289
|
|
|
* |
290
|
|
|
* @param {string} url |
291
|
|
|
* @param {jQuery} html |
292
|
|
|
* @param {Array} headers |
293
|
|
|
* @param {Array} field_data |
294
|
|
|
* @param {Array} phrases |
295
|
|
|
*/ |
296
|
|
|
run_tests: function(url, html, headers, field_data, phrases){ |
297
|
|
|
for(var t in this.tests) { |
298
|
|
|
this.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
299
|
|
|
this.tests[t]['callback'].apply(this.tests[t], [this.tests[t]['cont'], url, html, headers, field_data, phrases]); |
300
|
|
|
this.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
301
|
|
|
} |
302
|
|
|
}, |
303
|
|
|
|
304
|
|
|
/** |
305
|
|
|
* Trigger event callback and pass on the data |
306
|
|
|
* |
307
|
|
|
* @param {string} event |
308
|
|
|
* @param {*} data |
309
|
|
|
*/ |
310
|
|
|
trigger: function(event, data){ |
311
|
|
|
if(this.events.hasOwnProperty(event)) |
312
|
|
|
for(var e in this.events[event]) this.events[event][e].apply(this, data); |
313
|
|
|
}, |
314
|
|
|
|
315
|
|
|
/** |
316
|
|
|
* Register callback on action |
317
|
|
|
* |
318
|
|
|
* @param {string} event |
319
|
|
|
* @param {function} callback |
320
|
|
|
* @returns {crawler} |
321
|
|
|
*/ |
322
|
|
|
on: function(event, callback){ |
323
|
|
|
if(!this.events.hasOwnProperty(event)) this.events[event] = []; |
324
|
|
|
this.events[event].push(callback); |
325
|
|
|
}, |
326
|
|
|
|
327
|
|
|
/** |
328
|
|
|
* Strip out src=<anything> so that we avoid loading the images |
329
|
|
|
* on the pages |
330
|
|
|
* |
331
|
|
|
* @param {string}html |
332
|
|
|
* @returns {string} |
333
|
|
|
*/ |
334
|
|
|
strip_img_src: function(html){ |
335
|
|
|
return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' ); |
336
|
|
|
}, |
337
|
|
|
|
338
|
|
|
/** |
339
|
|
|
* Return the proxy url to test the passed url |
340
|
|
|
* |
341
|
|
|
* @param {string} url |
342
|
|
|
* @returns {string} |
343
|
|
|
*/ |
344
|
|
|
get_proxy: function(url){ |
345
|
|
|
return location.protocol + '//' + location.hostname + url; |
346
|
|
|
}, |
347
|
|
|
|
348
|
|
|
/** |
349
|
|
|
* @see crawler_painter.add_row(name, data) |
350
|
|
|
* @param {string} name |
351
|
|
|
* @param {Array} data |
352
|
|
|
*/ |
353
|
|
|
add_row: function(name, data){ |
354
|
|
|
crawler_painter.add_row(name, data); |
355
|
|
|
}, |
356
|
|
|
|
357
|
|
|
/** |
358
|
|
|
* Returns the word count for a given set of sentences or string |
359
|
|
|
* |
360
|
|
|
* @param {string|array} data |
361
|
|
|
* @returns {number} |
362
|
|
|
*/ |
363
|
|
|
get_word_count: function(data){ |
364
|
|
|
if( typeof data === 'string' ) return data.split(' ').length; |
365
|
|
|
|
366
|
|
|
var count = 0; |
367
|
|
|
for( var d in data ) count += data[d].split(' ').length; |
368
|
|
|
return count; |
369
|
|
|
}, |
370
|
|
|
|
371
|
|
|
/** |
372
|
|
|
* Set an arbitrary property on the crawler object |
373
|
|
|
* |
374
|
|
|
* @param {string} property |
375
|
|
|
* @param {string|int} key |
376
|
|
|
* @param {*} val |
377
|
|
|
* @return undefined |
378
|
|
|
*/ |
379
|
|
|
set_property: function(property, key, val){ |
380
|
|
|
if(!this.hasOwnProperty(property)) this[property] = {}; |
381
|
|
|
if(!this[property].hasOwnProperty(key)) this[property][key] = [val]; |
382
|
|
|
else this[property][key].push(val); |
383
|
|
|
}, |
384
|
|
|
|
385
|
|
|
/** |
386
|
|
|
* Start the crawler |
387
|
|
|
* |
388
|
|
|
* @param {object} settings |
389
|
|
|
* @throws Exception |
390
|
|
|
*/ |
391
|
|
|
init: function(settings){ |
392
|
|
|
this.trigger('BEFORE_INIT', []); |
393
|
|
|
|
394
|
|
|
if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']); |
395
|
|
|
if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']); |
396
|
|
|
|
397
|
|
|
if( !this.crawl_id ) throw "crawl_id must be specified"; |
398
|
|
|
|
399
|
|
|
crawler.fetch_and_test(); |
400
|
|
|
crawler.fetch_and_test(); |
401
|
|
|
|
402
|
|
|
crawler_painter.init(); |
403
|
|
|
this.trigger('AFTER_INIT', []); |
404
|
|
|
} |
405
|
|
|
}; |
406
|
|
|
|