1
|
|
|
const crawler = { |
2
|
|
|
|
3
|
|
|
que : [], |
4
|
|
|
tested : [], |
5
|
|
|
crawling : [], |
6
|
|
|
failed : [], |
7
|
|
|
tests : [], |
8
|
|
|
ignore_paths : [], |
9
|
|
|
crawl_id : undefined, |
10
|
|
|
linked_from : {}, |
11
|
|
|
redirects : {}, |
12
|
|
|
useragent : 'desktop', |
13
|
|
|
event_handler : crawler_event_handler, |
|
|
|
|
14
|
|
|
painter : crawler_painter, |
15
|
|
|
|
16
|
|
|
/** |
17
|
|
|
* Register a test to run. |
18
|
|
|
* |
19
|
|
|
* @param {string} name |
20
|
|
|
* @param {string} title |
21
|
|
|
* @param {Array} headers |
22
|
|
|
* @param {*} callable |
23
|
|
|
* @returns {undefined} |
24
|
|
|
* @throws Exception |
25
|
|
|
*/ |
26
|
|
|
regiser_test: function(name, title, headers, callable){ |
27
|
|
|
if(name == undefined || this.get_test_by_name(name)) throw 'Invalid name specified for your test'; |
28
|
|
|
if(title == undefined) throw 'Title not specified'; |
29
|
|
|
if(!(headers instanceof Array) || headers.length < 1) throw 'Headers array is invalid'; |
30
|
|
|
if(typeof callable != 'function') return this.painter.create(name, title, headers); |
31
|
|
|
this.tests.push({name: name, title: title, callback: callable, cont: this.painter.create(name, title, headers)}); |
32
|
|
|
return undefined; |
33
|
|
|
}, |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* Return a registered test by name |
37
|
|
|
* |
38
|
|
|
* @param {string} name |
39
|
|
|
* @returns {object|false} |
40
|
|
|
*/ |
41
|
|
|
get_test_by_name: function(name){ |
42
|
|
|
for(var t in this.test) if(this.tests[t]['name'] == name) return this.tests[t]; |
43
|
|
|
return false; |
44
|
|
|
}, |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* Check if the url passed is valid for crawling, if so and it hasn't |
48
|
|
|
* been added or crawled before, add it to the que |
49
|
|
|
* |
50
|
|
|
* Returns true|false if added to que |
51
|
|
|
* |
52
|
|
|
* @param {string} url |
53
|
|
|
* @returns {boolean} |
54
|
|
|
*/ |
55
|
|
|
que_url: function(url){ |
56
|
|
|
var sanitized = this.sanitize(url); |
57
|
|
|
if( !this.can_crawl(url) || this.que.indexOf(sanitized) > -1 || !this.can_crawl(sanitized)) return false; |
58
|
|
|
this.que.push(sanitized); |
59
|
|
|
return true; |
60
|
|
|
}, |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* Clean up a url so it becomes relative and standardized |
64
|
|
|
* |
65
|
|
|
* @param {string} url |
66
|
|
|
* @returns {string} |
67
|
|
|
*/ |
68
|
|
|
sanitize: function(url){ |
69
|
|
|
if(url == undefined) return ''; |
70
|
|
|
|
71
|
|
|
url = url |
72
|
|
|
.replace(/^\/|\/$/g, '') |
73
|
|
|
.replace(/https?:\/\/[^\/]+/i, '') |
74
|
|
|
.replace(/^\/|\/$/g, '') |
75
|
|
|
.split('#')[0]; |
76
|
|
|
|
77
|
|
|
if( url.slice(-1) == '?' ) url = url.slice(0, -1); |
78
|
|
|
if( url.length < 1 ) url = '/'; |
79
|
|
|
|
80
|
|
|
return url; |
81
|
|
|
}, |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Get the domain for the passed url |
85
|
|
|
* |
86
|
|
|
* @param {string} url |
87
|
|
|
* @returns {string} |
88
|
|
|
*/ |
89
|
|
|
get_domain: function(url){ |
90
|
|
|
if( !url ) return ''; |
91
|
|
|
if( url.indexOf("://") > -1 ) return url.split('/')[2].split(':')[0]; |
92
|
|
|
else return url.split('/')[0].split(':')[0]; |
93
|
|
|
}, |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Checks if the passed url should be ignored or not |
97
|
|
|
* |
98
|
|
|
* @param {string} url |
99
|
|
|
* @returns {boolean} |
100
|
|
|
*/ |
101
|
|
|
ignore_url: function( url ){ |
102
|
|
|
for(var regex in this.ignore_paths) { |
103
|
|
|
var reg = new RegExp(this.ignore_paths[regex], 'i'); |
104
|
|
|
if( url.match(reg) != null ) return true; |
105
|
|
|
} |
106
|
|
|
return false; |
107
|
|
|
}, |
108
|
|
|
|
109
|
|
|
/** |
110
|
|
|
* Update all ignore paths to the paths specified |
111
|
|
|
* Note: Path can be in regex format |
112
|
|
|
* |
113
|
|
|
* @param paths |
114
|
|
|
* @returns {crawler} |
115
|
|
|
*/ |
116
|
|
|
set_ignore_paths: function(paths){ |
117
|
|
|
this.ignore_paths = paths; |
118
|
|
|
return this; |
119
|
|
|
}, |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* Sets the crawl id |
123
|
|
|
* |
124
|
|
|
* @param crawl_id |
125
|
|
|
* @returns {crawler} |
126
|
|
|
*/ |
127
|
|
|
set_crawl_id: function(crawl_id){ |
128
|
|
|
this.crawl_id = crawl_id; |
129
|
|
|
return this; |
130
|
|
|
}, |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Does some soft checks to determine if url is a valid candidate for crawling |
134
|
|
|
* |
135
|
|
|
* @param {string} url |
136
|
|
|
* @returns {boolean} |
137
|
|
|
*/ |
138
|
|
|
can_crawl: function(url){ |
139
|
|
|
if(url == undefined) return false; |
140
|
|
|
return this.crawling.indexOf(url) < 0 && this.tested.indexOf(url) < 0 && this.que.indexOf(url) < 0 && |
141
|
|
|
!this.is_file(url) && !this.ignore_url(url) && !this.is_external(url); |
142
|
|
|
}, |
143
|
|
|
|
144
|
|
|
/** |
145
|
|
|
* Does a soft check for the url passed and checks if it's a file |
146
|
|
|
* by checking if it has an extension and if the extension contains 'html' |
147
|
|
|
* |
148
|
|
|
* @param {string} url |
149
|
|
|
* @returns {boolean} |
150
|
|
|
*/ |
151
|
|
|
is_file: function(url){ |
152
|
|
|
var split = this.sanitize( url ).split( '.' ); |
153
|
|
|
return split.length > 1 && split.pop().indexOf( 'html' ) < 0; |
154
|
|
|
}, |
155
|
|
|
|
156
|
|
|
/** |
157
|
|
|
* Does some soft checking for the url passed to see if it's external |
158
|
|
|
* Note: If the url is internal but redirects to an external source, we wown't detect it here |
159
|
|
|
* |
160
|
|
|
* @param {string} url |
161
|
|
|
* @returns {boolean} |
162
|
|
|
*/ |
163
|
|
|
is_external: function(url){ |
164
|
|
|
return !( |
165
|
|
|
url.length < 1 || |
166
|
|
|
url[0] == '/' || |
167
|
|
|
url[0] == '#' || |
168
|
|
|
url.indexOf('://') < 0 || |
169
|
|
|
url == this.sanitize( url ) || |
170
|
|
|
this.get_domain( url ) == location.hostname |
171
|
|
|
); |
172
|
|
|
}, |
173
|
|
|
|
174
|
|
|
/** |
175
|
|
|
* Checks if the href passed is an anchor link for url passed. |
176
|
|
|
* |
177
|
|
|
* @param {string} href |
178
|
|
|
* @param {string} url |
179
|
|
|
* @return {boolean} |
180
|
|
|
*/ |
181
|
|
|
is_anchor: function(href, url){ |
182
|
|
|
return href.indexOf('#') >= 0 && this.sanitize(href) == this.sanitize(url); |
183
|
|
|
}, |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Check if that target we requested matches the response we got. |
187
|
|
|
* If not mark as a redirect and append the redirect to be crawled |
188
|
|
|
* |
189
|
|
|
* @param {string} target |
190
|
|
|
* @param {string} response |
191
|
|
|
* @return {boolean} |
192
|
|
|
*/ |
193
|
|
|
check_fetched_url: function(target, response){ |
194
|
|
|
if(target != response){ |
195
|
|
|
this.redirects[target] = response; |
196
|
|
|
this.que_url(response); |
197
|
|
|
return false; |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
return true; |
201
|
|
|
}, |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Checks if the string passed is an html page |
205
|
|
|
* |
206
|
|
|
* @param {string} html |
207
|
|
|
* @returns {boolean} |
208
|
|
|
*/ |
209
|
|
|
is_html: function(html){ |
210
|
|
|
return html.indexOf('<head') > 0 && html.indexOf('<body') > 0; |
211
|
|
|
}, |
212
|
|
|
|
213
|
|
|
/** |
214
|
|
|
* Fetch the next url from the que and run the tests on it |
215
|
|
|
*/ |
216
|
|
|
fetch_and_test: function(){ |
217
|
|
|
if( !this.que || this.que.length < 1 || this.que.length < 1 || $.active > 2 ) return false; |
218
|
|
|
|
219
|
|
|
var url = this.que.pop(); |
220
|
|
|
this.crawling.push(url); |
221
|
|
|
|
222
|
|
|
$.ajax({ |
223
|
|
|
url: this.get_proxy( '/seotest/getPageData?u='+url ), |
224
|
|
|
data: { agent: this.useragent }, |
225
|
|
|
accepts: 'json', |
226
|
|
|
dataType: 'json' |
227
|
|
|
}) |
228
|
|
|
.done(function( result ) { |
229
|
|
|
var fetched = crawler.sanitize(result['url_fetched']); |
230
|
|
|
if( !result['headers'] || !result['body'] ) { |
231
|
|
|
return crawler.failed_url(url); |
232
|
|
|
}else if(!crawler.check_fetched_url(url, fetched)){ |
233
|
|
|
this.skipped = true; |
234
|
|
|
return crawler.event_handler.trigger('CRAWL_FOUND_REDIRECT', [url, fetched]); |
235
|
|
|
}else if(crawler.is_html(result['body'])){ |
236
|
|
|
var html = $(crawler.strip_img_src(result['body'])); |
237
|
|
|
crawler.fetch_links(html, url); |
238
|
|
|
return crawler.run_tests(url, html, result['headers'], result['field_data'], result['phrases']); |
239
|
|
|
}else{ |
240
|
|
|
this.skipped = true; |
|
|
|
|
241
|
|
|
} |
242
|
|
|
}) |
243
|
|
|
.fail( function(){ |
244
|
|
|
return crawler.failed_url(url); |
245
|
|
|
}) |
246
|
|
|
.always( function(){ |
247
|
|
|
crawler.crawling.splice(crawler.crawling.indexOf(url), 1); |
248
|
|
|
|
249
|
|
|
if(!this.hasOwnProperty('skipped')){ |
250
|
|
|
crawler.tested.push(url); |
251
|
|
|
} |
252
|
|
|
|
253
|
|
|
crawler.event_handler.trigger('CRAWL_FINISHED', [url]); |
254
|
|
|
|
255
|
|
|
if( crawler.que.length < 1 && crawler.crawling.length < 1){ |
256
|
|
|
crawler.event_handler.trigger('ALL_CRAWLS_FINISHED', []); |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
return crawler.fetch_and_test(); |
260
|
|
|
}); |
|
|
|
|
261
|
|
|
}, |
262
|
|
|
|
263
|
|
|
/** |
264
|
|
|
* Check for links in the html of the rendered page so we add them to the que |
265
|
|
|
* and also map how pages are linked to each other |
266
|
|
|
* |
267
|
|
|
* @param {jQuery} html |
268
|
|
|
* @param {string} url |
269
|
|
|
*/ |
270
|
|
|
fetch_links: function(html, url){ |
271
|
|
|
$.each(html.find('a'), function(){ |
272
|
|
|
var href = $(this).attr('href'), |
273
|
|
|
link = crawler.sanitize(href); |
274
|
|
|
|
275
|
|
|
crawler.que_url( href ); |
276
|
|
|
|
277
|
|
|
if(!crawler.linked_from.hasOwnProperty(link)) crawler.linked_from[link] = [url]; |
278
|
|
|
else if( crawler.linked_from[link].indexOf(url) < 0 ) crawler.linked_from[link].push(url); |
279
|
|
|
}); |
280
|
|
|
}, |
281
|
|
|
|
282
|
|
|
/** |
283
|
|
|
* Run the registered tests |
284
|
|
|
* |
285
|
|
|
* @param {string} url |
286
|
|
|
* @param {jQuery} html |
287
|
|
|
* @param {Array} headers |
288
|
|
|
* @param {Array} field_data |
289
|
|
|
* @param {Array} phrases |
290
|
|
|
* @returns {undefined} |
291
|
|
|
*/ |
292
|
|
|
run_tests: function(url, html, headers, field_data, phrases){ |
293
|
|
|
this.event_handler.trigger('CRAWL_BEFORE_TESTS', [url]); |
294
|
|
|
for(var t in this.tests) { |
295
|
|
|
this.event_handler.trigger('before'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
296
|
|
|
this.tests[t]['callback'].apply(this.tests[t], [url, html, headers, field_data, phrases]); |
297
|
|
|
this.event_handler.trigger('after'+this.tests[t]['name'], [url, html, headers, field_data, phrases]); |
298
|
|
|
} |
299
|
|
|
return this.event_handler.trigger('CRAWL_AFTER_TESTS', [url]); |
300
|
|
|
}, |
301
|
|
|
|
302
|
|
|
/** |
303
|
|
|
* Strip out src=<anything> so that we avoid loading the images |
304
|
|
|
* on the pages |
305
|
|
|
* |
306
|
|
|
* @param {string}html |
307
|
|
|
* @returns {string} |
308
|
|
|
*/ |
309
|
|
|
strip_img_src: function(html){ |
310
|
|
|
return html.replace( /(src).*?=(['|"].*?['|"])/ig, '' ); |
311
|
|
|
}, |
312
|
|
|
|
313
|
|
|
/** |
314
|
|
|
* Return the proxy url to test the passed url |
315
|
|
|
* |
316
|
|
|
* @param {string} url |
317
|
|
|
* @returns {string} |
318
|
|
|
*/ |
319
|
|
|
get_proxy: function(url){ |
320
|
|
|
return location.protocol + '//' + location.hostname + url; |
321
|
|
|
}, |
322
|
|
|
|
323
|
|
|
/** |
324
|
|
|
* Returns the word count for a given set of sentences or string |
325
|
|
|
* |
326
|
|
|
* @param {string|array} data |
327
|
|
|
* @returns {number} |
328
|
|
|
*/ |
329
|
|
|
get_word_count: function(data){ |
330
|
|
|
if( typeof data === 'string' ) return data.split(' ').length; |
331
|
|
|
|
332
|
|
|
var count = 0; |
333
|
|
|
for( var d in data ) count += data[d].split(' ').length; |
334
|
|
|
return count; |
335
|
|
|
}, |
336
|
|
|
|
337
|
|
|
/** |
338
|
|
|
* Set an arbitrary property on the crawler object |
339
|
|
|
* |
340
|
|
|
* @param {string} property |
341
|
|
|
* @param {string|int} key |
342
|
|
|
* @param {*} val |
343
|
|
|
* @return undefined |
344
|
|
|
*/ |
345
|
|
|
set_property: function(property, key, val){ |
346
|
|
|
if(!this.hasOwnProperty(property)) this[property] = {}; |
347
|
|
|
if(!this[property].hasOwnProperty(key)) this[property][key] = [val]; |
348
|
|
|
else this[property][key].push(val); |
349
|
|
|
}, |
350
|
|
|
|
351
|
|
|
/** |
352
|
|
|
* Add the failed url to the failed list and trigger the failed event |
353
|
|
|
* |
354
|
|
|
* @param {string} url |
355
|
|
|
* @returns {undefined} |
356
|
|
|
*/ |
357
|
|
|
failed_url: function(url){ |
358
|
|
|
this.failed.push(url); |
359
|
|
|
return this.event_handler.trigger('CRAWL_LOAD_FAILED', [url]); |
360
|
|
|
}, |
361
|
|
|
|
362
|
|
|
/** |
363
|
|
|
* Triggered every second |
364
|
|
|
* |
365
|
|
|
* @returns {undefined} |
366
|
|
|
*/ |
367
|
|
|
loop: function(){ |
368
|
|
|
this.event_handler.trigger('CRAWLER_LOOP', [this]); |
369
|
|
|
this.fetch_and_test(); |
370
|
|
|
return undefined; |
371
|
|
|
}, |
372
|
|
|
|
373
|
|
|
/** |
374
|
|
|
* Start the crawler |
375
|
|
|
* |
376
|
|
|
* @param {object} settings |
377
|
|
|
* @throws Exception |
378
|
|
|
*/ |
379
|
|
|
init: function(settings){ |
380
|
|
|
this.event_handler.trigger('BEFORE_INIT', [this]); |
381
|
|
|
|
382
|
|
|
if(settings.hasOwnProperty('crawl_id')) this.set_crawl_id(settings['crawl_id']); |
383
|
|
|
if(settings.hasOwnProperty('ignore_paths')) this.set_ignore_paths(settings['ignore_paths']); |
384
|
|
|
|
385
|
|
|
if( !this.crawl_id ) throw "crawl_id must be specified"; |
386
|
|
|
|
387
|
|
|
this.interval = setInterval(function(){crawler.loop();}, 1000); |
388
|
|
|
this.event_handler.on('ALL_CRAWLS_FINISHED', function(){ window.clearInterval( crawler.interval ); }); |
389
|
|
|
|
390
|
|
|
this.painter.init(); |
391
|
|
|
this.event_handler.trigger('AFTER_INIT', [this]); |
392
|
|
|
} |
393
|
|
|
}; |
394
|
|
|
|
This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.
To learn more about declaring variables in Javascript, see the MDN.