1
|
|
|
const crawler_file_tester = { |
2
|
|
|
|
3
|
|
|
robot_rules: [], |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Parse the content of the robots file |
7
|
|
|
* |
8
|
|
|
* @param {*} result |
9
|
|
|
* @throws {Exception} |
10
|
|
|
*/ |
11
|
|
|
parse_robots_file: function(result){ |
12
|
|
|
var rules = result.split("\n"), |
13
|
|
|
status = crawler_painter.create_status('success', 'Robots file loaded'); |
14
|
|
|
crawler_painter.add_row('robots_page',[ status, result.replace(/(?:\r\n|\r|\n)/g, '<br />') ]); |
15
|
|
|
|
16
|
|
|
var agent = '*'; |
17
|
|
|
for(var r in rules){ |
18
|
|
|
if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){ |
19
|
|
|
continue; |
|
|
|
|
20
|
|
|
}else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){ |
21
|
|
|
agent = rules[r].replace(/user-agent:/gi, ''); |
22
|
|
|
}else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){ |
23
|
|
|
var rule = rules[r].replace(/disallow:/gi, ''); |
24
|
|
|
crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent }); |
25
|
|
|
}else{ |
26
|
|
|
console.log(rules[r]); |
|
|
|
|
27
|
|
|
throw "Found a rule which we don't understand. Report it to the developer"; |
28
|
|
|
} |
29
|
|
|
} |
30
|
|
|
|
31
|
|
|
console.log(crawler_file_tester.robot_rules); |
32
|
|
|
}, |
33
|
|
|
|
34
|
|
|
get_file_contents: function(url, type, callback, failed_callback){ |
35
|
|
|
$.ajax({ 'url' : url, 'dataType' : type }).done(callback).fail(failed_callback); |
36
|
|
|
}, |
37
|
|
|
|
38
|
|
|
init: function(){ |
39
|
|
|
// Robots |
40
|
|
|
crawler.regiser_test('robots_page', 'ROBOTS PAGE', ['Status', 'Content'], false); |
41
|
|
|
crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL'], this.test_blocked_pages); |
42
|
|
|
crawler_file_tester.get_file_contents( |
43
|
|
|
crawler.robots_url, |
44
|
|
|
'text', |
45
|
|
|
crawler_file_tester.parse_robots_file, |
46
|
|
|
function(){ |
47
|
|
|
var status = crawler_painter.create_status('error', 'Failed to load robots file'); |
48
|
|
|
crawler_painter.add_row('robots_page', [ status ]); |
49
|
|
|
} |
50
|
|
|
); |
51
|
|
|
|
52
|
|
|
// Sitemap |
53
|
|
|
} |
54
|
|
|
}; |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* Start up the file tester |
58
|
|
|
*/ |
59
|
|
|
(function($){ |
|
|
|
|
60
|
|
|
crawler.on('BEFORE_INIT', crawler_file_tester.init); |
61
|
|
|
}(jQuery)); |
62
|
|
|
|