1
|
|
|
const crawler_file_tester = { |
2
|
|
|
|
3
|
|
|
robot_rules: [], |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Parse the content of the robots file |
7
|
|
|
* |
8
|
|
|
* @param {*} result |
9
|
|
|
* @throws {Exception} |
10
|
|
|
*/ |
11
|
|
|
parse_robots_file: function(result){ |
12
|
|
|
var rules = result.split("\n"); |
13
|
|
|
crawler_painter.add_row('file_tests', [ crawler_painter.create_status('success', 'Robots file loaded') ]); |
14
|
|
|
|
15
|
|
|
var agent = '*'; |
16
|
|
|
for(var r in rules){ |
17
|
|
|
if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){ |
18
|
|
|
continue; |
19
|
|
|
}else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){ |
20
|
|
|
agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, ''); |
21
|
|
|
}else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){ |
22
|
|
|
var rule = |
23
|
|
|
'^'+rules[r] |
24
|
|
|
.replace(/disallow:/gi, '') // remove disallow |
25
|
|
|
.replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space |
26
|
|
|
.replace('?', '\\?') // escape query string start |
27
|
|
|
.replace('|', '\\|') // escape pipe |
28
|
|
|
.replace('/', '\\/') // escape slashes |
29
|
|
|
.replace(/^\^\^/g, '^') // If it already had a caret remove it |
30
|
|
|
.replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier |
31
|
|
|
crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] }); |
32
|
|
|
}else{ |
33
|
|
|
console.log(rules[r]); |
34
|
|
|
throw "Found a rule which we don't understand. Report it to the developer"; |
35
|
|
|
} |
36
|
|
|
} |
37
|
|
|
}, |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Check all tested url and see if they are blocked by any rule in the robots file |
41
|
|
|
* |
42
|
|
|
* @returns {undefined} |
43
|
|
|
*/ |
44
|
|
|
test_blocked_pages: function(){ |
45
|
|
|
for(var t in crawler.tested){ |
46
|
|
|
var url = crawler.tested[t]; |
47
|
|
|
|
48
|
|
|
if( crawler.linked_from.hasOwnProperty(url) ) { |
49
|
|
|
for (var r in this.robot_rules) { |
50
|
|
|
var regex = new RegExp(this.robot_rules[r]['rule'], 'g'); |
51
|
|
|
if (regex.test('/' + url)) { |
52
|
|
|
var link = crawler_painter.create_link(url, url), |
53
|
|
|
status = crawler_painter.create_status('error', 'Page has links and is blocked in robots'), |
54
|
|
|
agent = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent']; |
55
|
|
|
crawler_painter.add_row( |
56
|
|
|
'blocked_pages', |
57
|
|
|
[link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]); |
58
|
|
|
} |
59
|
|
|
} |
60
|
|
|
} |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
return undefined; |
64
|
|
|
}, |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Setup an ajax call to fetch url |
68
|
|
|
* |
69
|
|
|
* @param {string} url |
70
|
|
|
* @param {function} callback |
71
|
|
|
* @param {function} failed_callback |
72
|
|
|
*/ |
73
|
|
|
get_file_contents: function(url, callback, failed_callback){ |
74
|
|
|
$.ajax({ |
75
|
|
|
'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent) |
76
|
|
|
}).done(callback).fail(failed_callback); |
77
|
|
|
}, |
78
|
|
|
|
79
|
|
|
/** |
80
|
|
|
* Start testing the robots page |
81
|
|
|
*/ |
82
|
|
|
init_robots_tester: function(){ |
83
|
|
|
|
84
|
|
|
|
85
|
|
|
|
86
|
|
|
}, |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
}; |
90
|
|
|
|
91
|
|
|
// Register the tests |
92
|
|
|
crawler.on('BEFORE_INIT', function(){ |
93
|
|
|
crawler.regiser_test('file_tests', 'FILE TESTS', ['Status'], false); |
94
|
|
|
crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false); |
95
|
|
|
|
96
|
|
|
crawler_painter.set_type('blocked_pages', 'success'); |
97
|
|
|
}); |
98
|
|
|
|
99
|
|
|
// Start up the file testers |
100
|
|
|
crawler.on('AFTER_INIT', function(){ |
101
|
|
|
crawler_file_tester.get_file_contents( |
102
|
|
|
crawler.robots_url, |
103
|
|
|
crawler_file_tester.parse_robots_file, |
104
|
|
|
function(){ crawler_painter.add_status_row('file_tests', 'error', 'Failed to load robots file'); } |
105
|
|
|
); |
106
|
|
|
//crawler_file_tester.init_sitemap_tester(); |
107
|
|
|
}); |
108
|
|
|
|
109
|
|
|
// Test for blocked pages the the crawler finishes |
110
|
|
|
crawler.on('ALL_CRAWLS_FINISHED', function(){ |
111
|
|
|
crawler_file_tester.test_blocked_pages(); |
112
|
|
|
}); |
113
|
|
|
|
114
|
|
|
|