1
|
|
|
const crawler_file_tester = { |
2
|
|
|
|
3
|
|
|
robot_rules: [], |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Parse the content of the robots file |
7
|
|
|
* |
8
|
|
|
* @param {*} result |
9
|
|
|
* @throws {Exception} |
10
|
|
|
*/ |
11
|
|
|
parse_robots_file: function(result){ |
12
|
|
|
var rules = result.split("\n"); |
13
|
|
|
$('#robots-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle"> </span>'); |
14
|
|
|
|
15
|
|
|
var agent = '*'; |
16
|
|
|
for(var r in rules){ |
17
|
|
|
if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){ |
18
|
|
|
continue; |
19
|
|
|
}else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){ |
20
|
|
|
agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, ''); |
21
|
|
|
}else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){ |
22
|
|
|
var rule = |
23
|
|
|
'^'+rules[r] |
24
|
|
|
.replace(/disallow:/gi, '') // remove disallow |
25
|
|
|
.replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space |
26
|
|
|
.replace('?', '\\?') // escape query string start |
27
|
|
|
.replace('|', '\\|') // escape pipe |
28
|
|
|
.replace('/', '\\/') // escape slashes |
29
|
|
|
.replace(/^\^\^/g, '^') // If it already had a caret remove it |
30
|
|
|
.replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier |
31
|
|
|
crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] }); |
32
|
|
|
}else{ |
33
|
|
|
console.log(rules[r]); |
34
|
|
|
throw "Found a rule which we don't understand. Report it to the developer"; |
35
|
|
|
} |
36
|
|
|
} |
37
|
|
|
}, |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Check all tested url and see if they are blocked by any rule in the robots file |
41
|
|
|
* |
42
|
|
|
* @returns {undefined} |
43
|
|
|
*/ |
44
|
|
|
test_blocked_pages: function(){ |
45
|
|
|
for(var t in crawler.tested){ |
46
|
|
|
var url = crawler.tested[t]; |
47
|
|
|
|
48
|
|
|
if( crawler.linked_from.hasOwnProperty(url) ) { |
49
|
|
|
for (var r in this.robot_rules) { |
50
|
|
|
var regex = new RegExp(this.robot_rules[r]['rule'], 'g'); |
51
|
|
|
if (regex.test('/' + url)) { |
52
|
|
|
var link = crawler.painter.create_link(url, url), |
53
|
|
|
status = crawler.painter.create_status('error', 'Page has links and is blocked in robots'), |
54
|
|
|
agent = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent']; |
55
|
|
|
crawler.painter.add_row( |
56
|
|
|
'blocked_pages', |
57
|
|
|
[link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]); |
58
|
|
|
} |
59
|
|
|
} |
60
|
|
|
} |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
return undefined; |
64
|
|
|
}, |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Parse the content of the sitemap file |
68
|
|
|
* |
69
|
|
|
* @returns undefined |
70
|
|
|
*/ |
71
|
|
|
parse_sitemap_file: function(result){ |
72
|
|
|
crawler.sitemap = []; |
73
|
|
|
var ruleset = $($(result).filter('urlset')[0]); |
74
|
|
|
$.each(ruleset.children(), function() { |
75
|
|
|
crawler.sitemap.push( crawler.sanitize($(this).find('loc')[0].innerHTML) ); |
76
|
|
|
}); |
77
|
|
|
|
78
|
|
|
$('#sitemap-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle"> </span>'); |
79
|
|
|
|
80
|
|
|
return undefined; |
81
|
|
|
}, |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Test the urls in the sitemap |
85
|
|
|
* |
86
|
|
|
* @returns {undefined} |
87
|
|
|
*/ |
88
|
|
|
test_sitemap: function(){ |
89
|
|
|
var sitemap = crawler.sitemap; |
90
|
|
|
|
91
|
|
|
for(var u in sitemap){ |
92
|
|
|
var link = crawler.painter.create_link(sitemap[u], sitemap[u]); |
93
|
|
|
|
94
|
|
|
if( crawler.failed.indexOf(sitemap[u]) >= 0 ) { |
95
|
|
|
var status = crawler.painter.create_status('error', 'Page found in sitemap but is broken'); |
96
|
|
|
crawler.painter.add_row('sitemap', [link, status]); |
97
|
|
|
continue; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
if( crawler.tested.indexOf(sitemap[u]) < 0 ){ |
101
|
|
|
var status = crawler.painter.create_status('warning', 'Page found in sitemap but not found by crawler'); |
|
|
|
|
102
|
|
|
crawler.painter.add_row('sitemap', [link, status]); |
103
|
|
|
continue; |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
if( !crawler.linked_from.hasOwnProperty(sitemap[u]) ){ |
107
|
|
|
var status = crawler.painter.create_status('info', 'Page found in sitemap but has no links on the site'); |
|
|
|
|
108
|
|
|
crawler.painter.add_row('sitemap', [link, status]); |
109
|
|
|
} |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
return undefined; |
113
|
|
|
}, |
114
|
|
|
|
115
|
|
|
/** |
116
|
|
|
* Setup an ajax call to fetch url |
117
|
|
|
* |
118
|
|
|
* @param {string} url |
119
|
|
|
* @param {function} callback |
120
|
|
|
* @param {function} failed_callback |
121
|
|
|
* |
122
|
|
|
* @returns {undefined} |
123
|
|
|
*/ |
124
|
|
|
get_file_contents: function(url, callback, failed_callback){ |
125
|
|
|
var t = $.ajax({ |
|
|
|
|
126
|
|
|
'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent) |
127
|
|
|
}).done(callback).fail(failed_callback); |
128
|
|
|
return undefined; |
129
|
|
|
} |
130
|
|
|
}; |
131
|
|
|
|
132
|
|
|
// Register the tests |
133
|
|
|
crawler.event_handler.on('BEFORE_INIT', function(){ |
134
|
|
|
crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false); |
135
|
|
|
crawler.painter.set_type('blocked_pages', 'default'); |
136
|
|
|
crawler.regiser_test('sitemap', 'SITEMAP', ['URL', 'Status'], false); |
137
|
|
|
crawler.painter.set_type('sitemap', 'default'); |
138
|
|
|
}); |
139
|
|
|
|
140
|
|
|
// Start up the file testers |
141
|
|
|
crawler.event_handler.on('AFTER_INIT', function(){ |
142
|
|
|
crawler_file_tester.get_file_contents( |
143
|
|
|
crawler.robots_url, |
144
|
|
|
crawler_file_tester.parse_robots_file, |
145
|
|
|
function(){ $('#robots-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle"> </span>'); } |
146
|
|
|
); |
147
|
|
|
crawler_file_tester.get_file_contents( |
148
|
|
|
crawler.sitemap_url, |
149
|
|
|
crawler_file_tester.parse_sitemap_file, |
150
|
|
|
function(){ $('#sitemap-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle"> </span>'); } |
151
|
|
|
); |
152
|
|
|
}); |
153
|
|
|
|
154
|
|
|
// Test for blocked pages the the crawler finishes |
155
|
|
|
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){ |
156
|
|
|
crawler_file_tester.test_blocked_pages(); |
157
|
|
|
crawler_file_tester.test_sitemap(); |
158
|
|
|
}); |
159
|
|
|
|
160
|
|
|
|
This check looks for variables that are declared in multiple lines. There may be several reasons for this.
In the simplest case the variable name was reused by mistake. This may lead to very hard to locate bugs.
If you want to reuse a variable for another purpose, consider declaring it at or near the top of your function and just assigning to it subsequently so it is always declared.