Completed
Push — master ( 090c72...ff33da )
by Dylan
04:20 queued 01:54
created

crawler_file_tester.test_sitemap   B

Complexity

Conditions 5
Paths 5

Size

Total Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
nc 5
nop 0
dl 0
loc 26
rs 8.439
c 0
b 0
f 0
1
const crawler_file_tester = {
2
3
    robot_rules: [],
4
5
    /**
6
     * Parse the content of the robots file
7
     *
8
     * @param {*} result
9
     * @throws {Exception}
10
     */
11
    parse_robots_file: function(result){
12
        var rules = result.split("\n");
13
        $('#robots-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle">&nbsp;</span>');
14
15
        var agent = '*';
16
        for(var r in rules){
17
            if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
18
                continue;
19
            }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
20
                agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, '');
21
            }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
22
                var rule =
23
                    '^'+rules[r]
24
                    .replace(/disallow:/gi, '') // remove disallow
25
                    .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space
26
                    .replace('?', '\\?') // escape query string start
27
                    .replace('|', '\\|') // escape pipe
28
                    .replace('/', '\\/') // escape slashes
29
                    .replace(/^\^\^/g, '^') // If it already had a caret remove it
30
                    .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier
31
                crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
32
            }else{
33
                console.log(rules[r]);
34
                throw "Found a rule which we don't understand. Report it to the developer";
35
            }
36
        }
37
    },
38
39
    /**
40
     * Check all tested url and see if they are blocked by any rule in the robots file
41
     *
42
     * @returns {undefined}
43
     */
44
    test_blocked_pages: function(){
45
        for(var t in crawler.tested){
46
            var url = crawler.tested[t];
47
48
            if( crawler.linked_from.hasOwnProperty(url) ) {
49
                for (var r in this.robot_rules) {
50
                    var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
51
                    if (regex.test('/' + url)) {
52
                        var link    = crawler.painter.create_link(url, url),
53
                            status  = crawler.painter.create_status('error', 'Page has links and is blocked in robots'),
54
                            agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
55
                        crawler.painter.add_row(
56
                            'blocked_pages',
57
                            [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
58
                    }
59
                }
60
            }
61
        }
62
63
        return undefined;
64
    },
65
66
    /**
67
     * Parse the content of the sitemap file
68
     *
69
     * @returns undefined
70
     */
71
    parse_sitemap_file: function(result){
72
        crawler.sitemap = [];
73
        var ruleset = $($(result).filter('urlset')[0]);
74
        $.each(ruleset.children(), function() {
75
            crawler.sitemap.push( crawler.sanitize($(this).find('loc')[0].innerHTML) );
76
        });
77
78
        $('#sitemap-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle">&nbsp;</span>');
79
80
        return undefined;
81
    },
82
83
    /**
84
     * Test the urls in the sitemap
85
     *
86
     * @returns {undefined}
87
     */
88
    test_sitemap: function(){
89
        var sitemap = crawler.sitemap;
90
91
        for(var u in sitemap){
92
            var link = crawler.painter.create_link(sitemap[u], sitemap[u]);
93
94
            if( crawler.failed.indexOf(sitemap[u]) >= 0 ) {
95
                var status = crawler.painter.create_status('error', 'Page found in sitemap but is broken');
96
                crawler.painter.add_row('sitemap', [link, status]);
97
                continue;
98
            }
99
100
            if( crawler.tested.indexOf(sitemap[u]) < 0 ){
101
                var status = crawler.painter.create_status('warning', 'Page found in sitemap but not found by crawler');
0 ignored issues
show
Comprehensibility Naming Best Practice introduced by
The variable status already seems to be declared on line 95. Consider using another variable name or omitting the var keyword.

This check looks for variables that are declared in multiple lines. There may be several reasons for this.

In the simplest case the variable name was reused by mistake. This may lead to very hard to locate bugs.

If you want to reuse a variable for another purpose, consider declaring it at or near the top of your function and just assigning to it subsequently so it is always declared.

Loading history...
102
                crawler.painter.add_row('sitemap', [link, status]);
103
                continue;
104
            }
105
106
            if( !crawler.linked_from.hasOwnProperty(sitemap[u]) ){
107
                var status = crawler.painter.create_status('info', 'Page found in sitemap but has no links on the site');
0 ignored issues
show
Comprehensibility Naming Best Practice introduced by
The variable status already seems to be declared on line 95. Consider using another variable name or omitting the var keyword.

This check looks for variables that are declared in multiple lines. There may be several reasons for this.

In the simplest case the variable name was reused by mistake. This may lead to very hard to locate bugs.

If you want to reuse a variable for another purpose, consider declaring it at or near the top of your function and just assigning to it subsequently so it is always declared.

Loading history...
108
                crawler.painter.add_row('sitemap', [link, status]);
109
            }
110
        }
111
112
        return undefined;
113
    },
114
115
    /**
116
     * Setup an ajax call to fetch url
117
     *
118
     * @param {string} url
119
     * @param {function} callback
120
     * @param {function} failed_callback
121
     *
122
     * @returns {undefined}
123
     */
124
    get_file_contents: function(url, callback, failed_callback){
125
        var t = $.ajax({
0 ignored issues
show
Unused Code introduced by
The variable t seems to be never used. Consider removing it.
Loading history...
126
            'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent)
127
        }).done(callback).fail(failed_callback);
128
        return undefined;
129
    }
130
};
131
132
// Register the tests
133
crawler.event_handler.on('BEFORE_INIT', function(){
134
    crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
135
    crawler.painter.set_type('blocked_pages', 'default');
136
    crawler.regiser_test('sitemap', 'SITEMAP', ['URL', 'Status'], false);
137
    crawler.painter.set_type('sitemap', 'default');
138
});
139
140
// Start up the file testers
141
crawler.event_handler.on('AFTER_INIT', function(){
142
    crawler_file_tester.get_file_contents(
143
        crawler.robots_url,
144
        crawler_file_tester.parse_robots_file,
145
        function(){ $('#robots-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle">&nbsp;</span>'); }
146
    );
147
    crawler_file_tester.get_file_contents(
148
        crawler.sitemap_url,
149
        crawler_file_tester.parse_sitemap_file,
150
        function(){ $('#sitemap-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle">&nbsp;</span>'); }
151
    );
152
});
153
154
// Test for blocked pages the the crawler finishes
155
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
156
    crawler_file_tester.test_blocked_pages();
157
    crawler_file_tester.test_sitemap();
158
});
159
160