Completed
Push — develop ( 321035...7852a2 )
by Dylan
02:53
created

crawler_file_tester.init_robots_tester   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 14
rs 9.4285

2 Functions

Rating   Name   Duplication   Size   Complexity  
A 0 5 2
A 0 3 1
1
const crawler_file_tester = {
2
3
    robot_rules: [],
4
5
    /**
6
     * Parse the content of the robots file
7
     *
8
     * @param {*} result
9
     * @throws {Exception}
10
     */
11
    parse_robots_file: function(result){
12
        var rules   = result.split("\n"),
13
            status  = crawler_painter.create_status('success', 'Robots file loaded');
14
        crawler_painter.add_row('file_tests', [ status, result.replace(/(?:\r\n|\r|\n)/g, '<br />') ]);
15
16
        var agent = '*';
17
        for(var r in rules){
18
            if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
19
                continue;
20
            }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
21
                agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, '');
22
            }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
23
                var rule =
24
                    '^'+rules[r]
25
                    .replace(/disallow:/gi, '') // remove disallow
26
                    .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space
27
                    .replace('?', '\\?') // escape query string start
28
                    .replace('|', '\\|') // escape pipe
29
                    .replace('/', '\\/') // escape slashes
30
                    .replace(/^\^\^/g, '^') // If it already had a caret remove it
31
                    .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier
32
                crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
33
            }else{
34
                console.log(rules[r]);
35
                throw "Found a rule which we don't understand. Report it to the developer";
36
            }
37
        }
38
    },
39
40
    /**
41
     * Check if the given url is blocked by the robot rules we have
42
     *
43
     * @param {string} url
44
     * @returns {boolean|string}
45
     */
46
    is_blocked_in: function(url){
0 ignored issues
show
Unused Code introduced by
The parameter url is not used and could be removed.

This check looks for parameters in functions that are not used in the function body and are not followed by other parameters which are used inside the function.

Loading history...
47
48
        return false;
49
    },
50
51
    /**
52
     * Check all tested url and see if they are blocked by any rule in the robots file
53
     *
54
     * @returns {undefined}
55
     */
56
    test_blocked_pages: function(){
57
        for(var t in crawler.tested){
58
            var url = crawler.tested[t];
59
60
            if( crawler.linked_from.hasOwnProperty(url) ) {
61
                for (var r in this.robot_rules) {
62
                    var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
63
                    if (regex.test('/' + url)) {
64
                        var link    = crawler_painter.create_link(url, url),
65
                            status  = crawler_painter.create_status('error', 'Page has links and is blocked in robots'),
66
                            agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
67
                        crawler_painter.add_row(
68
                            'blocked_pages',
69
                            [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
70
                    }
71
                }
72
            }
73
        }
74
75
        return undefined;
76
    },
77
78
    get_file_contents: function(url, type, callback, failed_callback){
79
        $.ajax({ 'url' : url, 'dataType' : type }).done(callback).fail(failed_callback);
80
    },
81
82
    /**
83
     * Start testing the robots page
84
     */
85
    init_robots_tester: function(){
86
        crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
87
        crawler_painter.set_type('blocked_pages', 'success');
88
89
        this.get_file_contents( crawler.robots_url, 'text', this.parse_robots_file, function(){
90
            crawler_painter.add_status_row('file_tests', 'error', 'Failed to load robots file');
91
        });
92
93
        crawler.on('CRAWL_FINISHED', function(){
94
           if( crawler.que.length < 1 ){
95
               crawler_file_tester.test_blocked_pages();
96
           }
97
        });
98
    },
99
100
101
};
102
103
// Start up the file tester
104
crawler.on('BEFORE_INIT', function(){
105
    crawler.regiser_test('file_tests', 'FILE TESTS', ['Status', 'Content'], false);
106
    crawler_file_tester.init_robots_tester();
107
    //crawler_file_tester.init_sitemap_tester();
108
});
109
110