Completed
Push — develop ( 322e3f...d82e9f )
by Dylan
02:40
created

crawler.event_handler.ALL_CRAWLS_FINISHED   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 3
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
1
const crawler_file_tester = {
2
3
    robot_rules: [],
4
5
    /**
6
     * Parse the content of the robots file
7
     *
8
     * @param {*} result
9
     * @throws {Exception}
10
     */
11
    parse_robots_file: function(result){
12
        var rules = result.split("\n");
13
        $('#robots-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle">&nbsp;</span>');
14
15
        var agent = '*';
16
        for(var r in rules){
17
            if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
18
                continue;
19
            }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
20
                agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, '');
21
            }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
22
                var rule =
23
                    '^'+rules[r]
24
                    .replace(/disallow:/gi, '') // remove disallow
25
                    .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space
26
                    .replace('?', '\\?') // escape query string start
27
                    .replace('|', '\\|') // escape pipe
28
                    .replace('/', '\\/') // escape slashes
29
                    .replace(/^\^\^/g, '^') // If it already had a caret remove it
30
                    .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier
31
                crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
32
            }else{
33
                console.log(rules[r]);
34
                throw "Found a rule which we don't understand. Report it to the developer";
35
            }
36
        }
37
    },
38
39
    /**
40
     * Check all tested url and see if they are blocked by any rule in the robots file
41
     *
42
     * @returns {undefined}
43
     */
44
    test_blocked_pages: function(){
45
        for(var t in crawler.tested){
46
            var url = crawler.tested[t];
47
48
            if( crawler.linked_from.hasOwnProperty(url) ) {
49
                for (var r in this.robot_rules) {
50
                    var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
51
                    if (regex.test('/' + url)) {
52
                        var link    = crawler.painter.create_link(url, url),
53
                            status  = crawler.painter.create_status('error', 'Page has links and is blocked in robots'),
54
                            agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
55
                        crawler.painter.add_row(
56
                            'blocked_pages',
57
                            [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
58
                    }
59
                }
60
            }
61
        }
62
63
        return undefined;
64
    },
65
66
    /**
67
     * Setup an ajax call to fetch url
68
     *
69
     * @param {string} url
70
     * @param {function} callback
71
     * @param {function} failed_callback
72
     */
73
    get_file_contents: function(url, callback, failed_callback){
74
        $.ajax({
75
            'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent)
76
        }).done(callback).fail(failed_callback);
77
    }
78
};
79
80
// Register the tests
81
crawler.event_handler.on('BEFORE_INIT', function(){
82
    crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
83
    crawler.painter.set_type('blocked_pages', 'default');
84
});
85
86
// Start up the file testers
87
crawler.event_handler.on('AFTER_INIT', function(){
88
    crawler_file_tester.get_file_contents(
89
        crawler.robots_url,
90
        crawler_file_tester.parse_robots_file,
91
        function(){ $('#robots-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle">&nbsp;</span>'); }
92
    );
93
    //crawler_file_tester.init_sitemap_tester();
94
});
95
96
// Test for blocked pages the the crawler finishes
97
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
98
    crawler_file_tester.test_blocked_pages();
99
});
100
101