Completed
Push — develop ( 7852a2...d7f552 )
by Dylan
03:02
created

crawler.ALL_CRAWLS_FINISHED   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 3
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
1
const crawler_file_tester = {
2
3
    robot_rules: [],
4
5
    /**
6
     * Parse the content of the robots file
7
     *
8
     * @param {*} result
9
     * @throws {Exception}
10
     */
11
    parse_robots_file: function(result){
12
        var rules = result.split("\n");
13
        crawler_painter.add_row('file_tests', [ crawler_painter.create_status('success', 'Robots file loaded') ]);
14
15
        var agent = '*';
16
        for(var r in rules){
17
            if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
18
                continue;
19
            }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
20
                agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, '');
21
            }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
22
                var rule =
23
                    '^'+rules[r]
24
                    .replace(/disallow:/gi, '') // remove disallow
25
                    .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space
26
                    .replace('?', '\\?') // escape query string start
27
                    .replace('|', '\\|') // escape pipe
28
                    .replace('/', '\\/') // escape slashes
29
                    .replace(/^\^\^/g, '^') // If it already had a caret remove it
30
                    .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier
31
                crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
32
            }else{
33
                console.log(rules[r]);
34
                throw "Found a rule which we don't understand. Report it to the developer";
35
            }
36
        }
37
    },
38
39
    /**
40
     * Check all tested url and see if they are blocked by any rule in the robots file
41
     *
42
     * @returns {undefined}
43
     */
44
    test_blocked_pages: function(){
45
        for(var t in crawler.tested){
46
            var url = crawler.tested[t];
47
48
            if( crawler.linked_from.hasOwnProperty(url) ) {
49
                for (var r in this.robot_rules) {
50
                    var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
51
                    if (regex.test('/' + url)) {
52
                        var link    = crawler_painter.create_link(url, url),
53
                            status  = crawler_painter.create_status('error', 'Page has links and is blocked in robots'),
54
                            agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
55
                        crawler_painter.add_row(
56
                            'blocked_pages',
57
                            [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
58
                    }
59
                }
60
            }
61
        }
62
63
        return undefined;
64
    },
65
66
    /**
67
     * Setup an ajax call to fetch url
68
     *
69
     * @param {string} url
70
     * @param {function} callback
71
     * @param {function} failed_callback
72
     */
73
    get_file_contents: function(url, callback, failed_callback){
74
        $.ajax({
75
            'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent)
76
        }).done(callback).fail(failed_callback);
77
    },
78
79
    /**
80
     * Start testing the robots page
81
     */
82
    init_robots_tester: function(){
83
84
85
86
    },
87
88
89
};
90
91
// Register the tests
92
crawler.on('BEFORE_INIT', function(){
93
    crawler.regiser_test('file_tests', 'FILE TESTS', ['Status'], false);
94
    crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
95
96
    crawler_painter.set_type('blocked_pages', 'success');
97
});
98
99
// Start up the file testers
100
crawler.on('AFTER_INIT', function(){
101
    crawler_file_tester.get_file_contents(
102
        crawler.robots_url,
103
        crawler_file_tester.parse_robots_file,
104
        function(){ crawler_painter.add_status_row('file_tests', 'error', 'Failed to load robots file'); }
105
    );
106
    //crawler_file_tester.init_sitemap_tester();
107
});
108
109
// Test for blocked pages the the crawler finishes
110
crawler.on('ALL_CRAWLS_FINISHED', function(){
111
    crawler_file_tester.test_blocked_pages();
112
});
113
114