crawler_file_tester.init_robots_tester - Code Metrics - Inspection of "#12 robots.txt" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — develop ( 321035...7852a2 )

by Dylan

created 2016-09-28 17:52 UTC

crawler_file_tester.init_robots_tester A

↳ Parent: js/crawler_file_tester.js

Complexity

Conditions	1
Paths	1

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
c	0
b	0
f	0
nc	1
nop	0
dl	0
loc	14
rs	9.4285

2 Functions

Rating	Name	Duplication	Size	Complexity
A		0	5	2
A		0	3	1

const crawler_file_tester = {

    robot_rules: [],

    /**
     * Parse the content of the robots file
     *
     * @param {*} result
     * @throws {Exception}
     */
    parse_robots_file: function(result){
        var rules   = result.split("\n"),
            status  = crawler_painter.create_status('success', 'Robots file loaded');
        crawler_painter.add_row('file_tests', [ status, result.replace(/(?:\r\n|\r|\n)/g, '<br />') ]);

        var agent = '*';
        for(var r in rules){
            if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
                continue;
            }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
                agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, '');
            }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
                var rule =
                    '^'+rules[r]
                    .replace(/disallow:/gi, '') // remove disallow
                    .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space
                    .replace('?', '\\?') // escape query string start
                    .replace('|', '\\|') // escape pipe
                    .replace('/', '\\/') // escape slashes
                    .replace(/^\^\^/g, '^') // If it already had a caret remove it
                    .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier
                crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
            }else{
                console.log(rules[r]);
                throw "Found a rule which we don't understand. Report it to the developer";
            }
        }
    },

    /**
     * Check if the given url is blocked by the robot rules we have
     *
     * @param {string} url
     * @returns {boolean|string}
     */
    is_blocked_in: function(url){


        return false;
    },

    /**
     * Check all tested url and see if they are blocked by any rule in the robots file
     *
     * @returns {undefined}
     */
    test_blocked_pages: function(){
        for(var t in crawler.tested){
            var url = crawler.tested[t];

            if( crawler.linked_from.hasOwnProperty(url) ) {
                for (var r in this.robot_rules) {
                    var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
                    if (regex.test('/' + url)) {
                        var link    = crawler_painter.create_link(url, url),
                            status  = crawler_painter.create_status('error', 'Page has links and is blocked in robots'),
                            agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
                        crawler_painter.add_row(
                            'blocked_pages',
                            [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
                    }
                }
            }
        }

        return undefined;
    },

    get_file_contents: function(url, type, callback, failed_callback){
        $.ajax({ 'url' : url, 'dataType' : type }).done(callback).fail(failed_callback);
    },

    /**
     * Start testing the robots page
     */
    init_robots_tester: function(){
        crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
        crawler_painter.set_type('blocked_pages', 'success');

        this.get_file_contents( crawler.robots_url, 'text', this.parse_robots_file, function(){
            crawler_painter.add_status_row('file_tests', 'error', 'Failed to load robots file');
        });

        crawler.on('CRAWL_FINISHED', function(){
           if( crawler.que.length < 1 ){
               crawler_file_tester.test_blocked_pages();
           }
        });
    },


};

// Start up the file tester
crawler.on('BEFORE_INIT', function(){
    crawler.regiser_test('file_tests', 'FILE TESTS', ['Status', 'Content'], false);
    crawler_file_tester.init_robots_tester();
    //crawler_file_tester.init_sitemap_tester();
});



1			const crawler_file_tester = {
2
3			robot_rules: [],
4
5			/**
6			* Parse the content of the robots file
7			*
8			* @param {*} result
9			* @throws {Exception}
10			*/
11			parse_robots_file: function(result){
12			var rules = result.split("\n"),
13			status = crawler_painter.create_status('success', 'Robots file loaded');
14			crawler_painter.add_row('file_tests', [ status, result.replace(/(?:\r\n\|\r\|\n)/g, '<br />') ]);
15
16			var agent = '*';
17			for(var r in rules){
18			if( rules[r].length < 1 \|\| rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){
19			continue;
20			}else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){
21			agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+\|\s+$\|\s+(?=\s)/g, '');
22			}else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){
23			var rule =
24			'^'+rules[r]
25			.replace(/disallow:/gi, '') // remove disallow
26			.replace(/^\s+\|\s+$\|\s+(?=\s)/g, '') // remove white space
27			.replace('?', '\\?') // escape query string start
28			.replace('\|', '\\\|') // escape pipe
29			.replace('/', '\\/') // escape slashes
30			.replace(/^\^\^/g, '^') // If it already had a caret remove it
31			.replace(/^(\)/g, '(.?)'); // Replace star with match anything modifier
32			crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] });
33			}else{
34			console.log(rules[r]);
35			throw "Found a rule which we don't understand. Report it to the developer";
36			}
37			}
38			},
39
40			/**
41			* Check if the given url is blocked by the robot rules we have
42			*
43			* @param {string} url
44			* @returns {boolean\|string}
45			*/
46			is_blocked_in: function(url){
			0 ignored issues – show Unused Code introduced 2016-09-28 17:55 UTC by Report Bug Copy Issue Report The parameter `url` is not used and could be removed. This check looks for parameters in functions that are not used in the function body and are not followed by other parameters which are used inside the function. Loading history...
47
48			return false;
49			},
50
51			/**
52			* Check all tested url and see if they are blocked by any rule in the robots file
53			*
54			* @returns {undefined}
55			*/
56			test_blocked_pages: function(){
57			for(var t in crawler.tested){
58			var url = crawler.tested[t];
59
60			if( crawler.linked_from.hasOwnProperty(url) ) {
61			for (var r in this.robot_rules) {
62			var regex = new RegExp(this.robot_rules[r]['rule'], 'g');
63			if (regex.test('/' + url)) {
64			var link = crawler_painter.create_link(url, url),
65			status = crawler_painter.create_status('error', 'Page has links and is blocked in robots'),
66			agent = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent'];
67			crawler_painter.add_row(
68			'blocked_pages',
69			[link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]);
70			}
71			}
72			}
73			}
74
75			return undefined;
76			},
77
78			get_file_contents: function(url, type, callback, failed_callback){
79			$.ajax({ 'url' : url, 'dataType' : type }).done(callback).fail(failed_callback);
80			},
81
82			/**
83			* Start testing the robots page
84			*/
85			init_robots_tester: function(){
86			crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false);
87			crawler_painter.set_type('blocked_pages', 'success');
88
89			this.get_file_contents( crawler.robots_url, 'text', this.parse_robots_file, function(){
90			crawler_painter.add_status_row('file_tests', 'error', 'Failed to load robots file');
91			});
92
93			crawler.on('CRAWL_FINISHED', function(){
94			if( crawler.que.length < 1 ){
95			crawler_file_tester.test_blocked_pages();
96			}
97			});
98			},
99
100
101			};
102
103			// Start up the file tester
104			crawler.on('BEFORE_INIT', function(){
105			crawler.regiser_test('file_tests', 'FILE TESTS', ['Status', 'Content'], false);
106			crawler_file_tester.init_robots_tester();
107			//crawler_file_tester.init_sitemap_tester();
108			});
109
110

dylangrech92 / seotoolbox

Push — develop ( 321035...7852a2 )

crawler_file_tester.init_robots_tester A

Complexity

Size

Duplication

Importance

2 Functions

Duplication Side-by-Side

Filter issues like