Code Duplication    Length = 28-30 lines in 2 locations

packages/denoiser/src/denoiser/__init__.py 2 locations

@@ 36-65 (lines=30) @@
33
34
        self.logger.info("Denoiser initialized")
35
36
    def cleanse(self, filename, is_csv=False):
37
        """Cleanse a file given its name
38
39
        Parameters:
40
            filename (str): Path of the file to cleanse
41
            is_csv (bool): Specifies if the file is a CSV
42
43
        Returns:
44
            `Text`: Text data
45
        """
46
        self.logger.debug("Cleaning "+filename+"...")
47
        text_data = Text(filename)
48
49
        # Parse the proper format
50
        if is_csv:
51
            text_data.read_csv()
52
        else:
53
            text_data.read_txt()
54
55
        # Clean the text
56
        self.inline_model.load(text_data)
57
        self.inline_model.correct(text_data)
58
59
        self.indicator_model.load(text_data)
60
        self.indicator_model.correct(text_data)
61
62
        self.learning_model.load(text_data)
63
        self.learning_model.correct(text_data)
64
65
        return text_data
66
67
    def train(self, dataset):
68
        """ Train the denoiser with a set of files
@@ 67-94 (lines=28) @@
64
65
        return text_data
66
67
    def train(self, dataset):
68
        """ Train the denoiser with a set of files
69
70
        Parameters
71
            dataset (list): List of files
72
        """
73
        self.logger.debug("Training denoiser...")
74
75
        # Generate datastructures from dataset
76
        text_dataset = [Text(f) for f in dataset]
77
78
        # Create datastructures for the whole dataset
79
        for text_data in text_dataset:
80
            self.logger.debug("Preprocessing "+text_data.filename)
81
            text_data.read_csv()
82
83
            # print "Loading "+text.filename+"..."
84
            self.inline_model.load(text_data)
85
            self.inline_model.correct(text_data)
86
87
            self.indicator_model.load(text_data)
88
            self.indicator_model.correct(text_data)
89
90
        # Load existing training data
91
        self.logger.debug("Training learning model...")
92
        self.learning_model.train(text_dataset)
93
94
        self.logger.info("Machine learning model trained")
95
96
    def generate_models(self, dataset):
97
        """ Generates the datastructures given a set of files