Denoiser.train()   B
last analyzed

Complexity

Conditions 3

Size

Total Lines 28

Duplication

Lines 28
Ratio 100 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 3
c 1
b 0
f 1
dl 28
loc 28
rs 8.8571
1
"""Package containing all the functions and classes needed to clean a file.
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
from __future__ import division
17
import logging
18
from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel
19
from denoiser.text import Text
20
21
22
class Denoiser(object):
23
    """Denoiser object, able to clean a file and train related models
24
    """
25
26
    def __init__(self, app_config):
27
        self.config = app_config
28
        self.logger = logging.getLogger('local')
29
30
        self.inline_model = InlineModel(self.config)
31
        self.indicator_model = IndicatorModel(self.config)
32
        self.learning_model = MachineLearningModel(self.config)
33
34
        self.logger.info("Denoiser initialized")
35
36 View Code Duplication
    def cleanse(self, filename, is_csv=False):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
37
        """Cleanse a file given its name
38
39
        Parameters:
40
            filename (str): Path of the file to cleanse
41
            is_csv (bool): Specifies if the file is a CSV
42
43
        Returns:
44
            `Text`: Text data
45
        """
46
        self.logger.debug("Cleaning "+filename+"...")
47
        text_data = Text(filename)
48
49
        # Parse the proper format
50
        if is_csv:
51
            text_data.read_csv()
52
        else:
53
            text_data.read_txt()
54
55
        # Clean the text
56
        self.inline_model.load(text_data)
57
        self.inline_model.correct(text_data)
58
59
        self.indicator_model.load(text_data)
60
        self.indicator_model.correct(text_data)
61
62
        self.learning_model.load(text_data)
63
        self.learning_model.correct(text_data)
64
65
        return text_data
66
67 View Code Duplication
    def train(self, dataset):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
68
        """ Train the denoiser with a set of files
69
70
        Parameters
71
            dataset (list): List of files
72
        """
73
        self.logger.debug("Training denoiser...")
74
75
        # Generate datastructures from dataset
76
        text_dataset = [Text(f) for f in dataset]
77
78
        # Create datastructures for the whole dataset
79
        for text_data in text_dataset:
80
            self.logger.debug("Preprocessing "+text_data.filename)
81
            text_data.read_csv()
82
83
            # print "Loading "+text.filename+"..."
84
            self.inline_model.load(text_data)
85
            self.inline_model.correct(text_data)
86
87
            self.indicator_model.load(text_data)
88
            self.indicator_model.correct(text_data)
89
90
        # Load existing training data
91
        self.logger.debug("Training learning model...")
92
        self.learning_model.train(text_dataset)
93
94
        self.logger.info("Machine learning model trained")
95
96
    def generate_models(self, dataset):
97
        """ Generates the datastructures given a set of files
98
99
        Parameters
100
            dataset (list): List of files
101
        """
102
        self.logger.debug("Generating datastructures...")
103
        text_dataset = [Text(f) for f in dataset]
104
105
        for text_data in text_dataset:
106
            self.logger.debug("Processing "+text_data.filename+"...")
107
108
            text_data.read_csv()
109
            self.inline_model.load(text_data)
110
111
        self.logger.info("Datastructure generated")
112
        return 0
113
114