1
|
|
|
"""Package containing all the functions and classes needed to clean a file. |
2
|
|
|
|
3
|
|
|
.. Authors: |
4
|
|
|
Philippe Dessauw |
5
|
|
|
[email protected] |
6
|
|
|
|
7
|
|
|
.. Sponsor: |
8
|
|
|
Alden Dima |
9
|
|
|
[email protected] |
10
|
|
|
Information Systems Group |
11
|
|
|
Software and Systems Division |
12
|
|
|
Information Technology Laboratory |
13
|
|
|
National Institute of Standards and Technology |
14
|
|
|
http://www.nist.gov/itl/ssd/is |
15
|
|
|
""" |
16
|
|
|
from __future__ import division |
17
|
|
|
import logging |
18
|
|
|
from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel |
19
|
|
|
from denoiser.text import Text |
20
|
|
|
|
21
|
|
|
|
22
|
|
|
class Denoiser(object): |
23
|
|
|
"""Denoiser object, able to clean a file and train related models |
24
|
|
|
""" |
25
|
|
|
|
26
|
|
|
def __init__(self, app_config): |
27
|
|
|
self.config = app_config |
28
|
|
|
self.logger = logging.getLogger('local') |
29
|
|
|
|
30
|
|
|
self.inline_model = InlineModel(self.config) |
31
|
|
|
self.indicator_model = IndicatorModel(self.config) |
32
|
|
|
self.learning_model = MachineLearningModel(self.config) |
33
|
|
|
|
34
|
|
|
self.logger.info("Denoiser initialized") |
35
|
|
|
|
36
|
|
View Code Duplication |
def cleanse(self, filename, is_csv=False): |
|
|
|
|
37
|
|
|
"""Cleanse a file given its name |
38
|
|
|
|
39
|
|
|
Parameters: |
40
|
|
|
filename (str): Path of the file to cleanse |
41
|
|
|
is_csv (bool): Specifies if the file is a CSV |
42
|
|
|
|
43
|
|
|
Returns: |
44
|
|
|
`Text`: Text data |
45
|
|
|
""" |
46
|
|
|
self.logger.debug("Cleaning "+filename+"...") |
47
|
|
|
text_data = Text(filename) |
48
|
|
|
|
49
|
|
|
# Parse the proper format |
50
|
|
|
if is_csv: |
51
|
|
|
text_data.read_csv() |
52
|
|
|
else: |
53
|
|
|
text_data.read_txt() |
54
|
|
|
|
55
|
|
|
# Clean the text |
56
|
|
|
self.inline_model.load(text_data) |
57
|
|
|
self.inline_model.correct(text_data) |
58
|
|
|
|
59
|
|
|
self.indicator_model.load(text_data) |
60
|
|
|
self.indicator_model.correct(text_data) |
61
|
|
|
|
62
|
|
|
self.learning_model.load(text_data) |
63
|
|
|
self.learning_model.correct(text_data) |
64
|
|
|
|
65
|
|
|
return text_data |
66
|
|
|
|
67
|
|
View Code Duplication |
def train(self, dataset): |
|
|
|
|
68
|
|
|
""" Train the denoiser with a set of files |
69
|
|
|
|
70
|
|
|
Parameters |
71
|
|
|
dataset (list): List of files |
72
|
|
|
""" |
73
|
|
|
self.logger.debug("Training denoiser...") |
74
|
|
|
|
75
|
|
|
# Generate datastructures from dataset |
76
|
|
|
text_dataset = [Text(f) for f in dataset] |
77
|
|
|
|
78
|
|
|
# Create datastructures for the whole dataset |
79
|
|
|
for text_data in text_dataset: |
80
|
|
|
self.logger.debug("Preprocessing "+text_data.filename) |
81
|
|
|
text_data.read_csv() |
82
|
|
|
|
83
|
|
|
# print "Loading "+text.filename+"..." |
84
|
|
|
self.inline_model.load(text_data) |
85
|
|
|
self.inline_model.correct(text_data) |
86
|
|
|
|
87
|
|
|
self.indicator_model.load(text_data) |
88
|
|
|
self.indicator_model.correct(text_data) |
89
|
|
|
|
90
|
|
|
# Load existing training data |
91
|
|
|
self.logger.debug("Training learning model...") |
92
|
|
|
self.learning_model.train(text_dataset) |
93
|
|
|
|
94
|
|
|
self.logger.info("Machine learning model trained") |
95
|
|
|
|
96
|
|
|
def generate_models(self, dataset): |
97
|
|
|
""" Generates the datastructures given a set of files |
98
|
|
|
|
99
|
|
|
Parameters |
100
|
|
|
dataset (list): List of files |
101
|
|
|
""" |
102
|
|
|
self.logger.debug("Generating datastructures...") |
103
|
|
|
text_dataset = [Text(f) for f in dataset] |
104
|
|
|
|
105
|
|
|
for text_data in text_dataset: |
106
|
|
|
self.logger.debug("Processing "+text_data.filename+"...") |
107
|
|
|
|
108
|
|
|
text_data.read_csv() |
109
|
|
|
self.inline_model.load(text_data) |
110
|
|
|
|
111
|
|
|
self.logger.info("Datastructure generated") |
112
|
|
|
return 0 |
113
|
|
|
|
114
|
|
|
|