|
1
|
|
|
"""Package containing all the functions and classes needed to clean a file. |
|
2
|
|
|
|
|
3
|
|
|
.. Authors: |
|
4
|
|
|
Philippe Dessauw |
|
5
|
|
|
[email protected] |
|
6
|
|
|
|
|
7
|
|
|
.. Sponsor: |
|
8
|
|
|
Alden Dima |
|
9
|
|
|
[email protected] |
|
10
|
|
|
Information Systems Group |
|
11
|
|
|
Software and Systems Division |
|
12
|
|
|
Information Technology Laboratory |
|
13
|
|
|
National Institute of Standards and Technology |
|
14
|
|
|
http://www.nist.gov/itl/ssd/is |
|
15
|
|
|
""" |
|
16
|
|
|
from __future__ import division |
|
17
|
|
|
import logging |
|
18
|
|
|
from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel |
|
19
|
|
|
from denoiser.text import Text |
|
20
|
|
|
|
|
21
|
|
|
|
|
22
|
|
|
class Denoiser(object): |
|
23
|
|
|
"""Denoiser object, able to clean a file and train related models |
|
24
|
|
|
""" |
|
25
|
|
|
|
|
26
|
|
|
def __init__(self, app_config): |
|
27
|
|
|
self.config = app_config |
|
28
|
|
|
self.logger = logging.getLogger('local') |
|
29
|
|
|
|
|
30
|
|
|
self.inline_model = InlineModel(self.config) |
|
31
|
|
|
self.indicator_model = IndicatorModel(self.config) |
|
32
|
|
|
self.learning_model = MachineLearningModel(self.config) |
|
33
|
|
|
|
|
34
|
|
|
self.logger.info("Denoiser initialized") |
|
35
|
|
|
|
|
36
|
|
View Code Duplication |
def cleanse(self, filename, is_csv=False): |
|
|
|
|
|
|
37
|
|
|
"""Cleanse a file given its name |
|
38
|
|
|
|
|
39
|
|
|
Parameters: |
|
40
|
|
|
filename (str): Path of the file to cleanse |
|
41
|
|
|
is_csv (bool): Specifies if the file is a CSV |
|
42
|
|
|
|
|
43
|
|
|
Returns: |
|
44
|
|
|
`Text`: Text data |
|
45
|
|
|
""" |
|
46
|
|
|
self.logger.debug("Cleaning "+filename+"...") |
|
47
|
|
|
text_data = Text(filename) |
|
48
|
|
|
|
|
49
|
|
|
# Parse the proper format |
|
50
|
|
|
if is_csv: |
|
51
|
|
|
text_data.read_csv() |
|
52
|
|
|
else: |
|
53
|
|
|
text_data.read_txt() |
|
54
|
|
|
|
|
55
|
|
|
# Clean the text |
|
56
|
|
|
self.inline_model.load(text_data) |
|
57
|
|
|
self.inline_model.correct(text_data) |
|
58
|
|
|
|
|
59
|
|
|
self.indicator_model.load(text_data) |
|
60
|
|
|
self.indicator_model.correct(text_data) |
|
61
|
|
|
|
|
62
|
|
|
self.learning_model.load(text_data) |
|
63
|
|
|
self.learning_model.correct(text_data) |
|
64
|
|
|
|
|
65
|
|
|
return text_data |
|
66
|
|
|
|
|
67
|
|
View Code Duplication |
def train(self, dataset): |
|
|
|
|
|
|
68
|
|
|
""" Train the denoiser with a set of files |
|
69
|
|
|
|
|
70
|
|
|
Parameters |
|
71
|
|
|
dataset (list): List of files |
|
72
|
|
|
""" |
|
73
|
|
|
self.logger.debug("Training denoiser...") |
|
74
|
|
|
|
|
75
|
|
|
# Generate datastructures from dataset |
|
76
|
|
|
text_dataset = [Text(f) for f in dataset] |
|
77
|
|
|
|
|
78
|
|
|
# Create datastructures for the whole dataset |
|
79
|
|
|
for text_data in text_dataset: |
|
80
|
|
|
self.logger.debug("Preprocessing "+text_data.filename) |
|
81
|
|
|
text_data.read_csv() |
|
82
|
|
|
|
|
83
|
|
|
# print "Loading "+text.filename+"..." |
|
84
|
|
|
self.inline_model.load(text_data) |
|
85
|
|
|
self.inline_model.correct(text_data) |
|
86
|
|
|
|
|
87
|
|
|
self.indicator_model.load(text_data) |
|
88
|
|
|
self.indicator_model.correct(text_data) |
|
89
|
|
|
|
|
90
|
|
|
# Load existing training data |
|
91
|
|
|
self.logger.debug("Training learning model...") |
|
92
|
|
|
self.learning_model.train(text_dataset) |
|
93
|
|
|
|
|
94
|
|
|
self.logger.info("Machine learning model trained") |
|
95
|
|
|
|
|
96
|
|
|
def generate_models(self, dataset): |
|
97
|
|
|
""" Generates the datastructures given a set of files |
|
98
|
|
|
|
|
99
|
|
|
Parameters |
|
100
|
|
|
dataset (list): List of files |
|
101
|
|
|
""" |
|
102
|
|
|
self.logger.debug("Generating datastructures...") |
|
103
|
|
|
text_dataset = [Text(f) for f in dataset] |
|
104
|
|
|
|
|
105
|
|
|
for text_data in text_dataset: |
|
106
|
|
|
self.logger.debug("Processing "+text_data.filename+"...") |
|
107
|
|
|
|
|
108
|
|
|
text_data.read_csv() |
|
109
|
|
|
self.inline_model.load(text_data) |
|
110
|
|
|
|
|
111
|
|
|
self.logger.info("Datastructure generated") |
|
112
|
|
|
return 0 |
|
113
|
|
|
|
|
114
|
|
|
|