Completed
Branch master (e214b7)
by Philippe
36s
created

MachineLearningAlgorithm   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 76
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 76
rs 10
wmc 9

6 Methods

Rating   Name   Duplication   Size   Complexity  
A __init__() 0 9 1
A set_training_set() 0 14 2
A classify() 0 10 1
A compute_error() 0 21 3
A train() 0 5 1
A set_classifier() 0 8 1
1
"""Package containing all the machine learning functions and objects
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
import logging
17
from numpy import mean
18
from numpy.lib.polynomial import poly1d
19
20
21
logger = logging.getLogger("app")
22
23
24
class MachineLearningAlgorithm(object):
25
    """Machine learning algorithm
26
    """
27
28
    def __init__(self):
29
        self.training_set = {
30
            "features": [],
31
            "results": []
32
        }
33
34
        self.classifier = None
35
36
        logger.info("Model created (new)")
37
38
    def set_classifier(self, cls):
39
        """Set the classifier
40
41
        Args:
42
            cls (object): Classifier object
43
        """
44
        self.classifier = cls
45
        logger.info(cls.__class__.__name__+" model loaded")
46
47
    def set_training_set(self, features, results):
48
        """Setup the training set and verify its integrity
49
50
        Args:
51
            features (list): Training set features
52
            results (list): Training set results
53
        """
54
        if len(features) != len(results):
55
            raise AttributeError("Number of features and result are different")
56
57
        self.training_set["features"] = features
58
        self.training_set["results"] = results
59
60
        logger.debug("Training set uploaded")
61
62
    def train(self):
63
        """Train the model with the given training set
64
        """
65
        self.classifier.fit(self.training_set["features"], self.training_set["results"])
66
        logger.debug("Model trained")
67
68
    def classify(self, features):
69
        """Classify features
70
71
        Args:
72
            features (list): Features to classify
73
74
        Returns:
75
            list: Results of the classification
76
        """
77
        return self.classifier.predict(features)
78
79
    def compute_error(self, features, results):
80
        """Compute classification error
81
82
        Args:
83
            features (list): Features to classify
84
            results (list): Expected results
85
86
        Returns:
87
            float: Classification error
88
        """
89
        prediction = self.classifier.predict(features)
90
        error = 0
91
92
        for index in xrange(len(prediction)):
93
            if results[index] < 0:
94
                continue
95
96
            error += ((prediction[index] - results[index]) / 5)**2
97
98
        error /= (2*len(prediction))
99
        return error
100
101
102
class MachineLearningFeatures(object):
103
    """Feature calculator for machine learning
104
    """
105
106
    def __init__(self):
107
        self.features = []
108
109
    def extract_features(self, line, unigrams, text_stats):
110
        """Extract features from a given line
111
112
        Args:
113
            line (Line): Line to get features from
114
            unigrams (Unigrams): Unigrams for the given line
115
            text_stats (Statistics): Statistics of the text the line is coming from
116
117
        Returns:
118
            list: List of the features
119
        """
120
        # Simple features
121
        features = [
122
            float(line.stats["orig"].get_stat("lw_char")),
123
            float(line.stats["orig"].get_stat("up_char")),
124
            float(line.stats["orig"].get_stat("sp_char")),
125
            float(line.stats["orig"].get_stat("nb_char")),
126
            float(len(line.tokens)),
127
        ]
128
129
        # Additional features
130
        fappend = features.append
131
        fappend(line.get_clean_stats().get_stat("lw_char"))
132
        fappend(line.get_clean_stats().get_stat("up_char"))
133
        fappend(line.get_clean_stats().get_stat("sp_char"))
134
        fappend(line.get_clean_stats().get_stat("nb_char"))
135
        fappend(line.get_line_score())
136
        fappend(len(line.get_orig_line()))
137
        fappend(len(line.get_clean_line()))
138
139
        u = unigrams
140
141
        tk_len = [len(token[0]) for token in line.tokens]
142
        word_avg_len = 0
143
144
        if len(tk_len) > 0:
145
            word_avg_len = mean(tk_len)
146
147
        fappend(float(word_avg_len))
148
149
        t0 = [u[tk[0]] for tk in line.tokens]
150
        s0 = 0
151
152
        if len(t0) != 0:
153
            s0 = mean(t0)
154
155
        fappend(float(s0))
156
157
        t1 = [u[tk[1]] for tk in line.tokens if not tk[1] is None]
158
        s1 = 0
159
160
        if len(t1) != 0:
161
            s1 = mean(t1)
162
163
        fappend(float(s1))
164
165
        t2 = [u[t] for tk in line.tokens if not tk[2] is None for t in tk[2].keys()]
166
        s2 = 0
167
168
        if len(t2) != 0:
169
            s2 = mean(t2)
170
171
        fappend(float(s2))
172
173
        # Regularization
174
        orig_chars = sum(features[:4])
175
        clean_chars = sum(features[5:9])
176
177
        f = [
178
            features[0] / orig_chars,
179
            features[1] / orig_chars,
180
            features[2] / orig_chars,
181
            features[3] / orig_chars
182
        ]
183
184
        if clean_chars != 0:
185
            f += [features[5] / clean_chars,
186
                  features[6] / clean_chars,
187
                  features[7] / clean_chars,
188
                  features[8] / clean_chars]
189
        else:
190
            f += [0, 0, 0, 0]
191
192
        f += [features[9],
193
              features[4] / text_stats.get_stat("word_avg_nb"),
194
              features[12] / text_stats.get_stat("word_avg_length"),
195
              features[10] / text_stats.get_stat("line_avg_length"),
196
              features[11] / text_stats.get_stat("line_avg_length")]
197
198
        if features[13] != 0:
199
            f.append(features[14] / features[13])
200
            f.append(features[15] / features[13])
201
        else:
202
            f.append(0)
203
            f.append(0)
204
205
        features = f
206
207
        # Ordering the data set
208
        features = [
209
            features[11],  # Original line average len
210
            features[12],  # Clean line average len
211
            features[9],  # Original line average len
212
            features[10],  # Clean line average len
213
            features[13],  # Original line average len
214
            features[14],  # Clean line average len
215
            features[0],  # Original line average len
216
            features[1],  # Clean line average len
217
            features[2],  # Original line average len
218
            features[3],  # Clean line average len
219
            features[4],  # Original line average len
220
            features[5],  # Clean line average len
221
            features[6],  # Original line average len
222
            features[7],  # Clean line average len
223
        ]
224
225
        # Polynomial features
226
        degree = 1
227
        poly_feat = []
228
        p_feat = poly1d(features)
229
230
        for d in xrange(degree):
231
            poly_feat += (p_feat ** (d+1)).coeffs.tolist()
232
233
        del poly_feat[5]
234
235
        self.features = poly_feat
236
237
        return self.features
238