Conditions | 15 |
Total Lines | 129 |
Lines | 0 |
Ratio | 0 % |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like src.denoiser.models.MachineLearningFeatures.extract_features() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """Package containing all the machine learning functions and objects |
||
109 | def extract_features(self, line, unigrams, text_stats): |
||
110 | """Extract features from a given line |
||
111 | |||
112 | Args: |
||
113 | line (Line): Line to get features from |
||
114 | unigrams (Unigrams): Unigrams for the given line |
||
115 | text_stats (Statistics): Statistics of the text the line is coming from |
||
116 | |||
117 | Returns: |
||
118 | list: List of the features |
||
119 | """ |
||
120 | # Simple features |
||
121 | features = [ |
||
122 | float(line.stats["orig"].get_stat("lw_char")), |
||
123 | float(line.stats["orig"].get_stat("up_char")), |
||
124 | float(line.stats["orig"].get_stat("sp_char")), |
||
125 | float(line.stats["orig"].get_stat("nb_char")), |
||
126 | float(len(line.tokens)), |
||
127 | ] |
||
128 | |||
129 | # Additional features |
||
130 | fappend = features.append |
||
131 | fappend(line.get_clean_stats().get_stat("lw_char")) |
||
132 | fappend(line.get_clean_stats().get_stat("up_char")) |
||
133 | fappend(line.get_clean_stats().get_stat("sp_char")) |
||
134 | fappend(line.get_clean_stats().get_stat("nb_char")) |
||
135 | fappend(line.get_line_score()) |
||
136 | fappend(len(line.get_orig_line())) |
||
137 | fappend(len(line.get_clean_line())) |
||
138 | |||
139 | u = unigrams |
||
140 | |||
141 | tk_len = [len(token[0]) for token in line.tokens] |
||
142 | word_avg_len = 0 |
||
143 | |||
144 | if len(tk_len) > 0: |
||
145 | word_avg_len = mean(tk_len) |
||
146 | |||
147 | fappend(float(word_avg_len)) |
||
148 | |||
149 | t0 = [u[tk[0]] for tk in line.tokens] |
||
150 | s0 = 0 |
||
151 | |||
152 | if len(t0) != 0: |
||
153 | s0 = mean(t0) |
||
154 | |||
155 | fappend(float(s0)) |
||
156 | |||
157 | t1 = [u[tk[1]] for tk in line.tokens if not tk[1] is None] |
||
158 | s1 = 0 |
||
159 | |||
160 | if len(t1) != 0: |
||
161 | s1 = mean(t1) |
||
162 | |||
163 | fappend(float(s1)) |
||
164 | |||
165 | t2 = [u[t] for tk in line.tokens if not tk[2] is None for t in tk[2].keys()] |
||
166 | s2 = 0 |
||
167 | |||
168 | if len(t2) != 0: |
||
169 | s2 = mean(t2) |
||
170 | |||
171 | fappend(float(s2)) |
||
172 | |||
173 | # Regularization |
||
174 | orig_chars = sum(features[:4]) |
||
175 | clean_chars = sum(features[5:9]) |
||
176 | |||
177 | f = [ |
||
178 | features[0] / orig_chars, |
||
179 | features[1] / orig_chars, |
||
180 | features[2] / orig_chars, |
||
181 | features[3] / orig_chars |
||
182 | ] |
||
183 | |||
184 | if clean_chars != 0: |
||
185 | f += [features[5] / clean_chars, |
||
186 | features[6] / clean_chars, |
||
187 | features[7] / clean_chars, |
||
188 | features[8] / clean_chars] |
||
189 | else: |
||
190 | f += [0, 0, 0, 0] |
||
191 | |||
192 | f += [features[9], |
||
193 | features[4] / text_stats.get_stat("word_avg_nb"), |
||
194 | features[12] / text_stats.get_stat("word_avg_length"), |
||
195 | features[10] / text_stats.get_stat("line_avg_length"), |
||
196 | features[11] / text_stats.get_stat("line_avg_length")] |
||
197 | |||
198 | if features[13] != 0: |
||
199 | f.append(features[14] / features[13]) |
||
200 | f.append(features[15] / features[13]) |
||
201 | else: |
||
202 | f.append(0) |
||
203 | f.append(0) |
||
204 | |||
205 | features = f |
||
206 | |||
207 | # Ordering the data set |
||
208 | features = [ |
||
209 | features[11], # Original line average len |
||
210 | features[12], # Clean line average len |
||
211 | features[9], # Original line average len |
||
212 | features[10], # Clean line average len |
||
213 | features[13], # Original line average len |
||
214 | features[14], # Clean line average len |
||
215 | features[0], # Original line average len |
||
216 | features[1], # Clean line average len |
||
217 | features[2], # Original line average len |
||
218 | features[3], # Clean line average len |
||
219 | features[4], # Original line average len |
||
220 | features[5], # Clean line average len |
||
221 | features[6], # Original line average len |
||
222 | features[7], # Clean line average len |
||
223 | ] |
||
224 | |||
225 | # Polynomial features |
||
226 | degree = 1 |
||
227 | poly_feat = [] |
||
228 | p_feat = poly1d(features) |
||
229 | |||
230 | for d in xrange(degree): |
||
231 | poly_feat += (p_feat ** (d+1)).coeffs.tolist() |
||
232 | |||
233 | del poly_feat[5] |
||
234 | |||
235 | self.features = poly_feat |
||
236 | |||
237 | return self.features |
||
238 |