| Conditions | 15 |
| Total Lines | 129 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 1 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like MachineLearningFeatures.extract_features() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """Package containing all the machine learning functions and objects |
||
| 109 | def extract_features(self, line, unigrams, text_stats): |
||
| 110 | """Extract features from a given line |
||
| 111 | |||
| 112 | Args: |
||
| 113 | line (Line): Line to get features from |
||
| 114 | unigrams (Unigrams): Unigrams for the given line |
||
| 115 | text_stats (Statistics): Statistics of the text the line is coming from |
||
| 116 | |||
| 117 | Returns: |
||
| 118 | list: List of the features |
||
| 119 | """ |
||
| 120 | # Simple features |
||
| 121 | features = [ |
||
| 122 | float(line.stats["orig"].get_stat("lw_char")), |
||
| 123 | float(line.stats["orig"].get_stat("up_char")), |
||
| 124 | float(line.stats["orig"].get_stat("sp_char")), |
||
| 125 | float(line.stats["orig"].get_stat("nb_char")), |
||
| 126 | float(len(line.tokens)), |
||
| 127 | ] |
||
| 128 | |||
| 129 | # Additional features |
||
| 130 | fappend = features.append |
||
| 131 | fappend(line.get_clean_stats().get_stat("lw_char")) |
||
| 132 | fappend(line.get_clean_stats().get_stat("up_char")) |
||
| 133 | fappend(line.get_clean_stats().get_stat("sp_char")) |
||
| 134 | fappend(line.get_clean_stats().get_stat("nb_char")) |
||
| 135 | fappend(line.get_line_score()) |
||
| 136 | fappend(len(line.get_orig_line())) |
||
| 137 | fappend(len(line.get_clean_line())) |
||
| 138 | |||
| 139 | u = unigrams |
||
| 140 | |||
| 141 | tk_len = [len(token[0]) for token in line.tokens] |
||
| 142 | word_avg_len = 0 |
||
| 143 | |||
| 144 | if len(tk_len) > 0: |
||
| 145 | word_avg_len = mean(tk_len) |
||
| 146 | |||
| 147 | fappend(float(word_avg_len)) |
||
| 148 | |||
| 149 | t0 = [u[tk[0]] for tk in line.tokens] |
||
| 150 | s0 = 0 |
||
| 151 | |||
| 152 | if len(t0) != 0: |
||
| 153 | s0 = mean(t0) |
||
| 154 | |||
| 155 | fappend(float(s0)) |
||
| 156 | |||
| 157 | t1 = [u[tk[1]] for tk in line.tokens if not tk[1] is None] |
||
| 158 | s1 = 0 |
||
| 159 | |||
| 160 | if len(t1) != 0: |
||
| 161 | s1 = mean(t1) |
||
| 162 | |||
| 163 | fappend(float(s1)) |
||
| 164 | |||
| 165 | t2 = [u[t] for tk in line.tokens if not tk[2] is None for t in tk[2].keys()] |
||
| 166 | s2 = 0 |
||
| 167 | |||
| 168 | if len(t2) != 0: |
||
| 169 | s2 = mean(t2) |
||
| 170 | |||
| 171 | fappend(float(s2)) |
||
| 172 | |||
| 173 | # Regularization |
||
| 174 | orig_chars = sum(features[:4]) |
||
| 175 | clean_chars = sum(features[5:9]) |
||
| 176 | |||
| 177 | f = [ |
||
| 178 | features[0] / orig_chars, |
||
| 179 | features[1] / orig_chars, |
||
| 180 | features[2] / orig_chars, |
||
| 181 | features[3] / orig_chars |
||
| 182 | ] |
||
| 183 | |||
| 184 | if clean_chars != 0: |
||
| 185 | f += [features[5] / clean_chars, |
||
| 186 | features[6] / clean_chars, |
||
| 187 | features[7] / clean_chars, |
||
| 188 | features[8] / clean_chars] |
||
| 189 | else: |
||
| 190 | f += [0, 0, 0, 0] |
||
| 191 | |||
| 192 | f += [features[9], |
||
| 193 | features[4] / text_stats.get_stat("word_avg_nb"), |
||
| 194 | features[12] / text_stats.get_stat("word_avg_length"), |
||
| 195 | features[10] / text_stats.get_stat("line_avg_length"), |
||
| 196 | features[11] / text_stats.get_stat("line_avg_length")] |
||
| 197 | |||
| 198 | if features[13] != 0: |
||
| 199 | f.append(features[14] / features[13]) |
||
| 200 | f.append(features[15] / features[13]) |
||
| 201 | else: |
||
| 202 | f.append(0) |
||
| 203 | f.append(0) |
||
| 204 | |||
| 205 | features = f |
||
| 206 | |||
| 207 | # Ordering the data set |
||
| 208 | features = [ |
||
| 209 | features[11], # Original line average len |
||
| 210 | features[12], # Clean line average len |
||
| 211 | features[9], # Original line average len |
||
| 212 | features[10], # Clean line average len |
||
| 213 | features[13], # Original line average len |
||
| 214 | features[14], # Clean line average len |
||
| 215 | features[0], # Original line average len |
||
| 216 | features[1], # Clean line average len |
||
| 217 | features[2], # Original line average len |
||
| 218 | features[3], # Clean line average len |
||
| 219 | features[4], # Original line average len |
||
| 220 | features[5], # Clean line average len |
||
| 221 | features[6], # Original line average len |
||
| 222 | features[7], # Clean line average len |
||
| 223 | ] |
||
| 224 | |||
| 225 | # Polynomial features |
||
| 226 | degree = 1 |
||
| 227 | poly_feat = [] |
||
| 228 | p_feat = poly1d(features) |
||
| 229 | |||
| 230 | for d in xrange(degree): |
||
| 231 | poly_feat += (p_feat ** (d+1)).coeffs.tolist() |
||
| 232 | |||
| 233 | del poly_feat[5] |
||
| 234 | |||
| 235 | self.features = poly_feat |
||
| 236 | |||
| 237 | return self.features |
||
| 238 |