Total Complexity | 48 |
Total Lines | 231 |
Duplicated Lines | 0 % |
Complex classes like src.denoiser.text.Text often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """This module contains necessary classes to parse a file in order to get the :class:`.Text` object. |
||
97 | class Text(object): |
||
98 | """Stores the the text from a filename given in parameters |
||
99 | |||
100 | Args: |
||
101 | fname (str): Path to the file. |
||
102 | |||
103 | Attributes: |
||
104 | filename (:func:`str`): Name of the file. |
||
105 | text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`. |
||
106 | stats (:class:`.Statistics`): Statistics object. |
||
107 | """ |
||
108 | |||
109 | def __init__(self, fname): |
||
110 | self.filename = fname |
||
111 | self.text = [] |
||
112 | self.contains_training_data = False |
||
113 | |||
114 | self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length", |
||
115 | "word_total_length", "word_avg_nb", "word_total_nb"]) |
||
116 | self.stats.set_stat("line_nb", 0) |
||
117 | self.stats.set_stat("line_avg_length", 0) |
||
118 | self.stats.set_stat("line_total_length", 0) |
||
119 | self.stats.set_stat("word_avg_length", 0) |
||
120 | self.stats.set_stat("word_total_length", 0) |
||
121 | self.stats.set_stat("word_avg_nb", 0) |
||
122 | self.stats.set_stat("word_total_nb", 0) |
||
123 | |||
124 | def read_csv(self): |
||
125 | """Read a CSV file and build the associated text object |
||
126 | """ |
||
127 | self.contains_training_data = True |
||
128 | |||
129 | with open(self.filename, "r") as f: |
||
130 | csv_reader = csv.reader(f) |
||
131 | paragraph = [] |
||
132 | |||
133 | for row in csv_reader: |
||
134 | if len(row) != 2: |
||
135 | if len(paragraph) != 0: |
||
136 | self.text.append(paragraph) |
||
137 | paragraph = [] |
||
138 | |||
139 | continue |
||
140 | |||
141 | line = unicode(row[0].decode("utf-8")) |
||
142 | line = line.strip(" \t\r\n") |
||
143 | |||
144 | if len(line) == 0: |
||
145 | if len(paragraph) != 0: |
||
146 | self.text.append(paragraph) |
||
147 | paragraph = [] |
||
148 | |||
149 | continue |
||
150 | |||
151 | line_object = Line(line, row[1]) |
||
152 | paragraph.append(line_object) |
||
153 | |||
154 | self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
||
155 | self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
||
156 | self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
||
157 | |||
158 | words_len = sum([len(tkn) for tkn in line_object.tokens]) |
||
159 | self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
||
160 | |||
161 | if len(paragraph) != 0: |
||
162 | self.text.append(paragraph) |
||
163 | |||
164 | self.stats.set_stat("line_avg_length", |
||
165 | self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
||
166 | self.stats.set_stat("word_avg_length", |
||
167 | self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
||
168 | self.stats.set_stat("word_avg_nb", |
||
169 | self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
||
170 | |||
171 | logging.debug(self.filename+" read") |
||
172 | |||
173 | def read_txt(self): |
||
174 | """Read a text file and build the associated text object |
||
175 | """ |
||
176 | self.contains_training_data = False |
||
177 | |||
178 | with codecs.open(self.filename, "rb", encoding="utf-8") as f: |
||
179 | paragraph = [] |
||
180 | |||
181 | for line in f: |
||
182 | line = line.strip(" \t\r\n") |
||
183 | |||
184 | if len(line) == 0: |
||
185 | if len(paragraph) != 0: |
||
186 | self.text.append(paragraph) |
||
187 | paragraph = [] |
||
188 | |||
189 | continue |
||
190 | |||
191 | line_object = Line(line) |
||
192 | paragraph.append(line_object) |
||
193 | |||
194 | self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
||
195 | self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
||
196 | self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
||
197 | |||
198 | words_len = sum([len(tkn) for tkn in line_object.tokens]) |
||
199 | self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
||
200 | |||
201 | if len(paragraph) != 0: |
||
202 | self.text.append(paragraph) |
||
203 | |||
204 | self.stats.set_stat("line_avg_length", |
||
205 | self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
||
206 | self.stats.set_stat("word_avg_length", |
||
207 | self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
||
208 | self.stats.set_stat("word_avg_nb", |
||
209 | self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
||
210 | |||
211 | logging.debug(self.filename+" read") |
||
212 | |||
213 | def get_clean_lines(self): |
||
214 | """Returns cleans line from the text object |
||
215 | |||
216 | Returns: |
||
217 | list: List of clean lines |
||
218 | """ |
||
219 | lines = [] |
||
220 | |||
221 | for paragraph in self.text: |
||
222 | for line in paragraph: |
||
223 | if line.grade == 5: |
||
224 | lines.append(line.get_clean_line()) |
||
225 | |||
226 | if len(lines) > 0 and lines[-1] != "": |
||
227 | lines.append("") |
||
228 | |||
229 | return lines |
||
230 | |||
231 | def get_garbage_lines(self): |
||
232 | """Returns garbage lines from the text object |
||
233 | |||
234 | Returns: |
||
235 | list: List of garbage lines |
||
236 | """ |
||
237 | lines = [] |
||
238 | |||
239 | for paragraph in self.text: |
||
240 | for line in paragraph: |
||
241 | if line.grade == 0: |
||
242 | lines.append(line.get_orig_line()) |
||
243 | |||
244 | if len(lines) > 0 and lines[-1] != "": |
||
245 | lines.append("") |
||
246 | |||
247 | return lines |
||
248 | |||
249 | def get_unclassified_lines(self): |
||
250 | """Returns unclassified lines from the text object |
||
251 | |||
252 | Returns: |
||
253 | list: List of unclassified lines |
||
254 | """ |
||
255 | lines = [] |
||
256 | |||
257 | for paragraph in self.text: |
||
258 | for line in paragraph: |
||
259 | if line.grade % 5 != 0: # Grade is not 0 nor 5 |
||
260 | lines.append(line.get_orig_line()) |
||
261 | |||
262 | if len(lines) > 0 and lines[-1] != "": |
||
263 | lines.append("") |
||
264 | |||
265 | return lines |
||
266 | |||
267 | def retrieve_text_score(self): |
||
268 | """Returns some stats and score regarding classification |
||
269 | |||
270 | Returns: |
||
271 | dict: Dictionary containing the results |
||
272 | """ |
||
273 | # True positive is a garbage string detected as such |
||
274 | score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0} |
||
275 | class_stats = {"classified": 0, "unclassified": 0, "unrated": 0} |
||
276 | |||
277 | for paragraph in self.text: |
||
278 | for line in paragraph: |
||
279 | if line.grade != 0 and line.grade != 5: |
||
280 | class_stats["unclassified"] += 1 |
||
281 | continue |
||
282 | |||
283 | if line.result is None or line.result < 0: |
||
284 | class_stats["unrated"] += 1 |
||
285 | continue |
||
286 | |||
287 | class_stats["classified"] += 1 |
||
288 | |||
289 | if line.grade == 0: # Line detected as garbage |
||
290 | if line.result == 1: # Line is clean |
||
291 | score_stats["FP"] += 1 # False positive |
||
292 | else: # Line is garbage |
||
293 | score_stats["TP"] += 1 # True postive |
||
294 | else: # Line detected as clean |
||
295 | if line.result == 1: # Line is clean |
||
296 | score_stats["TN"] += 1 # True negative |
||
297 | else: # Line is garbage |
||
298 | score_stats["FN"] += 1 # False negative |
||
299 | |||
300 | # Precision |
||
301 | divider_pr = score_stats["TP"] + score_stats["FP"] |
||
302 | if divider_pr != 0: |
||
303 | precision = score_stats["TP"] / divider_pr |
||
304 | else: |
||
305 | precision = 0 |
||
306 | |||
307 | # Recall |
||
308 | divider_rc = score_stats["TP"] + score_stats["FN"] |
||
309 | if divider_rc != 0: |
||
310 | recall = score_stats["TP"] / divider_rc |
||
311 | else: |
||
312 | recall = 0 |
||
313 | |||
314 | # F1 score |
||
315 | if precision + recall != 0: |
||
316 | f1 = 2 * precision * recall / (precision + recall) |
||
317 | else: |
||
318 | f1 = 0 |
||
319 | |||
320 | return { |
||
321 | "class": class_stats, |
||
322 | "score": { |
||
323 | "precision": precision, |
||
324 | "recall": recall, |
||
325 | "f1": f1 |
||
326 | }, |
||
327 | "raw": score_stats |
||
328 | } |
||
486 |