Text.retrieve_text_score()   F
last analyzed

Complexity

Conditions 13

Size

Total Lines 61

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 2
Metric Value
cc 13
c 2
b 0
f 2
dl 0
loc 61
rs 2.9617

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like Text.retrieve_text_score() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object.
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
from __future__ import division
17
import re
18
from nltk.tokenize import word_tokenize
19
from unidecode import unidecode
20
import codecs
21
from collections import Counter
22
import csv
23
import logging
24
from numpy import mean
25
from denoiser.text.stats import Statistics
26
27
28
def tokenize(line):
29
    """Separate line to get clean tokens out of it
30
31
    Parameters:
32
        line (:func:`str`): A line of text
33
34
    Returns:
35
        list - List of different tokens
36
    """
37
    separators = "=+/,.:;!?%<>#()&[]{}"
38
39
    tokens = []
40
    tokenized_line = word_tokenize(line)  # Will get rid of most of the separators
41
42
    for word in tokenized_line:
43
        tmp_tokens = [unidecode(word)]
44
45
        for separator in separators:
46
            sep_tokens = []
47
48
            for tmp_token in tmp_tokens:
49
                split_token = tmp_token.split(separator)
50
51
                if len(split_token) != 1:  # Token has been split
52
                    # Concatening the list of token with the separator
53
                    tkn_sep_list = []
54
55
                    for ind, tkn in enumerate(split_token):
56
                        tkn_sep_list.append(tkn)
57
58
                        if ind != len(split_token) - 1:  # Avoid to add the separator at the end
59
                            tkn_sep_list.append(unicode(separator))
60
61
                    sep_tokens += tkn_sep_list
62
                else:
63
                    sep_tokens += split_token
64
65
            tmp_tokens = sep_tokens
66
67
        tokens += [tkn for tkn in tmp_tokens if tkn != '']
68
69
    return tokens
70
71
72
def clean_head_tail(word):
73
    """Clean head and tail of a word
74
75
    Parameters:
76
        word (:func:`str`): The word to clean
77
    Returns:
78
        :func:`str` - Cleaned word
79
    """
80
    cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$")
81
    alpha_regexp = re.compile(r"[a-zA-Z]")
82
83
    word_groups = cleaning_regexp.findall(word)
84
85
    # Non matching strings are set as dirty (i.e. cannot be cleaned)
86
    # None is returned
87
    if len(word_groups) == 0:
88
        return None
89
90
    # Words containing no letters are set to None
91
    if alpha_regexp.search(word_groups[0][0]) is None:
92
        return None
93
94
    return word_groups[0][0]
95
96
97
class Text(object):
98
    """Stores the the text from a filename given in parameters
99
100
    Args:
101
        fname (str): Path to the file.
102
103
    Attributes:
104
        filename (:func:`str`): Name of the file.
105
        text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
106
        stats (:class:`.Statistics`): Statistics object.
107
    """
108
109
    def __init__(self, fname):
110
        self.filename = fname
111
        self.text = []
112
        self.contains_training_data = False
113
114
        self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
115
                                 "word_total_length", "word_avg_nb", "word_total_nb"])
116
        self.stats.set_stat("line_nb", 0)
117
        self.stats.set_stat("line_avg_length", 0)
118
        self.stats.set_stat("line_total_length", 0)
119
        self.stats.set_stat("word_avg_length", 0)
120
        self.stats.set_stat("word_total_length", 0)
121
        self.stats.set_stat("word_avg_nb", 0)
122
        self.stats.set_stat("word_total_nb", 0)
123
124
    def read_csv(self):
125
        """Read a CSV file and build the associated text object
126
127
        Returns:
128
            `Text`
129
        """
130
        self.contains_training_data = True
131
132
        with open(self.filename, "r") as f:
133
            csv_reader = csv.reader(f)
134
            paragraph = []
135
136
            for row in csv_reader:
137
                if len(row) != 2:
138
                    if len(paragraph) != 0:
139
                        self.text.append(paragraph)
140
                        paragraph = []
141
142
                    continue
143
144
                line = unicode(row[0].decode("utf-8"))
145
                line = line.strip(" \t\r\n")
146
147
                if len(line) == 0:
148
                    if len(paragraph) != 0:
149
                        self.text.append(paragraph)
150
                        paragraph = []
151
152
                    continue
153
154
                line_object = Line(line, row[1])
155
                paragraph.append(line_object)
156
157
                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
158
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
159
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
160
161
                words_len = sum([len(tkn) for tkn in line_object.tokens])
162
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
163
164
            if len(paragraph) != 0:
165
                self.text.append(paragraph)
166
167
        self.stats.set_stat("line_avg_length",
168
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
169
        self.stats.set_stat("word_avg_length",
170
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
171
        self.stats.set_stat("word_avg_nb",
172
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
173
174
        logging.debug(self.filename+" read")
175
176
    def read_txt(self):
177
        """Read a text file and build the associated text object
178
179
        Returns:
180
            `Text`
181
        """
182
        self.contains_training_data = False
183
184
        with codecs.open(self.filename, "rb", encoding="utf-8") as f:
185
            paragraph = []
186
187
            for line in f:
188
                line = line.strip(" \t\r\n")
189
190
                if len(line) == 0:
191
                    if len(paragraph) != 0:
192
                        self.text.append(paragraph)
193
                        paragraph = []
194
195
                    continue
196
197
                line_object = Line(line)
198
                paragraph.append(line_object)
199
200
                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
201
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
202
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
203
204
                words_len = sum([len(tkn) for tkn in line_object.tokens])
205
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
206
207
            if len(paragraph) != 0:
208
                self.text.append(paragraph)
209
210
        self.stats.set_stat("line_avg_length",
211
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
212
        self.stats.set_stat("word_avg_length",
213
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
214
        self.stats.set_stat("word_avg_nb",
215
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
216
217
        logging.debug(self.filename+" read")
218
219
    def get_clean_lines(self):
220
        """Returns cleans line from the text object
221
222
        Returns:
223
            list: List of clean lines
224
        """
225
        lines = []
226
227
        for paragraph in self.text:
228
            for line in paragraph:
229
                if line.grade == 5:
230
                    lines.append(line.get_clean_line())
231
232
            if len(lines) > 0 and lines[-1] != "":
233
                lines.append("")
234
235
        return lines
236
237
    def get_garbage_lines(self):
238
        """Returns garbage lines from the text object
239
240
        Returns:
241
            list: List of garbage lines
242
        """
243
        lines = []
244
245
        for paragraph in self.text:
246
            for line in paragraph:
247
                if line.grade == 0:
248
                    lines.append(line.get_orig_line())
249
250
            if len(lines) > 0 and lines[-1] != "":
251
                lines.append("")
252
253
        return lines
254
255
    def get_unclassified_lines(self):
256
        """Returns unclassified lines from the text object
257
258
        Returns:
259
            list: List of unclassified lines
260
        """
261
        lines = []
262
263
        for paragraph in self.text:
264
            for line in paragraph:
265
                if line.grade % 5 != 0:  # Grade is not 0 nor 5
266
                    lines.append(line.get_orig_line())
267
268
            if len(lines) > 0 and lines[-1] != "":
269
                lines.append("")
270
271
        return lines
272
273
    def retrieve_text_score(self):
274
        """Returns some stats and score regarding classification
275
276
        Returns:
277
            dict: Dictionary containing the results
278
        """
279
        # True positive is a garbage string detected as such
280
        score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
281
        class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}
282
283
        for paragraph in self.text:
284
            for line in paragraph:
285
                if line.grade != 0 and line.grade != 5:
286
                    class_stats["unclassified"] += 1
287
                    continue
288
289
                if line.result is None or line.result < 0:
290
                    class_stats["unrated"] += 1
291
                    continue
292
293
                class_stats["classified"] += 1
294
295
                if line.grade == 0:  # Line detected as garbage
296
                    if line.result == 1:  # Line is clean
297
                        score_stats["FP"] += 1  # False positive
298
                    else:  # Line is garbage
299
                        score_stats["TP"] += 1  # True postive
300
                else:  # Line detected as clean
301
                    if line.result == 1:  # Line is clean
302
                        score_stats["TN"] += 1  # True negative
303
                    else:  # Line is garbage
304
                        score_stats["FN"] += 1  # False negative
305
306
        # Precision
307
        divider_pr = score_stats["TP"] + score_stats["FP"]
308
        if divider_pr != 0:
309
            precision = score_stats["TP"] / divider_pr
310
        else:
311
            precision = 0
312
313
        # Recall
314
        divider_rc = score_stats["TP"] + score_stats["FN"]
315
        if divider_rc != 0:
316
            recall = score_stats["TP"] / divider_rc
317
        else:
318
            recall = 0
319
320
        # F1 score
321
        if precision + recall != 0:
322
            f1 = 2 * precision * recall / (precision + recall)
323
        else:
324
            f1 = 0
325
326
        return {
327
            "class": class_stats,
328
            "score": {
329
                "precision": precision,
330
                "recall": recall,
331
                "f1": f1
332
            },
333
            "raw": score_stats
334
        }
335
336
337
class Line(object):
338
    """Represents a line of text and provides datastructures to handle it.
339
340
    Args:
341
        string (unicode): Line to parse.
342
        result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line)
343
344
    Attributes:
345
        tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
346
            3 element organized in this order `(original_token, clean_token, corrected_token)`
347
        pos_string (:func:`str`): Reference string containing the position of all the tokens
348
        result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
349
                                              algorithm
350
        grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
351
        stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
352
            **lower**, **upper** and **special** characters along with **numbers**.
353
    """
354
355
    def __init__(self, string, result=None):
356
        self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]
357
358
        self.pos_string = string  # String containing the position of each token (e.g. "%0 %1%2 ... %n")
359
        for index, token in enumerate(self.tokens):
360
            self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)
361
362
        self.result = None
363
        if result is not None:
364
            self.result = int(result)
365
366
        if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
367
            self.grade = 0
368
        else:
369
            self.grade = 3
370
371
        self.stats = {
372
            "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
373
            "clean": None
374
        }
375
376
        tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line())  # Lower chars replacement
377
        tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
378
        tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
379
        tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
380
        line_stats = Counter(tmp_line)
381
382
        self.stats["orig"].set_stat("lw_char", line_stats["a"])
383
        self.stats["orig"].set_stat("up_char", line_stats["A"])
384
        self.stats["orig"].set_stat("nb_char", line_stats["0"])
385
        self.stats["orig"].set_stat("sp_char", line_stats["#"])
386
387
    def raise_grade(self):
388
        """Add 1 to the grade of the line (up to 5)
389
        """
390
        if self.grade < 5:
391
            self.grade += 1
392
393
    def decrease_grade(self):
394
        """Remove 1 to the grade of the line (down to 0)
395
        """
396
        if self.grade > 0:
397
            self.grade -= 1
398
399
    def set_garbage(self):
400
        """Set the grade to 0
401
        """
402
        self.grade = 0
403
404
    def set_clean(self):
405
        """Set the grade to 5
406
        """
407
        self.grade = 5
408
409
    def get_orig_line(self):
410
        """Returns the original line
411
412
        Returns:
413
            str: Original line
414
        """
415
        string = self.pos_string
416
417
        for index, token in reversed(list(enumerate(self.tokens))):
418
            string = string.replace("%"+str(index), token[0])
419
420
        return string
421
422
    def get_clean_line(self):
423
        """Returns the clean line
424
425
        Returns:
426
            str: Clean line
427
        """
428
        string = self.pos_string
429
430
        for index, token in reversed(list(enumerate(self.tokens))):
431
            if not token[2] is None and len(token[2]) > 0:
432
                string = string.replace("%"+str(index), token[2].keys()[0])
433
            else:  # Inline correction is not available
434
                if not token[1] is None:
435
                    string = string.replace("%"+str(index), token[1])
436
                else:  # Clean token does not exist, use the original token
437
                    string = string.replace("%"+str(index), token[0])
438
439
        return re.sub(" +", " ", string).strip()
440
441
    def get_orig_stats(self):
442
        """Get original stats of the line
443
444
        Returns:
445
            Statistics: Statistics of the original line
446
        """
447
        return self.stats["orig"]
448
449
    def get_clean_stats(self):
450
        """Get clean stats of the line
451
452
        Returns:
453
            Statistics: Statistics of the clean line
454
        """
455
        if self.stats["clean"] is None:  # Compute clean stats if it is not already done
456
            self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])
457
458
            tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line())  # Lower chars replacement
459
            tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
460
            tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
461
            tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
462
            line_stats = Counter(tmp_line)
463
464
            self.stats["clean"].set_stat("lw_char", line_stats["a"])
465
            self.stats["clean"].set_stat("up_char", line_stats["A"])
466
            self.stats["clean"].set_stat("nb_char", line_stats["0"])
467
            self.stats["clean"].set_stat("sp_char", line_stats["#"])
468
469
        return self.stats["clean"]
470
471
    def get_line_score(self):
472
        """Return a global score of the line
473
474
        Returns:
475
            float: Score of the line
476
        """
477
        score = 0
478
479
        if len(self.tokens) == 0:
480
            return score
481
482
        for token in [t[2] for t in self.tokens if not t[2] is None]:
483
            score += mean([s for s in token.values()])
484
485
        return score / len(self.tokens)
486
487
    def __len__(self):
488
        return len(self.get_orig_line())
489
490
    def __str__(self):
491
        return str(self.tokens) + " | " + str(self.grade)
492