1
|
|
|
"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object. |
2
|
|
|
|
3
|
|
|
.. Authors: |
4
|
|
|
Philippe Dessauw |
5
|
|
|
[email protected] |
6
|
|
|
|
7
|
|
|
.. Sponsor: |
8
|
|
|
Alden Dima |
9
|
|
|
[email protected] |
10
|
|
|
Information Systems Group |
11
|
|
|
Software and Systems Division |
12
|
|
|
Information Technology Laboratory |
13
|
|
|
National Institute of Standards and Technology |
14
|
|
|
http://www.nist.gov/itl/ssd/is |
15
|
|
|
""" |
16
|
|
|
from __future__ import division |
17
|
|
|
import re |
18
|
|
|
from nltk.tokenize import word_tokenize |
19
|
|
|
from unidecode import unidecode |
20
|
|
|
import codecs |
21
|
|
|
from collections import Counter |
22
|
|
|
import csv |
23
|
|
|
import logging |
24
|
|
|
from numpy import mean |
25
|
|
|
from denoiser.text.stats import Statistics |
26
|
|
|
|
27
|
|
|
|
28
|
|
|
def tokenize(line): |
29
|
|
|
"""Separate line to get clean tokens out of it |
30
|
|
|
|
31
|
|
|
Parameters: |
32
|
|
|
line (:func:`str`): A line of text |
33
|
|
|
|
34
|
|
|
Returns: |
35
|
|
|
list - List of different tokens |
36
|
|
|
""" |
37
|
|
|
separators = "=+/,.:;!?%<>#()&[]{}" |
38
|
|
|
|
39
|
|
|
tokens = [] |
40
|
|
|
tokenized_line = word_tokenize(line) # Will get rid of most of the separators |
41
|
|
|
|
42
|
|
|
for word in tokenized_line: |
43
|
|
|
tmp_tokens = [unidecode(word)] |
44
|
|
|
|
45
|
|
|
for separator in separators: |
46
|
|
|
sep_tokens = [] |
47
|
|
|
|
48
|
|
|
for tmp_token in tmp_tokens: |
49
|
|
|
split_token = tmp_token.split(separator) |
50
|
|
|
|
51
|
|
|
if len(split_token) != 1: # Token has been split |
52
|
|
|
# Concatening the list of token with the separator |
53
|
|
|
tkn_sep_list = [] |
54
|
|
|
|
55
|
|
|
for ind, tkn in enumerate(split_token): |
56
|
|
|
tkn_sep_list.append(tkn) |
57
|
|
|
|
58
|
|
|
if ind != len(split_token) - 1: # Avoid to add the separator at the end |
59
|
|
|
tkn_sep_list.append(unicode(separator)) |
60
|
|
|
|
61
|
|
|
sep_tokens += tkn_sep_list |
62
|
|
|
else: |
63
|
|
|
sep_tokens += split_token |
64
|
|
|
|
65
|
|
|
tmp_tokens = sep_tokens |
66
|
|
|
|
67
|
|
|
tokens += [tkn for tkn in tmp_tokens if tkn != ''] |
68
|
|
|
|
69
|
|
|
return tokens |
70
|
|
|
|
71
|
|
|
|
72
|
|
|
def clean_head_tail(word): |
73
|
|
|
"""Clean head and tail of a word |
74
|
|
|
|
75
|
|
|
Parameters: |
76
|
|
|
word (:func:`str`): The word to clean |
77
|
|
|
Returns: |
78
|
|
|
:func:`str` - Cleaned word |
79
|
|
|
""" |
80
|
|
|
cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$") |
81
|
|
|
alpha_regexp = re.compile(r"[a-zA-Z]") |
82
|
|
|
|
83
|
|
|
word_groups = cleaning_regexp.findall(word) |
84
|
|
|
|
85
|
|
|
# Non matching strings are set as dirty (i.e. cannot be cleaned) |
86
|
|
|
# None is returned |
87
|
|
|
if len(word_groups) == 0: |
88
|
|
|
return None |
89
|
|
|
|
90
|
|
|
# Words containing no letters are set to None |
91
|
|
|
if alpha_regexp.search(word_groups[0][0]) is None: |
92
|
|
|
return None |
93
|
|
|
|
94
|
|
|
return word_groups[0][0] |
95
|
|
|
|
96
|
|
|
|
97
|
|
|
class Text(object): |
98
|
|
|
"""Stores the the text from a filename given in parameters |
99
|
|
|
|
100
|
|
|
Args: |
101
|
|
|
fname (str): Path to the file. |
102
|
|
|
|
103
|
|
|
Attributes: |
104
|
|
|
filename (:func:`str`): Name of the file. |
105
|
|
|
text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`. |
106
|
|
|
stats (:class:`.Statistics`): Statistics object. |
107
|
|
|
""" |
108
|
|
|
|
109
|
|
|
def __init__(self, fname): |
110
|
|
|
self.filename = fname |
111
|
|
|
self.text = [] |
112
|
|
|
self.contains_training_data = False |
113
|
|
|
|
114
|
|
|
self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length", |
115
|
|
|
"word_total_length", "word_avg_nb", "word_total_nb"]) |
116
|
|
|
self.stats.set_stat("line_nb", 0) |
117
|
|
|
self.stats.set_stat("line_avg_length", 0) |
118
|
|
|
self.stats.set_stat("line_total_length", 0) |
119
|
|
|
self.stats.set_stat("word_avg_length", 0) |
120
|
|
|
self.stats.set_stat("word_total_length", 0) |
121
|
|
|
self.stats.set_stat("word_avg_nb", 0) |
122
|
|
|
self.stats.set_stat("word_total_nb", 0) |
123
|
|
|
|
124
|
|
|
def read_csv(self): |
125
|
|
|
"""Read a CSV file and build the associated text object |
126
|
|
|
|
127
|
|
|
Returns: |
128
|
|
|
`Text` |
129
|
|
|
""" |
130
|
|
|
self.contains_training_data = True |
131
|
|
|
|
132
|
|
|
with open(self.filename, "r") as f: |
133
|
|
|
csv_reader = csv.reader(f) |
134
|
|
|
paragraph = [] |
135
|
|
|
|
136
|
|
|
for row in csv_reader: |
137
|
|
|
if len(row) != 2: |
138
|
|
|
if len(paragraph) != 0: |
139
|
|
|
self.text.append(paragraph) |
140
|
|
|
paragraph = [] |
141
|
|
|
|
142
|
|
|
continue |
143
|
|
|
|
144
|
|
|
line = unicode(row[0].decode("utf-8")) |
145
|
|
|
line = line.strip(" \t\r\n") |
146
|
|
|
|
147
|
|
|
if len(line) == 0: |
148
|
|
|
if len(paragraph) != 0: |
149
|
|
|
self.text.append(paragraph) |
150
|
|
|
paragraph = [] |
151
|
|
|
|
152
|
|
|
continue |
153
|
|
|
|
154
|
|
|
line_object = Line(line, row[1]) |
155
|
|
|
paragraph.append(line_object) |
156
|
|
|
|
157
|
|
|
self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
158
|
|
|
self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
159
|
|
|
self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
160
|
|
|
|
161
|
|
|
words_len = sum([len(tkn) for tkn in line_object.tokens]) |
162
|
|
|
self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
163
|
|
|
|
164
|
|
|
if len(paragraph) != 0: |
165
|
|
|
self.text.append(paragraph) |
166
|
|
|
|
167
|
|
|
self.stats.set_stat("line_avg_length", |
168
|
|
|
self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
169
|
|
|
self.stats.set_stat("word_avg_length", |
170
|
|
|
self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
171
|
|
|
self.stats.set_stat("word_avg_nb", |
172
|
|
|
self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
173
|
|
|
|
174
|
|
|
logging.debug(self.filename+" read") |
175
|
|
|
|
176
|
|
|
def read_txt(self): |
177
|
|
|
"""Read a text file and build the associated text object |
178
|
|
|
|
179
|
|
|
Returns: |
180
|
|
|
`Text` |
181
|
|
|
""" |
182
|
|
|
self.contains_training_data = False |
183
|
|
|
|
184
|
|
|
with codecs.open(self.filename, "rb", encoding="utf-8") as f: |
185
|
|
|
paragraph = [] |
186
|
|
|
|
187
|
|
|
for line in f: |
188
|
|
|
line = line.strip(" \t\r\n") |
189
|
|
|
|
190
|
|
|
if len(line) == 0: |
191
|
|
|
if len(paragraph) != 0: |
192
|
|
|
self.text.append(paragraph) |
193
|
|
|
paragraph = [] |
194
|
|
|
|
195
|
|
|
continue |
196
|
|
|
|
197
|
|
|
line_object = Line(line) |
198
|
|
|
paragraph.append(line_object) |
199
|
|
|
|
200
|
|
|
self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
201
|
|
|
self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
202
|
|
|
self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
203
|
|
|
|
204
|
|
|
words_len = sum([len(tkn) for tkn in line_object.tokens]) |
205
|
|
|
self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
206
|
|
|
|
207
|
|
|
if len(paragraph) != 0: |
208
|
|
|
self.text.append(paragraph) |
209
|
|
|
|
210
|
|
|
self.stats.set_stat("line_avg_length", |
211
|
|
|
self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
212
|
|
|
self.stats.set_stat("word_avg_length", |
213
|
|
|
self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
214
|
|
|
self.stats.set_stat("word_avg_nb", |
215
|
|
|
self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
216
|
|
|
|
217
|
|
|
logging.debug(self.filename+" read") |
218
|
|
|
|
219
|
|
|
def get_clean_lines(self): |
220
|
|
|
"""Returns cleans line from the text object |
221
|
|
|
|
222
|
|
|
Returns: |
223
|
|
|
list: List of clean lines |
224
|
|
|
""" |
225
|
|
|
lines = [] |
226
|
|
|
|
227
|
|
|
for paragraph in self.text: |
228
|
|
|
for line in paragraph: |
229
|
|
|
if line.grade == 5: |
230
|
|
|
lines.append(line.get_clean_line()) |
231
|
|
|
|
232
|
|
|
if len(lines) > 0 and lines[-1] != "": |
233
|
|
|
lines.append("") |
234
|
|
|
|
235
|
|
|
return lines |
236
|
|
|
|
237
|
|
|
def get_garbage_lines(self): |
238
|
|
|
"""Returns garbage lines from the text object |
239
|
|
|
|
240
|
|
|
Returns: |
241
|
|
|
list: List of garbage lines |
242
|
|
|
""" |
243
|
|
|
lines = [] |
244
|
|
|
|
245
|
|
|
for paragraph in self.text: |
246
|
|
|
for line in paragraph: |
247
|
|
|
if line.grade == 0: |
248
|
|
|
lines.append(line.get_orig_line()) |
249
|
|
|
|
250
|
|
|
if len(lines) > 0 and lines[-1] != "": |
251
|
|
|
lines.append("") |
252
|
|
|
|
253
|
|
|
return lines |
254
|
|
|
|
255
|
|
|
def get_unclassified_lines(self): |
256
|
|
|
"""Returns unclassified lines from the text object |
257
|
|
|
|
258
|
|
|
Returns: |
259
|
|
|
list: List of unclassified lines |
260
|
|
|
""" |
261
|
|
|
lines = [] |
262
|
|
|
|
263
|
|
|
for paragraph in self.text: |
264
|
|
|
for line in paragraph: |
265
|
|
|
if line.grade % 5 != 0: # Grade is not 0 nor 5 |
266
|
|
|
lines.append(line.get_orig_line()) |
267
|
|
|
|
268
|
|
|
if len(lines) > 0 and lines[-1] != "": |
269
|
|
|
lines.append("") |
270
|
|
|
|
271
|
|
|
return lines |
272
|
|
|
|
273
|
|
|
def retrieve_text_score(self): |
274
|
|
|
"""Returns some stats and score regarding classification |
275
|
|
|
|
276
|
|
|
Returns: |
277
|
|
|
dict: Dictionary containing the results |
278
|
|
|
""" |
279
|
|
|
# True positive is a garbage string detected as such |
280
|
|
|
score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0} |
281
|
|
|
class_stats = {"classified": 0, "unclassified": 0, "unrated": 0} |
282
|
|
|
|
283
|
|
|
for paragraph in self.text: |
284
|
|
|
for line in paragraph: |
285
|
|
|
if line.grade != 0 and line.grade != 5: |
286
|
|
|
class_stats["unclassified"] += 1 |
287
|
|
|
continue |
288
|
|
|
|
289
|
|
|
if line.result is None or line.result < 0: |
290
|
|
|
class_stats["unrated"] += 1 |
291
|
|
|
continue |
292
|
|
|
|
293
|
|
|
class_stats["classified"] += 1 |
294
|
|
|
|
295
|
|
|
if line.grade == 0: # Line detected as garbage |
296
|
|
|
if line.result == 1: # Line is clean |
297
|
|
|
score_stats["FP"] += 1 # False positive |
298
|
|
|
else: # Line is garbage |
299
|
|
|
score_stats["TP"] += 1 # True postive |
300
|
|
|
else: # Line detected as clean |
301
|
|
|
if line.result == 1: # Line is clean |
302
|
|
|
score_stats["TN"] += 1 # True negative |
303
|
|
|
else: # Line is garbage |
304
|
|
|
score_stats["FN"] += 1 # False negative |
305
|
|
|
|
306
|
|
|
# Precision |
307
|
|
|
divider_pr = score_stats["TP"] + score_stats["FP"] |
308
|
|
|
if divider_pr != 0: |
309
|
|
|
precision = score_stats["TP"] / divider_pr |
310
|
|
|
else: |
311
|
|
|
precision = 0 |
312
|
|
|
|
313
|
|
|
# Recall |
314
|
|
|
divider_rc = score_stats["TP"] + score_stats["FN"] |
315
|
|
|
if divider_rc != 0: |
316
|
|
|
recall = score_stats["TP"] / divider_rc |
317
|
|
|
else: |
318
|
|
|
recall = 0 |
319
|
|
|
|
320
|
|
|
# F1 score |
321
|
|
|
if precision + recall != 0: |
322
|
|
|
f1 = 2 * precision * recall / (precision + recall) |
323
|
|
|
else: |
324
|
|
|
f1 = 0 |
325
|
|
|
|
326
|
|
|
return { |
327
|
|
|
"class": class_stats, |
328
|
|
|
"score": { |
329
|
|
|
"precision": precision, |
330
|
|
|
"recall": recall, |
331
|
|
|
"f1": f1 |
332
|
|
|
}, |
333
|
|
|
"raw": score_stats |
334
|
|
|
} |
335
|
|
|
|
336
|
|
|
|
337
|
|
|
class Line(object): |
338
|
|
|
"""Represents a line of text and provides datastructures to handle it. |
339
|
|
|
|
340
|
|
|
Args: |
341
|
|
|
string (unicode): Line to parse. |
342
|
|
|
result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line) |
343
|
|
|
|
344
|
|
|
Attributes: |
345
|
|
|
tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of |
346
|
|
|
3 element organized in this order `(original_token, clean_token, corrected_token)` |
347
|
|
|
pos_string (:func:`str`): Reference string containing the position of all the tokens |
348
|
|
|
result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the |
349
|
|
|
algorithm |
350
|
|
|
grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line). |
351
|
|
|
stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of |
352
|
|
|
**lower**, **upper** and **special** characters along with **numbers**. |
353
|
|
|
""" |
354
|
|
|
|
355
|
|
|
def __init__(self, string, result=None): |
356
|
|
|
self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)] |
357
|
|
|
|
358
|
|
|
self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n") |
359
|
|
|
for index, token in enumerate(self.tokens): |
360
|
|
|
self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1) |
361
|
|
|
|
362
|
|
|
self.result = None |
363
|
|
|
if result is not None: |
364
|
|
|
self.result = int(result) |
365
|
|
|
|
366
|
|
|
if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0: |
367
|
|
|
self.grade = 0 |
368
|
|
|
else: |
369
|
|
|
self.grade = 3 |
370
|
|
|
|
371
|
|
|
self.stats = { |
372
|
|
|
"orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]), |
373
|
|
|
"clean": None |
374
|
|
|
} |
375
|
|
|
|
376
|
|
|
tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement |
377
|
|
|
tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement |
378
|
|
|
tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement |
379
|
|
|
tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement |
380
|
|
|
line_stats = Counter(tmp_line) |
381
|
|
|
|
382
|
|
|
self.stats["orig"].set_stat("lw_char", line_stats["a"]) |
383
|
|
|
self.stats["orig"].set_stat("up_char", line_stats["A"]) |
384
|
|
|
self.stats["orig"].set_stat("nb_char", line_stats["0"]) |
385
|
|
|
self.stats["orig"].set_stat("sp_char", line_stats["#"]) |
386
|
|
|
|
387
|
|
|
def raise_grade(self): |
388
|
|
|
"""Add 1 to the grade of the line (up to 5) |
389
|
|
|
""" |
390
|
|
|
if self.grade < 5: |
391
|
|
|
self.grade += 1 |
392
|
|
|
|
393
|
|
|
def decrease_grade(self): |
394
|
|
|
"""Remove 1 to the grade of the line (down to 0) |
395
|
|
|
""" |
396
|
|
|
if self.grade > 0: |
397
|
|
|
self.grade -= 1 |
398
|
|
|
|
399
|
|
|
def set_garbage(self): |
400
|
|
|
"""Set the grade to 0 |
401
|
|
|
""" |
402
|
|
|
self.grade = 0 |
403
|
|
|
|
404
|
|
|
def set_clean(self): |
405
|
|
|
"""Set the grade to 5 |
406
|
|
|
""" |
407
|
|
|
self.grade = 5 |
408
|
|
|
|
409
|
|
|
def get_orig_line(self): |
410
|
|
|
"""Returns the original line |
411
|
|
|
|
412
|
|
|
Returns: |
413
|
|
|
str: Original line |
414
|
|
|
""" |
415
|
|
|
string = self.pos_string |
416
|
|
|
|
417
|
|
|
for index, token in reversed(list(enumerate(self.tokens))): |
418
|
|
|
string = string.replace("%"+str(index), token[0]) |
419
|
|
|
|
420
|
|
|
return string |
421
|
|
|
|
422
|
|
|
def get_clean_line(self): |
423
|
|
|
"""Returns the clean line |
424
|
|
|
|
425
|
|
|
Returns: |
426
|
|
|
str: Clean line |
427
|
|
|
""" |
428
|
|
|
string = self.pos_string |
429
|
|
|
|
430
|
|
|
for index, token in reversed(list(enumerate(self.tokens))): |
431
|
|
|
if not token[2] is None and len(token[2]) > 0: |
432
|
|
|
string = string.replace("%"+str(index), token[2].keys()[0]) |
433
|
|
|
else: # Inline correction is not available |
434
|
|
|
if not token[1] is None: |
435
|
|
|
string = string.replace("%"+str(index), token[1]) |
436
|
|
|
else: # Clean token does not exist, use the original token |
437
|
|
|
string = string.replace("%"+str(index), token[0]) |
438
|
|
|
|
439
|
|
|
return re.sub(" +", " ", string).strip() |
440
|
|
|
|
441
|
|
|
def get_orig_stats(self): |
442
|
|
|
"""Get original stats of the line |
443
|
|
|
|
444
|
|
|
Returns: |
445
|
|
|
Statistics: Statistics of the original line |
446
|
|
|
""" |
447
|
|
|
return self.stats["orig"] |
448
|
|
|
|
449
|
|
|
def get_clean_stats(self): |
450
|
|
|
"""Get clean stats of the line |
451
|
|
|
|
452
|
|
|
Returns: |
453
|
|
|
Statistics: Statistics of the clean line |
454
|
|
|
""" |
455
|
|
|
if self.stats["clean"] is None: # Compute clean stats if it is not already done |
456
|
|
|
self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"]) |
457
|
|
|
|
458
|
|
|
tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement |
459
|
|
|
tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement |
460
|
|
|
tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement |
461
|
|
|
tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement |
462
|
|
|
line_stats = Counter(tmp_line) |
463
|
|
|
|
464
|
|
|
self.stats["clean"].set_stat("lw_char", line_stats["a"]) |
465
|
|
|
self.stats["clean"].set_stat("up_char", line_stats["A"]) |
466
|
|
|
self.stats["clean"].set_stat("nb_char", line_stats["0"]) |
467
|
|
|
self.stats["clean"].set_stat("sp_char", line_stats["#"]) |
468
|
|
|
|
469
|
|
|
return self.stats["clean"] |
470
|
|
|
|
471
|
|
|
def get_line_score(self): |
472
|
|
|
"""Return a global score of the line |
473
|
|
|
|
474
|
|
|
Returns: |
475
|
|
|
float: Score of the line |
476
|
|
|
""" |
477
|
|
|
score = 0 |
478
|
|
|
|
479
|
|
|
if len(self.tokens) == 0: |
480
|
|
|
return score |
481
|
|
|
|
482
|
|
|
for token in [t[2] for t in self.tokens if not t[2] is None]: |
483
|
|
|
score += mean([s for s in token.values()]) |
484
|
|
|
|
485
|
|
|
return score / len(self.tokens) |
486
|
|
|
|
487
|
|
|
def __len__(self): |
488
|
|
|
return len(self.get_orig_line()) |
489
|
|
|
|
490
|
|
|
def __str__(self): |
491
|
|
|
return str(self.tokens) + " | " + str(self.grade) |
492
|
|
|
|