|
1
|
|
|
"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object. |
|
2
|
|
|
|
|
3
|
|
|
.. Authors: |
|
4
|
|
|
Philippe Dessauw |
|
5
|
|
|
[email protected] |
|
6
|
|
|
|
|
7
|
|
|
.. Sponsor: |
|
8
|
|
|
Alden Dima |
|
9
|
|
|
[email protected] |
|
10
|
|
|
Information Systems Group |
|
11
|
|
|
Software and Systems Division |
|
12
|
|
|
Information Technology Laboratory |
|
13
|
|
|
National Institute of Standards and Technology |
|
14
|
|
|
http://www.nist.gov/itl/ssd/is |
|
15
|
|
|
""" |
|
16
|
|
|
from __future__ import division |
|
17
|
|
|
import re |
|
18
|
|
|
from nltk.tokenize import word_tokenize |
|
19
|
|
|
from unidecode import unidecode |
|
20
|
|
|
import codecs |
|
21
|
|
|
from collections import Counter |
|
22
|
|
|
import csv |
|
23
|
|
|
import logging |
|
24
|
|
|
from numpy import mean |
|
25
|
|
|
from denoiser.text.stats import Statistics |
|
26
|
|
|
|
|
27
|
|
|
|
|
28
|
|
|
def tokenize(line): |
|
29
|
|
|
"""Separate line to get clean tokens out of it |
|
30
|
|
|
|
|
31
|
|
|
Parameters: |
|
32
|
|
|
line (:func:`str`): A line of text |
|
33
|
|
|
|
|
34
|
|
|
Returns: |
|
35
|
|
|
list - List of different tokens |
|
36
|
|
|
""" |
|
37
|
|
|
separators = "=+/,.:;!?%<>#()&[]{}" |
|
38
|
|
|
|
|
39
|
|
|
tokens = [] |
|
40
|
|
|
tokenized_line = word_tokenize(line) # Will get rid of most of the separators |
|
41
|
|
|
|
|
42
|
|
|
for word in tokenized_line: |
|
43
|
|
|
tmp_tokens = [unidecode(word)] |
|
44
|
|
|
|
|
45
|
|
|
for separator in separators: |
|
46
|
|
|
sep_tokens = [] |
|
47
|
|
|
|
|
48
|
|
|
for tmp_token in tmp_tokens: |
|
49
|
|
|
split_token = tmp_token.split(separator) |
|
50
|
|
|
|
|
51
|
|
|
if len(split_token) != 1: # Token has been split |
|
52
|
|
|
# Concatening the list of token with the separator |
|
53
|
|
|
tkn_sep_list = [] |
|
54
|
|
|
|
|
55
|
|
|
for ind, tkn in enumerate(split_token): |
|
56
|
|
|
tkn_sep_list.append(tkn) |
|
57
|
|
|
|
|
58
|
|
|
if ind != len(split_token) - 1: # Avoid to add the separator at the end |
|
59
|
|
|
tkn_sep_list.append(unicode(separator)) |
|
60
|
|
|
|
|
61
|
|
|
sep_tokens += tkn_sep_list |
|
62
|
|
|
else: |
|
63
|
|
|
sep_tokens += split_token |
|
64
|
|
|
|
|
65
|
|
|
tmp_tokens = sep_tokens |
|
66
|
|
|
|
|
67
|
|
|
tokens += [tkn for tkn in tmp_tokens if tkn != ''] |
|
68
|
|
|
|
|
69
|
|
|
return tokens |
|
70
|
|
|
|
|
71
|
|
|
|
|
72
|
|
|
def clean_head_tail(word): |
|
73
|
|
|
"""Clean head and tail of a word |
|
74
|
|
|
|
|
75
|
|
|
Parameters: |
|
76
|
|
|
word (:func:`str`): The word to clean |
|
77
|
|
|
Returns: |
|
78
|
|
|
:func:`str` - Cleaned word |
|
79
|
|
|
""" |
|
80
|
|
|
cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$") |
|
81
|
|
|
alpha_regexp = re.compile(r"[a-zA-Z]") |
|
82
|
|
|
|
|
83
|
|
|
word_groups = cleaning_regexp.findall(word) |
|
84
|
|
|
|
|
85
|
|
|
# Non matching strings are set as dirty (i.e. cannot be cleaned) |
|
86
|
|
|
# None is returned |
|
87
|
|
|
if len(word_groups) == 0: |
|
88
|
|
|
return None |
|
89
|
|
|
|
|
90
|
|
|
# Words containing no letters are set to None |
|
91
|
|
|
if alpha_regexp.search(word_groups[0][0]) is None: |
|
92
|
|
|
return None |
|
93
|
|
|
|
|
94
|
|
|
return word_groups[0][0] |
|
95
|
|
|
|
|
96
|
|
|
|
|
97
|
|
|
class Text(object): |
|
98
|
|
|
"""Stores the the text from a filename given in parameters |
|
99
|
|
|
|
|
100
|
|
|
Args: |
|
101
|
|
|
fname (str): Path to the file. |
|
102
|
|
|
|
|
103
|
|
|
Attributes: |
|
104
|
|
|
filename (:func:`str`): Name of the file. |
|
105
|
|
|
text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`. |
|
106
|
|
|
stats (:class:`.Statistics`): Statistics object. |
|
107
|
|
|
""" |
|
108
|
|
|
|
|
109
|
|
|
def __init__(self, fname): |
|
110
|
|
|
self.filename = fname |
|
111
|
|
|
self.text = [] |
|
112
|
|
|
self.contains_training_data = False |
|
113
|
|
|
|
|
114
|
|
|
self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length", |
|
115
|
|
|
"word_total_length", "word_avg_nb", "word_total_nb"]) |
|
116
|
|
|
self.stats.set_stat("line_nb", 0) |
|
117
|
|
|
self.stats.set_stat("line_avg_length", 0) |
|
118
|
|
|
self.stats.set_stat("line_total_length", 0) |
|
119
|
|
|
self.stats.set_stat("word_avg_length", 0) |
|
120
|
|
|
self.stats.set_stat("word_total_length", 0) |
|
121
|
|
|
self.stats.set_stat("word_avg_nb", 0) |
|
122
|
|
|
self.stats.set_stat("word_total_nb", 0) |
|
123
|
|
|
|
|
124
|
|
|
def read_csv(self): |
|
125
|
|
|
"""Read a CSV file and build the associated text object |
|
126
|
|
|
|
|
127
|
|
|
Returns: |
|
128
|
|
|
`Text` |
|
129
|
|
|
""" |
|
130
|
|
|
self.contains_training_data = True |
|
131
|
|
|
|
|
132
|
|
|
with open(self.filename, "r") as f: |
|
133
|
|
|
csv_reader = csv.reader(f) |
|
134
|
|
|
paragraph = [] |
|
135
|
|
|
|
|
136
|
|
|
for row in csv_reader: |
|
137
|
|
|
if len(row) != 2: |
|
138
|
|
|
if len(paragraph) != 0: |
|
139
|
|
|
self.text.append(paragraph) |
|
140
|
|
|
paragraph = [] |
|
141
|
|
|
|
|
142
|
|
|
continue |
|
143
|
|
|
|
|
144
|
|
|
line = unicode(row[0].decode("utf-8")) |
|
145
|
|
|
line = line.strip(" \t\r\n") |
|
146
|
|
|
|
|
147
|
|
|
if len(line) == 0: |
|
148
|
|
|
if len(paragraph) != 0: |
|
149
|
|
|
self.text.append(paragraph) |
|
150
|
|
|
paragraph = [] |
|
151
|
|
|
|
|
152
|
|
|
continue |
|
153
|
|
|
|
|
154
|
|
|
line_object = Line(line, row[1]) |
|
155
|
|
|
paragraph.append(line_object) |
|
156
|
|
|
|
|
157
|
|
|
self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
|
158
|
|
|
self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
|
159
|
|
|
self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
|
160
|
|
|
|
|
161
|
|
|
words_len = sum([len(tkn) for tkn in line_object.tokens]) |
|
162
|
|
|
self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
|
163
|
|
|
|
|
164
|
|
|
if len(paragraph) != 0: |
|
165
|
|
|
self.text.append(paragraph) |
|
166
|
|
|
|
|
167
|
|
|
self.stats.set_stat("line_avg_length", |
|
168
|
|
|
self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
|
169
|
|
|
self.stats.set_stat("word_avg_length", |
|
170
|
|
|
self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
|
171
|
|
|
self.stats.set_stat("word_avg_nb", |
|
172
|
|
|
self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
|
173
|
|
|
|
|
174
|
|
|
logging.debug(self.filename+" read") |
|
175
|
|
|
|
|
176
|
|
|
def read_txt(self): |
|
177
|
|
|
"""Read a text file and build the associated text object |
|
178
|
|
|
|
|
179
|
|
|
Returns: |
|
180
|
|
|
`Text` |
|
181
|
|
|
""" |
|
182
|
|
|
self.contains_training_data = False |
|
183
|
|
|
|
|
184
|
|
|
with codecs.open(self.filename, "rb", encoding="utf-8") as f: |
|
185
|
|
|
paragraph = [] |
|
186
|
|
|
|
|
187
|
|
|
for line in f: |
|
188
|
|
|
line = line.strip(" \t\r\n") |
|
189
|
|
|
|
|
190
|
|
|
if len(line) == 0: |
|
191
|
|
|
if len(paragraph) != 0: |
|
192
|
|
|
self.text.append(paragraph) |
|
193
|
|
|
paragraph = [] |
|
194
|
|
|
|
|
195
|
|
|
continue |
|
196
|
|
|
|
|
197
|
|
|
line_object = Line(line) |
|
198
|
|
|
paragraph.append(line_object) |
|
199
|
|
|
|
|
200
|
|
|
self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1) |
|
201
|
|
|
self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object)) |
|
202
|
|
|
self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens)) |
|
203
|
|
|
|
|
204
|
|
|
words_len = sum([len(tkn) for tkn in line_object.tokens]) |
|
205
|
|
|
self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len) |
|
206
|
|
|
|
|
207
|
|
|
if len(paragraph) != 0: |
|
208
|
|
|
self.text.append(paragraph) |
|
209
|
|
|
|
|
210
|
|
|
self.stats.set_stat("line_avg_length", |
|
211
|
|
|
self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb")) |
|
212
|
|
|
self.stats.set_stat("word_avg_length", |
|
213
|
|
|
self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb")) |
|
214
|
|
|
self.stats.set_stat("word_avg_nb", |
|
215
|
|
|
self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb")) |
|
216
|
|
|
|
|
217
|
|
|
logging.debug(self.filename+" read") |
|
218
|
|
|
|
|
219
|
|
|
def get_clean_lines(self): |
|
220
|
|
|
"""Returns cleans line from the text object |
|
221
|
|
|
|
|
222
|
|
|
Returns: |
|
223
|
|
|
list: List of clean lines |
|
224
|
|
|
""" |
|
225
|
|
|
lines = [] |
|
226
|
|
|
|
|
227
|
|
|
for paragraph in self.text: |
|
228
|
|
|
for line in paragraph: |
|
229
|
|
|
if line.grade == 5: |
|
230
|
|
|
lines.append(line.get_clean_line()) |
|
231
|
|
|
|
|
232
|
|
|
if len(lines) > 0 and lines[-1] != "": |
|
233
|
|
|
lines.append("") |
|
234
|
|
|
|
|
235
|
|
|
return lines |
|
236
|
|
|
|
|
237
|
|
|
def get_garbage_lines(self): |
|
238
|
|
|
"""Returns garbage lines from the text object |
|
239
|
|
|
|
|
240
|
|
|
Returns: |
|
241
|
|
|
list: List of garbage lines |
|
242
|
|
|
""" |
|
243
|
|
|
lines = [] |
|
244
|
|
|
|
|
245
|
|
|
for paragraph in self.text: |
|
246
|
|
|
for line in paragraph: |
|
247
|
|
|
if line.grade == 0: |
|
248
|
|
|
lines.append(line.get_orig_line()) |
|
249
|
|
|
|
|
250
|
|
|
if len(lines) > 0 and lines[-1] != "": |
|
251
|
|
|
lines.append("") |
|
252
|
|
|
|
|
253
|
|
|
return lines |
|
254
|
|
|
|
|
255
|
|
|
def get_unclassified_lines(self): |
|
256
|
|
|
"""Returns unclassified lines from the text object |
|
257
|
|
|
|
|
258
|
|
|
Returns: |
|
259
|
|
|
list: List of unclassified lines |
|
260
|
|
|
""" |
|
261
|
|
|
lines = [] |
|
262
|
|
|
|
|
263
|
|
|
for paragraph in self.text: |
|
264
|
|
|
for line in paragraph: |
|
265
|
|
|
if line.grade % 5 != 0: # Grade is not 0 nor 5 |
|
266
|
|
|
lines.append(line.get_orig_line()) |
|
267
|
|
|
|
|
268
|
|
|
if len(lines) > 0 and lines[-1] != "": |
|
269
|
|
|
lines.append("") |
|
270
|
|
|
|
|
271
|
|
|
return lines |
|
272
|
|
|
|
|
273
|
|
|
def retrieve_text_score(self): |
|
274
|
|
|
"""Returns some stats and score regarding classification |
|
275
|
|
|
|
|
276
|
|
|
Returns: |
|
277
|
|
|
dict: Dictionary containing the results |
|
278
|
|
|
""" |
|
279
|
|
|
# True positive is a garbage string detected as such |
|
280
|
|
|
score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0} |
|
281
|
|
|
class_stats = {"classified": 0, "unclassified": 0, "unrated": 0} |
|
282
|
|
|
|
|
283
|
|
|
for paragraph in self.text: |
|
284
|
|
|
for line in paragraph: |
|
285
|
|
|
if line.grade != 0 and line.grade != 5: |
|
286
|
|
|
class_stats["unclassified"] += 1 |
|
287
|
|
|
continue |
|
288
|
|
|
|
|
289
|
|
|
if line.result is None or line.result < 0: |
|
290
|
|
|
class_stats["unrated"] += 1 |
|
291
|
|
|
continue |
|
292
|
|
|
|
|
293
|
|
|
class_stats["classified"] += 1 |
|
294
|
|
|
|
|
295
|
|
|
if line.grade == 0: # Line detected as garbage |
|
296
|
|
|
if line.result == 1: # Line is clean |
|
297
|
|
|
score_stats["FP"] += 1 # False positive |
|
298
|
|
|
else: # Line is garbage |
|
299
|
|
|
score_stats["TP"] += 1 # True postive |
|
300
|
|
|
else: # Line detected as clean |
|
301
|
|
|
if line.result == 1: # Line is clean |
|
302
|
|
|
score_stats["TN"] += 1 # True negative |
|
303
|
|
|
else: # Line is garbage |
|
304
|
|
|
score_stats["FN"] += 1 # False negative |
|
305
|
|
|
|
|
306
|
|
|
# Precision |
|
307
|
|
|
divider_pr = score_stats["TP"] + score_stats["FP"] |
|
308
|
|
|
if divider_pr != 0: |
|
309
|
|
|
precision = score_stats["TP"] / divider_pr |
|
310
|
|
|
else: |
|
311
|
|
|
precision = 0 |
|
312
|
|
|
|
|
313
|
|
|
# Recall |
|
314
|
|
|
divider_rc = score_stats["TP"] + score_stats["FN"] |
|
315
|
|
|
if divider_rc != 0: |
|
316
|
|
|
recall = score_stats["TP"] / divider_rc |
|
317
|
|
|
else: |
|
318
|
|
|
recall = 0 |
|
319
|
|
|
|
|
320
|
|
|
# F1 score |
|
321
|
|
|
if precision + recall != 0: |
|
322
|
|
|
f1 = 2 * precision * recall / (precision + recall) |
|
323
|
|
|
else: |
|
324
|
|
|
f1 = 0 |
|
325
|
|
|
|
|
326
|
|
|
return { |
|
327
|
|
|
"class": class_stats, |
|
328
|
|
|
"score": { |
|
329
|
|
|
"precision": precision, |
|
330
|
|
|
"recall": recall, |
|
331
|
|
|
"f1": f1 |
|
332
|
|
|
}, |
|
333
|
|
|
"raw": score_stats |
|
334
|
|
|
} |
|
335
|
|
|
|
|
336
|
|
|
|
|
337
|
|
|
class Line(object): |
|
338
|
|
|
"""Represents a line of text and provides datastructures to handle it. |
|
339
|
|
|
|
|
340
|
|
|
Args: |
|
341
|
|
|
string (unicode): Line to parse. |
|
342
|
|
|
result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line) |
|
343
|
|
|
|
|
344
|
|
|
Attributes: |
|
345
|
|
|
tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of |
|
346
|
|
|
3 element organized in this order `(original_token, clean_token, corrected_token)` |
|
347
|
|
|
pos_string (:func:`str`): Reference string containing the position of all the tokens |
|
348
|
|
|
result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the |
|
349
|
|
|
algorithm |
|
350
|
|
|
grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line). |
|
351
|
|
|
stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of |
|
352
|
|
|
**lower**, **upper** and **special** characters along with **numbers**. |
|
353
|
|
|
""" |
|
354
|
|
|
|
|
355
|
|
|
def __init__(self, string, result=None): |
|
356
|
|
|
self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)] |
|
357
|
|
|
|
|
358
|
|
|
self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n") |
|
359
|
|
|
for index, token in enumerate(self.tokens): |
|
360
|
|
|
self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1) |
|
361
|
|
|
|
|
362
|
|
|
self.result = None |
|
363
|
|
|
if result is not None: |
|
364
|
|
|
self.result = int(result) |
|
365
|
|
|
|
|
366
|
|
|
if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0: |
|
367
|
|
|
self.grade = 0 |
|
368
|
|
|
else: |
|
369
|
|
|
self.grade = 3 |
|
370
|
|
|
|
|
371
|
|
|
self.stats = { |
|
372
|
|
|
"orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]), |
|
373
|
|
|
"clean": None |
|
374
|
|
|
} |
|
375
|
|
|
|
|
376
|
|
|
tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement |
|
377
|
|
|
tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement |
|
378
|
|
|
tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement |
|
379
|
|
|
tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement |
|
380
|
|
|
line_stats = Counter(tmp_line) |
|
381
|
|
|
|
|
382
|
|
|
self.stats["orig"].set_stat("lw_char", line_stats["a"]) |
|
383
|
|
|
self.stats["orig"].set_stat("up_char", line_stats["A"]) |
|
384
|
|
|
self.stats["orig"].set_stat("nb_char", line_stats["0"]) |
|
385
|
|
|
self.stats["orig"].set_stat("sp_char", line_stats["#"]) |
|
386
|
|
|
|
|
387
|
|
|
def raise_grade(self): |
|
388
|
|
|
"""Add 1 to the grade of the line (up to 5) |
|
389
|
|
|
""" |
|
390
|
|
|
if self.grade < 5: |
|
391
|
|
|
self.grade += 1 |
|
392
|
|
|
|
|
393
|
|
|
def decrease_grade(self): |
|
394
|
|
|
"""Remove 1 to the grade of the line (down to 0) |
|
395
|
|
|
""" |
|
396
|
|
|
if self.grade > 0: |
|
397
|
|
|
self.grade -= 1 |
|
398
|
|
|
|
|
399
|
|
|
def set_garbage(self): |
|
400
|
|
|
"""Set the grade to 0 |
|
401
|
|
|
""" |
|
402
|
|
|
self.grade = 0 |
|
403
|
|
|
|
|
404
|
|
|
def set_clean(self): |
|
405
|
|
|
"""Set the grade to 5 |
|
406
|
|
|
""" |
|
407
|
|
|
self.grade = 5 |
|
408
|
|
|
|
|
409
|
|
|
def get_orig_line(self): |
|
410
|
|
|
"""Returns the original line |
|
411
|
|
|
|
|
412
|
|
|
Returns: |
|
413
|
|
|
str: Original line |
|
414
|
|
|
""" |
|
415
|
|
|
string = self.pos_string |
|
416
|
|
|
|
|
417
|
|
|
for index, token in reversed(list(enumerate(self.tokens))): |
|
418
|
|
|
string = string.replace("%"+str(index), token[0]) |
|
419
|
|
|
|
|
420
|
|
|
return string |
|
421
|
|
|
|
|
422
|
|
|
def get_clean_line(self): |
|
423
|
|
|
"""Returns the clean line |
|
424
|
|
|
|
|
425
|
|
|
Returns: |
|
426
|
|
|
str: Clean line |
|
427
|
|
|
""" |
|
428
|
|
|
string = self.pos_string |
|
429
|
|
|
|
|
430
|
|
|
for index, token in reversed(list(enumerate(self.tokens))): |
|
431
|
|
|
if not token[2] is None and len(token[2]) > 0: |
|
432
|
|
|
string = string.replace("%"+str(index), token[2].keys()[0]) |
|
433
|
|
|
else: # Inline correction is not available |
|
434
|
|
|
if not token[1] is None: |
|
435
|
|
|
string = string.replace("%"+str(index), token[1]) |
|
436
|
|
|
else: # Clean token does not exist, use the original token |
|
437
|
|
|
string = string.replace("%"+str(index), token[0]) |
|
438
|
|
|
|
|
439
|
|
|
return re.sub(" +", " ", string).strip() |
|
440
|
|
|
|
|
441
|
|
|
def get_orig_stats(self): |
|
442
|
|
|
"""Get original stats of the line |
|
443
|
|
|
|
|
444
|
|
|
Returns: |
|
445
|
|
|
Statistics: Statistics of the original line |
|
446
|
|
|
""" |
|
447
|
|
|
return self.stats["orig"] |
|
448
|
|
|
|
|
449
|
|
|
def get_clean_stats(self): |
|
450
|
|
|
"""Get clean stats of the line |
|
451
|
|
|
|
|
452
|
|
|
Returns: |
|
453
|
|
|
Statistics: Statistics of the clean line |
|
454
|
|
|
""" |
|
455
|
|
|
if self.stats["clean"] is None: # Compute clean stats if it is not already done |
|
456
|
|
|
self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"]) |
|
457
|
|
|
|
|
458
|
|
|
tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement |
|
459
|
|
|
tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement |
|
460
|
|
|
tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement |
|
461
|
|
|
tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement |
|
462
|
|
|
line_stats = Counter(tmp_line) |
|
463
|
|
|
|
|
464
|
|
|
self.stats["clean"].set_stat("lw_char", line_stats["a"]) |
|
465
|
|
|
self.stats["clean"].set_stat("up_char", line_stats["A"]) |
|
466
|
|
|
self.stats["clean"].set_stat("nb_char", line_stats["0"]) |
|
467
|
|
|
self.stats["clean"].set_stat("sp_char", line_stats["#"]) |
|
468
|
|
|
|
|
469
|
|
|
return self.stats["clean"] |
|
470
|
|
|
|
|
471
|
|
|
def get_line_score(self): |
|
472
|
|
|
"""Return a global score of the line |
|
473
|
|
|
|
|
474
|
|
|
Returns: |
|
475
|
|
|
float: Score of the line |
|
476
|
|
|
""" |
|
477
|
|
|
score = 0 |
|
478
|
|
|
|
|
479
|
|
|
if len(self.tokens) == 0: |
|
480
|
|
|
return score |
|
481
|
|
|
|
|
482
|
|
|
for token in [t[2] for t in self.tokens if not t[2] is None]: |
|
483
|
|
|
score += mean([s for s in token.values()]) |
|
484
|
|
|
|
|
485
|
|
|
return score / len(self.tokens) |
|
486
|
|
|
|
|
487
|
|
|
def __len__(self): |
|
488
|
|
|
return len(self.get_orig_line()) |
|
489
|
|
|
|
|
490
|
|
|
def __str__(self): |
|
491
|
|
|
return str(self.tokens) + " | " + str(self.grade) |
|
492
|
|
|
|