1
|
|
|
"""Utilities for inline denoising |
2
|
|
|
|
3
|
|
|
.. Authors: |
4
|
|
|
Philippe Dessauw |
5
|
|
|
[email protected] |
6
|
|
|
|
7
|
|
|
.. Sponsor: |
8
|
|
|
Alden Dima |
9
|
|
|
[email protected] |
10
|
|
|
Information Systems Group |
11
|
|
|
Software and Systems Division |
12
|
|
|
Information Technology Laboratory |
13
|
|
|
National Institute of Standards and Technology |
14
|
|
|
http://www.nist.gov/itl/ssd/is |
15
|
|
|
""" |
16
|
|
|
from hashlib import md5 |
17
|
|
|
from collections import Counter |
18
|
|
|
from copy import deepcopy |
19
|
|
|
from math import log |
20
|
|
|
from operator import add |
21
|
|
|
from nltk.metrics.distance import edit_distance |
22
|
|
|
import operator |
23
|
|
|
from denoiser.models.inline.hashing import anagram_hash, ocr_key_hash |
24
|
|
|
from denoiser.models.inline.ranking import rate_anagram, rate_ocr_key, rate_bigram |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
def init_correction_map(token, dictionary): |
28
|
|
|
"""Initialize the correction dictionary |
29
|
|
|
|
30
|
|
|
Parameters: |
31
|
|
|
token (:func:`str`): Cleaned token |
32
|
|
|
dictionary (:func:`dict`): Dictionary structure |
33
|
|
|
|
34
|
|
|
Returns: |
35
|
|
|
:func:`dict` or :data:`None` - Correction map |
36
|
|
|
""" |
37
|
|
|
if token is None: |
38
|
|
|
return None |
39
|
|
|
|
40
|
|
|
if len(token) <= 2 or token.lower() in dictionary: |
41
|
|
|
return {token: 1} |
42
|
|
|
|
43
|
|
|
return {} |
44
|
|
|
|
45
|
|
|
|
46
|
|
|
def generate_alphabet_from_word(word): |
47
|
|
|
"""Generate anagram hash for all chars in a word |
48
|
|
|
|
49
|
|
|
Parameters: |
50
|
|
|
word (:func:`str`): Word to generate hash |
51
|
|
|
Returns: |
52
|
|
|
set - Set of hashes |
53
|
|
|
""" |
54
|
|
|
word = " "+word+" " |
55
|
|
|
chars = [char for char in word] # Getting letters from the word |
56
|
|
|
chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list |
57
|
|
|
|
58
|
|
|
# Computing hash of items and add 0 to the list |
59
|
|
|
return set([0] + [anagram_hash(c) for c in set(chars)]) |
60
|
|
|
|
61
|
|
|
|
62
|
|
|
def select_anagrams(token, structures): |
63
|
|
|
"""Select possible anagrams for a given token |
64
|
|
|
|
65
|
|
|
Parameters: |
66
|
|
|
token (:func:`str`): Cleaned token |
67
|
|
|
structures (:func:`dict`): Datastructures from file |
68
|
|
|
|
69
|
|
|
Returns: |
70
|
|
|
:func:`dict` - Possible anagrams (keys) along with their score (values) |
71
|
|
|
""" |
72
|
|
|
anagrams = {} |
73
|
|
|
focus_alphabet = generate_alphabet_from_word(token[1]) |
74
|
|
|
token_hash = anagram_hash(token) |
75
|
|
|
|
76
|
|
|
hash_list = [] |
77
|
|
|
for c in structures["alphabet"]: |
78
|
|
|
for f in focus_alphabet: |
79
|
|
|
hash_list.append(token_hash + c - f) |
80
|
|
|
|
81
|
|
|
hash_counter = Counter(hash_list) # Counting retrieval occurence |
82
|
|
|
|
83
|
|
|
for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())): |
84
|
|
|
count = hash_counter[h] |
85
|
|
|
anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3] |
86
|
|
|
|
87
|
|
|
for anag in anag_list: |
88
|
|
|
anag_score = rate_anagram(structures["occurence_map"], token, anag, count) |
89
|
|
|
|
90
|
|
|
if anag_score > 0: |
91
|
|
|
anagrams[anag] = anag_score |
92
|
|
|
|
93
|
|
|
return anagrams |
94
|
|
|
|
95
|
|
|
|
96
|
|
|
def select_ocrsims(token, structures): |
97
|
|
|
"""Select similar words for a given token |
98
|
|
|
|
99
|
|
|
Parameters: |
100
|
|
|
token (:func:`str`): Cleaned token |
101
|
|
|
structures (:func:`dict`): Datastructures from file |
102
|
|
|
|
103
|
|
|
Returns: |
104
|
|
|
:func:`dict` - Similar words (keys) along with their score (values) |
105
|
|
|
""" |
106
|
|
|
delta = 2 |
107
|
|
|
ocr_sims = {} |
108
|
|
|
|
109
|
|
|
word_hash = ocr_key_hash(token) |
110
|
|
|
|
111
|
|
|
sim_hash_list = {} # Using a dictionary avoid multiple entries if a key is retrieved twice |
112
|
|
|
key_index = -1 |
113
|
|
|
|
114
|
|
|
# for (key, value) in word_hash: |
115
|
|
|
for key, value in word_hash: |
116
|
|
|
key_index += 1 |
117
|
|
|
sim_hash = deepcopy(word_hash) |
118
|
|
|
|
119
|
|
|
for d in range(-delta, delta+1): |
120
|
|
|
if d != 0: |
121
|
|
|
card = max(int(value)+d, 1) |
122
|
|
|
|
123
|
|
|
sim_hash[key_index] = (key, card) |
124
|
|
|
|
125
|
|
|
# Rebuild OCR key string |
126
|
|
|
sim_hash_str = "" |
127
|
|
|
for k, v in sim_hash: |
128
|
|
|
sim_hash_str += k + str(v) |
129
|
|
|
|
130
|
|
|
if sim_hash_str in structures["ocrkeys"]: |
131
|
|
|
card_diff = abs(int(value)-card) |
132
|
|
|
|
133
|
|
|
sim_hash_list[sim_hash_str] = [(sim_word, card_diff) |
134
|
|
|
for sim_word in structures["ocrkeys"][sim_hash_str] |
135
|
|
|
if edit_distance(sim_word, token) <= 2] |
136
|
|
|
|
137
|
|
|
for sim_hash_str, sim_list in sim_hash_list.items(): |
138
|
|
|
for sim_word, card_diff in sim_list: |
139
|
|
|
sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) |
140
|
|
|
|
141
|
|
|
if sim_score > 0: |
142
|
|
|
ocr_sims[sim_word] = sim_score |
143
|
|
|
|
144
|
|
|
return ocr_sims |
145
|
|
|
|
146
|
|
|
|
147
|
|
|
def truncate_ocr_sim_list(token, ocr_sims_list, limit=10): |
148
|
|
|
"""Truncate the OCR key similarity list to a defined set of possibilities |
149
|
|
|
|
150
|
|
|
Parameters: |
151
|
|
|
token (:func:`str`): Initial token |
152
|
|
|
ocr_sims_list (dict): OCR similarities |
153
|
|
|
limit (int): Final number of similarities |
154
|
|
|
|
155
|
|
|
Returns: |
156
|
|
|
dict - List of similarities to keep |
157
|
|
|
""" |
158
|
|
|
if len(ocr_sims_list) <= limit: |
159
|
|
|
return ocr_sims_list |
160
|
|
|
|
161
|
|
|
ocr_scores = set([sc for sim, sc in ocr_sims_list.items()]) |
162
|
|
|
|
163
|
|
|
# Limit of 10 different scores allowed |
164
|
|
|
sorted_ocr_scores = sorted(ocr_scores, reverse=True)[:limit] |
165
|
|
|
ocr_list = [] |
166
|
|
|
for score in sorted_ocr_scores: |
167
|
|
|
tmp_ocr_list = [ocr_sims for ocr_sims, ocr_score in ocr_sims_list.items() if ocr_score == score] |
168
|
|
|
|
169
|
|
|
if len(ocr_list) + len(tmp_ocr_list) > limit: |
170
|
|
|
list_len = limit - len(ocr_list) |
171
|
|
|
tmp_list = [] |
172
|
|
|
|
173
|
|
|
while len(tmp_list) < list_len: |
174
|
|
|
tmp_list += select_lower_edit_distance(token, tmp_ocr_list) |
175
|
|
|
|
176
|
|
|
if len(ocr_list) + len(tmp_list) == limit: # Final list has exactly 10 elements |
177
|
|
|
ocr_list += tmp_list |
178
|
|
|
break |
179
|
|
|
else: # List has more than 10 arguments (need to chose only the n elements needed) |
180
|
|
|
alpha_tmp_list = [] |
181
|
|
|
|
182
|
|
|
while len(alpha_tmp_list) != list_len: |
183
|
|
|
alpha_word = select_best_alphabetical_word(token, tmp_list) |
184
|
|
|
|
185
|
|
|
alpha_tmp_list.append(alpha_word) |
186
|
|
|
tmp_list = [tkn for tkn in tmp_list if tkn != alpha_word] |
187
|
|
|
|
188
|
|
|
ocr_list += alpha_tmp_list |
189
|
|
|
break |
190
|
|
|
elif len(ocr_list) + len(tmp_ocr_list) == limit: |
191
|
|
|
ocr_list += tmp_ocr_list |
192
|
|
|
break |
193
|
|
|
else: # len(ocr_list) + len(tmp_ocr_list) < limit |
194
|
|
|
ocr_list += tmp_ocr_list |
195
|
|
|
|
196
|
|
|
if len(ocr_list) != limit: |
197
|
|
|
raise IndexError("OCR list is still too big ("+str(len(ocr_list))+"/"+str(limit)+")") |
198
|
|
|
|
199
|
|
|
return {tkn: ocr_sims_list[tkn] for tkn in ocr_list} |
200
|
|
|
|
201
|
|
|
|
202
|
|
|
def split_ocr_list(token, ocr_list): |
203
|
|
|
"""Split the OCR list between strong and week OCR words |
204
|
|
|
|
205
|
|
|
Parameters: |
206
|
|
|
token (:func:`str`): Token to correct |
207
|
|
|
ocr_list (:func:`dict`): List of possible OCR correction |
208
|
|
|
Returns: |
209
|
|
|
tuple - Strong OCR words and weak OCR words |
210
|
|
|
""" |
211
|
|
|
|
212
|
|
|
# Build the sorted OCR key list and divide it into 2 different stacks |
213
|
|
|
ocr_words = sorted( |
214
|
|
|
ocr_list.iteritems(), |
215
|
|
|
key=operator.itemgetter(1), |
216
|
|
|
reverse=True |
217
|
|
|
) |
218
|
|
|
strong_ocr_words = {tkn: sc for tkn, sc in ocr_words[:5]} |
219
|
|
|
weak_ocr_words = {tkn: sc for tkn, sc in ocr_words[5:]} |
220
|
|
|
|
221
|
|
|
min_strong_score = min([sc for tkn, sc in strong_ocr_words.items()]) |
222
|
|
|
max_weak_score = max([sc for tkn, sc in weak_ocr_words.items()]) |
223
|
|
|
|
224
|
|
|
if min_strong_score == max_weak_score: |
225
|
|
|
strong_tmp_ocr_words = [tkn for tkn, score in strong_ocr_words.items() if score == min_strong_score] |
226
|
|
|
weak_tmp_ocr_words = [tkn for tkn, score in weak_ocr_words.items() if score == min_strong_score] |
227
|
|
|
|
228
|
|
|
tmp_list = {t: min_strong_score for t in strong_tmp_ocr_words} |
229
|
|
|
tmp_list.update({t: min_strong_score for t in weak_tmp_ocr_words}) |
230
|
|
|
|
231
|
|
|
tmp_strg_list = truncate_ocr_sim_list(token, tmp_list, len(strong_tmp_ocr_words)) |
232
|
|
|
tmp_weak_list = [tkn for tkn in tmp_list.keys() if tkn not in tmp_strg_list] |
233
|
|
|
|
234
|
|
|
strong_ocr_words = {tkn: sc for tkn, sc in strong_ocr_words.items() if tkn not in tmp_list.keys()} |
235
|
|
|
strong_ocr_words.update(tmp_strg_list) |
236
|
|
|
|
237
|
|
|
weak_ocr_words = {tkn: sc for tkn, sc in weak_ocr_words.items() if tkn not in tmp_list.keys()} |
238
|
|
|
weak_ocr_words.update({tkn: min_strong_score for tkn in tmp_weak_list}) |
239
|
|
|
|
240
|
|
|
return strong_ocr_words, weak_ocr_words |
241
|
|
|
|
242
|
|
|
|
243
|
|
|
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures): |
244
|
|
|
"""Merge anagram and OCRkey list into one list. |
245
|
|
|
|
246
|
|
|
Parameters: |
247
|
|
|
token (:func:`str`): Cleaned token |
248
|
|
|
anagrams_list (:func:`dict`): Result of `select_anagrams` |
249
|
|
|
ocr_sims_list (:func:`dict`): Result of `select_ocrsims` |
250
|
|
|
structures (:func:`dict`): Datastructures from file |
251
|
|
|
|
252
|
|
|
Returns: |
253
|
|
|
:func:`dict` - Correction tokens (keys) along with their score (values) |
254
|
|
|
""" |
255
|
|
|
final_list = anagrams_list |
256
|
|
|
|
257
|
|
|
ocr_list = truncate_ocr_sim_list(token, ocr_sims_list) |
258
|
|
|
|
259
|
|
|
strong_ocr_list = ocr_list |
260
|
|
|
weak_ocr_list = {} |
261
|
|
|
if len(ocr_list) > 5: |
262
|
|
|
(strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list) |
263
|
|
|
|
264
|
|
|
for ocr_word, ocr_score in strong_ocr_list.items(): |
265
|
|
|
if ocr_word in final_list.keys(): |
266
|
|
|
final_list[ocr_word] *= ocr_score |
267
|
|
|
del strong_ocr_list[ocr_word] |
268
|
|
|
|
269
|
|
|
strong_ocr_list.update(weak_ocr_list) |
270
|
|
|
|
271
|
|
|
for ocr_word, ocr_score in strong_ocr_list.items(): |
272
|
|
|
if ocr_word not in final_list.keys(): |
273
|
|
|
final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \ |
274
|
|
|
* rate_ocr_key(structures["occurence_map"], token, ocr_word, 0) |
275
|
|
|
|
276
|
|
|
return final_list |
277
|
|
|
|
278
|
|
|
|
279
|
|
|
def find_correct_case(word, case_mode, structures): |
280
|
|
|
"""Select the best case between a set of already encountered cases |
281
|
|
|
|
282
|
|
|
Parameters: |
283
|
|
|
word (:func:`str`): Word to correct |
284
|
|
|
case_mode (int): Choice between lower or upper case (extra choice for undecisive) |
285
|
|
|
structures (dict): List of structures needed to perform the choice |
286
|
|
|
Returns: |
287
|
|
|
:func:`str` - Corrected word |
288
|
|
|
""" |
289
|
|
|
variations = {key: structures["occurence_map"][key] for key in structures["altcase"][word.lower()]} |
290
|
|
|
variations = sorted(variations.iteritems(), key=operator.itemgetter(1), reverse=True) |
291
|
|
|
|
292
|
|
|
tmp_vars = [] |
293
|
|
|
if case_mode == 0: # Upper case spelling |
294
|
|
|
for var in variations: |
295
|
|
|
_word = var[0] |
296
|
|
|
if _word[0].isupper() and sum(char.isupper() for char in _word) > 2: |
297
|
|
|
tmp_vars.append(var) |
298
|
|
|
|
299
|
|
|
if len(tmp_vars) == 0: |
300
|
|
|
tmp_vars = variations |
301
|
|
|
elif case_mode == 1: # Lower case with capital initial |
302
|
|
|
for var in variations: |
303
|
|
|
_word = var[0] |
304
|
|
|
if _word[0].isupper() and sum(char.isupper() for char in _word) <= 2: |
305
|
|
|
tmp_vars.append(var) |
306
|
|
|
|
307
|
|
|
if len(tmp_vars) == 0: |
308
|
|
|
tmp_vars = variations |
309
|
|
|
else: # case_mode == -1 (no capital letters found) |
310
|
|
|
tmp_vars = variations |
311
|
|
|
|
312
|
|
|
max_occ = tmp_vars[0][1] |
313
|
|
|
dist_vars = {term: edit_distance(word, term) for term, occ in tmp_vars if occ == max_occ} |
314
|
|
|
|
315
|
|
|
if len(dist_vars) == 1: |
316
|
|
|
return dist_vars.keys()[0] |
317
|
|
|
|
318
|
|
|
# Several terms with max occurence still exist |
319
|
|
|
dist_vars = sorted(dist_vars.iteritems(), key=operator.itemgetter(1)) |
320
|
|
|
|
321
|
|
|
min_dist = dist_vars[0][1] |
322
|
|
|
min_dist_vars = [term for term, dist in dist_vars if dist == min_dist] |
323
|
|
|
|
324
|
|
|
if len(min_dist_vars) == 1: |
325
|
|
|
return min_dist_vars[0] |
326
|
|
|
|
327
|
|
|
# Several terms with same Levenhstein distance exist |
328
|
|
|
term_ascii_code = {term: [ord(ch) for ch in term] for term in min_dist_vars} |
329
|
|
|
|
330
|
|
|
for ascii_code in term_ascii_code.values(): |
331
|
|
|
for i in xrange(len(ascii_code)): |
332
|
|
|
code = ascii_code[i] |
333
|
|
|
|
334
|
|
|
# Non a-zA-Z chars will have a 0 value |
335
|
|
|
if code < 65 or 90 < code < 97 or code > 122: |
336
|
|
|
ascii_code[i] = 0 |
337
|
|
|
|
338
|
|
View Code Duplication |
if case_mode >= 0: |
|
|
|
|
339
|
|
|
ascii_val = min(term_ascii_code.values()) |
340
|
|
|
|
341
|
|
|
t = [t for t, v in term_ascii_code.items() if v == ascii_val] |
342
|
|
|
|
343
|
|
|
if len(t) > 1: |
344
|
|
|
raise ValueError("Too many value in final array") |
345
|
|
|
|
346
|
|
|
return t[0] |
347
|
|
|
else: |
348
|
|
|
ascii_val = max(term_ascii_code.values()) |
349
|
|
|
|
350
|
|
|
t = [t for t, v in term_ascii_code.items() if v == ascii_val] |
351
|
|
|
|
352
|
|
|
if len(t) > 1: |
353
|
|
|
raise ValueError("Too many value in final array") |
354
|
|
|
|
355
|
|
|
return t[0] |
356
|
|
|
|
357
|
|
|
|
358
|
|
|
def correct_case(token, corrections_map, structures): |
359
|
|
|
"""Select the best spelling for a word (case-wise) |
360
|
|
|
|
361
|
|
|
Parameters: |
362
|
|
|
token (:func:`str`): Cleaned token |
363
|
|
|
corrections_map (:func:`dict`): Result of `build_candidates_list` |
364
|
|
|
structures (:func:`dict`): Datastructures from file |
365
|
|
|
|
366
|
|
|
Returns: |
367
|
|
|
:func:`dict` - Corrected tokens (keys) along with their score (values) |
368
|
|
|
""" |
369
|
|
|
alt_case_mode = -1 # Most common variation |
370
|
|
|
if token[0].isupper(): |
371
|
|
|
if sum(char.isupper() for char in token) > 2: |
372
|
|
|
alt_case_mode = 0 # Upper case variation |
373
|
|
|
else: |
374
|
|
|
alt_case_mode = 1 # Lower case variation with capital first letter |
375
|
|
|
|
376
|
|
|
corrected_case_map = {} |
377
|
|
|
for correct_word, score in corrections_map.items(): |
378
|
|
|
if correct_word.find(" ") != -1: |
379
|
|
|
words = correct_word.split(" ") |
380
|
|
|
|
381
|
|
|
keys_left = find_correct_case(words[0], alt_case_mode, structures) |
382
|
|
|
keys_right = find_correct_case(words[1], alt_case_mode, structures) |
383
|
|
|
key = keys_left+" "+keys_right |
384
|
|
|
else: |
385
|
|
|
key = find_correct_case(correct_word, alt_case_mode, structures) |
386
|
|
|
|
387
|
|
|
# If the key already exists we keep the highest score |
388
|
|
|
if key in corrected_case_map.keys(): |
389
|
|
|
old_score = corrected_case_map[key] |
390
|
|
|
corrected_case_map[key] = max(old_score, score) |
391
|
|
|
else: |
392
|
|
|
corrected_case_map[key] = score |
393
|
|
|
|
394
|
|
|
return corrected_case_map |
395
|
|
|
|
396
|
|
|
|
397
|
|
|
def apply_bigram_boost(paragraph, bigrams, occurence_map): |
398
|
|
|
"""Compute the bigram boost for every token of a paragraph and apply it to the possible corrections. |
399
|
|
|
|
400
|
|
|
Parameters: |
401
|
|
|
paragraph (:func:`list`): List of lines |
402
|
|
|
bigrams (:func:`list`): Bigrams for the given paragraph |
403
|
|
|
occurence_map (:func:`dict`): Occurence of unigrams and bigrams |
404
|
|
|
""" |
405
|
|
|
token_index = -1 |
406
|
|
|
|
407
|
|
|
for line in paragraph: |
408
|
|
|
for token in line.tokens: |
409
|
|
|
if token[2] is None: |
410
|
|
|
continue |
411
|
|
|
|
412
|
|
|
# Finding adjacent tokens |
413
|
|
|
adjacent_tokens = [] |
414
|
|
|
|
415
|
|
|
if token_index > 0: |
416
|
|
|
adjacent_tokens.append(bigrams[token_index][0]) |
417
|
|
|
else: |
418
|
|
|
adjacent_tokens.append(None) |
419
|
|
|
|
420
|
|
|
token_index += 1 |
421
|
|
|
|
422
|
|
|
if token_index < len(bigrams): |
423
|
|
|
adjacent_tokens.append(bigrams[token_index][1]) |
424
|
|
|
else: |
425
|
|
|
adjacent_tokens.append(None) |
426
|
|
|
|
427
|
|
|
# Normalizing adjacent tokens array |
428
|
|
|
for tkn_index in xrange(len(adjacent_tokens)): |
429
|
|
|
adj_tkn = adjacent_tokens[tkn_index] |
430
|
|
|
|
431
|
|
|
if adj_tkn is None: |
432
|
|
|
adjacent_tokens[tkn_index] = [] |
433
|
|
|
continue |
434
|
|
|
|
435
|
|
|
if not adj_tkn[2] is None: |
436
|
|
|
adjacent_tokens[tkn_index] = [] |
437
|
|
|
adj_tkn = sorted(adj_tkn[2].iteritems(), key=operator.itemgetter(1), reverse=True) |
438
|
|
|
|
439
|
|
|
for idx in xrange(min(5, len(adj_tkn))): |
440
|
|
|
adjacent_tokens[tkn_index].append(adj_tkn[idx][0].lower()) |
441
|
|
|
else: |
442
|
|
|
if not adj_tkn[1] is None: |
443
|
|
|
adjacent_tokens[tkn_index] = [adj_tkn[1].lower()] |
444
|
|
|
else: |
445
|
|
|
adjacent_tokens[tkn_index] = [adj_tkn[0].lower()] |
446
|
|
|
|
447
|
|
|
# Computing bigram boost |
448
|
|
|
for correction in token[2].keys(): |
449
|
|
|
bigram_boost = rate_bigram(correction.lower(), adjacent_tokens[0], adjacent_tokens[1], occurence_map) |
450
|
|
|
token[2][correction] *= bigram_boost |
451
|
|
|
|
452
|
|
|
|
453
|
|
|
def select_lower_edit_distance(ref_word, word_list): |
454
|
|
|
"""Get the word with the lower edit distance |
455
|
|
|
|
456
|
|
|
Parameters: |
457
|
|
|
ref_word (:func:`str`): Word to correct |
458
|
|
|
word_list (list): List of proposals |
459
|
|
|
|
460
|
|
|
Returns: |
461
|
|
|
:func:`str` - Selected word |
462
|
|
|
""" |
463
|
|
|
word_dict = {word: edit_distance(ref_word, word) for word in word_list} |
464
|
|
|
min_dist = min(word_dict.values()) |
465
|
|
|
|
466
|
|
|
return [word for word, dist in word_dict.items() if dist == min_dist] |
467
|
|
|
|
468
|
|
|
|
469
|
|
|
def select_by_hash(word_list): |
470
|
|
|
"""Select the word with the lower md5 hash |
471
|
|
|
|
472
|
|
|
Parameters: |
473
|
|
|
word_list (list): List of proposal |
474
|
|
|
|
475
|
|
|
Returns: |
476
|
|
|
:func:`str` - Selected word |
477
|
|
|
""" |
478
|
|
|
hashes = set([md5(word).hexdigest() for word in word_list]) |
479
|
|
|
|
480
|
|
|
if len(hashes) != len(word_list): |
481
|
|
|
raise Exception("differenciation impossible") |
482
|
|
|
|
483
|
|
|
return [tkn for tkn in word_list if md5(tkn).hexdigest() == min(hashes)][0] |
484
|
|
|
|
485
|
|
|
|
486
|
|
|
def select_best_alphabetical_word(ref_word, word_list): |
487
|
|
|
"""Select closest alphabetical word (non alphanumerical chars are set to the same value) |
488
|
|
|
|
489
|
|
|
Parameters: |
490
|
|
|
ref_word (:func:`str`): Word to correct |
491
|
|
|
word_list (list): List of propositions |
492
|
|
|
|
493
|
|
|
Returns: |
494
|
|
|
:func:`str` - Selected word |
495
|
|
|
""" |
496
|
|
|
case_mode = -1 if ref_word[0].isupper() else 0 |
497
|
|
|
term_ascii_code = {term: [ord(ch) for ch in term] for term in word_list} |
498
|
|
|
|
499
|
|
|
for ascii_code in term_ascii_code.values(): |
500
|
|
|
for i in xrange(len(ascii_code)): |
501
|
|
|
code = ascii_code[i] |
502
|
|
|
|
503
|
|
|
# Non a-zA-Z chars will have a 0 value |
504
|
|
|
if code < 65 or 90 < code < 97 or code > 122: |
505
|
|
|
ascii_code[i] = 0 |
506
|
|
|
|
507
|
|
View Code Duplication |
if case_mode >= 0: |
|
|
|
|
508
|
|
|
ascii_val = min(term_ascii_code.values()) |
509
|
|
|
|
510
|
|
|
tkn_list = [t for t, v in term_ascii_code.items() if v == ascii_val] |
511
|
|
|
|
512
|
|
|
if len(tkn_list) > 1: |
513
|
|
|
return select_by_hash(tkn_list) |
514
|
|
|
|
515
|
|
|
return tkn_list[0] |
516
|
|
|
else: |
517
|
|
|
ascii_val = max(term_ascii_code.values()) |
518
|
|
|
|
519
|
|
|
tkn_list = [t for t, v in term_ascii_code.items() if v == ascii_val] |
520
|
|
|
|
521
|
|
|
if len(tkn_list) > 1: |
522
|
|
|
return select_by_hash(tkn_list) |
523
|
|
|
|
524
|
|
|
return tkn_list[0] |
525
|
|
|
|
526
|
|
|
|
527
|
|
|
def select_correction(word, corrections_map): |
528
|
|
|
"""Select the best correction for a word given its score |
529
|
|
|
|
530
|
|
|
Parameters: |
531
|
|
|
word (str): Word to select a correction for. |
532
|
|
|
corrections_map (:func:`dict`): Dictionary containing all corrections for a token along with their score |
533
|
|
|
|
534
|
|
|
Returns: |
535
|
|
|
:func:`dict` - Chosen correction(s) along with their score |
536
|
|
|
""" |
537
|
|
|
if corrections_map is None or len(corrections_map) == 1: |
538
|
|
|
return corrections_map |
539
|
|
|
|
540
|
|
|
max_val = max(corrections_map.values()) |
541
|
|
|
final_list = {term: val for term, val in corrections_map.items() if val == max_val} |
542
|
|
|
|
543
|
|
|
if len(final_list) == 1: # One value has the maximum |
544
|
|
|
if final_list.values()[0] > 0.7: # Highly valued terms are chosen by default |
545
|
|
|
return final_list |
546
|
|
|
|
547
|
|
|
first_word = final_list.keys()[0] |
548
|
|
|
|
549
|
|
|
# If the threshold value has not been reached we are looking for a second term |
550
|
|
|
del corrections_map[final_list.keys()[0]] |
551
|
|
|
|
552
|
|
|
max_val = max(corrections_map.values()) |
553
|
|
|
tmp_list = {term: val for term, val in corrections_map.items() if val == max_val} |
554
|
|
|
|
555
|
|
|
if len(tmp_list) == 1: # One value has the second higher grade |
556
|
|
|
final_list.update(tmp_list) |
557
|
|
|
second_word = tmp_list.keys()[0] |
558
|
|
|
else: # Several terms with the same score |
559
|
|
|
# Differenciation on the Levenhstein distance |
560
|
|
|
tmp_list = select_lower_edit_distance(word, tmp_list.keys()) |
561
|
|
|
|
562
|
|
|
if len(tmp_list) == 1: # One term has the lowest score |
563
|
|
|
final_list[tmp_list[0]] = max_val |
564
|
|
|
second_word = tmp_list[0] |
565
|
|
|
else: # Several terms with the same |
566
|
|
|
# Choose the best alphabetical term |
567
|
|
|
second_word = select_best_alphabetical_word(word, tmp_list) |
568
|
|
|
final_list[second_word] = max_val |
569
|
|
|
|
570
|
|
|
# Determine if we need one or two terms |
571
|
|
|
if log(final_list[first_word] / final_list[second_word]) >= 1: |
572
|
|
|
del final_list[second_word] |
573
|
|
|
|
574
|
|
|
return final_list |
575
|
|
|
elif len(final_list) != 2: # More than 2 values share the same maximum |
576
|
|
|
tmp_list = select_lower_edit_distance(word, final_list.keys()) |
577
|
|
|
|
578
|
|
|
if len(tmp_list) == 1: # One word get the min edit distance |
579
|
|
|
first_word = tmp_list[0] |
580
|
|
|
tmp_final_list = final_list |
581
|
|
|
del tmp_final_list[first_word] |
582
|
|
|
|
583
|
|
|
tmp_list = select_lower_edit_distance(word, tmp_final_list.keys()) |
584
|
|
|
|
585
|
|
|
if len(tmp_list) == 1: # One word get the second minimal edit distance |
586
|
|
|
final_list = { |
587
|
|
|
first_word: max_val, |
588
|
|
|
tmp_list[0]: max_val |
589
|
|
|
} |
590
|
|
|
|
591
|
|
|
return final_list |
592
|
|
|
else: # The second minimal edit distance is shared by several terms |
593
|
|
|
best_term = select_best_alphabetical_word(word, tmp_list) |
594
|
|
|
|
595
|
|
|
final_list = { |
596
|
|
|
first_word: max_val, |
597
|
|
|
best_term: max_val |
598
|
|
|
} |
599
|
|
|
|
600
|
|
|
return final_list |
601
|
|
|
elif len(tmp_list) == 2: # Exactly two word get the same min edit distance |
602
|
|
|
final_list = { |
603
|
|
|
tmp_list[0]: max_val, |
604
|
|
|
tmp_list[1]: max_val |
605
|
|
|
} |
606
|
|
|
|
607
|
|
|
return final_list |
608
|
|
|
else: # |
609
|
|
|
best_term_1 = select_best_alphabetical_word(word, tmp_list) |
610
|
|
|
|
611
|
|
|
tmp_list = [term for term in tmp_list if term != best_term_1] |
612
|
|
|
best_term_2 = select_best_alphabetical_word(word, tmp_list) |
613
|
|
|
|
614
|
|
|
final_list = { |
615
|
|
|
best_term_1: max_val, |
616
|
|
|
best_term_2: max_val |
617
|
|
|
} |
618
|
|
|
|
619
|
|
|
return final_list |
620
|
|
|
else: # Two words with the same score |
621
|
|
|
return final_list |
622
|
|
|
|
623
|
|
|
|
624
|
|
|
def extract_paragraph_bigrams(paragraph): |
625
|
|
|
"""Get bigrams for a given paragraph |
626
|
|
|
|
627
|
|
|
Parameters: |
628
|
|
|
paragraph (list): Paragraph to extract bigrams from |
629
|
|
|
|
630
|
|
|
Returns: |
631
|
|
|
list - Bigram list |
632
|
|
|
""" |
633
|
|
|
p_tokens = [token for line in paragraph for token in line.tokens] |
634
|
|
|
bigram_list = [] |
635
|
|
|
|
636
|
|
|
for index in xrange(len(p_tokens) - 1): |
637
|
|
|
bigram_list.append((p_tokens[index], p_tokens[index + 1])) |
638
|
|
|
|
639
|
|
|
return bigram_list |
640
|
|
|
|