InlineStructure   A
last analyzed

Complexity

Total Complexity 6

Size/Duplication

Total Lines 34
Duplicated Lines 0 %

Importance

Changes 2
Bugs 0 Features 2
Metric Value
c 2
b 0
f 2
dl 0
loc 34
rs 10
wmc 6

4 Methods

Rating   Name   Duplication   Size   Complexity  
A save() 0 7 1
A load() 0 5 2
A __init__() 0 5 2
A append_data() 0 10 1
1
"""Package containing all the main inline structures
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
from __future__ import division
17
from math import floor
18
from numpy.lib.function_base import median
19
from collections import Counter
20
import inspect
21
from os.path import exists
22
import operator
23
from nltk.util import ngrams as nltk_ngrams
24
from denoiser.models.inline.hashing import ocr_key_list_to_str, ocr_key_hash, anagram_hash
25
from apputils.pickling import load, save
26
import re
27
from operator import add
28
29
30
def truncate_map(occurence_map):
31
    """Truncate an occurence map by removing uncommon iteration
32
33
    Parameters:
34
        occurence_map (dict): Dictionary containing word as key and occurence as value
35
36
    Returns:
37
        dict: Truncated map
38
    """
39
    # Get occurences distribution
40
    distribution = Counter(occurence_map.values())
41
    dist_median = median(distribution.values())
42
43
    # Compute upper bound
44
    limit = 0.99
45
    dist_upper_median = sorted([v for v in distribution.values() if v > dist_median])
46
    dist_upper_bound = int(floor(len(dist_upper_median) * limit))
47
48
    # Compute new distribution
49
    min_dist_value = dist_upper_median[dist_upper_bound - 1]
50
    distribution = {k: v for k, v in distribution.items() if v <= min_dist_value}
51
52
    # Return new occurence map
53
    return {k: v for k, v in occurence_map.items() if v in distribution.keys()}
54
55
56
class InlineStructure(object):
57
    """Abstract inline structure
58
    """
59
60
    def __init__(self, filename):
61
        self.filename = filename
62
63
        if exists(self.filename):
64
            self.load()
65
66
    def append_data(self, **kwargs):
67
        """Append data to the structure
68
69
        Args:
70
            **kwargs: Arbitrary keyword arguments
71
72
        Raise:
73
            NotImplementedError: Not yet implemented
74
        """
75
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
76
77
    def load(self):
78
        """Load the structure from the file if it exists
79
        """
80
        if not exists(self.filename):
81
            return
82
83
    def save(self):
84
        """Save the structure to the file
85
86
        Raise:
87
            NotImplementedError: Not yet implemented
88
        """
89
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
90
91
92
class NGramsStructure(InlineStructure):
93
    """Abstract n-gram structure
94
    """
95
96
    def __init__(self, filename):
97
        self.ngrams = Counter()
98
        self.ngrams_pruned = Counter()
99
100
        super(NGramsStructure, self).__init__(filename)
101
102
    def append_data(self, **kwargs):
103
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
104
105
    def prune(self, rate):
106
        """Prune ngrams list given the rate of data to keep
107
108
        Args:
109
            rate (float): Limit rate of data to keep
110
        """
111
        if rate >= 1:
112
            self.ngrams_pruned = self.ngrams
113
            return
114
115
        pruned_target = {}
116
117
        truncated_target = truncate_map(self.ngrams)
118
        sorted_target = sorted(truncated_target.iteritems(), key=operator.itemgetter(1), reverse=True)
119
120
        total = len(sorted_target)
121
        registered = 0
122
        current_occ = 0
123
        for (data, occurence) in sorted_target:
124
            if registered / total >= rate and occurence != current_occ:
125
                break
126
127
            current_occ = occurence
128
            pruned_target[data] = occurence
129
            registered += 1
130
131
        self.ngrams_pruned = Counter(pruned_target)
132
133
    def load(self):
134
        super(NGramsStructure, self).load()
135
136
    def save(self):
137
        super(NGramsStructure, self).save()
138
139
140
class Dictionary(InlineStructure):
141
    """Dictionary
142
    """
143
144
    def __init__(self, filename):
145
        self.dictionary = list()
146
147
        super(Dictionary, self).__init__(filename)
148
149
    def append_data(self, unigrams):
150
        word_list = []
151
152
        aspell_dict = "models/aspell.en.dict"
153
        with open(aspell_dict, "r") as f:
154
            for line in f:
155
                word_list.append(line.strip("\r\n"))
156
157
        plc_set = set(unigrams)
158
        word_set = set(word_list)
159
160
        self.dictionary = list(plc_set.intersection(word_set))
161
        self.save()
162
163
    def load(self):
164
        super(Dictionary, self).load()
165
166
        self.dictionary = load(self.filename)
167
168
    def save(self):
169
        save(self.dictionary, self.filename)
170
171
172
class Unigrams(NGramsStructure):
173
    """Unigrams list
174
    """
175
176
    def __init__(self, filename):
177
        self.raw_unigrams = Counter()  # Unigrams not submitted to case modification
178
179
        super(Unigrams, self).__init__(filename)
180
181
    def append_data(self, text_data):
182
        unigrams = [token[1] for paragraph in text_data.text for line in paragraph for token in line.tokens
183
                    if line.grade != 0 and not token[1] is None and len(token[1]) > 1]
184
185
        unigrams_counter = Counter(unigrams)
186
        self.raw_unigrams += unigrams_counter
187
188
        self.save()
189
        return unigrams
190
191
    def generate_low_case(self, altcase_map):
192
        """Generate lower case unigrams
193
194
        Args:
195
            altcase_map (dict): List of alternative case word for a given lowercase word
196
        """
197
        low_unigrams = {key: 0 for key in altcase_map.keys()}
198
199
        for unigram, alt_case_list in altcase_map.items():
200
            low_unigrams[unigram] = sum([self.raw_unigrams[alt_case] for alt_case in alt_case_list])
201
202
        self.ngrams = Counter(low_unigrams)
203
        self.save()
204
205
    def load(self):
206
        super(Unigrams, self).load()
207
208
        data = load(self.filename)
209
210
        self.raw_unigrams = data["raw_unigrams"]
211
        self.ngrams = data["unigrams"]
212
        self.ngrams_pruned = data["unigrams_pruned"]
213
214
    def save(self):
215
        data = {
216
            "raw_unigrams": self.raw_unigrams,
217
            "unigrams": self.ngrams,
218
            "unigrams_pruned": self.ngrams_pruned
219
        }
220
221
        save(data, self.filename)
222
223
224
class Bigrams(NGramsStructure):
225
    """Bigrams list
226
    """
227
228
    def __init__(self, filename):
229
        super(Bigrams, self).__init__(filename)
230
231
    def append_data(self, unigrams):
232
        bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2)
233
                   if len(bigram[0]) > 1 and len(bigram[1]) > 1]
234
235
        self.ngrams += Counter(bigrams)
236
        self.prune(0.35)
237
238
        self.save()
239
240
    def load(self):
241
        super(Bigrams, self).load()
242
243
        data = load(self.filename)
244
245
        self.ngrams = data["bigrams"]
246
        self.ngrams_pruned = data["bigrams_pruned"]
247
248
    def save(self):
249
        data = {
250
            "bigrams": self.ngrams,
251
            "bigrams_pruned": self.ngrams_pruned
252
        }
253
254
        save(data, self.filename)
255
256
257
class AltCaseMap(InlineStructure):
258
    """Alternative case map
259
    """
260
261
    def __init__(self, filename):
262
        self.altcase_map = {}
263
        self.altcase_pruned_map = {}
264
265
        super(AltCaseMap, self).__init__(filename)
266
267
    def append_data(self, unigrams):
268
        _altcase_map = {unigram.lower(): set() for unigram in unigrams.keys()}
269
270
        for unigram in unigrams.keys():
271
            _altcase_map[unigram.lower()].add(unigram)
272
273
        self.altcase_map = {key: set(value) for key, value in _altcase_map.items()}
274
        self.save()
275
276
    def prune(self, unigrams_pruned):
277
        """Prume the map given selected unigrams
278
279
        Args:
280
            unigrams_pruned (dict): List of unigrams to keep in the final list
281
        """
282
        self.altcase_pruned_map = {unigram: self.altcase_map[unigram] for unigram in unigrams_pruned.keys()}
283
        self.save()
284
285
    def load(self):
286
        super(AltCaseMap, self).load()
287
288
        data = load(self.filename)
289
290
        self.altcase_map = data["altcase"]
291
        self.altcase_pruned_map = data["altcase_pruned"]
292
293
    def save(self):
294
        data = {
295
            "altcase": self.altcase_map,
296
            "altcase_pruned": self.altcase_pruned_map
297
        }
298
299
        save(data, self.filename)
300
301
302
class OcrKeyMap(InlineStructure):
303
    """OCR Key map
304
    """
305
306
    def __init__(self, filename):
307
        self.ocrkey_map = {}
308
309
        super(OcrKeyMap, self).__init__(filename)
310
311
    def append_data(self, unigrams):
312
        word_list = []
313
314
        aspell_dict = "models/aspell.en.dict"
315
        with open(aspell_dict, "r") as f:
316
            for line in f:
317
                word_list.append(line.strip("\r\n"))
318
319
        word_set = set(word_list)
320
        unigram_set = set(unigrams.keys())
321
322
        ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}
323
324
        # Every word contained in the mixed case map and the dictionary
325
        for word in unigram_set.intersection(word_set):
326
            h_list = ocr_key_hash(word)
327
            h_str = ocr_key_list_to_str(h_list)
328
329
            ocr_key_map[h_str].add(word)  # Add the word to the tab
330
331
        combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}
332
333
        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
334
            combine_struct[key] = combine_struct[key].union(value)
335
336
        self.ocrkey_map = combine_struct
337
        self.save()
338
339
    def load(self):
340
        super(OcrKeyMap, self).load()
341
342
        self.ocrkey_map = load(self.filename)
343
344
    def save(self):
345
        save(self.ocrkey_map, self.filename)
346
347
348
class AnagramMap(InlineStructure):
349
    """Anagram map
350
    """
351
352
    def __init__(self, filename):
353
        self.anagram_hashmap = {}
354
        self.anagram_alphabet = {}
355
356
        super(AnagramMap, self).__init__(filename)
357
358
    def append_data(self, bigrams, unigrams):
359
        anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()}
360
361
        for word in bigrams.keys() + unigrams.keys():
362
            anaghash_map[anagram_hash(word)].add(word)
363
364
        self.anagram_hashmap = anaghash_map
365
366
        clean_word = re.compile(r"^[a-zA-Z '-]+$")
367
        alphabet = set()
368
369
        for word in unigrams:
370
            word = " "+word+" "
371
            chars = [char for char in word]  # Getting letters from the word
372
            chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list
373
374
            alphabet = alphabet.union([anagram_hash(char) for char in set(chars)
375
                                       if not clean_word.match(char) is None])
376
377
        alphabet.add(0)
378
379
        self.anagram_alphabet = alphabet
380
        self.save()
381
382
    def load(self):
383
        super(AnagramMap, self).load()
384
385
        data = load(self.filename)
386
387
        self.anagram_hashmap = data["hashmap"]
388
        self.anagram_alphabet = data["alphabet"]
389
390
    def save(self):
391
        data = {
392
            "hashmap": self.anagram_hashmap,
393
            "alphabet": self.anagram_alphabet
394
        }
395
396
        save(data, self.filename)
397