Unigrams - Code Metrics - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

Unigrams A
last analyzed 2017-09-28 14:20 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	50
Duplicated Lines	0 %

Importance

Changes	2
Bugs	0	Features	2

Metric	Value
c	2
b	0
f	2
dl	0
loc	50
rs	10
wmc	14

5 Methods

Rating	Name	Size	Complexity
A	load()	8	1
B	append_data()	9	7
A	generate_low_case()	13	4
A	__init__()	4	1
A	save()	8	1

"""Package containing all the main inline structures

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
from __future__ import division
from math import floor
from numpy.lib.function_base import median
from collections import Counter
import inspect
from os.path import exists
import operator
from nltk.util import ngrams as nltk_ngrams
from denoiser.models.inline.hashing import ocr_key_list_to_str, ocr_key_hash, anagram_hash
from apputils.pickling import load, save
import re
from operator import add


def truncate_map(occurence_map):
    """Truncate an occurence map by removing uncommon iteration

    Parameters:
        occurence_map (dict): Dictionary containing word as key and occurence as value

    Returns:
        dict: Truncated map
    """
    # Get occurences distribution
    distribution = Counter(occurence_map.values())
    dist_median = median(distribution.values())

    # Compute upper bound
    limit = 0.99
    dist_upper_median = sorted([v for v in distribution.values() if v > dist_median])
    dist_upper_bound = int(floor(len(dist_upper_median) * limit))

    # Compute new distribution
    min_dist_value = dist_upper_median[dist_upper_bound - 1]
    distribution = {k: v for k, v in distribution.items() if v <= min_dist_value}

    # Return new occurence map
    return {k: v for k, v in occurence_map.items() if v in distribution.keys()}


class InlineStructure(object):
    """Abstract inline structure
    """

    def __init__(self, filename):
        self.filename = filename

        if exists(self.filename):
            self.load()

    def append_data(self, **kwargs):
        """Append data to the structure

        Args:
            **kwargs: Arbitrary keyword arguments

        Raise:
            NotImplementedError: Not yet implemented
        """
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")

    def load(self):
        """Load the structure from the file if it exists
        """
        if not exists(self.filename):
            return

    def save(self):
        """Save the structure to the file

        Raise:
            NotImplementedError: Not yet implemented
        """
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")


class NGramsStructure(InlineStructure):
    """Abstract n-gram structure
    """

    def __init__(self, filename):
        self.ngrams = Counter()
        self.ngrams_pruned = Counter()

        super(NGramsStructure, self).__init__(filename)

    def append_data(self, **kwargs):
        raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")

    def prune(self, rate):
        """Prune ngrams list given the rate of data to keep

        Args:
            rate (float): Limit rate of data to keep
        """
        if rate >= 1:
            self.ngrams_pruned = self.ngrams
            return

        pruned_target = {}

        truncated_target = truncate_map(self.ngrams)
        sorted_target = sorted(truncated_target.iteritems(), key=operator.itemgetter(1), reverse=True)

        total = len(sorted_target)
        registered = 0
        current_occ = 0
        for (data, occurence) in sorted_target:
            if registered / total >= rate and occurence != current_occ:
                break

            current_occ = occurence
            pruned_target[data] = occurence
            registered += 1

        self.ngrams_pruned = Counter(pruned_target)

    def load(self):
        super(NGramsStructure, self).load()

    def save(self):
        super(NGramsStructure, self).save()


class Dictionary(InlineStructure):
    """Dictionary
    """

    def __init__(self, filename):
        self.dictionary = list()

        super(Dictionary, self).__init__(filename)

    def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        plc_set = set(unigrams)
        word_set = set(word_list)

        self.dictionary = list(plc_set.intersection(word_set))
        self.save()

    def load(self):
        super(Dictionary, self).load()

        self.dictionary = load(self.filename)

    def save(self):
        save(self.dictionary, self.filename)


class Unigrams(NGramsStructure):
    """Unigrams list
    """

    def __init__(self, filename):
        self.raw_unigrams = Counter()  # Unigrams not submitted to case modification

        super(Unigrams, self).__init__(filename)

    def append_data(self, text_data):
        unigrams = [token[1] for paragraph in text_data.text for line in paragraph for token in line.tokens
                    if line.grade != 0 and not token[1] is None and len(token[1]) > 1]

        unigrams_counter = Counter(unigrams)
        self.raw_unigrams += unigrams_counter

        self.save()
        return unigrams

    def generate_low_case(self, altcase_map):
        """Generate lower case unigrams

        Args:
            altcase_map (dict): List of alternative case word for a given lowercase word
        """
        low_unigrams = {key: 0 for key in altcase_map.keys()}

        for unigram, alt_case_list in altcase_map.items():
            low_unigrams[unigram] = sum([self.raw_unigrams[alt_case] for alt_case in alt_case_list])

        self.ngrams = Counter(low_unigrams)
        self.save()

    def load(self):
        super(Unigrams, self).load()

        data = load(self.filename)

        self.raw_unigrams = data["raw_unigrams"]
        self.ngrams = data["unigrams"]
        self.ngrams_pruned = data["unigrams_pruned"]

    def save(self):
        data = {
            "raw_unigrams": self.raw_unigrams,
            "unigrams": self.ngrams,
            "unigrams_pruned": self.ngrams_pruned
        }

        save(data, self.filename)


class Bigrams(NGramsStructure):
    """Bigrams list
    """

    def __init__(self, filename):
        super(Bigrams, self).__init__(filename)

    def append_data(self, unigrams):
        bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2)
                   if len(bigram[0]) > 1 and len(bigram[1]) > 1]

        self.ngrams += Counter(bigrams)
        self.prune(0.35)

        self.save()

    def load(self):
        super(Bigrams, self).load()

        data = load(self.filename)

        self.ngrams = data["bigrams"]
        self.ngrams_pruned = data["bigrams_pruned"]

    def save(self):
        data = {
            "bigrams": self.ngrams,
            "bigrams_pruned": self.ngrams_pruned
        }

        save(data, self.filename)


class AltCaseMap(InlineStructure):
    """Alternative case map
    """

    def __init__(self, filename):
        self.altcase_map = {}
        self.altcase_pruned_map = {}

        super(AltCaseMap, self).__init__(filename)

    def append_data(self, unigrams):
        _altcase_map = {unigram.lower(): set() for unigram in unigrams.keys()}

        for unigram in unigrams.keys():
            _altcase_map[unigram.lower()].add(unigram)

        self.altcase_map = {key: set(value) for key, value in _altcase_map.items()}
        self.save()

    def prune(self, unigrams_pruned):
        """Prume the map given selected unigrams

        Args:
            unigrams_pruned (dict): List of unigrams to keep in the final list
        """
        self.altcase_pruned_map = {unigram: self.altcase_map[unigram] for unigram in unigrams_pruned.keys()}
        self.save()

    def load(self):
        super(AltCaseMap, self).load()

        data = load(self.filename)

        self.altcase_map = data["altcase"]
        self.altcase_pruned_map = data["altcase_pruned"]

    def save(self):
        data = {
            "altcase": self.altcase_map,
            "altcase_pruned": self.altcase_pruned_map
        }

        save(data, self.filename)


class OcrKeyMap(InlineStructure):
    """OCR Key map
    """

    def __init__(self, filename):
        self.ocrkey_map = {}

        super(OcrKeyMap, self).__init__(filename)

    def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        word_set = set(word_list)
        unigram_set = set(unigrams.keys())

        ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}

        # Every word contained in the mixed case map and the dictionary
        for word in unigram_set.intersection(word_set):
            h_list = ocr_key_hash(word)
            h_str = ocr_key_list_to_str(h_list)

            ocr_key_map[h_str].add(word)  # Add the word to the tab

        combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}

        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.ocrkey_map = combine_struct
        self.save()

    def load(self):
        super(OcrKeyMap, self).load()

        self.ocrkey_map = load(self.filename)

    def save(self):
        save(self.ocrkey_map, self.filename)


class AnagramMap(InlineStructure):
    """Anagram map
    """

    def __init__(self, filename):
        self.anagram_hashmap = {}
        self.anagram_alphabet = {}

        super(AnagramMap, self).__init__(filename)

    def append_data(self, bigrams, unigrams):
        anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()}

        for word in bigrams.keys() + unigrams.keys():
            anaghash_map[anagram_hash(word)].add(word)

        self.anagram_hashmap = anaghash_map

        clean_word = re.compile(r"^[a-zA-Z '-]+$")
        alphabet = set()

        for word in unigrams:
            word = " "+word+" "
            chars = [char for char in word]  # Getting letters from the word
            chars += map(add, chars[:-1], chars[1:])  # Adding bigrams to the list

            alphabet = alphabet.union([anagram_hash(char) for char in set(chars)
                                       if not clean_word.match(char) is None])

        alphabet.add(0)

        self.anagram_alphabet = alphabet
        self.save()

    def load(self):
        super(AnagramMap, self).load()

        data = load(self.filename)

        self.anagram_hashmap = data["hashmap"]
        self.anagram_alphabet = data["alphabet"]

    def save(self):
        data = {
            "hashmap": self.anagram_hashmap,
            "alphabet": self.anagram_alphabet
        }

        save(data, self.filename)


1			"""Package containing all the main inline structures
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			from __future__ import division
17			from math import floor
18			from numpy.lib.function_base import median
19			from collections import Counter
20			import inspect
21			from os.path import exists
22			import operator
23			from nltk.util import ngrams as nltk_ngrams
24			from denoiser.models.inline.hashing import ocr_key_list_to_str, ocr_key_hash, anagram_hash
25			from apputils.pickling import load, save
26			import re
27			from operator import add
28
29
30			def truncate_map(occurence_map):
31			"""Truncate an occurence map by removing uncommon iteration
32
33			Parameters:
34			occurence_map (dict): Dictionary containing word as key and occurence as value
35
36			Returns:
37			dict: Truncated map
38			"""
39			# Get occurences distribution
40			distribution = Counter(occurence_map.values())
41			dist_median = median(distribution.values())
42
43			# Compute upper bound
44			limit = 0.99
45			dist_upper_median = sorted([v for v in distribution.values() if v > dist_median])
46			dist_upper_bound = int(floor(len(dist_upper_median) * limit))
47
48			# Compute new distribution
49			min_dist_value = dist_upper_median[dist_upper_bound - 1]
50			distribution = {k: v for k, v in distribution.items() if v <= min_dist_value}
51
52			# Return new occurence map
53			return {k: v for k, v in occurence_map.items() if v in distribution.keys()}
54
55
56			class InlineStructure(object):
57			"""Abstract inline structure
58			"""
59
60			def __init__(self, filename):
61			self.filename = filename
62
63			if exists(self.filename):
64			self.load()
65
66			def append_data(self, **kwargs):
67			"""Append data to the structure
68
69			Args:
70			**kwargs: Arbitrary keyword arguments
71
72			Raise:
73			NotImplementedError: Not yet implemented
74			"""
75			raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
76
77			def load(self):
78			"""Load the structure from the file if it exists
79			"""
80			if not exists(self.filename):
81			return
82
83			def save(self):
84			"""Save the structure to the file
85
86			Raise:
87			NotImplementedError: Not yet implemented
88			"""
89			raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
90
91
92			class NGramsStructure(InlineStructure):
93			"""Abstract n-gram structure
94			"""
95
96			def __init__(self, filename):
97			self.ngrams = Counter()
98			self.ngrams_pruned = Counter()
99
100			super(NGramsStructure, self).__init__(filename)
101
102			def append_data(self, **kwargs):
103			raise NotImplementedError("Function "+inspect.stack()[0][3]+" has not been implemented")
104
105			def prune(self, rate):
106			"""Prune ngrams list given the rate of data to keep
107
108			Args:
109			rate (float): Limit rate of data to keep
110			"""
111			if rate >= 1:
112			self.ngrams_pruned = self.ngrams
113			return
114
115			pruned_target = {}
116
117			truncated_target = truncate_map(self.ngrams)
118			sorted_target = sorted(truncated_target.iteritems(), key=operator.itemgetter(1), reverse=True)
119
120			total = len(sorted_target)
121			registered = 0
122			current_occ = 0
123			for (data, occurence) in sorted_target:
124			if registered / total >= rate and occurence != current_occ:
125			break
126
127			current_occ = occurence
128			pruned_target[data] = occurence
129			registered += 1
130
131			self.ngrams_pruned = Counter(pruned_target)
132
133			def load(self):
134			super(NGramsStructure, self).load()
135
136			def save(self):
137			super(NGramsStructure, self).save()
138
139
140			class Dictionary(InlineStructure):
141			"""Dictionary
142			"""
143
144			def __init__(self, filename):
145			self.dictionary = list()
146
147			super(Dictionary, self).__init__(filename)
148
149			def append_data(self, unigrams):
150			word_list = []
151
152			aspell_dict = "models/aspell.en.dict"
153			with open(aspell_dict, "r") as f:
154			for line in f:
155			word_list.append(line.strip("\r\n"))
156
157			plc_set = set(unigrams)
158			word_set = set(word_list)
159
160			self.dictionary = list(plc_set.intersection(word_set))
161			self.save()
162
163			def load(self):
164			super(Dictionary, self).load()
165
166			self.dictionary = load(self.filename)
167
168			def save(self):
169			save(self.dictionary, self.filename)
170
171
172			class Unigrams(NGramsStructure):
173			"""Unigrams list
174			"""
175
176			def __init__(self, filename):
177			self.raw_unigrams = Counter() # Unigrams not submitted to case modification
178
179			super(Unigrams, self).__init__(filename)
180
181			def append_data(self, text_data):
182			unigrams = [token[1] for paragraph in text_data.text for line in paragraph for token in line.tokens
183			if line.grade != 0 and not token[1] is None and len(token[1]) > 1]
184
185			unigrams_counter = Counter(unigrams)
186			self.raw_unigrams += unigrams_counter
187
188			self.save()
189			return unigrams
190
191			def generate_low_case(self, altcase_map):
192			"""Generate lower case unigrams
193
194			Args:
195			altcase_map (dict): List of alternative case word for a given lowercase word
196			"""
197			low_unigrams = {key: 0 for key in altcase_map.keys()}
198
199			for unigram, alt_case_list in altcase_map.items():
200			low_unigrams[unigram] = sum([self.raw_unigrams[alt_case] for alt_case in alt_case_list])
201
202			self.ngrams = Counter(low_unigrams)
203			self.save()
204
205			def load(self):
206			super(Unigrams, self).load()
207
208			data = load(self.filename)
209
210			self.raw_unigrams = data["raw_unigrams"]
211			self.ngrams = data["unigrams"]
212			self.ngrams_pruned = data["unigrams_pruned"]
213
214			def save(self):
215			data = {
216			"raw_unigrams": self.raw_unigrams,
217			"unigrams": self.ngrams,
218			"unigrams_pruned": self.ngrams_pruned
219			}
220
221			save(data, self.filename)
222
223
224			class Bigrams(NGramsStructure):
225			"""Bigrams list
226			"""
227
228			def __init__(self, filename):
229			super(Bigrams, self).__init__(filename)
230
231			def append_data(self, unigrams):
232			bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2)
233			if len(bigram[0]) > 1 and len(bigram[1]) > 1]
234
235			self.ngrams += Counter(bigrams)
236			self.prune(0.35)
237
238			self.save()
239
240			def load(self):
241			super(Bigrams, self).load()
242
243			data = load(self.filename)
244
245			self.ngrams = data["bigrams"]
246			self.ngrams_pruned = data["bigrams_pruned"]
247
248			def save(self):
249			data = {
250			"bigrams": self.ngrams,
251			"bigrams_pruned": self.ngrams_pruned
252			}
253
254			save(data, self.filename)
255
256
257			class AltCaseMap(InlineStructure):
258			"""Alternative case map
259			"""
260
261			def __init__(self, filename):
262			self.altcase_map = {}
263			self.altcase_pruned_map = {}
264
265			super(AltCaseMap, self).__init__(filename)
266
267			def append_data(self, unigrams):
268			_altcase_map = {unigram.lower(): set() for unigram in unigrams.keys()}
269
270			for unigram in unigrams.keys():
271			_altcase_map[unigram.lower()].add(unigram)
272
273			self.altcase_map = {key: set(value) for key, value in _altcase_map.items()}
274			self.save()
275
276			def prune(self, unigrams_pruned):
277			"""Prume the map given selected unigrams
278
279			Args:
280			unigrams_pruned (dict): List of unigrams to keep in the final list
281			"""
282			self.altcase_pruned_map = {unigram: self.altcase_map[unigram] for unigram in unigrams_pruned.keys()}
283			self.save()
284
285			def load(self):
286			super(AltCaseMap, self).load()
287
288			data = load(self.filename)
289
290			self.altcase_map = data["altcase"]
291			self.altcase_pruned_map = data["altcase_pruned"]
292
293			def save(self):
294			data = {
295			"altcase": self.altcase_map,
296			"altcase_pruned": self.altcase_pruned_map
297			}
298
299			save(data, self.filename)
300
301
302			class OcrKeyMap(InlineStructure):
303			"""OCR Key map
304			"""
305
306			def __init__(self, filename):
307			self.ocrkey_map = {}
308
309			super(OcrKeyMap, self).__init__(filename)
310
311			def append_data(self, unigrams):
312			word_list = []
313
314			aspell_dict = "models/aspell.en.dict"
315			with open(aspell_dict, "r") as f:
316			for line in f:
317			word_list.append(line.strip("\r\n"))
318
319			word_set = set(word_list)
320			unigram_set = set(unigrams.keys())
321
322			ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}
323
324			# Every word contained in the mixed case map and the dictionary
325			for word in unigram_set.intersection(word_set):
326			h_list = ocr_key_hash(word)
327			h_str = ocr_key_list_to_str(h_list)
328
329			ocr_key_map[h_str].add(word) # Add the word to the tab
330
331			combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}
332
333			for key, value in self.ocrkey_map.items() + ocr_key_map.items():
334			combine_struct[key] = combine_struct[key].union(value)
335
336			self.ocrkey_map = combine_struct
337			self.save()
338
339			def load(self):
340			super(OcrKeyMap, self).load()
341
342			self.ocrkey_map = load(self.filename)
343
344			def save(self):
345			save(self.ocrkey_map, self.filename)
346
347
348			class AnagramMap(InlineStructure):
349			"""Anagram map
350			"""
351
352			def __init__(self, filename):
353			self.anagram_hashmap = {}
354			self.anagram_alphabet = {}
355
356			super(AnagramMap, self).__init__(filename)
357
358			def append_data(self, bigrams, unigrams):
359			anaghash_map = {anagram_hash(word): set() for word in bigrams.keys() + unigrams.keys()}
360
361			for word in bigrams.keys() + unigrams.keys():
362			anaghash_map[anagram_hash(word)].add(word)
363
364			self.anagram_hashmap = anaghash_map
365
366			clean_word = re.compile(r"^[a-zA-Z '-]+$")
367			alphabet = set()
368
369			for word in unigrams:
370			word = " "+word+" "
371			chars = [char for char in word] # Getting letters from the word
372			chars += map(add, chars[:-1], chars[1:]) # Adding bigrams to the list
373
374			alphabet = alphabet.union([anagram_hash(char) for char in set(chars)
375			if not clean_word.match(char) is None])
376
377			alphabet.add(0)
378
379			self.anagram_alphabet = alphabet
380			self.save()
381
382			def load(self):
383			super(AnagramMap, self).load()
384
385			data = load(self.filename)
386
387			self.anagram_hashmap = data["hashmap"]
388			self.anagram_alphabet = data["alphabet"]
389
390			def save(self):
391			data = {
392			"hashmap": self.anagram_hashmap,
393			"alphabet": self.anagram_alphabet
394			}
395
396			save(data, self.filename)
397

usnistgov / ocr-pipeline

Unigrams A last analyzed 2017-09-28 14:20 UTC

Complexity

Size/Duplication

Importance

5 Methods

Duplication Side-by-Side

Filter issues like

Unigrams A
last analyzed 2017-09-28 14:20 UTC