Text.retrieve_text_score() - Code Metrics - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

Text.retrieve_text_score() F
last analyzed 2017-09-28 14:20 UTC

↳ Parent: Text

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	0	Features	2

Metric	Value
cc	13
c	2
b	0
f	2
dl	0
loc	61
rs	2.9617

How to fix Long Method Complexity

"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object.

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
from __future__ import division
import re
from nltk.tokenize import word_tokenize
from unidecode import unidecode
import codecs
from collections import Counter
import csv
import logging
from numpy import mean
from denoiser.text.stats import Statistics


def tokenize(line):
    """Separate line to get clean tokens out of it

    Parameters:
        line (:func:`str`): A line of text

    Returns:
        list - List of different tokens
    """
    separators = "=+/,.:;!?%<>#()&[]{}"

    tokens = []
    tokenized_line = word_tokenize(line)  # Will get rid of most of the separators

    for word in tokenized_line:
        tmp_tokens = [unidecode(word)]

        for separator in separators:
            sep_tokens = []

            for tmp_token in tmp_tokens:
                split_token = tmp_token.split(separator)

                if len(split_token) != 1:  # Token has been split
                    # Concatening the list of token with the separator
                    tkn_sep_list = []

                    for ind, tkn in enumerate(split_token):
                        tkn_sep_list.append(tkn)

                        if ind != len(split_token) - 1:  # Avoid to add the separator at the end
                            tkn_sep_list.append(unicode(separator))

                    sep_tokens += tkn_sep_list
                else:
                    sep_tokens += split_token

            tmp_tokens = sep_tokens

        tokens += [tkn for tkn in tmp_tokens if tkn != '']

    return tokens


def clean_head_tail(word):
    """Clean head and tail of a word

    Parameters:
        word (:func:`str`): The word to clean
    Returns:
        :func:`str` - Cleaned word
    """
    cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$")
    alpha_regexp = re.compile(r"[a-zA-Z]")

    word_groups = cleaning_regexp.findall(word)

    # Non matching strings are set as dirty (i.e. cannot be cleaned)
    # None is returned
    if len(word_groups) == 0:
        return None

    # Words containing no letters are set to None
    if alpha_regexp.search(word_groups[0][0]) is None:
        return None

    return word_groups[0][0]


class Text(object):
    """Stores the the text from a filename given in parameters

    Args:
        fname (str): Path to the file.

    Attributes:
        filename (:func:`str`): Name of the file.
        text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
        stats (:class:`.Statistics`): Statistics object.
    """

    def __init__(self, fname):
        self.filename = fname
        self.text = []
        self.contains_training_data = False

        self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
                                 "word_total_length", "word_avg_nb", "word_total_nb"])
        self.stats.set_stat("line_nb", 0)
        self.stats.set_stat("line_avg_length", 0)
        self.stats.set_stat("line_total_length", 0)
        self.stats.set_stat("word_avg_length", 0)
        self.stats.set_stat("word_total_length", 0)
        self.stats.set_stat("word_avg_nb", 0)
        self.stats.set_stat("word_total_nb", 0)

    def read_csv(self):
        """Read a CSV file and build the associated text object

        Returns:
            `Text`
        """
        self.contains_training_data = True

        with open(self.filename, "r") as f:
            csv_reader = csv.reader(f)
            paragraph = []

            for row in csv_reader:
                if len(row) != 2:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line = unicode(row[0].decode("utf-8"))
                line = line.strip(" \t\r\n")

                if len(line) == 0:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line_object = Line(line, row[1])
                paragraph.append(line_object)

                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))

                words_len = sum([len(tkn) for tkn in line_object.tokens])
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)

            if len(paragraph) != 0:
                self.text.append(paragraph)

        self.stats.set_stat("line_avg_length",
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
        self.stats.set_stat("word_avg_length",
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
        self.stats.set_stat("word_avg_nb",
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))

        logging.debug(self.filename+" read")

    def read_txt(self):
        """Read a text file and build the associated text object

        Returns:
            `Text`
        """
        self.contains_training_data = False

        with codecs.open(self.filename, "rb", encoding="utf-8") as f:
            paragraph = []

            for line in f:
                line = line.strip(" \t\r\n")

                if len(line) == 0:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line_object = Line(line)
                paragraph.append(line_object)

                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))

                words_len = sum([len(tkn) for tkn in line_object.tokens])
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)

            if len(paragraph) != 0:
                self.text.append(paragraph)

        self.stats.set_stat("line_avg_length",
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
        self.stats.set_stat("word_avg_length",
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
        self.stats.set_stat("word_avg_nb",
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))

        logging.debug(self.filename+" read")

    def get_clean_lines(self):
        """Returns cleans line from the text object

        Returns:
            list: List of clean lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade == 5:
                    lines.append(line.get_clean_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def get_garbage_lines(self):
        """Returns garbage lines from the text object

        Returns:
            list: List of garbage lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade == 0:
                    lines.append(line.get_orig_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def get_unclassified_lines(self):
        """Returns unclassified lines from the text object

        Returns:
            list: List of unclassified lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade % 5 != 0:  # Grade is not 0 nor 5
                    lines.append(line.get_orig_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def retrieve_text_score(self):
        """Returns some stats and score regarding classification

        Returns:
            dict: Dictionary containing the results
        """
        # True positive is a garbage string detected as such
        score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
        class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}

        for paragraph in self.text:
            for line in paragraph:
                if line.grade != 0 and line.grade != 5:
                    class_stats["unclassified"] += 1
                    continue

                if line.result is None or line.result < 0:
                    class_stats["unrated"] += 1
                    continue

                class_stats["classified"] += 1

                if line.grade == 0:  # Line detected as garbage
                    if line.result == 1:  # Line is clean
                        score_stats["FP"] += 1  # False positive
                    else:  # Line is garbage
                        score_stats["TP"] += 1  # True postive
                else:  # Line detected as clean
                    if line.result == 1:  # Line is clean
                        score_stats["TN"] += 1  # True negative
                    else:  # Line is garbage
                        score_stats["FN"] += 1  # False negative

        # Precision
        divider_pr = score_stats["TP"] + score_stats["FP"]
        if divider_pr != 0:
            precision = score_stats["TP"] / divider_pr
        else:
            precision = 0

        # Recall
        divider_rc = score_stats["TP"] + score_stats["FN"]
        if divider_rc != 0:
            recall = score_stats["TP"] / divider_rc
        else:
            recall = 0

        # F1 score
        if precision + recall != 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0

        return {
            "class": class_stats,
            "score": {
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "raw": score_stats
        }


class Line(object):
    """Represents a line of text and provides datastructures to handle it.

    Args:
        string (unicode): Line to parse.
        result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line)

    Attributes:
        tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
            3 element organized in this order `(original_token, clean_token, corrected_token)`
        pos_string (:func:`str`): Reference string containing the position of all the tokens
        result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
                                              algorithm
        grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
        stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
            **lower**, **upper** and **special** characters along with **numbers**.
    """

    def __init__(self, string, result=None):
        self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]

        self.pos_string = string  # String containing the position of each token (e.g. "%0 %1%2 ... %n")
        for index, token in enumerate(self.tokens):
            self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)

        self.result = None
        if result is not None:
            self.result = int(result)

        if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
            self.grade = 0
        else:
            self.grade = 3

        self.stats = {
            "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
            "clean": None
        }

        tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line())  # Lower chars replacement
        tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
        tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
        tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
        line_stats = Counter(tmp_line)

        self.stats["orig"].set_stat("lw_char", line_stats["a"])
        self.stats["orig"].set_stat("up_char", line_stats["A"])
        self.stats["orig"].set_stat("nb_char", line_stats["0"])
        self.stats["orig"].set_stat("sp_char", line_stats["#"])

    def raise_grade(self):
        """Add 1 to the grade of the line (up to 5)
        """
        if self.grade < 5:
            self.grade += 1

    def decrease_grade(self):
        """Remove 1 to the grade of the line (down to 0)
        """
        if self.grade > 0:
            self.grade -= 1

    def set_garbage(self):
        """Set the grade to 0
        """
        self.grade = 0

    def set_clean(self):
        """Set the grade to 5
        """
        self.grade = 5

    def get_orig_line(self):
        """Returns the original line

        Returns:
            str: Original line
        """
        string = self.pos_string

        for index, token in reversed(list(enumerate(self.tokens))):
            string = string.replace("%"+str(index), token[0])

        return string

    def get_clean_line(self):
        """Returns the clean line

        Returns:
            str: Clean line
        """
        string = self.pos_string

        for index, token in reversed(list(enumerate(self.tokens))):
            if not token[2] is None and len(token[2]) > 0:
                string = string.replace("%"+str(index), token[2].keys()[0])
            else:  # Inline correction is not available
                if not token[1] is None:
                    string = string.replace("%"+str(index), token[1])
                else:  # Clean token does not exist, use the original token
                    string = string.replace("%"+str(index), token[0])

        return re.sub(" +", " ", string).strip()

    def get_orig_stats(self):
        """Get original stats of the line

        Returns:
            Statistics: Statistics of the original line
        """
        return self.stats["orig"]

    def get_clean_stats(self):
        """Get clean stats of the line

        Returns:
            Statistics: Statistics of the clean line
        """
        if self.stats["clean"] is None:  # Compute clean stats if it is not already done
            self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])

            tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line())  # Lower chars replacement
            tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
            tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
            tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
            line_stats = Counter(tmp_line)

            self.stats["clean"].set_stat("lw_char", line_stats["a"])
            self.stats["clean"].set_stat("up_char", line_stats["A"])
            self.stats["clean"].set_stat("nb_char", line_stats["0"])
            self.stats["clean"].set_stat("sp_char", line_stats["#"])

        return self.stats["clean"]

    def get_line_score(self):
        """Return a global score of the line

        Returns:
            float: Score of the line
        """
        score = 0

        if len(self.tokens) == 0:
            return score

        for token in [t[2] for t in self.tokens if not t[2] is None]:
            score += mean([s for s in token.values()])

        return score / len(self.tokens)

    def __len__(self):
        return len(self.get_orig_line())

    def __str__(self):
        return str(self.tokens) + " | " + str(self.grade)


1			"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object.
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			from __future__ import division
17			import re
18			from nltk.tokenize import word_tokenize
19			from unidecode import unidecode
20			import codecs
21			from collections import Counter
22			import csv
23			import logging
24			from numpy import mean
25			from denoiser.text.stats import Statistics
26
27
28			def tokenize(line):
29			"""Separate line to get clean tokens out of it
30
31			Parameters:
32			line (:func:`str`): A line of text
33
34			Returns:
35			list - List of different tokens
36			"""
37			separators = "=+/,.:;!?%<>#()&[]{}"
38
39			tokens = []
40			tokenized_line = word_tokenize(line) # Will get rid of most of the separators
41
42			for word in tokenized_line:
43			tmp_tokens = [unidecode(word)]
44
45			for separator in separators:
46			sep_tokens = []
47
48			for tmp_token in tmp_tokens:
49			split_token = tmp_token.split(separator)
50
51			if len(split_token) != 1: # Token has been split
52			# Concatening the list of token with the separator
53			tkn_sep_list = []
54
55			for ind, tkn in enumerate(split_token):
56			tkn_sep_list.append(tkn)
57
58			if ind != len(split_token) - 1: # Avoid to add the separator at the end
59			tkn_sep_list.append(unicode(separator))
60
61			sep_tokens += tkn_sep_list
62			else:
63			sep_tokens += split_token
64
65			tmp_tokens = sep_tokens
66
67			tokens += [tkn for tkn in tmp_tokens if tkn != '']
68
69			return tokens
70
71
72			def clean_head_tail(word):
73			"""Clean head and tail of a word
74
75			Parameters:
76			word (:func:`str`): The word to clean
77			Returns:
78			:func:`str` - Cleaned word
79			"""
80			cleaning_regexp = re.compile(r"^[^a-zA-Z'-]([a-zA-Z'-](.[a-zA-Z'-])?)[^a-zA-Z'-]*$")
81			alpha_regexp = re.compile(r"[a-zA-Z]")
82
83			word_groups = cleaning_regexp.findall(word)
84
85			# Non matching strings are set as dirty (i.e. cannot be cleaned)
86			# None is returned
87			if len(word_groups) == 0:
88			return None
89
90			# Words containing no letters are set to None
91			if alpha_regexp.search(word_groups[0][0]) is None:
92			return None
93
94			return word_groups[0][0]
95
96
97			class Text(object):
98			"""Stores the the text from a filename given in parameters
99
100			Args:
101			fname (str): Path to the file.
102
103			Attributes:
104			filename (:func:`str`): Name of the file.
105			text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
106			stats (:class:`.Statistics`): Statistics object.
107			"""
108
109			def __init__(self, fname):
110			self.filename = fname
111			self.text = []
112			self.contains_training_data = False
113
114			self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
115			"word_total_length", "word_avg_nb", "word_total_nb"])
116			self.stats.set_stat("line_nb", 0)
117			self.stats.set_stat("line_avg_length", 0)
118			self.stats.set_stat("line_total_length", 0)
119			self.stats.set_stat("word_avg_length", 0)
120			self.stats.set_stat("word_total_length", 0)
121			self.stats.set_stat("word_avg_nb", 0)
122			self.stats.set_stat("word_total_nb", 0)
123
124			def read_csv(self):
125			"""Read a CSV file and build the associated text object
126
127			Returns:
128			`Text`
129			"""
130			self.contains_training_data = True
131
132			with open(self.filename, "r") as f:
133			csv_reader = csv.reader(f)
134			paragraph = []
135
136			for row in csv_reader:
137			if len(row) != 2:
138			if len(paragraph) != 0:
139			self.text.append(paragraph)
140			paragraph = []
141
142			continue
143
144			line = unicode(row[0].decode("utf-8"))
145			line = line.strip(" \t\r\n")
146
147			if len(line) == 0:
148			if len(paragraph) != 0:
149			self.text.append(paragraph)
150			paragraph = []
151
152			continue
153
154			line_object = Line(line, row[1])
155			paragraph.append(line_object)
156
157			self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
158			self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
159			self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
160
161			words_len = sum([len(tkn) for tkn in line_object.tokens])
162			self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
163
164			if len(paragraph) != 0:
165			self.text.append(paragraph)
166
167			self.stats.set_stat("line_avg_length",
168			self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
169			self.stats.set_stat("word_avg_length",
170			self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
171			self.stats.set_stat("word_avg_nb",
172			self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
173
174			logging.debug(self.filename+" read")
175
176			def read_txt(self):
177			"""Read a text file and build the associated text object
178
179			Returns:
180			`Text`
181			"""
182			self.contains_training_data = False
183
184			with codecs.open(self.filename, "rb", encoding="utf-8") as f:
185			paragraph = []
186
187			for line in f:
188			line = line.strip(" \t\r\n")
189
190			if len(line) == 0:
191			if len(paragraph) != 0:
192			self.text.append(paragraph)
193			paragraph = []
194
195			continue
196
197			line_object = Line(line)
198			paragraph.append(line_object)
199
200			self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
201			self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
202			self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
203
204			words_len = sum([len(tkn) for tkn in line_object.tokens])
205			self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
206
207			if len(paragraph) != 0:
208			self.text.append(paragraph)
209
210			self.stats.set_stat("line_avg_length",
211			self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
212			self.stats.set_stat("word_avg_length",
213			self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
214			self.stats.set_stat("word_avg_nb",
215			self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
216
217			logging.debug(self.filename+" read")
218
219			def get_clean_lines(self):
220			"""Returns cleans line from the text object
221
222			Returns:
223			list: List of clean lines
224			"""
225			lines = []
226
227			for paragraph in self.text:
228			for line in paragraph:
229			if line.grade == 5:
230			lines.append(line.get_clean_line())
231
232			if len(lines) > 0 and lines[-1] != "":
233			lines.append("")
234
235			return lines
236
237			def get_garbage_lines(self):
238			"""Returns garbage lines from the text object
239
240			Returns:
241			list: List of garbage lines
242			"""
243			lines = []
244
245			for paragraph in self.text:
246			for line in paragraph:
247			if line.grade == 0:
248			lines.append(line.get_orig_line())
249
250			if len(lines) > 0 and lines[-1] != "":
251			lines.append("")
252
253			return lines
254
255			def get_unclassified_lines(self):
256			"""Returns unclassified lines from the text object
257
258			Returns:
259			list: List of unclassified lines
260			"""
261			lines = []
262
263			for paragraph in self.text:
264			for line in paragraph:
265			if line.grade % 5 != 0: # Grade is not 0 nor 5
266			lines.append(line.get_orig_line())
267
268			if len(lines) > 0 and lines[-1] != "":
269			lines.append("")
270
271			return lines
272
273			def retrieve_text_score(self):
274			"""Returns some stats and score regarding classification
275
276			Returns:
277			dict: Dictionary containing the results
278			"""
279			# True positive is a garbage string detected as such
280			score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
281			class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}
282
283			for paragraph in self.text:
284			for line in paragraph:
285			if line.grade != 0 and line.grade != 5:
286			class_stats["unclassified"] += 1
287			continue
288
289			if line.result is None or line.result < 0:
290			class_stats["unrated"] += 1
291			continue
292
293			class_stats["classified"] += 1
294
295			if line.grade == 0: # Line detected as garbage
296			if line.result == 1: # Line is clean
297			score_stats["FP"] += 1 # False positive
298			else: # Line is garbage
299			score_stats["TP"] += 1 # True postive
300			else: # Line detected as clean
301			if line.result == 1: # Line is clean
302			score_stats["TN"] += 1 # True negative
303			else: # Line is garbage
304			score_stats["FN"] += 1 # False negative
305
306			# Precision
307			divider_pr = score_stats["TP"] + score_stats["FP"]
308			if divider_pr != 0:
309			precision = score_stats["TP"] / divider_pr
310			else:
311			precision = 0
312
313			# Recall
314			divider_rc = score_stats["TP"] + score_stats["FN"]
315			if divider_rc != 0:
316			recall = score_stats["TP"] / divider_rc
317			else:
318			recall = 0
319
320			# F1 score
321			if precision + recall != 0:
322			f1 = 2 * precision * recall / (precision + recall)
323			else:
324			f1 = 0
325
326			return {
327			"class": class_stats,
328			"score": {
329			"precision": precision,
330			"recall": recall,
331			"f1": f1
332			},
333			"raw": score_stats
334			}
335
336
337			class Line(object):
338			"""Represents a line of text and provides datastructures to handle it.
339
340			Args:
341			string (unicode): Line to parse.
342			result (int): (Optional) Expected result for a line (either a garbage string or a clean line)
343
344			Attributes:
345			tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
346			3 element organized in this order `(original_token, clean_token, corrected_token)`
347			pos_string (:func:`str`): Reference string containing the position of all the tokens
348			result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
349			algorithm
350			grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
351			stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
352			lower, upper and special characters along with numbers.
353			"""
354
355			def __init__(self, string, result=None):
356			self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]
357
358			self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n")
359			for index, token in enumerate(self.tokens):
360			self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)
361
362			self.result = None
363			if result is not None:
364			self.result = int(result)
365
366			if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
367			self.grade = 0
368			else:
369			self.grade = 3
370
371			self.stats = {
372			"orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
373			"clean": None
374			}
375
376			tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement
377			tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement
378			tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement
379			tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement
380			line_stats = Counter(tmp_line)
381
382			self.stats["orig"].set_stat("lw_char", line_stats["a"])
383			self.stats["orig"].set_stat("up_char", line_stats["A"])
384			self.stats["orig"].set_stat("nb_char", line_stats["0"])
385			self.stats["orig"].set_stat("sp_char", line_stats["#"])
386
387			def raise_grade(self):
388			"""Add 1 to the grade of the line (up to 5)
389			"""
390			if self.grade < 5:
391			self.grade += 1
392
393			def decrease_grade(self):
394			"""Remove 1 to the grade of the line (down to 0)
395			"""
396			if self.grade > 0:
397			self.grade -= 1
398
399			def set_garbage(self):
400			"""Set the grade to 0
401			"""
402			self.grade = 0
403
404			def set_clean(self):
405			"""Set the grade to 5
406			"""
407			self.grade = 5
408
409			def get_orig_line(self):
410			"""Returns the original line
411
412			Returns:
413			str: Original line
414			"""
415			string = self.pos_string
416
417			for index, token in reversed(list(enumerate(self.tokens))):
418			string = string.replace("%"+str(index), token[0])
419
420			return string
421
422			def get_clean_line(self):
423			"""Returns the clean line
424
425			Returns:
426			str: Clean line
427			"""
428			string = self.pos_string
429
430			for index, token in reversed(list(enumerate(self.tokens))):
431			if not token[2] is None and len(token[2]) > 0:
432			string = string.replace("%"+str(index), token[2].keys()[0])
433			else: # Inline correction is not available
434			if not token[1] is None:
435			string = string.replace("%"+str(index), token[1])
436			else: # Clean token does not exist, use the original token
437			string = string.replace("%"+str(index), token[0])
438
439			return re.sub(" +", " ", string).strip()
440
441			def get_orig_stats(self):
442			"""Get original stats of the line
443
444			Returns:
445			Statistics: Statistics of the original line
446			"""
447			return self.stats["orig"]
448
449			def get_clean_stats(self):
450			"""Get clean stats of the line
451
452			Returns:
453			Statistics: Statistics of the clean line
454			"""
455			if self.stats["clean"] is None: # Compute clean stats if it is not already done
456			self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])
457
458			tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement
459			tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement
460			tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement
461			tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement
462			line_stats = Counter(tmp_line)
463
464			self.stats["clean"].set_stat("lw_char", line_stats["a"])
465			self.stats["clean"].set_stat("up_char", line_stats["A"])
466			self.stats["clean"].set_stat("nb_char", line_stats["0"])
467			self.stats["clean"].set_stat("sp_char", line_stats["#"])
468
469			return self.stats["clean"]
470
471			def get_line_score(self):
472			"""Return a global score of the line
473
474			Returns:
475			float: Score of the line
476			"""
477			score = 0
478
479			if len(self.tokens) == 0:
480			return score
481
482			for token in [t[2] for t in self.tokens if not t[2] is None]:
483			score += mean([s for s in token.values()])
484
485			return score / len(self.tokens)
486
487			def __len__(self):
488			return len(self.get_orig_line())
489
490			def __str__(self):
491			return str(self.tokens) + " \| " + str(self.grade)
492

usnistgov / ocr-pipeline

Text.retrieve_text_score() F last analyzed 2017-09-28 14:20 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

Text.retrieve_text_score() F
last analyzed 2017-09-28 14:20 UTC