src.denoiser.text.Line - Code Metrics - Inspection of "Update LICENSE.txt" - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (e214b7)

by Philippe

created 2017-07-17 12:35 UTC

src.denoiser.text.Line A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	155
Duplicated Lines	0 %

Metric	Value
dl	0
loc	155
rs	9.8
wmc	31

11 Methods

Rating	Name	Size	Complexity
A	Line.__len__()	2	1
A	Line.__str__()	2	1
A	Line.get_orig_line()	12	2
A	Line.decrease_grade()	5	2
B	Line.get_line_score()	15	6
A	Line.set_clean()	4	1
A	Line.get_clean_stats()	21	2
B	Line.get_clean_line()	18	5
A	Line.set_garbage()	4	1
A	Line.get_orig_stats()	7	1
A	Line.raise_grade()	5	2

"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object.

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
from __future__ import division
import re
from nltk.tokenize import word_tokenize
from unidecode import unidecode
import codecs
from collections import Counter
import csv
import logging
from numpy import mean
from denoiser.text.stats import Statistics


def tokenize(line):
    """Separate line to get clean tokens out of it

    Parameters:
        line (:func:`str`): A line of text

    Returns:
        list - List of different tokens
    """
    separators = "=+/,.:;!?%<>#()&[]{}"

    tokens = []
    tokenized_line = word_tokenize(line)  # Will get rid of most of the separators

    for word in tokenized_line:
        tmp_tokens = [unidecode(word)]

        for separator in separators:
            sep_tokens = []

            for tmp_token in tmp_tokens:
                split_token = tmp_token.split(separator)

                if len(split_token) != 1:  # Token has been split
                    # Concatening the list of token with the separator
                    tkn_sep_list = []

                    for ind, tkn in enumerate(split_token):
                        tkn_sep_list.append(tkn)

                        if ind != len(split_token) - 1:  # Avoid to add the separator at the end
                            tkn_sep_list.append(unicode(separator))

                    sep_tokens += tkn_sep_list
                else:
                    sep_tokens += split_token

            tmp_tokens = sep_tokens

        tokens += [tkn for tkn in tmp_tokens if tkn != '']

    return tokens


def clean_head_tail(word):
    """Clean head and tail of a word

    Parameters:
        word (:func:`str`): The word to clean
    Returns:
        :func:`str` - Cleaned word
    """
    cleaning_regexp = re.compile(r"^[^a-zA-Z'-]*([a-zA-Z'-](.*[a-zA-Z'-])?)[^a-zA-Z'-]*$")
    alpha_regexp = re.compile(r"[a-zA-Z]")

    word_groups = cleaning_regexp.findall(word)

    # Non matching strings are set as dirty (i.e. cannot be cleaned)
    # None is returned
    if len(word_groups) == 0:
        return None

    # Words containing no letters are set to None
    if alpha_regexp.search(word_groups[0][0]) is None:
        return None

    return word_groups[0][0]


class Text(object):
    """Stores the the text from a filename given in parameters

    Args:
        fname (str): Path to the file.

    Attributes:
        filename (:func:`str`): Name of the file.
        text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
        stats (:class:`.Statistics`): Statistics object.
    """

    def __init__(self, fname):
        self.filename = fname
        self.text = []
        self.contains_training_data = False

        self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
                                 "word_total_length", "word_avg_nb", "word_total_nb"])
        self.stats.set_stat("line_nb", 0)
        self.stats.set_stat("line_avg_length", 0)
        self.stats.set_stat("line_total_length", 0)
        self.stats.set_stat("word_avg_length", 0)
        self.stats.set_stat("word_total_length", 0)
        self.stats.set_stat("word_avg_nb", 0)
        self.stats.set_stat("word_total_nb", 0)

    def read_csv(self):
        """Read a CSV file and build the associated text object
        """
        self.contains_training_data = True

        with open(self.filename, "r") as f:
            csv_reader = csv.reader(f)
            paragraph = []

            for row in csv_reader:
                if len(row) != 2:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line = unicode(row[0].decode("utf-8"))
                line = line.strip(" \t\r\n")

                if len(line) == 0:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line_object = Line(line, row[1])
                paragraph.append(line_object)

                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))

                words_len = sum([len(tkn) for tkn in line_object.tokens])
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)

            if len(paragraph) != 0:
                self.text.append(paragraph)

        self.stats.set_stat("line_avg_length",
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
        self.stats.set_stat("word_avg_length",
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
        self.stats.set_stat("word_avg_nb",
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))

        logging.debug(self.filename+" read")

    def read_txt(self):
        """Read a text file and build the associated text object
        """
        self.contains_training_data = False

        with codecs.open(self.filename, "rb", encoding="utf-8") as f:
            paragraph = []

            for line in f:
                line = line.strip(" \t\r\n")

                if len(line) == 0:
                    if len(paragraph) != 0:
                        self.text.append(paragraph)
                        paragraph = []

                    continue

                line_object = Line(line)
                paragraph.append(line_object)

                self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
                self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
                self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))

                words_len = sum([len(tkn) for tkn in line_object.tokens])
                self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)

            if len(paragraph) != 0:
                self.text.append(paragraph)

        self.stats.set_stat("line_avg_length",
                            self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
        self.stats.set_stat("word_avg_length",
                            self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
        self.stats.set_stat("word_avg_nb",
                            self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))

        logging.debug(self.filename+" read")

    def get_clean_lines(self):
        """Returns cleans line from the text object

        Returns:
            list: List of clean lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade == 5:
                    lines.append(line.get_clean_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def get_garbage_lines(self):
        """Returns garbage lines from the text object

        Returns:
            list: List of garbage lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade == 0:
                    lines.append(line.get_orig_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def get_unclassified_lines(self):
        """Returns unclassified lines from the text object

        Returns:
            list: List of unclassified lines
        """
        lines = []

        for paragraph in self.text:
            for line in paragraph:
                if line.grade % 5 != 0:  # Grade is not 0 nor 5
                    lines.append(line.get_orig_line())

            if len(lines) > 0 and lines[-1] != "":
                lines.append("")

        return lines

    def retrieve_text_score(self):
        """Returns some stats and score regarding classification

        Returns:
            dict: Dictionary containing the results
        """
        # True positive is a garbage string detected as such
        score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
        class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}

        for paragraph in self.text:
            for line in paragraph:
                if line.grade != 0 and line.grade != 5:
                    class_stats["unclassified"] += 1
                    continue

                if line.result is None or line.result < 0:
                    class_stats["unrated"] += 1
                    continue

                class_stats["classified"] += 1

                if line.grade == 0:  # Line detected as garbage
                    if line.result == 1:  # Line is clean
                        score_stats["FP"] += 1  # False positive
                    else:  # Line is garbage
                        score_stats["TP"] += 1  # True postive
                else:  # Line detected as clean
                    if line.result == 1:  # Line is clean
                        score_stats["TN"] += 1  # True negative
                    else:  # Line is garbage
                        score_stats["FN"] += 1  # False negative

        # Precision
        divider_pr = score_stats["TP"] + score_stats["FP"]
        if divider_pr != 0:
            precision = score_stats["TP"] / divider_pr
        else:
            precision = 0

        # Recall
        divider_rc = score_stats["TP"] + score_stats["FN"]
        if divider_rc != 0:
            recall = score_stats["TP"] / divider_rc
        else:
            recall = 0

        # F1 score
        if precision + recall != 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0

        return {
            "class": class_stats,
            "score": {
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "raw": score_stats
        }


class Line(object):
    """Represents a line of text and provides datastructures to handle it.

    Args:
        string (str): Line to parse.
        result (int): (**Optional**) Expected result for a line (either a garbage string or a clean line)

    Attributes:
        tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
            3 element organized in this order `(original_token, clean_token, corrected_token)`
        pos_string (:func:`str`): Reference string containing the position of all the tokens
        result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
                                              algorithm
        grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
        stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
            **lower**, **upper** and **special** characters along with **numbers**.
    """

    def __init__(self, string, result=None):
        self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]

        self.pos_string = string  # String containing the position of each token (e.g. "%0 %1%2 ... %n")
        for index, token in enumerate(self.tokens):
            self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)

        self.result = None
        if result is not None:
            self.result = int(result)

        if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
            self.grade = 0
        else:
            self.grade = 3

        self.stats = {
            "orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
            "clean": None
        }

        tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line())  # Lower chars replacement
        tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
        tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
        tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
        line_stats = Counter(tmp_line)

        self.stats["orig"].set_stat("lw_char", line_stats["a"])
        self.stats["orig"].set_stat("up_char", line_stats["A"])
        self.stats["orig"].set_stat("nb_char", line_stats["0"])
        self.stats["orig"].set_stat("sp_char", line_stats["#"])

    def raise_grade(self):
        """Add 1 to the grade of the line (up to 5)
        """
        if self.grade < 5:
            self.grade += 1

    def decrease_grade(self):
        """Remove 1 to the grade of the line (down to 0)
        """
        if self.grade > 0:
            self.grade -= 1

    def set_garbage(self):
        """Set the grade to 0
        """
        self.grade = 0

    def set_clean(self):
        """Set the grade to 5
        """
        self.grade = 5

    def get_orig_line(self):
        """Returns the original line

        Returns:
            str: Original line
        """
        string = self.pos_string

        for index, token in reversed(list(enumerate(self.tokens))):
            string = string.replace("%"+str(index), token[0])

        return string

    def get_clean_line(self):
        """Returns the clean line

        Returns:
            str: Clean line
        """
        string = self.pos_string

        for index, token in reversed(list(enumerate(self.tokens))):
            if not token[2] is None and len(token[2]) > 0:
                string = string.replace("%"+str(index), token[2].keys()[0])
            else:  # Inline correction is not available
                if not token[1] is None:
                    string = string.replace("%"+str(index), token[1])
                else:  # Clean token does not exist, use the original token
                    string = string.replace("%"+str(index), token[0])

        return re.sub(" +", " ", string).strip()

    def get_orig_stats(self):
        """Get original stats of the line

        Returns:
            Statistics: Statistics of the original line
        """
        return self.stats["orig"]

    def get_clean_stats(self):
        """Get clean stats of the line

        Returns:
            Statistics: Statistics of the clean line
        """
        if self.stats["clean"] is None:  # Compute clean stats if it is not already done
            self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])

            tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line())  # Lower chars replacement
            tmp_line = re.sub(r'[A-Z]', 'A', tmp_line)  # Upper chars replacement
            tmp_line = re.sub(r'[0-9]', '0', tmp_line)  # Numbers replacement
            tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line)  # Special chars replacement
            line_stats = Counter(tmp_line)

            self.stats["clean"].set_stat("lw_char", line_stats["a"])
            self.stats["clean"].set_stat("up_char", line_stats["A"])
            self.stats["clean"].set_stat("nb_char", line_stats["0"])
            self.stats["clean"].set_stat("sp_char", line_stats["#"])

        return self.stats["clean"]

    def get_line_score(self):
        """Return a global score of the line

        Returns:
            float: Score of the line
        """
        score = 0

        if len(self.tokens) == 0:
            return score

        for token in [t[2] for t in self.tokens if not t[2] is None]:
            score += mean([s for s in token.values()])

        return score / len(self.tokens)

    def __len__(self):
        return len(self.get_orig_line())

    def __str__(self):
        return str(self.tokens) + " | " + str(self.grade)


1			"""This module contains necessary classes to parse a file in order to get the :class:`.Text` object.
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			from __future__ import division
17			import re
18			from nltk.tokenize import word_tokenize
19			from unidecode import unidecode
20			import codecs
21			from collections import Counter
22			import csv
23			import logging
24			from numpy import mean
25			from denoiser.text.stats import Statistics
26
27
28			def tokenize(line):
29			"""Separate line to get clean tokens out of it
30
31			Parameters:
32			line (:func:`str`): A line of text
33
34			Returns:
35			list - List of different tokens
36			"""
37			separators = "=+/,.:;!?%<>#()&[]{}"
38
39			tokens = []
40			tokenized_line = word_tokenize(line) # Will get rid of most of the separators
41
42			for word in tokenized_line:
43			tmp_tokens = [unidecode(word)]
44
45			for separator in separators:
46			sep_tokens = []
47
48			for tmp_token in tmp_tokens:
49			split_token = tmp_token.split(separator)
50
51			if len(split_token) != 1: # Token has been split
52			# Concatening the list of token with the separator
53			tkn_sep_list = []
54
55			for ind, tkn in enumerate(split_token):
56			tkn_sep_list.append(tkn)
57
58			if ind != len(split_token) - 1: # Avoid to add the separator at the end
59			tkn_sep_list.append(unicode(separator))
60
61			sep_tokens += tkn_sep_list
62			else:
63			sep_tokens += split_token
64
65			tmp_tokens = sep_tokens
66
67			tokens += [tkn for tkn in tmp_tokens if tkn != '']
68
69			return tokens
70
71
72			def clean_head_tail(word):
73			"""Clean head and tail of a word
74
75			Parameters:
76			word (:func:`str`): The word to clean
77			Returns:
78			:func:`str` - Cleaned word
79			"""
80			cleaning_regexp = re.compile(r"^[^a-zA-Z'-]([a-zA-Z'-](.[a-zA-Z'-])?)[^a-zA-Z'-]*$")
81			alpha_regexp = re.compile(r"[a-zA-Z]")
82
83			word_groups = cleaning_regexp.findall(word)
84
85			# Non matching strings are set as dirty (i.e. cannot be cleaned)
86			# None is returned
87			if len(word_groups) == 0:
88			return None
89
90			# Words containing no letters are set to None
91			if alpha_regexp.search(word_groups[0][0]) is None:
92			return None
93
94			return word_groups[0][0]
95
96
97			class Text(object):
98			"""Stores the the text from a filename given in parameters
99
100			Args:
101			fname (str): Path to the file.
102
103			Attributes:
104			filename (:func:`str`): Name of the file.
105			text (:func:`list`): List of paragraphs. Every paragraph is a list of :class:`.Line`.
106			stats (:class:`.Statistics`): Statistics object.
107			"""
108
109			def __init__(self, fname):
110			self.filename = fname
111			self.text = []
112			self.contains_training_data = False
113
114			self.stats = Statistics(["line_nb", "line_avg_length", "line_total_length", "word_avg_length",
115			"word_total_length", "word_avg_nb", "word_total_nb"])
116			self.stats.set_stat("line_nb", 0)
117			self.stats.set_stat("line_avg_length", 0)
118			self.stats.set_stat("line_total_length", 0)
119			self.stats.set_stat("word_avg_length", 0)
120			self.stats.set_stat("word_total_length", 0)
121			self.stats.set_stat("word_avg_nb", 0)
122			self.stats.set_stat("word_total_nb", 0)
123
124			def read_csv(self):
125			"""Read a CSV file and build the associated text object
126			"""
127			self.contains_training_data = True
128
129			with open(self.filename, "r") as f:
130			csv_reader = csv.reader(f)
131			paragraph = []
132
133			for row in csv_reader:
134			if len(row) != 2:
135			if len(paragraph) != 0:
136			self.text.append(paragraph)
137			paragraph = []
138
139			continue
140
141			line = unicode(row[0].decode("utf-8"))
142			line = line.strip(" \t\r\n")
143
144			if len(line) == 0:
145			if len(paragraph) != 0:
146			self.text.append(paragraph)
147			paragraph = []
148
149			continue
150
151			line_object = Line(line, row[1])
152			paragraph.append(line_object)
153
154			self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
155			self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
156			self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
157
158			words_len = sum([len(tkn) for tkn in line_object.tokens])
159			self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
160
161			if len(paragraph) != 0:
162			self.text.append(paragraph)
163
164			self.stats.set_stat("line_avg_length",
165			self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
166			self.stats.set_stat("word_avg_length",
167			self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
168			self.stats.set_stat("word_avg_nb",
169			self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
170
171			logging.debug(self.filename+" read")
172
173			def read_txt(self):
174			"""Read a text file and build the associated text object
175			"""
176			self.contains_training_data = False
177
178			with codecs.open(self.filename, "rb", encoding="utf-8") as f:
179			paragraph = []
180
181			for line in f:
182			line = line.strip(" \t\r\n")
183
184			if len(line) == 0:
185			if len(paragraph) != 0:
186			self.text.append(paragraph)
187			paragraph = []
188
189			continue
190
191			line_object = Line(line)
192			paragraph.append(line_object)
193
194			self.stats.set_stat("line_nb", self.stats.get_stat("line_nb")+1)
195			self.stats.set_stat("line_total_length", self.stats.get_stat("line_total_length")+len(line_object))
196			self.stats.set_stat("word_total_nb", self.stats.get_stat("word_total_nb") + len(line_object.tokens))
197
198			words_len = sum([len(tkn) for tkn in line_object.tokens])
199			self.stats.set_stat("word_total_length", self.stats.get_stat("word_total_length") + words_len)
200
201			if len(paragraph) != 0:
202			self.text.append(paragraph)
203
204			self.stats.set_stat("line_avg_length",
205			self.stats.get_stat("line_total_length") / self.stats.get_stat("line_nb"))
206			self.stats.set_stat("word_avg_length",
207			self.stats.get_stat("word_total_length") / self.stats.get_stat("word_total_nb"))
208			self.stats.set_stat("word_avg_nb",
209			self.stats.get_stat("word_total_nb") / self.stats.get_stat("line_nb"))
210
211			logging.debug(self.filename+" read")
212
213			def get_clean_lines(self):
214			"""Returns cleans line from the text object
215
216			Returns:
217			list: List of clean lines
218			"""
219			lines = []
220
221			for paragraph in self.text:
222			for line in paragraph:
223			if line.grade == 5:
224			lines.append(line.get_clean_line())
225
226			if len(lines) > 0 and lines[-1] != "":
227			lines.append("")
228
229			return lines
230
231			def get_garbage_lines(self):
232			"""Returns garbage lines from the text object
233
234			Returns:
235			list: List of garbage lines
236			"""
237			lines = []
238
239			for paragraph in self.text:
240			for line in paragraph:
241			if line.grade == 0:
242			lines.append(line.get_orig_line())
243
244			if len(lines) > 0 and lines[-1] != "":
245			lines.append("")
246
247			return lines
248
249			def get_unclassified_lines(self):
250			"""Returns unclassified lines from the text object
251
252			Returns:
253			list: List of unclassified lines
254			"""
255			lines = []
256
257			for paragraph in self.text:
258			for line in paragraph:
259			if line.grade % 5 != 0: # Grade is not 0 nor 5
260			lines.append(line.get_orig_line())
261
262			if len(lines) > 0 and lines[-1] != "":
263			lines.append("")
264
265			return lines
266
267			def retrieve_text_score(self):
268			"""Returns some stats and score regarding classification
269
270			Returns:
271			dict: Dictionary containing the results
272			"""
273			# True positive is a garbage string detected as such
274			score_stats = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
275			class_stats = {"classified": 0, "unclassified": 0, "unrated": 0}
276
277			for paragraph in self.text:
278			for line in paragraph:
279			if line.grade != 0 and line.grade != 5:
280			class_stats["unclassified"] += 1
281			continue
282
283			if line.result is None or line.result < 0:
284			class_stats["unrated"] += 1
285			continue
286
287			class_stats["classified"] += 1
288
289			if line.grade == 0: # Line detected as garbage
290			if line.result == 1: # Line is clean
291			score_stats["FP"] += 1 # False positive
292			else: # Line is garbage
293			score_stats["TP"] += 1 # True postive
294			else: # Line detected as clean
295			if line.result == 1: # Line is clean
296			score_stats["TN"] += 1 # True negative
297			else: # Line is garbage
298			score_stats["FN"] += 1 # False negative
299
300			# Precision
301			divider_pr = score_stats["TP"] + score_stats["FP"]
302			if divider_pr != 0:
303			precision = score_stats["TP"] / divider_pr
304			else:
305			precision = 0
306
307			# Recall
308			divider_rc = score_stats["TP"] + score_stats["FN"]
309			if divider_rc != 0:
310			recall = score_stats["TP"] / divider_rc
311			else:
312			recall = 0
313
314			# F1 score
315			if precision + recall != 0:
316			f1 = 2 * precision * recall / (precision + recall)
317			else:
318			f1 = 0
319
320			return {
321			"class": class_stats,
322			"score": {
323			"precision": precision,
324			"recall": recall,
325			"f1": f1
326			},
327			"raw": score_stats
328			}
329
330
331			class Line(object):
332			"""Represents a line of text and provides datastructures to handle it.
333
334			Args:
335			string (str): Line to parse.
336			result (int): (Optional) Expected result for a line (either a garbage string or a clean line)
337
338			Attributes:
339			tokens (:func:`list`): List of tokens contained in the initial string. Every list element is a :func:`list` of
340			3 element organized in this order `(original_token, clean_token, corrected_token)`
341			pos_string (:func:`str`): Reference string containing the position of all the tokens
342			result (:func:`int` or :data:`None`): Expected result for a line. Helps compute fitness (F1 score) of the
343			algorithm
344			grade (:func:`int`): Grade of a line, between 0 (garbage string) and 5 (clean line).
345			stats (:func:`dict`): Dictionary containing two :class:`.Statistics` objects. Each of them compute the number of
346			lower, upper and special characters along with numbers.
347			"""
348
349			def __init__(self, string, result=None):
350			self.tokens = [[tkn, clean_head_tail(tkn), None] for tkn in tokenize(string)]
351
352			self.pos_string = string # String containing the position of each token (e.g. "%0 %1%2 ... %n")
353			for index, token in enumerate(self.tokens):
354			self.pos_string = self.pos_string.replace(token[0], "%"+str(index), 1)
355
356			self.result = None
357			if result is not None:
358			self.result = int(result)
359
360			if sum([len(t[1]) for t in self.tokens if not t[1] is None]) == 0:
361			self.grade = 0
362			else:
363			self.grade = 3
364
365			self.stats = {
366			"orig": Statistics(["lw_char", "up_char", "nb_char", "sp_char"]),
367			"clean": None
368			}
369
370			tmp_line = re.sub(r'[a-z]', 'a', self.get_orig_line()) # Lower chars replacement
371			tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement
372			tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement
373			tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement
374			line_stats = Counter(tmp_line)
375
376			self.stats["orig"].set_stat("lw_char", line_stats["a"])
377			self.stats["orig"].set_stat("up_char", line_stats["A"])
378			self.stats["orig"].set_stat("nb_char", line_stats["0"])
379			self.stats["orig"].set_stat("sp_char", line_stats["#"])
380
381			def raise_grade(self):
382			"""Add 1 to the grade of the line (up to 5)
383			"""
384			if self.grade < 5:
385			self.grade += 1
386
387			def decrease_grade(self):
388			"""Remove 1 to the grade of the line (down to 0)
389			"""
390			if self.grade > 0:
391			self.grade -= 1
392
393			def set_garbage(self):
394			"""Set the grade to 0
395			"""
396			self.grade = 0
397
398			def set_clean(self):
399			"""Set the grade to 5
400			"""
401			self.grade = 5
402
403			def get_orig_line(self):
404			"""Returns the original line
405
406			Returns:
407			str: Original line
408			"""
409			string = self.pos_string
410
411			for index, token in reversed(list(enumerate(self.tokens))):
412			string = string.replace("%"+str(index), token[0])
413
414			return string
415
416			def get_clean_line(self):
417			"""Returns the clean line
418
419			Returns:
420			str: Clean line
421			"""
422			string = self.pos_string
423
424			for index, token in reversed(list(enumerate(self.tokens))):
425			if not token[2] is None and len(token[2]) > 0:
426			string = string.replace("%"+str(index), token[2].keys()[0])
427			else: # Inline correction is not available
428			if not token[1] is None:
429			string = string.replace("%"+str(index), token[1])
430			else: # Clean token does not exist, use the original token
431			string = string.replace("%"+str(index), token[0])
432
433			return re.sub(" +", " ", string).strip()
434
435			def get_orig_stats(self):
436			"""Get original stats of the line
437
438			Returns:
439			Statistics: Statistics of the original line
440			"""
441			return self.stats["orig"]
442
443			def get_clean_stats(self):
444			"""Get clean stats of the line
445
446			Returns:
447			Statistics: Statistics of the clean line
448			"""
449			if self.stats["clean"] is None: # Compute clean stats if it is not already done
450			self.stats["clean"] = Statistics(["lw_char", "up_char", "nb_char", "sp_char"])
451
452			tmp_line = re.sub(r'[a-z]', 'a', self.get_clean_line()) # Lower chars replacement
453			tmp_line = re.sub(r'[A-Z]', 'A', tmp_line) # Upper chars replacement
454			tmp_line = re.sub(r'[0-9]', '0', tmp_line) # Numbers replacement
455			tmp_line = re.sub(r'[^a-zA-Z0-9 ]', '#', tmp_line) # Special chars replacement
456			line_stats = Counter(tmp_line)
457
458			self.stats["clean"].set_stat("lw_char", line_stats["a"])
459			self.stats["clean"].set_stat("up_char", line_stats["A"])
460			self.stats["clean"].set_stat("nb_char", line_stats["0"])
461			self.stats["clean"].set_stat("sp_char", line_stats["#"])
462
463			return self.stats["clean"]
464
465			def get_line_score(self):
466			"""Return a global score of the line
467
468			Returns:
469			float: Score of the line
470			"""
471			score = 0
472
473			if len(self.tokens) == 0:
474			return score
475
476			for token in [t[2] for t in self.tokens if not t[2] is None]:
477			score += mean([s for s in token.values()])
478
479			return score / len(self.tokens)
480
481			def __len__(self):
482			return len(self.get_orig_line())
483
484			def __str__(self):
485			return str(self.tokens) + " \| " + str(self.grade)
486

usnistgov / ocr-pipeline

Branch — master (e214b7)

src.denoiser.text.Line A

Complexity

Size/Duplication

11 Methods

Duplication Side-by-Side

Filter issues like