CleanTextIndicator - Code Metrics - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

CleanTextIndicator A
last analyzed 2017-09-28 14:20 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	16
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
c	1
b	0
f	1
dl	0
loc	16
rs	10
wmc	3

2 Methods

Rating	Name	Duplication	Size	Complexity
A	__init__()	0	4	1
A	match()	0	7	2

"""List of all the different indicators used to clean a text

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
from __future__ import division
import re


class StatsIndicator(object):
    """Indicator based on statistics (match the line depending on the stats)
    """

    def __init__(self, text_stats=None):
        self.stats = text_stats

    def set_stats(self, text_stats):
        """Set statistics of the indicator based on text statistics

        Args:
            text_stats (`Statistics`): Text statistics
        """
        self.stats = text_stats

    def match(self, line):
        """Define if a line is matching the rules

        Args:
            line (Line): Input line

        Returns:
            bool: True
        """
        return True


class RegexIndicator(object):
    """Indicator based on a regexp (match the line with the given regexp)
    """

    def __init__(self, regexp):
        self.regexp = '^'+regexp+'$'

    def match(self, line):
        """Define if a line is matching the rules

        Args:
            line (Line): Input line

        Returns:
            bool: True if line match the RegExp, false otherwise
        """
        return re.match(self.regexp, line.get_clean_line())


# ==========================================
# STRONG INDICATORS
# ==========================================

class AlphaNumIndicator(StatsIndicator):
    """Indicator detecting a high number of special chars
    """

    def __init__(self, stats=None):
        self.spchar_rate = 0.6
        super(AlphaNumIndicator, self).__init__(stats)

    def match(self, line):
        return True if len(line) == 0 else line.get_clean_stats().get_stat('sp_char') / len(line) > self.spchar_rate


class CardinalNumberIndicator(RegexIndicator):
    """Indicator detecting cardinal numbers
    """

    def __init__(self):
        super(CardinalNumberIndicator, self).__init__("[0-9efEaAoOsSt.,= \\-]+")


# ==========================================
# CLEAN INDICATORS
# ==========================================

class CleanTextIndicator(StatsIndicator):
    """Indicator detecting a clean line
    """

    def __init__(self, stats=None):
        self.max_length_rate = 0.5
        self.char_rate = 0.6
        super(CleanTextIndicator, self).__init__(stats)

    def match(self, line):
        if len(line) == 0:
            return False

        return float(len(line)) >= self.stats.get_stat("line_avg_length") * self.max_length_rate \
            and (line.get_clean_stats().get_stat('lw_char') / len(line) > self.char_rate
                 or line.get_clean_stats().get_stat('up_char') / len(line) > self.char_rate)


class TitleIndicator(RegexIndicator):
    """Indicator matching a title. A title is a line beginning with an upper char and followed by lower chars or space
    """

    def __init__(self):
        super(TitleIndicator, self).__init__("[A-Z][a-z ]+")


1			"""List of all the different indicators used to clean a text
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			from __future__ import division
17			import re
18
19
20			class StatsIndicator(object):
21			"""Indicator based on statistics (match the line depending on the stats)
22			"""
23
24			def __init__(self, text_stats=None):
25			self.stats = text_stats
26
27			def set_stats(self, text_stats):
28			"""Set statistics of the indicator based on text statistics
29
30			Args:
31			text_stats (`Statistics`): Text statistics
32			"""
33			self.stats = text_stats
34
35			def match(self, line):
36			"""Define if a line is matching the rules
37
38			Args:
39			line (Line): Input line
40
41			Returns:
42			bool: True
43			"""
44			return True
45
46
47			class RegexIndicator(object):
48			"""Indicator based on a regexp (match the line with the given regexp)
49			"""
50
51			def __init__(self, regexp):
52			self.regexp = '^'+regexp+'$'
53
54			def match(self, line):
55			"""Define if a line is matching the rules
56
57			Args:
58			line (Line): Input line
59
60			Returns:
61			bool: True if line match the RegExp, false otherwise
62			"""
63			return re.match(self.regexp, line.get_clean_line())
64
65
66			# ==========================================
67			# STRONG INDICATORS
68			# ==========================================
69
70			class AlphaNumIndicator(StatsIndicator):
71			"""Indicator detecting a high number of special chars
72			"""
73
74			def __init__(self, stats=None):
75			self.spchar_rate = 0.6
76			super(AlphaNumIndicator, self).__init__(stats)
77
78			def match(self, line):
79			return True if len(line) == 0 else line.get_clean_stats().get_stat('sp_char') / len(line) > self.spchar_rate
80
81
82			class CardinalNumberIndicator(RegexIndicator):
83			"""Indicator detecting cardinal numbers
84			"""
85
86			def __init__(self):
87			super(CardinalNumberIndicator, self).__init__("[0-9efEaAoOsSt.,= \\-]+")
88
89
90			# ==========================================
91			# CLEAN INDICATORS
92			# ==========================================
93
94			class CleanTextIndicator(StatsIndicator):
95			"""Indicator detecting a clean line
96			"""
97
98			def __init__(self, stats=None):
99			self.max_length_rate = 0.5
100			self.char_rate = 0.6
101			super(CleanTextIndicator, self).__init__(stats)
102
103			def match(self, line):
104			if len(line) == 0:
105			return False
106
107			return float(len(line)) >= self.stats.get_stat("line_avg_length") * self.max_length_rate \
108			and (line.get_clean_stats().get_stat('lw_char') / len(line) > self.char_rate
109			or line.get_clean_stats().get_stat('up_char') / len(line) > self.char_rate)
110
111
112			class TitleIndicator(RegexIndicator):
113			"""Indicator matching a title. A title is a line beginning with an upper char and followed by lower chars or space
114			"""
115
116			def __init__(self):
117			super(TitleIndicator, self).__init__("[A-Z][a-z ]+")
118

usnistgov / ocr-pipeline

CleanTextIndicator A last analyzed 2017-09-28 14:20 UTC

Complexity

Size/Duplication

Importance

2 Methods

Duplication Side-by-Side

Filter issues like

CleanTextIndicator A
last analyzed 2017-09-28 14:20 UTC