CleanTextIndicator   A
last analyzed

Complexity

Total Complexity 3

Size/Duplication

Total Lines 16
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 16
rs 10
wmc 3

2 Methods

Rating   Name   Duplication   Size   Complexity  
A __init__() 0 4 1
A match() 0 7 2
1
"""List of all the different indicators used to clean a text
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
from __future__ import division
17
import re
18
19
20
class StatsIndicator(object):
21
    """Indicator based on statistics (match the line depending on the stats)
22
    """
23
24
    def __init__(self, text_stats=None):
25
        self.stats = text_stats
26
27
    def set_stats(self, text_stats):
28
        """Set statistics of the indicator based on text statistics
29
30
        Args:
31
            text_stats (`Statistics`): Text statistics
32
        """
33
        self.stats = text_stats
34
35
    def match(self, line):
36
        """Define if a line is matching the rules
37
38
        Args:
39
            line (Line): Input line
40
41
        Returns:
42
            bool: True
43
        """
44
        return True
45
46
47
class RegexIndicator(object):
48
    """Indicator based on a regexp (match the line with the given regexp)
49
    """
50
51
    def __init__(self, regexp):
52
        self.regexp = '^'+regexp+'$'
53
54
    def match(self, line):
55
        """Define if a line is matching the rules
56
57
        Args:
58
            line (Line): Input line
59
60
        Returns:
61
            bool: True if line match the RegExp, false otherwise
62
        """
63
        return re.match(self.regexp, line.get_clean_line())
64
65
66
# ==========================================
67
# STRONG INDICATORS
68
# ==========================================
69
70
class AlphaNumIndicator(StatsIndicator):
71
    """Indicator detecting a high number of special chars
72
    """
73
74
    def __init__(self, stats=None):
75
        self.spchar_rate = 0.6
76
        super(AlphaNumIndicator, self).__init__(stats)
77
78
    def match(self, line):
79
        return True if len(line) == 0 else line.get_clean_stats().get_stat('sp_char') / len(line) > self.spchar_rate
80
81
82
class CardinalNumberIndicator(RegexIndicator):
83
    """Indicator detecting cardinal numbers
84
    """
85
86
    def __init__(self):
87
        super(CardinalNumberIndicator, self).__init__("[0-9efEaAoOsSt.,= \\-]+")
88
89
90
# ==========================================
91
# CLEAN INDICATORS
92
# ==========================================
93
94
class CleanTextIndicator(StatsIndicator):
95
    """Indicator detecting a clean line
96
    """
97
98
    def __init__(self, stats=None):
99
        self.max_length_rate = 0.5
100
        self.char_rate = 0.6
101
        super(CleanTextIndicator, self).__init__(stats)
102
103
    def match(self, line):
104
        if len(line) == 0:
105
            return False
106
107
        return float(len(line)) >= self.stats.get_stat("line_avg_length") * self.max_length_rate \
108
            and (line.get_clean_stats().get_stat('lw_char') / len(line) > self.char_rate
109
                 or line.get_clean_stats().get_stat('up_char') / len(line) > self.char_rate)
110
111
112
class TitleIndicator(RegexIndicator):
113
    """Indicator matching a title. A title is a line beginning with an upper char and followed by lower chars or space
114
    """
115
116
    def __init__(self):
117
        super(TitleIndicator, self).__init__("[A-Z][a-z ]+")
118