1
|
|
|
"""Package containing indicators lists |
2
|
|
|
|
3
|
|
|
.. Authors: |
4
|
|
|
Philippe Dessauw |
5
|
|
|
[email protected] |
6
|
|
|
|
7
|
|
|
.. Sponsor: |
8
|
|
|
Alden Dima |
9
|
|
|
[email protected] |
10
|
|
|
Information Systems Group |
11
|
|
|
Software and Systems Division |
12
|
|
|
Information Technology Laboratory |
13
|
|
|
National Institute of Standards and Technology |
14
|
|
|
http://www.nist.gov/itl/ssd/is |
15
|
|
|
""" |
16
|
|
|
from __future__ import division |
17
|
|
|
from denoiser.models.indicators import * |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
class IndicatorsList(object): |
21
|
|
|
"""Object handling a list of indicator of a same purpose |
22
|
|
|
""" |
23
|
|
|
|
24
|
|
|
def __init__(self): |
25
|
|
|
self.indicators = [] |
26
|
|
|
|
27
|
|
|
def add_indicator(self, indicator): |
28
|
|
|
"""Add an indicator to the list |
29
|
|
|
|
30
|
|
|
Args: |
31
|
|
|
indicator (Indicator): Indicator to add to the list |
32
|
|
|
""" |
33
|
|
|
self.indicators.append(indicator) |
34
|
|
|
|
35
|
|
|
def set_stats(self, stats): |
36
|
|
|
"""Set stats for all the StatsIndicator |
37
|
|
|
|
38
|
|
|
Args: |
39
|
|
|
stat (Statistics): Text statistics to setup |
40
|
|
|
""" |
41
|
|
|
for indicator in self.indicators: |
42
|
|
|
if indicator.__class__.__base__ == StatsIndicator: |
43
|
|
|
indicator.set_stats(stats) |
44
|
|
|
|
45
|
|
|
def match(self, line): |
46
|
|
|
"""Define if a line is matching the indicators |
47
|
|
|
|
48
|
|
|
Args: |
49
|
|
|
line (Line): Input line |
50
|
|
|
|
51
|
|
|
Returns: |
52
|
|
|
bool: True if line match at least one indicator |
53
|
|
|
""" |
54
|
|
|
return self.match_rate(line) > 0 |
55
|
|
|
|
56
|
|
|
def match_rate(self, line): |
57
|
|
|
"""Get the ratio of match of a line |
58
|
|
|
|
59
|
|
|
Args: |
60
|
|
|
line (Line): Input line |
61
|
|
|
|
62
|
|
|
Returns: |
63
|
|
|
float: Ratio of match / number of indicators |
64
|
|
|
""" |
65
|
|
|
total_ind = len(self.indicators) |
66
|
|
|
matching_ind = 0 |
67
|
|
|
|
68
|
|
|
for indicator in self.indicators: |
69
|
|
|
if indicator.match(line): |
70
|
|
|
matching_ind += 1 |
71
|
|
|
|
72
|
|
|
return matching_ind / total_ind |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
class StrongIndicatorList(IndicatorsList): |
76
|
|
|
"""List of strong indicator (detecting garbage strings) |
77
|
|
|
""" |
78
|
|
|
|
79
|
|
|
def __init__(self): |
80
|
|
|
super(StrongIndicatorList, self).__init__() |
81
|
|
|
|
82
|
|
|
self.add_indicator(AlphaNumIndicator()) |
83
|
|
|
self.add_indicator(CardinalNumberIndicator()) |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
class CleanIndicatorList(IndicatorsList): |
87
|
|
|
"""List detecting clean lines |
88
|
|
|
""" |
89
|
|
|
|
90
|
|
|
def __init__(self): |
91
|
|
|
super(CleanIndicatorList, self).__init__() |
92
|
|
|
|
93
|
|
|
self.add_indicator(CleanTextIndicator()) |
94
|
|
|
self.add_indicator(TitleIndicator()) |
95
|
|
|
|