Passed
Push — master ( c17a7d...ea11a0 )
by Osma
02:27 queued 13s
created

annif.lexical.tokenset.TokenSet.contains()   A

Complexity

Conditions 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
"""Index for fast matching of token sets."""
2
3
import collections
4
5
6
class TokenSet:
7
    """Represents a set of tokens (expressed as integer token IDs) that can
8
    be matched with another set of tokens. A TokenSet can optionally
9
    be associated with a subject from the vocabulary."""
10
11
    def __init__(self, tokens, subject_id=None, is_pref=False):
12
        self._tokens = set(tokens)
13
        self.subject_id = subject_id
14
        self.is_pref = is_pref
15
16
    def __len__(self):
17
        return len(self._tokens)
18
19
    def __iter__(self):
20
        return iter(self._tokens)
21
22
    def contains(self, other):
23
        """Returns True iff the tokens in the other TokenSet are all
24
        included within this TokenSet."""
25
26
        return other._tokens.issubset(self._tokens)
27
28
    def sample(self):
29
        """Return an arbitrary token from this TokenSet, or None if empty"""
30
        try:
31
            return next(iter(self._tokens))
32
        except StopIteration:
33
            return None
34
35
36
class TokenSetIndex:
37
    """A searchable index of TokenSets (representing vocabulary terms)"""
38
39
    def __init__(self):
40
        self._index = collections.defaultdict(set)
41
42
    def __len__(self):
43
        return len(self._index)
44
45
    def add(self, tset):
46
        """Add a TokenSet into this index"""
47
        token = tset.sample()
48
        if token is not None:
49
            self._index[token].add(tset)
50
51
    def _find_subj_tsets(self, tset):
52
        """return a dict (subject_id : TokenSet) of matches contained in the
53
        given TokenSet"""
54
55
        subj_tsets = {}
56
57
        for token in tset:
58
            for ts in self._index[token]:
59
                if tset.contains(ts) \
60
                   and (ts.subject_id not in subj_tsets
61
                        or not subj_tsets[ts.subject_id].is_pref):
62
                    subj_tsets[ts.subject_id] = ts
63
64
        return subj_tsets
65
66
    def _find_subj_ambiguity(self, tsets):
67
        """calculate the ambiguity values (the number of other TokenSets
68
        that also match the same tokens) for the given TokenSets and return
69
        them as a dict-like object (subject_id : ambiguity_value)"""
70
71
        subj_ambiguity = collections.Counter()
72
73
        subj_ambiguity.update([ts.subject_id
74
                               for ts in tsets
75
                               for other in tsets
76
                               if ts != other
77
                               and other.contains(ts)])
78
79
        return subj_ambiguity
80
81
    def search(self, tset):
82
        """Return the TokenSets that are contained in the given TokenSet.
83
        The matches are returned as a list of (TokenSet, ambiguity) pairs
84
        where ambiguity is an integer indicating the number of other TokenSets
85
        that also match the same tokens."""
86
87
        subj_tsets = self._find_subj_tsets(tset)
88
        subj_ambiguity = self._find_subj_ambiguity(subj_tsets.values())
89
90
        return [(ts, subj_ambiguity[subject_id])
91
                for subject_id, ts in subj_tsets.items()]
92