Passed
Pull Request — master (#518)
by Osma
02:07
created

annif.lexical.tokenset.TokenSet.sample()   A

Complexity

Conditions 2

Size

Total Lines 6
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nop 1
dl 0
loc 6
rs 10
c 0
b 0
f 0
1
"""Index for fast matching of token sets."""
2
3
import collections
4
5
6
class TokenSet:
7
    """Represents a set of tokens (expressed as integer token IDs) that can
8
    be matched with another set of tokens. A TokenSet can optionally
9
    be associated with a subject from the vocabulary."""
10
11
    def __init__(self, tokens, subject_id=None, is_pref=False):
12
        self._tokens = set(tokens)
13
        self.key = tokens[0] if len(tokens) else None
14
        self.subject_id = subject_id
15
        self.is_pref = is_pref
16
17
    def __len__(self):
18
        return len(self._tokens)
19
20
    def __iter__(self):
21
        return iter(self._tokens)
22
23
    def contains(self, other):
24
        """Returns True iff the tokens in the other TokenSet are all
25
        included within this TokenSet."""
26
27
        return other._tokens.issubset(self._tokens)
28
29
30
class TokenSetIndex:
31
    """A searchable index of TokenSets (representing vocabulary terms)"""
32
33
    def __init__(self):
34
        self._index = collections.defaultdict(set)
35
36
    def __len__(self):
37
        return len(self._index)
38
39
    def add(self, tset):
40
        """Add a TokenSet into this index"""
41
        if tset.key is not None:
42
            self._index[tset.key].add(tset)
43
44
    def _find_subj_tsets(self, tset):
45
        """return a dict (subject_id : TokenSet) of matches contained in the
46
        given TokenSet"""
47
48
        subj_tsets = {}
49
50
        for token in tset:
51
            for ts in self._index[token]:
52
                if tset.contains(ts) \
53
                   and (ts.subject_id not in subj_tsets
54
                        or not subj_tsets[ts.subject_id].is_pref):
55
                    subj_tsets[ts.subject_id] = ts
56
57
        return subj_tsets
58
59
    def _find_subj_ambiguity(self, tsets):
60
        """calculate the ambiguity values (the number of other TokenSets
61
        that also match the same tokens) for the given TokenSets and return
62
        them as a dict-like object (subject_id : ambiguity_value)"""
63
64
        subj_ambiguity = collections.Counter()
65
66
        subj_ambiguity.update([ts.subject_id
67
                               for ts in tsets
68
                               for other in tsets
69
                               if ts != other
70
                               and other.contains(ts)])
71
72
        return subj_ambiguity
73
74
    def search(self, tset):
75
        """Return the TokenSets that are contained in the given TokenSet.
76
        The matches are returned as a list of (TokenSet, ambiguity) pairs
77
        where ambiguity is an integer indicating the number of other TokenSets
78
        that also match the same tokens."""
79
80
        subj_tsets = self._find_subj_tsets(tset)
81
        subj_ambiguity = self._find_subj_ambiguity(subj_tsets.values())
82
83
        return [(ts, subj_ambiguity[subject_id])
84
                for subject_id, ts in subj_tsets.items()]
85