Passed
Pull Request — master (#462)
by Osma
02:32
created

annif.lexical.tokenset.TokenSet.__len__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""Index for fast matching of token sets."""
2
3
import collections
4
5
6
class TokenSet:
7
    """Represents a set of tokens (expressed as integer token IDs) that can
8
    be matched with another set of tokens. A TokenSet can optionally
9
    be associated with a subject from the vocabulary."""
10
11
    def __init__(self, tokens, subject_id=None, is_pref=False):
12
        self._tokens = set(tokens)
13
        self.subject_id = subject_id
14
        self.is_pref = is_pref
15
16
    def __len__(self):
17
        return len(self._tokens)
18
19
    def __iter__(self):
20
        return iter(self._tokens)
21
22
    def contains(self, other):
23
        """Returns True iff the tokens in the other TokenSet are all
24
        included within this TokenSet."""
25
26
        return other._tokens.issubset(self._tokens)
27
28
    def sample(self):
29
        """Return an arbitrary token from this TokenSet, or None if empty"""
30
        try:
31
            return next(iter(self._tokens))
32
        except StopIteration:
33
            return None
34
35
36
class TokenSetIndex:
37
    """A searchable index of TokenSets (representing vocabulary terms)"""
38
39
    def __init__(self):
40
        self._index = collections.defaultdict(set)
41
42
    def __len__(self):
43
        return len(self._index)
44
45
    def add(self, tset):
46
        """Add a TokenSet into this index"""
47
        token = tset.sample()
48
        if token is not None:
49
            self._index[token].add(tset)
50
51
    def search(self, tset):
52
        """Return the TokenSets that are contained in the given TokenSet.
53
        The matches are returned as a list of (TokenSet, ambiguity) pairs
54
        where ambiguity is an integer indicating the number of other TokenSets
55
        that also match the same tokens."""
56
57
        subj_tsets = {}
58
        subj_ambiguity = collections.Counter()
59
60
        for token in tset:
61
            for ts in self._index[token]:
62
                if not tset.contains(ts):
63
                    continue
64
                if ts.subject_id not in subj_tsets or \
65
                   not subj_tsets[ts.subject_id].is_pref:
66
                    subj_tsets[ts.subject_id] = ts
67
68
        for ts in subj_tsets.values():
69
            for other in subj_tsets.values():
70
                if ts == other:
71
                    continue
72
                if other.contains(ts):
73
                    subj_ambiguity.update([ts.subject_id])
74
75
        return [(ts, subj_ambiguity[ts.subject_id])
76
                for uri, ts in subj_tsets.items()]
77