1
|
|
|
"""Index for fast matching of token sets.""" |
2
|
|
|
|
3
|
|
|
import collections |
4
|
|
|
|
5
|
|
|
|
6
|
|
|
class TokenSet: |
7
|
|
|
"""Represents a set of tokens (expressed as integer token IDs) that can |
8
|
|
|
be matched with another set of tokens. A TokenSet can optionally |
9
|
|
|
be associated with a subject from the vocabulary.""" |
10
|
|
|
|
11
|
|
|
def __init__(self, tokens, subject_id=None, is_pref=False): |
12
|
|
|
self._tokens = set(tokens) |
13
|
|
|
self.subject_id = subject_id |
14
|
|
|
self.is_pref = is_pref |
15
|
|
|
|
16
|
|
|
def __len__(self): |
17
|
|
|
return len(self._tokens) |
18
|
|
|
|
19
|
|
|
def __iter__(self): |
20
|
|
|
return iter(self._tokens) |
21
|
|
|
|
22
|
|
|
def contains(self, other): |
23
|
|
|
"""Returns True iff the tokens in the other TokenSet are all |
24
|
|
|
included within this TokenSet.""" |
25
|
|
|
|
26
|
|
|
return other._tokens.issubset(self._tokens) |
27
|
|
|
|
28
|
|
|
def sample(self): |
29
|
|
|
"""Return an arbitrary token from this TokenSet, or None if empty""" |
30
|
|
|
try: |
31
|
|
|
return next(iter(self._tokens)) |
32
|
|
|
except StopIteration: |
33
|
|
|
return None |
34
|
|
|
|
35
|
|
|
|
36
|
|
|
class TokenSetIndex: |
37
|
|
|
"""A searchable index of TokenSets (representing vocabulary terms)""" |
38
|
|
|
|
39
|
|
|
def __init__(self): |
40
|
|
|
self._index = collections.defaultdict(set) |
41
|
|
|
|
42
|
|
|
def __len__(self): |
43
|
|
|
return len(self._index) |
44
|
|
|
|
45
|
|
|
def add(self, tset): |
46
|
|
|
"""Add a TokenSet into this index""" |
47
|
|
|
token = tset.sample() |
48
|
|
|
if token is not None: |
49
|
|
|
self._index[token].add(tset) |
50
|
|
|
|
51
|
|
|
def search(self, tset): |
52
|
|
|
"""Return the TokenSets that are contained in the given TokenSet. |
53
|
|
|
The matches are returned as a list of (TokenSet, ambiguity) pairs |
54
|
|
|
where ambiguity is an integer indicating the number of other TokenSets |
55
|
|
|
that also match the same tokens.""" |
56
|
|
|
|
57
|
|
|
subj_tsets = {} |
58
|
|
|
subj_ambiguity = collections.Counter() |
59
|
|
|
|
60
|
|
|
for token in tset: |
61
|
|
|
for ts in self._index[token]: |
62
|
|
|
if not tset.contains(ts): |
63
|
|
|
continue |
64
|
|
|
if ts.subject_id not in subj_tsets or \ |
65
|
|
|
not subj_tsets[ts.subject_id].is_pref: |
66
|
|
|
subj_tsets[ts.subject_id] = ts |
67
|
|
|
|
68
|
|
|
for ts in subj_tsets.values(): |
69
|
|
|
for other in subj_tsets.values(): |
70
|
|
|
if ts == other: |
71
|
|
|
continue |
72
|
|
|
if other.contains(ts): |
73
|
|
|
subj_ambiguity.update([ts.subject_id]) |
74
|
|
|
|
75
|
|
|
return [(ts, subj_ambiguity[ts.subject_id]) |
76
|
|
|
for uri, ts in subj_tsets.items()] |
77
|
|
|
|