annif.lexical.tokenset.TokenSet.sample() - Code Metrics - Inspection of "Use least frequent token as key in TokenSetIndex u..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#518)

by Osma

created 2021-08-19 11:12 UTC

annif.lexical.tokenset.TokenSet.sample() A

↳ Parent: annif.lexical.tokenset

Complexity

Conditions

Size

Total Lines	6
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	5
nop	1
dl	0
loc	6
rs	10
c	0
b	0
f	0

"""Index for fast matching of token sets."""

import collections


class TokenSet:
    """Represents a set of tokens (expressed as integer token IDs) that can
    be matched with another set of tokens. A TokenSet can optionally
    be associated with a subject from the vocabulary."""

    def __init__(self, tokens, subject_id=None, is_pref=False):
        self._tokens = set(tokens)
        self.key = tokens[0] if len(tokens) else None
        self.subject_id = subject_id
        self.is_pref = is_pref

    def __len__(self):
        return len(self._tokens)

    def __iter__(self):
        return iter(self._tokens)

    def contains(self, other):
        """Returns True iff the tokens in the other TokenSet are all
        included within this TokenSet."""

        return other._tokens.issubset(self._tokens)


class TokenSetIndex:
    """A searchable index of TokenSets (representing vocabulary terms)"""

    def __init__(self):
        self._index = collections.defaultdict(set)

    def __len__(self):
        return len(self._index)

    def add(self, tset):
        """Add a TokenSet into this index"""
        if tset.key is not None:
            self._index[tset.key].add(tset)

    def _find_subj_tsets(self, tset):
        """return a dict (subject_id : TokenSet) of matches contained in the
        given TokenSet"""

        subj_tsets = {}

        for token in tset:
            for ts in self._index[token]:
                if tset.contains(ts) \
                   and (ts.subject_id not in subj_tsets
                        or not subj_tsets[ts.subject_id].is_pref):
                    subj_tsets[ts.subject_id] = ts

        return subj_tsets

    def _find_subj_ambiguity(self, tsets):
        """calculate the ambiguity values (the number of other TokenSets
        that also match the same tokens) for the given TokenSets and return
        them as a dict-like object (subject_id : ambiguity_value)"""

        subj_ambiguity = collections.Counter()

        subj_ambiguity.update([ts.subject_id
                               for ts in tsets
                               for other in tsets
                               if ts != other
                               and other.contains(ts)])

        return subj_ambiguity

    def search(self, tset):
        """Return the TokenSets that are contained in the given TokenSet.
        The matches are returned as a list of (TokenSet, ambiguity) pairs
        where ambiguity is an integer indicating the number of other TokenSets
        that also match the same tokens."""

        subj_tsets = self._find_subj_tsets(tset)
        subj_ambiguity = self._find_subj_ambiguity(subj_tsets.values())

        return [(ts, subj_ambiguity[subject_id])
                for subject_id, ts in subj_tsets.items()]


1			"""Index for fast matching of token sets."""
2
3			import collections
4
5
6			class TokenSet:
7			"""Represents a set of tokens (expressed as integer token IDs) that can
8			be matched with another set of tokens. A TokenSet can optionally
9			be associated with a subject from the vocabulary."""
10
11			def __init__(self, tokens, subject_id=None, is_pref=False):
12			self._tokens = set(tokens)
13			self.key = tokens[0] if len(tokens) else None
14			self.subject_id = subject_id
15			self.is_pref = is_pref
16
17			def __len__(self):
18			return len(self._tokens)
19
20			def __iter__(self):
21			return iter(self._tokens)
22
23			def contains(self, other):
24			"""Returns True iff the tokens in the other TokenSet are all
25			included within this TokenSet."""
26
27			return other._tokens.issubset(self._tokens)
28
29
30			class TokenSetIndex:
31			"""A searchable index of TokenSets (representing vocabulary terms)"""
32
33			def __init__(self):
34			self._index = collections.defaultdict(set)
35
36			def __len__(self):
37			return len(self._index)
38
39			def add(self, tset):
40			"""Add a TokenSet into this index"""
41			if tset.key is not None:
42			self._index[tset.key].add(tset)
43
44			def _find_subj_tsets(self, tset):
45			"""return a dict (subject_id : TokenSet) of matches contained in the
46			given TokenSet"""
47
48			subj_tsets = {}
49
50			for token in tset:
51			for ts in self._index[token]:
52			if tset.contains(ts) \
53			and (ts.subject_id not in subj_tsets
54			or not subj_tsets[ts.subject_id].is_pref):
55			subj_tsets[ts.subject_id] = ts
56
57			return subj_tsets
58
59			def _find_subj_ambiguity(self, tsets):
60			"""calculate the ambiguity values (the number of other TokenSets
61			that also match the same tokens) for the given TokenSets and return
62			them as a dict-like object (subject_id : ambiguity_value)"""
63
64			subj_ambiguity = collections.Counter()
65
66			subj_ambiguity.update([ts.subject_id
67			for ts in tsets
68			for other in tsets
69			if ts != other
70			and other.contains(ts)])
71
72			return subj_ambiguity
73
74			def search(self, tset):
75			"""Return the TokenSets that are contained in the given TokenSet.
76			The matches are returned as a list of (TokenSet, ambiguity) pairs
77			where ambiguity is an integer indicating the number of other TokenSets
78			that also match the same tokens."""
79
80			subj_tsets = self._find_subj_tsets(tset)
81			subj_ambiguity = self._find_subj_ambiguity(subj_tsets.values())
82
83			return [(ts, subj_ambiguity[subject_id])
84			for subject_id, ts in subj_tsets.items()]
85

NatLibFi / Annif

Pull Request — master (#518)

annif.lexical.tokenset.TokenSet.sample() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like