annif.lexical.tokenset.TokenSetIndex.add() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-22 15:04 UTC

annif.lexical.tokenset.TokenSetIndex.add() A

↳ Parent: annif.lexical.tokenset

Complexity

Conditions

Size

Total Lines	5
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	4
nop	2
dl	0
loc	5
rs	10
c	0
b	0
f	0

"""Index for fast matching of token sets."""

import collections


class TokenSet:
    """Represents a set of tokens (expressed as integer token IDs) that can
    be matched with another set of tokens. A TokenSet can optionally
    be associated with a subject from the vocabulary."""

    def __init__(self, tokens, subject_id=None, is_pref=False):
        self._tokens = set(tokens)
        self.subject_id = subject_id
        self.is_pref = is_pref

    def __len__(self):
        return len(self._tokens)

    def __iter__(self):
        return iter(self._tokens)

    def contains(self, other):
        """Returns True iff the tokens in the other TokenSet are all
        included within this TokenSet."""

        return other._tokens.issubset(self._tokens)

    def sample(self):
        """Return an arbitrary token from this TokenSet, or None if empty"""
        try:
            return next(iter(self._tokens))
        except StopIteration:
            return None


class TokenSetIndex:
    """A searchable index of TokenSets (representing vocabulary terms)"""

    def __init__(self):
        self._index = collections.defaultdict(set)

    def __len__(self):
        return len(self._index)

    def add(self, tset):
        """Add a TokenSet into this index"""
        token = tset.sample()
        if token is not None:
            self._index[token].add(tset)

    def search(self, tset):
        """Return the TokenSets that are contained in the given TokenSet.
        The matches are returned as a list of (TokenSet, ambiguity) pairs
        where ambiguity is an integer indicating the number of other TokenSets
        that also match the same tokens."""

        subj_tsets = {}
        subj_ambiguity = collections.Counter()

        for token in tset:
            for ts in self._index[token]:
                if not tset.contains(ts):
                    continue
                if ts.subject_id not in subj_tsets or \
                   not subj_tsets[ts.subject_id].is_pref:
                    subj_tsets[ts.subject_id] = ts

        for ts in subj_tsets.values():
            for other in subj_tsets.values():
                if ts == other:
                    continue
                if other.contains(ts):
                    subj_ambiguity.update([ts.subject_id])

        return [(ts, subj_ambiguity[ts.subject_id])
                for uri, ts in subj_tsets.items()]


1			"""Index for fast matching of token sets."""
2
3			import collections
4
5
6			class TokenSet:
7			"""Represents a set of tokens (expressed as integer token IDs) that can
8			be matched with another set of tokens. A TokenSet can optionally
9			be associated with a subject from the vocabulary."""
10
11			def __init__(self, tokens, subject_id=None, is_pref=False):
12			self._tokens = set(tokens)
13			self.subject_id = subject_id
14			self.is_pref = is_pref
15
16			def __len__(self):
17			return len(self._tokens)
18
19			def __iter__(self):
20			return iter(self._tokens)
21
22			def contains(self, other):
23			"""Returns True iff the tokens in the other TokenSet are all
24			included within this TokenSet."""
25
26			return other._tokens.issubset(self._tokens)
27
28			def sample(self):
29			"""Return an arbitrary token from this TokenSet, or None if empty"""
30			try:
31			return next(iter(self._tokens))
32			except StopIteration:
33			return None
34
35
36			class TokenSetIndex:
37			"""A searchable index of TokenSets (representing vocabulary terms)"""
38
39			def __init__(self):
40			self._index = collections.defaultdict(set)
41
42			def __len__(self):
43			return len(self._index)
44
45			def add(self, tset):
46			"""Add a TokenSet into this index"""
47			token = tset.sample()
48			if token is not None:
49			self._index[token].add(tset)
50
51			def search(self, tset):
52			"""Return the TokenSets that are contained in the given TokenSet.
53			The matches are returned as a list of (TokenSet, ambiguity) pairs
54			where ambiguity is an integer indicating the number of other TokenSets
55			that also match the same tokens."""
56
57			subj_tsets = {}
58			subj_ambiguity = collections.Counter()
59
60			for token in tset:
61			for ts in self._index[token]:
62			if not tset.contains(ts):
63			continue
64			if ts.subject_id not in subj_tsets or \
65			not subj_tsets[ts.subject_id].is_pref:
66			subj_tsets[ts.subject_id] = ts
67
68			for ts in subj_tsets.values():
69			for other in subj_tsets.values():
70			if ts == other:
71			continue
72			if other.contains(ts):
73			subj_ambiguity.update([ts.subject_id])
74
75			return [(ts, subj_ambiguity[ts.subject_id])
76			for uri, ts in subj_tsets.items()]
77

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.tokenset.TokenSetIndex.add() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like