abydos.distance._soft_cosine.SoftCosine.__init__() - Code Metrics - Inspection of "Merge pull request #257 from chrislit/0.6.0b" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 416c2f...9ec382 )

by Chris

created 2020-02-10 23:39 UTC

abydos.distance._soft_cosine.SoftCosine.init() A

↳ Parent: abydos.distance._soft_cosine

Complexity

Conditions

Size

Total Lines	56
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	6
CRAP Score	3

Importance

Changes

Metric	Value
eloc	11
dl	0
loc	56
ccs	6
cts	6
cp	1
rs	9.85
c	0
b	0
f	0
cc	3
nop	5
crap	3

How to fix Long Method

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._softy_cosine.

Soft Cosine similarity & distance
"""

from typing import Any, Optional, cast

from ._distance import _Distance
from ._levenshtein import Levenshtein
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer

__all__ = ['SoftCosine']


class SoftCosine(_TokenDistance):
    r"""Soft Cosine similarity.

    As described in :cite:`Sidorov:2014`, soft cosine similarity of two
    multi-sets X and Y, drawn from an alphabet S, is

        .. math::

            sim_{soft cosine}(X, Y) =
            \frac{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i Y_j}
            {\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i X_j}
            \sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} Y_i Y_j}}

    where :math:`s_{ij}` is the similarity of two tokens, by default a function
    of Levenshtein distance: :math:`\frac{1}{1+Levenshtein\_distance(i, j)}`.

    Notes
    -----
    This class implements soft cosine similarity, as defined by
    :cite:`Sidorov:2014`. An alternative formulation of soft cosine similarity
    using soft (multi-)sets is provided by the :class:`Cosine` class using
    intersection_type=``soft``, based on the soft intersection
    defined in :cite:`Russ:2014`.

    .. versionadded:: 0.4.0

    """

    def __init__(
        self,
        tokenizer: Optional[_Tokenizer] = None,
        metric: Optional[_Distance] = None,
        sim_method: str = 'a',
        **kwargs: Any
    ) -> None:
        r"""Initialize SoftCosine instance.

        Parameters
        ----------
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer`
            package, defaulting to the QGrams tokenizer with q=4
        threshold : float
            The minimum similarity for a pair of tokens to contribute to
            similarity
        metric : _Distance
            A distance instance from the abydos.distance package, defaulting
            to Levenshtein distance
        sim_method : str
            Selects the similarity method from the four given in
            :cite:`Sidorov:2014`:

                - ``a`` : :math:`\frac{1}{1+d}`
                - ``b`` : :math:`1-\frac{d}{m}`
                - ``c`` : :math:`\sqrt{1-\frac{d}{m}}`
                - ``d`` : :math:`\Big(1-\frac{d}{m}\Big)^2`

            Where :math:`d` is the distance (Levenshtein by default) and
            :math:`m` is the maximum length of the two tokens. Option `a` is
            default, as suggested by the paper.
        **kwargs
            Arbitrary keyword arguments

        Raises
        ------
        ValueError
            sim_method must be one of 'a', 'b', 'c', or 'd'

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.


        .. versionadded:: 0.4.0

        """
        super(SoftCosine, self).__init__(tokenizer, **kwargs)
        self.params['metric'] = metric if metric is not None else Levenshtein()
        if sim_method not in {'a', 'b', 'c', 'd'}:
            raise ValueError("sim_method must be one of 'a', 'b', 'c', or 'd'")
        self.params['sim_method'] = sim_method

    def _sim_a(self, src: str, tar: str) -> float:
        return 1 / (1 + cast(float, self.params['metric'].dist_abs(src, tar)))

    def _sim_b(self, src: str, tar: str) -> float:
        return 1 - (
            cast(float, self.params['metric'].dist_abs(src, tar))
            / max(len(src), len(tar))
        )

    def _sim_c(self, src: str, tar: str) -> float:
        return (
            1
            - (
                cast(float, self.params['metric'].dist_abs(src, tar))
                / max(len(src), len(tar))
            )
        ) ** 0.5

    def _sim_d(self, src: str, tar: str) -> float:
        return (
            1
            - (
                cast(float, self.params['metric'].dist_abs(src, tar))
                / max(len(src), len(tar))
            )
        ) ** 2

    def sim(self, src: str, tar: str) -> float:
        r"""Return the Soft Cosine similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Fuzzy Cosine similarity

        Examples
        --------
        >>> cmp = SoftCosine()
        >>> cmp.sim('cat', 'hat')
        0.8750000000000001
        >>> cmp.sim('Niall', 'Neil')
        0.8844691709074513
        >>> cmp.sim('aluminum', 'Catalan')
        0.831348688760277
        >>> cmp.sim('ATCG', 'TAGC')
        0.8571428571428572


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        if not self._src_card() or not self._tar_card():
            return 0.0

        similarity = {
            'a': self._sim_a,
            'b': self._sim_b,
            'c': self._sim_c,
            'd': self._sim_d,
        }

        nom = 0.0
        denom_left = 0.0
        denom_right = 0.0

        for src in self._src_tokens.keys():
            for tar in self._tar_tokens.keys():
                nom += (
                    self._src_tokens[src]
                    * self._tar_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        for src in self._src_tokens.keys():
            for tar in self._src_tokens.keys():
                denom_left += (
                    self._src_tokens[src]
                    * self._src_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        for src in self._tar_tokens.keys():
            for tar in self._tar_tokens.keys():
                denom_right += (
                    self._tar_tokens[src]
                    * self._tar_tokens[tar]
                    * similarity[self.params['sim_method']](src, tar)
                )

        return nom / (denom_left ** 0.5 * denom_right ** 0.5)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._softy_cosine.
18
19	1	Soft Cosine similarity & distance
20		"""
21
22		from typing import Any, Optional, cast
23
24	1	from ._distance import _Distance
25		from ._levenshtein import Levenshtein
26		from ._token_distance import _TokenDistance
27		from ..tokenizer import _Tokenizer
28
29		__all__ = ['SoftCosine']
30
31	1
32	1	class SoftCosine(_TokenDistance):
33		r"""Soft Cosine similarity.
34	1
35		As described in :cite:`Sidorov:2014`, soft cosine similarity of two
36		multi-sets X and Y, drawn from an alphabet S, is
37	1
38		.. math::
39
40		sim_{soft cosine}(X, Y) =
41		\frac{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i Y_j}
42		{\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i X_j}
43		\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} Y_i Y_j}}
44
45		where :math:`s_{ij}` is the similarity of two tokens, by default a function
46		of Levenshtein distance: :math:`\frac{1}{1+Levenshtein\_distance(i, j)}`.
47
48		Notes
49		-----
50		This class implements soft cosine similarity, as defined by
51		:cite:`Sidorov:2014`. An alternative formulation of soft cosine similarity
52		using soft (multi-)sets is provided by the :class:`Cosine` class using
53		intersection_type=``soft``, based on the soft intersection
54		defined in :cite:`Russ:2014`.
55
56		.. versionadded:: 0.4.0
57
58		"""
59
60		def __init__(
61		self,
62		tokenizer: Optional[_Tokenizer] = None,
63		metric: Optional[_Distance] = None,
64		sim_method: str = 'a',
65	1	**kwargs: Any
66		) -> None:
67		r"""Initialize SoftCosine instance.
68
69		Parameters
70		----------
71		tokenizer : _Tokenizer
72		A tokenizer instance from the :py:mod:`abydos.tokenizer`
73		package, defaulting to the QGrams tokenizer with q=4
74		threshold : float
75		The minimum similarity for a pair of tokens to contribute to
76		similarity
77		metric : _Distance
78		A distance instance from the abydos.distance package, defaulting
79		to Levenshtein distance
80		sim_method : str
81		Selects the similarity method from the four given in
82		:cite:`Sidorov:2014`:
83
84		- ``a`` : :math:`\frac{1}{1+d}`
85		- ``b`` : :math:`1-\frac{d}{m}`
86		- ``c`` : :math:`\sqrt{1-\frac{d}{m}}`
87		- ``d`` : :math:`\Big(1-\frac{d}{m}\Big)^2`
88
89		Where :math:`d` is the distance (Levenshtein by default) and
90		:math:`m` is the maximum length of the two tokens. Option `a` is
91		default, as suggested by the paper.
92		**kwargs
93		Arbitrary keyword arguments
94
95		Raises
96		------
97		ValueError
98		sim_method must be one of 'a', 'b', 'c', or 'd'
99
100		Other Parameters
101		----------------
102		qval : int
103		The length of each q-gram. Using this parameter and tokenizer=None
104		will cause the instance to use the QGram tokenizer with this
105		q value.
106
107
108		.. versionadded:: 0.4.0
109
110	1	"""
111	1	super(SoftCosine, self).__init__(tokenizer, **kwargs)
112	1	self.params['metric'] = metric if metric is not None else Levenshtein()
113	1	if sim_method not in {'a', 'b', 'c', 'd'}:
114	1	raise ValueError("sim_method must be one of 'a', 'b', 'c', or 'd'")
115		self.params['sim_method'] = sim_method
116	1
117		def _sim_a(self, src: str, tar: str) -> float:
118		return 1 / (1 + cast(float, self.params['metric'].dist_abs(src, tar)))
119
120		def _sim_b(self, src: str, tar: str) -> float:
121		return 1 - (
122		cast(float, self.params['metric'].dist_abs(src, tar))
123		/ max(len(src), len(tar))
124		)
125
126		def _sim_c(self, src: str, tar: str) -> float:
127		return (
128		1
129		- (
130		cast(float, self.params['metric'].dist_abs(src, tar))
131		/ max(len(src), len(tar))
132		)
133		) ** 0.5
134
135		def _sim_d(self, src: str, tar: str) -> float:
136		return (
137		1
138		- (
139		cast(float, self.params['metric'].dist_abs(src, tar))
140		/ max(len(src), len(tar))
141		)
142		) ** 2
143
144		def sim(self, src: str, tar: str) -> float:
145		r"""Return the Soft Cosine similarity of two strings.
146
147	1	Parameters
148	1	----------
149		src : str
150	1	Source string (or QGrams/Counter objects) for comparison
151		tar : str
152	1	Target string (or QGrams/Counter objects) for comparison
153	1
154		Returns
155	1	-------
156		float
157		Fuzzy Cosine similarity
158
159		Examples
160		--------
161		>>> cmp = SoftCosine()
162		>>> cmp.sim('cat', 'hat')
163		0.8750000000000001
164		>>> cmp.sim('Niall', 'Neil')
165		0.8844691709074513
166		>>> cmp.sim('aluminum', 'Catalan')
167		0.831348688760277
168		>>> cmp.sim('ATCG', 'TAGC')
169		0.8571428571428572
170
171
172		.. versionadded:: 0.4.0
173
174		"""
175		if src == tar:
176		return 1.0
177
178		self._tokenize(src, tar)
179
180		if not self._src_card() or not self._tar_card():
181	1	return 0.0
182	1
183	1	similarity = {
184		'a': self._sim_a,
185	1	'b': self._sim_b,
186	1	'c': self._sim_c,
187	1	'd': self._sim_d,
188		}
189
190		nom = 0.0
191		denom_left = 0.0
192		denom_right = 0.0
193	1
194	1	for src in self._src_tokens.keys():
195	1	for tar in self._tar_tokens.keys():
196		nom += (
197		self._src_tokens[src]
198		* self._tar_tokens[tar]
199		* similarity[self.params['sim_method']](src, tar)
200		)
201	1
202	1	for src in self._src_tokens.keys():
203	1	for tar in self._src_tokens.keys():
204		denom_left += (
205		self._src_tokens[src]
206		* self._src_tokens[tar]
207		* similarity[self.params['sim_method']](src, tar)
208		)
209	1
210		for src in self._tar_tokens.keys():
211		for tar in self._tar_tokens.keys():
212		denom_right += (
213		self._tar_tokens[src]
214		* self._tar_tokens[tar]
215		* similarity[self.params['sim_method']](src, tar)
216		)
217
218		return nom / (denom_left ** 0.5 * denom_right ** 0.5)
219
220
221		if __name__ == '__main__':
222		import doctest
223
224		doctest.testmod()
225

chrislit / abydos

Push — master ( 416c2f...9ec382 )

abydos.distance._soft_cosine.SoftCosine.__init__() A

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like

abydos.distance._soft_cosine.SoftCosine.init() A