abydos.distance._tversky.sim_tversky() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.distance._tversky.sim_tversky() A

↳ Parent: abydos.distance._tversky

Complexity

Conditions

Size

Total Lines	38
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	6
dl	0
loc	38
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._tversky.

Tversky index
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from ._token_distance import _TokenDistance

__all__ = ['Tversky', 'dist_tversky', 'sim_tversky']


class Tversky(_TokenDistance):

    r"""Tversky index.

    The Tversky index :cite:`Tversky:1977` is defined as:
    For two sets X and Y:
    :math:`sim_{Tversky}(X, Y) = \frac{|X \cap Y|}
    {|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}`.

    :math:`\alpha = \beta = 1` is equivalent to the Jaccard & Tanimoto
    similarity coefficients.

    :math:`\alpha = \beta = 0.5` is equivalent to the Sørensen-Dice
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.

    Unequal α and β will tend to emphasize one or the other set's
    contributions:

        - :math:`\alpha > \beta` emphasizes the contributions of X over Y
        - :math:`\alpha < \beta` emphasizes the contributions of Y over X)

    Parameter values' relation to 1 emphasizes different types of
    contributions:

        - :math:`\alpha and \beta > 1` emphsize unique contributions over the
          intersection
        - :math:`\alpha and \beta < 1` emphsize the intersection over unique
          contributions

    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
    by specifying a bias parameter.
    """

    def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None):

        """Return the Tversky index of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison
        qval : int
            The length of each q-gram; 0 for non-q-gram version
        alpha : float
            Tversky index parameter as described above
        beta : float
            Tversky index parameter as described above
        bias : float
            The symmetric Tversky index bias parameter

        Returns
        -------
        float
            Tversky similarity

        Raises
        ------
        ValueError
            Unsupported weight assignment; alpha and beta must be greater than
            or equal to 0.

        Examples
        --------
        >>> cmp = Tversky()
        >>> cmp.sim('cat', 'hat')
        0.3333333333333333
        >>> cmp.sim('Niall', 'Neil')
        0.2222222222222222
        >>> cmp.sim('aluminum', 'Catalan')
        0.0625
        >>> cmp.sim('ATCG', 'TAGC')
        0.0

        """
        if alpha < 0 or beta < 0:
            raise ValueError(
                'Unsupported weight assignment; alpha and beta '
                + 'must be greater than or equal to 0.'
            )

        if src == tar:
            return 1.0
        elif not src or not tar:
            return 0.0

        q_src, q_tar = self._get_qgrams(src, tar, qval)
        q_src_mag = sum(q_src.values())
        q_tar_mag = sum(q_tar.values())
        q_intersection_mag = sum((q_src & q_tar).values())

        if not q_src or not q_tar:
            return 0.0

        if bias is None:
            return q_intersection_mag / (
                q_intersection_mag
                + alpha * (q_src_mag - q_intersection_mag)
                + beta * (q_tar_mag - q_intersection_mag)
            )

        a_val = min(
            q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
        )
        b_val = max(
            q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
        )
        c_val = q_intersection_mag + bias
        return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)


def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):

    """Return the Tversky index of two strings.

    This is a wrapper for :py:meth:`Tversky.sim`.

    Parameters
    ----------
    src : str
        Source string (or QGrams/Counter objects) for comparison
    tar : str
        Target string (or QGrams/Counter objects) for comparison
    qval : int
        The length of each q-gram; 0 for non-q-gram version
    alpha : float
        Tversky index parameter as described above
    beta : float
        Tversky index parameter as described above
    bias : float
        The symmetric Tversky index bias parameter

    Returns
    -------
    float
        Tversky similarity

    Examples
    --------
    >>> sim_tversky('cat', 'hat')
    0.3333333333333333
    >>> sim_tversky('Niall', 'Neil')
    0.2222222222222222
    >>> sim_tversky('aluminum', 'Catalan')
    0.0625
    >>> sim_tversky('ATCG', 'TAGC')
    0.0

    """
    return Tversky().sim(src, tar, qval, alpha, beta, bias)


def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):

    """Return the Tversky distance between two strings.

    This is a wrapper for :py:meth:`Tversky.dist`.

    Parameters
    ----------
    src : str
        Source string (or QGrams/Counter objects) for comparison
    tar : str
        Target string (or QGrams/Counter objects) for comparison
    qval : int
        The length of each q-gram; 0 for non-q-gram version
    alpha : float
        Tversky index parameter as described above
    beta : float
        Tversky index parameter as described above
    bias : float
        The symmetric Tversky index bias parameter

    Returns
    -------
    float
        Tversky distance

    Examples
    --------
    >>> dist_tversky('cat', 'hat')
    0.6666666666666667
    >>> dist_tversky('Niall', 'Neil')
    0.7777777777777778
    >>> dist_tversky('aluminum', 'Catalan')
    0.9375
    >>> dist_tversky('ATCG', 'TAGC')
    1.0

    """
    return Tversky().dist(src, tar, qval, alpha, beta, bias)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._tversky.
20
21		Tversky index
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from ._token_distance import _TokenDistance
32
33	1	__all__ = ['Tversky', 'dist_tversky', 'sim_tversky']
34
35
36	1	class Tversky(_TokenDistance):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
37		r"""Tversky index.
38
39		The Tversky index :cite:`Tversky:1977` is defined as:
40		For two sets X and Y:
41		:math:`sim_{Tversky}(X, Y) = \frac{\|X \cap Y\|}
42		{\|X \cap Y\| + \alpha\|X - Y\| + \beta\|Y - X\|}`.
43
44		:math:`\alpha = \beta = 1` is equivalent to the Jaccard & Tanimoto
45		similarity coefficients.
46
47		:math:`\alpha = \beta = 0.5` is equivalent to the Sørensen-Dice
48		similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
49
50		Unequal α and β will tend to emphasize one or the other set's
51		contributions:
52
53		- :math:`\alpha > \beta` emphasizes the contributions of X over Y
54		- :math:`\alpha < \beta` emphasizes the contributions of Y over X)
55
56		Parameter values' relation to 1 emphasizes different types of
57		contributions:
58
59		- :math:`\alpha and \beta > 1` emphsize unique contributions over the
60		intersection
61		- :math:`\alpha and \beta < 1` emphsize the intersection over unique
62		contributions
63
64		The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
65		by specifying a bias parameter.
66		"""
67
68	1	def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None):
		0 ignored issues – show best-practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Too many arguments (7/5) Loading history... Comprehensibility introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (16/15). Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'sim' method Loading history...
69		"""Return the Tversky index of two strings.
70
71		Parameters
72		----------
73		src : str
74		Source string (or QGrams/Counter objects) for comparison
75		tar : str
76		Target string (or QGrams/Counter objects) for comparison
77		qval : int
78		The length of each q-gram; 0 for non-q-gram version
79		alpha : float
80		Tversky index parameter as described above
81		beta : float
82		Tversky index parameter as described above
83		bias : float
84		The symmetric Tversky index bias parameter
85
86		Returns
87		-------
88		float
89		Tversky similarity
90
91		Raises
92		------
93		ValueError
94		Unsupported weight assignment; alpha and beta must be greater than
95		or equal to 0.
96
97		Examples
98		--------
99		>>> cmp = Tversky()
100		>>> cmp.sim('cat', 'hat')
101		0.3333333333333333
102		>>> cmp.sim('Niall', 'Neil')
103		0.2222222222222222
104		>>> cmp.sim('aluminum', 'Catalan')
105		0.0625
106		>>> cmp.sim('ATCG', 'TAGC')
107		0.0
108
109		"""
110	1	if alpha < 0 or beta < 0:
111	1	raise ValueError(
112		'Unsupported weight assignment; alpha and beta '
113		+ 'must be greater than or equal to 0.'
114		)
115
116	1	if src == tar:
117	1	return 1.0
118	1	elif not src or not tar:
119	1	return 0.0
120
121	1	q_src, q_tar = self._get_qgrams(src, tar, qval)
122	1	q_src_mag = sum(q_src.values())
123	1	q_tar_mag = sum(q_tar.values())
124	1	q_intersection_mag = sum((q_src & q_tar).values())
125
126	1	if not q_src or not q_tar:
127	1	return 0.0
128
129	1	if bias is None:
130	1	return q_intersection_mag / (
131		q_intersection_mag
132		+ alpha * (q_src_mag - q_intersection_mag)
133		+ beta * (q_tar_mag - q_intersection_mag)
134		)
135
136	1	a_val = min(
137		q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
138		)
139	1	b_val = max(
140		q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
141		)
142	1	c_val = q_intersection_mag + bias
143	1	return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
144
145
146	1	def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
		0 ignored issues – show best-practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Too many arguments (6/5) Loading history...
147		"""Return the Tversky index of two strings.
148
149		This is a wrapper for :py:meth:`Tversky.sim`.
150
151		Parameters
152		----------
153		src : str
154		Source string (or QGrams/Counter objects) for comparison
155		tar : str
156		Target string (or QGrams/Counter objects) for comparison
157		qval : int
158		The length of each q-gram; 0 for non-q-gram version
159		alpha : float
160		Tversky index parameter as described above
161		beta : float
162		Tversky index parameter as described above
163		bias : float
164		The symmetric Tversky index bias parameter
165
166		Returns
167		-------
168		float
169		Tversky similarity
170
171		Examples
172		--------
173		>>> sim_tversky('cat', 'hat')
174		0.3333333333333333
175		>>> sim_tversky('Niall', 'Neil')
176		0.2222222222222222
177		>>> sim_tversky('aluminum', 'Catalan')
178		0.0625
179		>>> sim_tversky('ATCG', 'TAGC')
180		0.0
181
182		"""
183	1	return Tversky().sim(src, tar, qval, alpha, beta, bias)
184
185
186	1	def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
		0 ignored issues – show best-practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Too many arguments (6/5) Loading history...
187		"""Return the Tversky distance between two strings.
188
189		This is a wrapper for :py:meth:`Tversky.dist`.
190
191		Parameters
192		----------
193		src : str
194		Source string (or QGrams/Counter objects) for comparison
195		tar : str
196		Target string (or QGrams/Counter objects) for comparison
197		qval : int
198		The length of each q-gram; 0 for non-q-gram version
199		alpha : float
200		Tversky index parameter as described above
201		beta : float
202		Tversky index parameter as described above
203		bias : float
204		The symmetric Tversky index bias parameter
205
206		Returns
207		-------
208		float
209		Tversky distance
210
211		Examples
212		--------
213		>>> dist_tversky('cat', 'hat')
214		0.6666666666666667
215		>>> dist_tversky('Niall', 'Neil')
216		0.7777777777777778
217		>>> dist_tversky('aluminum', 'Catalan')
218		0.9375
219		>>> dist_tversky('ATCG', 'TAGC')
220		1.0
221
222		"""
223	1	return Tversky().dist(src, tar, qval, alpha, beta, bias)
224
225
226		if __name__ == '__main__':
227		import doctest
228
229		doctest.testmod()
230

chrislit / abydos

Push — master ( f43547...71985b )

abydos.distance._tversky.sim_tversky() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like