Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._tversky.sim_tversky()   A

Complexity

Conditions 1

Size

Total Lines 38
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 6
dl 0
loc 38
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._tversky.
20
21
Tversky index
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32
33 1
__all__ = ['Tversky', 'dist_tversky', 'sim_tversky']
34
35
36 1
class Tversky(_TokenDistance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
37
    r"""Tversky index.
38
39
    The Tversky index :cite:`Tversky:1977` is defined as:
40
    For two sets X and Y:
41
    :math:`sim_{Tversky}(X, Y) = \frac{|X \cap Y|}
42
    {|X \cap Y| + \alpha|X - Y| + \beta|Y - X|}`.
43
44
    :math:`\alpha = \beta = 1` is equivalent to the Jaccard & Tanimoto
45
    similarity coefficients.
46
47
    :math:`\alpha = \beta = 0.5` is equivalent to the Sørensen-Dice
48
    similarity coefficient :cite:`Dice:1945,Sorensen:1948`.
49
50
    Unequal α and β will tend to emphasize one or the other set's
51
    contributions:
52
53
        - :math:`\alpha > \beta` emphasizes the contributions of X over Y
54
        - :math:`\alpha < \beta` emphasizes the contributions of Y over X)
55
56
    Parameter values' relation to 1 emphasizes different types of
57
    contributions:
58
59
        - :math:`\alpha and \beta > 1` emphsize unique contributions over the
60
          intersection
61
        - :math:`\alpha and \beta < 1` emphsize the intersection over unique
62
          contributions
63
64
    The symmetric variant is defined in :cite:`Jiminez:2013`. This is activated
65
    by specifying a bias parameter.
66
    """
67
68 1
    def sim(self, src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
69
        """Return the Tversky index of two strings.
70
71
        Parameters
72
        ----------
73
        src : str
74
            Source string (or QGrams/Counter objects) for comparison
75
        tar : str
76
            Target string (or QGrams/Counter objects) for comparison
77
        qval : int
78
            The length of each q-gram; 0 for non-q-gram version
79
        alpha : float
80
            Tversky index parameter as described above
81
        beta : float
82
            Tversky index parameter as described above
83
        bias : float
84
            The symmetric Tversky index bias parameter
85
86
        Returns
87
        -------
88
        float
89
            Tversky similarity
90
91
        Raises
92
        ------
93
        ValueError
94
            Unsupported weight assignment; alpha and beta must be greater than
95
            or equal to 0.
96
97
        Examples
98
        --------
99
        >>> cmp = Tversky()
100
        >>> cmp.sim('cat', 'hat')
101
        0.3333333333333333
102
        >>> cmp.sim('Niall', 'Neil')
103
        0.2222222222222222
104
        >>> cmp.sim('aluminum', 'Catalan')
105
        0.0625
106
        >>> cmp.sim('ATCG', 'TAGC')
107
        0.0
108
109
        """
110 1
        if alpha < 0 or beta < 0:
111 1
            raise ValueError(
112
                'Unsupported weight assignment; alpha and beta '
113
                + 'must be greater than or equal to 0.'
114
            )
115
116 1
        if src == tar:
117 1
            return 1.0
118 1
        elif not src or not tar:
119 1
            return 0.0
120
121 1
        q_src, q_tar = self._get_qgrams(src, tar, qval)
122 1
        q_src_mag = sum(q_src.values())
123 1
        q_tar_mag = sum(q_tar.values())
124 1
        q_intersection_mag = sum((q_src & q_tar).values())
125
126 1
        if not q_src or not q_tar:
127 1
            return 0.0
128
129 1
        if bias is None:
130 1
            return q_intersection_mag / (
131
                q_intersection_mag
132
                + alpha * (q_src_mag - q_intersection_mag)
133
                + beta * (q_tar_mag - q_intersection_mag)
134
            )
135
136 1
        a_val = min(
137
            q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
138
        )
139 1
        b_val = max(
140
            q_src_mag - q_intersection_mag, q_tar_mag - q_intersection_mag
141
        )
142 1
        c_val = q_intersection_mag + bias
143 1
        return c_val / (beta * (alpha * a_val + (1 - alpha) * b_val) + c_val)
144
145
146 1
def sim_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
147
    """Return the Tversky index of two strings.
148
149
    This is a wrapper for :py:meth:`Tversky.sim`.
150
151
    Parameters
152
    ----------
153
    src : str
154
        Source string (or QGrams/Counter objects) for comparison
155
    tar : str
156
        Target string (or QGrams/Counter objects) for comparison
157
    qval : int
158
        The length of each q-gram; 0 for non-q-gram version
159
    alpha : float
160
        Tversky index parameter as described above
161
    beta : float
162
        Tversky index parameter as described above
163
    bias : float
164
        The symmetric Tversky index bias parameter
165
166
    Returns
167
    -------
168
    float
169
        Tversky similarity
170
171
    Examples
172
    --------
173
    >>> sim_tversky('cat', 'hat')
174
    0.3333333333333333
175
    >>> sim_tversky('Niall', 'Neil')
176
    0.2222222222222222
177
    >>> sim_tversky('aluminum', 'Catalan')
178
    0.0625
179
    >>> sim_tversky('ATCG', 'TAGC')
180
    0.0
181
182
    """
183 1
    return Tversky().sim(src, tar, qval, alpha, beta, bias)
184
185
186 1
def dist_tversky(src, tar, qval=2, alpha=1, beta=1, bias=None):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
187
    """Return the Tversky distance between two strings.
188
189
    This is a wrapper for :py:meth:`Tversky.dist`.
190
191
    Parameters
192
    ----------
193
    src : str
194
        Source string (or QGrams/Counter objects) for comparison
195
    tar : str
196
        Target string (or QGrams/Counter objects) for comparison
197
    qval : int
198
        The length of each q-gram; 0 for non-q-gram version
199
    alpha : float
200
        Tversky index parameter as described above
201
    beta : float
202
        Tversky index parameter as described above
203
    bias : float
204
        The symmetric Tversky index bias parameter
205
206
    Returns
207
    -------
208
    float
209
        Tversky distance
210
211
    Examples
212
    --------
213
    >>> dist_tversky('cat', 'hat')
214
    0.6666666666666667
215
    >>> dist_tversky('Niall', 'Neil')
216
    0.7777777777777778
217
    >>> dist_tversky('aluminum', 'Catalan')
218
    0.9375
219
    >>> dist_tversky('ATCG', 'TAGC')
220
    1.0
221
222
    """
223 1
    return Tversky().dist(src, tar, qval, alpha, beta, bias)
224
225
226
if __name__ == '__main__':
227
    import doctest
228
229
    doctest.testmod()
230