Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._jaccard   A

Complexity

Total Complexity 6

Size/Duplication

Total Lines 229
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 26
dl 0
loc 229
ccs 19
cts 19
cp 1
rs 10
c 0
b 0
f 0
wmc 6

2 Methods

Rating   Name   Duplication   Size   Complexity  
A Jaccard.tanimoto_coeff() 0 38 2
A Jaccard.sim() 0 31 1

3 Functions

Rating   Name   Duplication   Size   Complexity  
A sim_jaccard() 0 32 1
A dist_jaccard() 0 32 1
A tanimoto() 0 32 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._jaccard.
20
21
Jaccard similarity coefficient, distance, & Tanimoto coefficient
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from math import log
32
33 1
from ._tversky import Tversky
34
35 1
__all__ = ['Jaccard', 'dist_jaccard', 'sim_jaccard', 'tanimoto']
36
37
38 1
class Jaccard(Tversky):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    r"""Jaccard similarity.
40
41
    For two sets X and Y, the Jaccard similarity coefficient
42
    :cite:`Jaccard:1901` is :math:`sim_{Jaccard}(X, Y) =
43
    \frac{|X \cap Y|}{|X \cup Y|}`.
44
45
    This is identical to the Tanimoto similarity coefficient
46
    :cite:`Tanimoto:1958`
47
    and the Tversky index :cite:`Tversky:1977` for
48
    :math:`\alpha = \beta = 1`.
49
    """
50
51 1
    def sim(self, src, tar, qval=2):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'sim' method
Loading history...
52
        r"""Return the Jaccard similarity of two strings.
53
54
        Parameters
55
        ----------
56
        src : str
57
            Source string (or QGrams/Counter objects) for comparison
58
        tar : str
59
            Target string (or QGrams/Counter objects) for comparison
60
        qval : int
61
            The length of each q-gram; 0 for non-q-gram version
62
63
        Returns
64
        -------
65
        float
66
            Jaccard similarity
67
68
        Examples
69
        --------
70
        >>> cmp = Jaccard()
71
        >>> cmp.sim('cat', 'hat')
72
        0.3333333333333333
73
        >>> cmp.sim('Niall', 'Neil')
74
        0.2222222222222222
75
        >>> cmp.sim('aluminum', 'Catalan')
76
        0.0625
77
        >>> cmp.sim('ATCG', 'TAGC')
78
        0.0
79
80
        """
81 1
        return super(self.__class__, self).sim(src, tar, qval, 1, 1)
0 ignored issues
show
Bug introduced by
The first argument passed to super() should be the super-class name, but self.__class__ was given.
Loading history...
82
83 1
    def tanimoto_coeff(self, src, tar, qval=2):
84
        """Return the Tanimoto distance between two strings.
85
86
        Tanimoto distance :cite:`Tanimoto:1958` is
87
        :math:`-log_{2} sim_{Tanimoto}(X, Y)`.
88
89
        Parameters
90
        ----------
91
        src : str
92
            Source string (or QGrams/Counter objects) for comparison
93
        tar : str
94
            Target string (or QGrams/Counter objects) for comparison
95
        qval : int
96
            The length of each q-gram; 0 for non-q-gram version
97
98
        Returns
99
        -------
100
        float
101
            Tanimoto distance
102
103
        Examples
104
        --------
105
        >>> cmp = Jaccard()
106
        >>> cmp.tanimoto_coeff('cat', 'hat')
107
        -1.5849625007211563
108
        >>> cmp.tanimoto_coeff('Niall', 'Neil')
109
        -2.1699250014423126
110
        >>> cmp.tanimoto_coeff('aluminum', 'Catalan')
111
        -4.0
112
        >>> cmp.tanimoto_coeff('ATCG', 'TAGC')
113
        -inf
114
115
        """
116 1
        coeff = self.sim(src, tar, qval)
117 1
        if coeff != 0:
118 1
            return log(coeff, 2)
119
120 1
        return float('-inf')
121
122
123 1
def sim_jaccard(src, tar, qval=2):
124
    """Return the Jaccard similarity of two strings.
125
126
    This is a wrapper for :py:meth:`Jaccard.sim`.
127
128
    Parameters
129
    ----------
130
    src : str
131
        Source string (or QGrams/Counter objects) for comparison
132
    tar : str
133
        Target string (or QGrams/Counter objects) for comparison
134
    qval : int
135
        The length of each q-gram; 0 for non-q-gram version
136
137
    Returns
138
    -------
139
    float
140
        Jaccard similarity
141
142
    Examples
143
    --------
144
    >>> sim_jaccard('cat', 'hat')
145
    0.3333333333333333
146
    >>> sim_jaccard('Niall', 'Neil')
147
    0.2222222222222222
148
    >>> sim_jaccard('aluminum', 'Catalan')
149
    0.0625
150
    >>> sim_jaccard('ATCG', 'TAGC')
151
    0.0
152
153
    """
154 1
    return Jaccard().sim(src, tar, qval)
155
156
157 1
def dist_jaccard(src, tar, qval=2):
158
    """Return the Jaccard distance between two strings.
159
160
    This is a wrapper for :py:meth:`Jaccard.dist`.
161
162
    Parameters
163
    ----------
164
    src : str
165
        Source string (or QGrams/Counter objects) for comparison
166
    tar : str
167
        Target string (or QGrams/Counter objects) for comparison
168
    qval : int
169
        The length of each q-gram; 0 for non-q-gram version
170
171
    Returns
172
    -------
173
    float
174
        Jaccard distance
175
176
    Examples
177
    --------
178
    >>> dist_jaccard('cat', 'hat')
179
    0.6666666666666667
180
    >>> dist_jaccard('Niall', 'Neil')
181
    0.7777777777777778
182
    >>> dist_jaccard('aluminum', 'Catalan')
183
    0.9375
184
    >>> dist_jaccard('ATCG', 'TAGC')
185
    1.0
186
187
    """
188 1
    return Jaccard().dist(src, tar, qval)
189
190
191 1
def tanimoto(src, tar, qval=2):
192
    """Return the Tanimoto coefficient of two strings.
193
194
    This is a wrapper for :py:meth:`Jaccard.tanimoto_coeff`.
195
196
    Parameters
197
    ----------
198
    src : str
199
        Source string (or QGrams/Counter objects) for comparison
200
    tar : str
201
        Target string (or QGrams/Counter objects) for comparison
202
    qval : int
203
        The length of each q-gram; 0 for non-q-gram version
204
205
    Returns
206
    -------
207
    float
208
        Tanimoto distance
209
210
    Examples
211
    --------
212
    >>> tanimoto('cat', 'hat')
213
    -1.5849625007211563
214
    >>> tanimoto('Niall', 'Neil')
215
    -2.1699250014423126
216
    >>> tanimoto('aluminum', 'Catalan')
217
    -4.0
218
    >>> tanimoto('ATCG', 'TAGC')
219
    -inf
220
221
    """
222 1
    return Jaccard().tanimoto_coeff(src, tar, qval)
223
224
225
if __name__ == '__main__':
226
    import doctest
227
228
    doctest.testmod()
229