Completed
Pull Request — master (#225)
by Chris
09:15
created

GoodmanKruskalTauB.sim()   A

Complexity

Conditions 5

Size

Total Lines 51
Code Lines 16

Duplication

Lines 51
Ratio 100 %

Code Coverage

Tests 15
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 16
dl 51
loc 51
ccs 15
cts 15
cp 1
rs 9.1333
c 0
b 0
f 0
cc 5
nop 3
crap 5

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._goodman_kruskal_tau_b.
20
21
Goodman & Kruskal's Tau B similarity
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32
33 1
__all__ = ['GoodmanKruskalTauB']
34
35
36 1 View Code Duplication
class GoodmanKruskalTauB(_TokenDistance):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
37
    r"""Goodman & Kruskal's Tau B similarity.
38
39
    For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b`
40
    similarity :cite:`Goodman:1954` is
41
42
        .. math::
43
44
            sim_{GK_{\tau_b}}(X, Y) =
45
            \frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
46
            \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+
47
            \frac{\frac{|Y \setminus X|}{|N|}^2 +
48
            \frac{|(N \setminus X) \setminus Y|}{|N|}^2}
49
            {\frac{|N \setminus X|}{|N|}} -
50
            (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
51
            {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
52
53
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54
    after each term has been converted to a proportion by dividing by n, this
55
    is
56
57
        .. math::
58
59
            sim_{GK_{\tau_b}} =
60
            \frac{
61
            \frac{a^2 + b^2}{a+b} +
62
            \frac{c^2 + d^2}{c+d} -
63
            ((a+c)^2 + (b+d)^2)}
64
            {1 - ((a+c)^2 + (b+d)^2)}
65
66
    .. versionadded:: 0.4.0
67
    """
68
69 1
    def __init__(
70
        self,
71
        alphabet=None,
72
        tokenizer=None,
73
        intersection_type='crisp',
74
        normalizer='proportional',
75
        **kwargs
76
    ):
77
        """Initialize GoodmanKruskalTauB instance.
78
79
        Parameters
80
        ----------
81
        alphabet : Counter, collection, int, or None
82
            This represents the alphabet of possible tokens.
83
            See :ref:`alphabet <alphabet>` description in
84
            :py:class:`_TokenDistance` for details.
85
        tokenizer : _Tokenizer
86
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
87
        intersection_type : str
88
            Specifies the intersection type, and set type as a result:
89
            See :ref:`intersection_type <intersection_type>` description in
90
            :py:class:`_TokenDistance` for details.
91
        normalizer : str
92
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
93
            description in :py:class:`_TokenDistance` for details.
94
        **kwargs
95
            Arbitrary keyword arguments
96
97
        Other Parameters
98
        ----------------
99
        qval : int
100
            The length of each q-gram. Using this parameter and tokenizer=None
101
            will cause the instance to use the QGram tokenizer with this
102
            q value.
103
        metric : _Distance
104
            A string distance measure class for use in the ``soft`` and
105
            ``fuzzy`` variants.
106
        threshold : float
107
            A threshold value, similarities above which are counted as
108
            members of the intersection for the ``fuzzy`` variant.
109
110
111
        .. versionadded:: 0.4.0
112
113
        """
114 1
        super(GoodmanKruskalTauB, self).__init__(
115
            alphabet=alphabet,
116
            tokenizer=tokenizer,
117
            intersection_type=intersection_type,
118
            normalizer=normalizer,
119
            **kwargs
120
        )
121
122 1
    def sim(self, src, tar):
123
        """Return Goodman & Kruskal's Tau B similarity of two strings.
124
125
        Parameters
126
        ----------
127
        src : str
128
            Source string (or QGrams/Counter objects) for comparison
129
        tar : str
130
            Target string (or QGrams/Counter objects) for comparison
131
132
        Returns
133
        -------
134
        float
135
            Goodman & Kruskal's Tau B similarity
136
137
        Examples
138
        --------
139
        >>> cmp = GoodmanKruskalTauB()
140
        >>> cmp.sim('cat', 'hat')
141
        0.3304969657208484
142
        >>> cmp.sim('Niall', 'Neil')
143
        0.2346006486710202
144
        >>> cmp.sim('aluminum', 'Catalan')
145
        0.06533810992392582
146
        >>> cmp.sim('ATCG', 'TAGC')
147
        4.119695274745721e-05
148
149
150
        .. versionadded:: 0.4.0
151
152
        """
153 1
        self._tokenize(src, tar)
154
155 1
        a = self._intersection_card()
156 1
        b = self._src_only_card()
157 1
        c = self._tar_only_card()
158 1
        d = self._total_complement_card()
159
160 1
        if a + b == 0 or a + c == 0:
161 1
            return 0.0
162
163 1
        fp = (a * a + b * b) / (a + b)
164
165 1
        sp = c * c + d * d
166 1
        if sp:
167 1
            sp /= c + d
168
169 1
        num = fp + sp - (a + c) ** 2 - (b + d) ** 2
170 1
        if num > 1e-14:
171 1
            return num / (1 - (a + c) ** 2 - (b + d) ** 2)
172
        return 0.0  # pragma: no cover
173
174
175
if __name__ == '__main__':
176
    import doctest
177
178
    doctest.testmod()
179