Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._digby.Digby.corr()   A

Complexity

Conditions 5

Size

Total Lines 47
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 5.009

Importance

Changes 0
Metric Value
eloc 14
dl 0
loc 47
ccs 13
cts 14
cp 0.9286
rs 9.2333
c 0
b 0
f 0
cc 5
nop 3
crap 5.009
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018-2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._digby.
20
21
Digby correlation
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32
33 1
__all__ = ['Digby']
34
35
36 1
class Digby(_TokenDistance):
37
    r"""Digby correlation.
38
39
    For two sets X and Y and a population N, Digby's approximation of the
40
    tetrachoric correlation coefficient
41
    :cite:`Digby:1983` is
42
43
        .. math::
44
45
            corr_{Digby}(X, Y) =
46
            \frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4}-
47
            (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
48
            {(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4} +
49
            (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
50
51
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
52
    this is
53
54
        .. math::
55
56
            corr_{Digby} =
57
            \frac{ad^\frac{3}{4}-bc^\frac{3}{4}}{ad^\frac{3}{4}+bc^\frac{3}{4}}
58
59
    .. versionadded:: 0.4.0
60
    """
61
62 1
    def __init__(
63
        self,
64
        alphabet=None,
65
        tokenizer=None,
66
        intersection_type='crisp',
67
        **kwargs
68
    ):
69
        """Initialize Digby instance.
70
71
        Parameters
72
        ----------
73
        alphabet : Counter, collection, int, or None
74
            This represents the alphabet of possible tokens.
75
            See :ref:`alphabet <alphabet>` description in
76
            :py:class:`_TokenDistance` for details.
77
        tokenizer : _Tokenizer
78
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
79
        intersection_type : str
80
            Specifies the intersection type, and set type as a result:
81
            See :ref:`intersection_type <intersection_type>` description in
82
            :py:class:`_TokenDistance` for details.
83
        **kwargs
84
            Arbitrary keyword arguments
85
86
        Other Parameters
87
        ----------------
88
        qval : int
89
            The length of each q-gram. Using this parameter and tokenizer=None
90
            will cause the instance to use the QGram tokenizer with this
91
            q value.
92
        metric : _Distance
93
            A string distance measure class for use in the ``soft`` and
94
            ``fuzzy`` variants.
95
        threshold : float
96
            A threshold value, similarities above which are counted as
97
            members of the intersection for the ``fuzzy`` variant.
98
99
100
        .. versionadded:: 0.4.0
101
102
        """
103 1
        super(Digby, self).__init__(
104
            alphabet=alphabet,
105
            tokenizer=tokenizer,
106
            intersection_type=intersection_type,
107
            **kwargs
108
        )
109
110 1
    def corr(self, src, tar):
111
        """Return the Digby correlation of two strings.
112
113
        Parameters
114
        ----------
115
        src : str
116
            Source string (or QGrams/Counter objects) for comparison
117
        tar : str
118
            Target string (or QGrams/Counter objects) for comparison
119
120
        Returns
121
        -------
122
        float
123
            Digby correlation
124
125
        Examples
126
        --------
127
        >>> cmp = Digby()
128
        >>> cmp.corr('cat', 'hat')
129
        0.9774244829419212
130
        >>> cmp.corr('Niall', 'Neil')
131
        0.9491281473458171
132
        >>> cmp.corr('aluminum', 'Catalan')
133
        0.7541039303781305
134
        >>> cmp.corr('ATCG', 'TAGC')
135
        -1.0
136
137
138
        .. versionadded:: 0.4.0
139
140
        """
141 1
        if src == tar:
142 1
            return 1.0
143 1
        if not src or not tar:
144 1
            return -1.0
145
146 1
        self._tokenize(src, tar)
147
148 1
        a = self._intersection_card()
149 1
        b = self._src_only_card()
150 1
        c = self._tar_only_card()
151 1
        d = self._total_complement_card()
152
153 1
        num = (a * d) ** 0.75 - (b * c) ** 0.75
154 1
        if num:
155 1
            return num / ((a * d) ** 0.75 + (b * c) ** 0.75)
156
        return 0.0
157
158 1
    def sim(self, src, tar):
159
        """Return the Digby similarity of two strings.
160
161
        Parameters
162
        ----------
163
        src : str
164
            Source string (or QGrams/Counter objects) for comparison
165
        tar : str
166
            Target string (or QGrams/Counter objects) for comparison
167
168
        Returns
169
        -------
170
        float
171
            Digby similarity
172
173
        Examples
174
        --------
175
        >>> cmp = Digby()
176
        >>> cmp.sim('cat', 'hat')
177
        0.9887122414709606
178
        >>> cmp.sim('Niall', 'Neil')
179
        0.9745640736729085
180
        >>> cmp.sim('aluminum', 'Catalan')
181
        0.8770519651890653
182
        >>> cmp.sim('ATCG', 'TAGC')
183
        0.0
184
185
186
        .. versionadded:: 0.4.0
187
188
        """
189 1
        return (1 + self.corr(src, tar)) / 2
190
191
192
if __name__ == '__main__':
193
    import doctest
194
195
    doctest.testmod()
196