abydos.distance._digby.Digby.sim()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1.0156

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 32
ccs 3
cts 4
cp 0.75
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1.0156
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._digby.
18
19 1
Digby correlation
20
"""
21
22
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
23
24 1
from ._token_distance import _TokenDistance
25
from ..tokenizer import _Tokenizer
26
27
__all__ = ['Digby']
28
29
30
class Digby(_TokenDistance):
31 1
    r"""Digby correlation.
32
33 1
    For two sets X and Y and a population N, Digby's approximation of the
34
    tetrachoric correlation coefficient
35
    :cite:`Digby:1983` is
36 1
37
        .. math::
38
39
            corr_{Digby}(X, Y) =
40
            \frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4}-
41
            (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
42
            {(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4} +
43
            (|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
44
45
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
46
    this is
47
48
        .. math::
49
50
            corr_{Digby} =
51
            \frac{ad^\frac{3}{4}-bc^\frac{3}{4}}{ad^\frac{3}{4}+bc^\frac{3}{4}}
52
53
    .. versionadded:: 0.4.0
54
    """
55
56
    def __init__(
57
        self,
58
        alphabet: Optional[
59
            Union[TCounter[str], Sequence[str], Set[str], int]
60
        ] = None,
61
        tokenizer: Optional[_Tokenizer] = None,
62 1
        intersection_type: str = 'crisp',
63
        **kwargs: Any
64
    ) -> None:
65
        """Initialize Digby instance.
66
67
        Parameters
68
        ----------
69
        alphabet : Counter, collection, int, or None
70
            This represents the alphabet of possible tokens.
71
            See :ref:`alphabet <alphabet>` description in
72
            :py:class:`_TokenDistance` for details.
73
        tokenizer : _Tokenizer
74
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
75
        intersection_type : str
76
            Specifies the intersection type, and set type as a result:
77
            See :ref:`intersection_type <intersection_type>` description in
78
            :py:class:`_TokenDistance` for details.
79
        **kwargs
80
            Arbitrary keyword arguments
81
82
        Other Parameters
83
        ----------------
84
        qval : int
85
            The length of each q-gram. Using this parameter and tokenizer=None
86
            will cause the instance to use the QGram tokenizer with this
87
            q value.
88
        metric : _Distance
89
            A string distance measure class for use in the ``soft`` and
90
            ``fuzzy`` variants.
91
        threshold : float
92
            A threshold value, similarities above which are counted as
93
            members of the intersection for the ``fuzzy`` variant.
94
95
96
        .. versionadded:: 0.4.0
97
98
        """
99
        super(Digby, self).__init__(
100
            alphabet=alphabet,
101
            tokenizer=tokenizer,
102
            intersection_type=intersection_type,
103 1
            **kwargs
104
        )
105
106
    def corr(self, src: str, tar: str) -> float:
107
        """Return the Digby correlation of two strings.
108
109
        Parameters
110 1
        ----------
111
        src : str
112
            Source string (or QGrams/Counter objects) for comparison
113
        tar : str
114
            Target string (or QGrams/Counter objects) for comparison
115
116
        Returns
117
        -------
118
        float
119
            Digby correlation
120
121
        Examples
122
        --------
123
        >>> cmp = Digby()
124
        >>> cmp.corr('cat', 'hat')
125
        0.9774244829419212
126
        >>> cmp.corr('Niall', 'Neil')
127
        0.9491281473458171
128
        >>> cmp.corr('aluminum', 'Catalan')
129
        0.7541039303781305
130
        >>> cmp.corr('ATCG', 'TAGC')
131
        -1.0
132
133
134
        .. versionadded:: 0.4.0
135
136
        """
137
        if src == tar:
138
            return 1.0
139
        if not src or not tar:
140
            return -1.0
141 1
142 1
        self._tokenize(src, tar)
143 1
144 1
        a = self._intersection_card()
145
        b = self._src_only_card()
146 1
        c = self._tar_only_card()
147
        d = self._total_complement_card()
148 1
149 1
        num = (a * d) ** 0.75 - (b * c) ** 0.75
150 1
        if num:
151 1
            return num / ((a * d) ** 0.75 + (b * c) ** 0.75)
152
        return 0.0
153 1
154 1
    def sim(self, src: str, tar: str) -> float:
155 1
        """Return the Digby similarity of two strings.
156
157
        Parameters
158 1
        ----------
159
        src : str
160
            Source string (or QGrams/Counter objects) for comparison
161
        tar : str
162
            Target string (or QGrams/Counter objects) for comparison
163
164
        Returns
165
        -------
166
        float
167
            Digby similarity
168
169
        Examples
170
        --------
171
        >>> cmp = Digby()
172
        >>> cmp.sim('cat', 'hat')
173
        0.9887122414709606
174
        >>> cmp.sim('Niall', 'Neil')
175
        0.9745640736729085
176
        >>> cmp.sim('aluminum', 'Catalan')
177
        0.8770519651890653
178
        >>> cmp.sim('ATCG', 'TAGC')
179
        0.0
180
181
182
        .. versionadded:: 0.4.0
183
184
        """
185
        return (1 + self.corr(src, tar)) / 2
186
187
188
if __name__ == '__main__':
189 1
    import doctest
190
191
    doctest.testmod()
192