Issues (140)

abydos/distance/_dispersion.py (1 issue)

1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._dispersion.
18
19 1
Dispersion correlation
20
"""
21
22
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
23
24 1
from ._token_distance import _TokenDistance
25
from ..tokenizer import _Tokenizer
26
27
__all__ = ['Dispersion']
28
29
30 View Code Duplication
class Dispersion(_TokenDistance):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
31 1
    r"""Dispersion correlation.
32
33 1
    For two sets X and Y and a population N, the dispersion
34
    correlation :cite:`IBM:2017` is
35
36 1
        .. math::
37
38
            corr_{dispersion}(X, Y) =
39
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
40
            |X \setminus Y| \cdot |Y \setminus X|}
41
            {|N|^2}
42
43
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44
    this is
45
46
        .. math::
47
48
            corr_{dispersion} =
49
            \frac{ad-bc}{n^2}
50
51
    .. versionadded:: 0.4.0
52
    """
53
54
    def __init__(
55
        self,
56
        alphabet: Optional[
57
            Union[TCounter[str], Sequence[str], Set[str], int]
58
        ] = None,
59
        tokenizer: Optional[_Tokenizer] = None,
60 1
        intersection_type: str = 'crisp',
61
        **kwargs: Any
62
    ) -> None:
63
        """Initialize Dispersion instance.
64
65
        Parameters
66
        ----------
67
        alphabet : Counter, collection, int, or None
68
            This represents the alphabet of possible tokens.
69
            See :ref:`alphabet <alphabet>` description in
70
            :py:class:`_TokenDistance` for details.
71
        tokenizer : _Tokenizer
72
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73
        intersection_type : str
74
            Specifies the intersection type, and set type as a result:
75
            See :ref:`intersection_type <intersection_type>` description in
76
            :py:class:`_TokenDistance` for details.
77
        **kwargs
78
            Arbitrary keyword arguments
79
80
        Other Parameters
81
        ----------------
82
        qval : int
83
            The length of each q-gram. Using this parameter and tokenizer=None
84
            will cause the instance to use the QGram tokenizer with this
85
            q value.
86
        metric : _Distance
87
            A string distance measure class for use in the ``soft`` and
88
            ``fuzzy`` variants.
89
        threshold : float
90
            A threshold value, similarities above which are counted as
91
            members of the intersection for the ``fuzzy`` variant.
92
93
94
        .. versionadded:: 0.4.0
95
96
        """
97
        super(Dispersion, self).__init__(
98
            alphabet=alphabet,
99
            tokenizer=tokenizer,
100
            intersection_type=intersection_type,
101 1
            **kwargs
102
        )
103
104
    def corr(self, src: str, tar: str) -> float:
105
        """Return the Dispersion correlation of two strings.
106
107
        Parameters
108 1
        ----------
109
        src : str
110
            Source string (or QGrams/Counter objects) for comparison
111
        tar : str
112
            Target string (or QGrams/Counter objects) for comparison
113
114
        Returns
115
        -------
116
        float
117
            Dispersion correlation
118
119
        Examples
120
        --------
121
        >>> cmp = Dispersion()
122
        >>> cmp.corr('cat', 'hat')
123
        0.002524989587671803
124
        >>> cmp.corr('Niall', 'Neil')
125
        0.002502212619741774
126
        >>> cmp.corr('aluminum', 'Catalan')
127
        0.0011570449105440383
128
        >>> cmp.corr('ATCG', 'TAGC')
129
        -4.06731570179092e-05
130
131
132
        .. versionadded:: 0.4.0
133
134
        """
135
        self._tokenize(src, tar)
136
137
        a = self._intersection_card()
138
        b = self._src_only_card()
139 1
        c = self._tar_only_card()
140
        d = self._total_complement_card()
141 1
        n = self._population_unique_card()
142 1
143 1
        admbc = a * d - b * c
144 1
        if admbc == 0.0:
145 1
            return 0.0
146
        return admbc / n ** 2
147 1
148 1
    def sim(self, src: str, tar: str) -> float:
149 1
        """Return the Dispersion similarity of two strings.
150 1
151
        Parameters
152 1
        ----------
153
        src : str
154
            Source string (or QGrams/Counter objects) for comparison
155
        tar : str
156
            Target string (or QGrams/Counter objects) for comparison
157
158
        Returns
159
        -------
160
        float
161
            Dispersion similarity
162
163
        Examples
164
        --------
165
        >>> cmp = Dispersion()
166
        >>> cmp.sim('cat', 'hat')
167
        0.5012624947938359
168
        >>> cmp.sim('Niall', 'Neil')
169
        0.5012511063098709
170
        >>> cmp.sim('aluminum', 'Catalan')
171
        0.500578522455272
172
        >>> cmp.sim('ATCG', 'TAGC')
173
        0.499979663421491
174
175
176
        .. versionadded:: 0.4.0
177
178
        """
179
        return (1 + self.corr(src, tar)) / 2
180
181
182
if __name__ == '__main__':
183 1
    import doctest
184
185
    doctest.testmod()
186