abydos.distance._scott_pi.ScottPi.corr()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 50
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 50
ccs 11
cts 11
cp 1
rs 9.6
c 0
b 0
f 0
cc 3
nop 3
crap 3
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._scott_pi.
18
19 1
Scott's Pi correlation
20
"""
21
22
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
23
24 1
from ._token_distance import _TokenDistance
25
from ..tokenizer import _Tokenizer
26
27
__all__ = ['ScottPi']
28
29
30
class ScottPi(_TokenDistance):
31 1
    r"""Scott's Pi correlation.
32
33 1
    For two sets X and Y and a population N, Scott's :math:`\pi` correlation
34
    :cite:`Scott:1955` is
35
36 1
        .. math::
37
38
            corr_{Scott_\pi}(X, Y) = \pi =
39
            \frac{p_o - p_e^\pi}{1 - p_e^\pi}
40
41
    where
42
43
        .. math::
44
45
            \begin{array}{ll}
46
            p_o &= \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}
47
48
            p_e^\pi &= \Big(\frac{|X| + |Y|}{2 \cdot |N|}\Big)^2 +
49
            \Big(\frac{|N \setminus X| + |N \setminus Y|}{2 \cdot |N|}\Big)^2
50
            \end{array}
51
52
53
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54
    this is
55
56
        .. math::
57
58
            \begin{array}{ll}
59
            p_o &= \frac{a+d}{n}
60
61
            p_e^\pi &= \Big(\frac{2a+b+c}{2n}\Big)^2 +
62
            \Big(\frac{2d+b+c}{2n}\Big)^2
63
            \end{array}
64
65
66
    .. versionadded:: 0.4.0
67
    """
68
69
    def __init__(
70
        self,
71
        alphabet: Optional[
72
            Union[TCounter[str], Sequence[str], Set[str], int]
73
        ] = None,
74
        tokenizer: Optional[_Tokenizer] = None,
75 1
        intersection_type: str = 'crisp',
76
        **kwargs: Any
77
    ) -> None:
78
        """Initialize ScottPi instance.
79
80
        Parameters
81
        ----------
82
        alphabet : Counter, collection, int, or None
83
            This represents the alphabet of possible tokens.
84
            See :ref:`alphabet <alphabet>` description in
85
            :py:class:`_TokenDistance` for details.
86
        tokenizer : _Tokenizer
87
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
88
        intersection_type : str
89
            Specifies the intersection type, and set type as a result:
90
            See :ref:`intersection_type <intersection_type>` description in
91
            :py:class:`_TokenDistance` for details.
92
        **kwargs
93
            Arbitrary keyword arguments
94
95
        Other Parameters
96
        ----------------
97
        qval : int
98
            The length of each q-gram. Using this parameter and tokenizer=None
99
            will cause the instance to use the QGram tokenizer with this
100
            q value.
101
        metric : _Distance
102
            A string distance measure class for use in the ``soft`` and
103
            ``fuzzy`` variants.
104
        threshold : float
105
            A threshold value, similarities above which are counted as
106
            members of the intersection for the ``fuzzy`` variant.
107
108
109
        .. versionadded:: 0.4.0
110
111
        """
112
        super(ScottPi, self).__init__(
113
            alphabet=alphabet,
114
            tokenizer=tokenizer,
115
            intersection_type=intersection_type,
116 1
            **kwargs
117
        )
118
119
    def corr(self, src: str, tar: str) -> float:
120
        """Return the Scott's Pi correlation of two strings.
121
122
        Parameters
123 1
        ----------
124
        src : str
125
            Source string (or QGrams/Counter objects) for comparison
126
        tar : str
127
            Target string (or QGrams/Counter objects) for comparison
128
129
        Returns
130
        -------
131
        float
132
            Scott's Pi correlation
133
134
        Examples
135
        --------
136
        >>> cmp = ScottPi()
137
        >>> cmp.corr('cat', 'hat')
138
        0.49743589743589733
139
        >>> cmp.corr('Niall', 'Neil')
140
        0.35914053833129245
141
        >>> cmp.corr('aluminum', 'Catalan')
142
        0.10798833377524023
143
        >>> cmp.corr('ATCG', 'TAGC')
144
        -0.006418485237489689
145
146
147
        .. versionadded:: 0.4.0
148
149
        """
150
        if src == tar:
151
            return 1.0
152
153
        self._tokenize(src, tar)
154 1
155 1
        a = self._intersection_card()
156
        b = self._src_only_card()
157 1
        c = self._tar_only_card()
158
        d = self._total_complement_card()
159 1
        n = a + b + c + d
160 1
161 1
        po = (a + d) / n
162 1
        pe = ((2 * a + b + c) / (2 * n)) ** 2 + (
163 1
            (2 * d + b + c) / (2 * n)
164
        ) ** 2
165 1
166 1
        if po != pe:
167
            return (po - pe) / (1 - pe)
168
        return 0.0
169
170 1
    def sim(self, src: str, tar: str) -> float:
171 1
        """Return the Scott's Pi similarity of two strings.
172
173
        Parameters
174 1
        ----------
175
        src : str
176
            Source string (or QGrams/Counter objects) for comparison
177
        tar : str
178
            Target string (or QGrams/Counter objects) for comparison
179
180
        Returns
181
        -------
182
        float
183
            Scott's Pi similarity
184
185
        Examples
186
        --------
187
        >>> cmp = ScottPi()
188
        >>> cmp.sim('cat', 'hat')
189
        0.7487179487179487
190
        >>> cmp.sim('Niall', 'Neil')
191
        0.6795702691656462
192
        >>> cmp.sim('aluminum', 'Catalan')
193
        0.5539941668876202
194
        >>> cmp.sim('ATCG', 'TAGC')
195
        0.49679075738125517
196
197
198
        .. versionadded:: 0.4.0
199
200
        """
201
        return (1.0 + self.corr(src, tar)) / 2.0
202
203
204
if __name__ == '__main__':
205 1
    import doctest
206
207
    doctest.testmod()
208