Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._scott_pi.ScottPi.sim()   A

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 32
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._scott_pi.
20
21
Scott's Pi correlation
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32
33 1
__all__ = ['ScottPi']
34
35
36 1
class ScottPi(_TokenDistance):
37
    r"""Scott's Pi correlation.
38
39
    For two sets X and Y and a population N, Scott's :math:`\pi` correlation
40
    :cite:`Scott:1955` is
41
42
        .. math::
43
44
            corr_{Scott_\pi}(X, Y) = \pi =
45
            \frac{p_o - p_e^\pi}{1 - p_e^\pi}
46
47
    where
48
49
        .. math::
50
51
            \begin{array}{ll}
52
            p_o &= \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}
53
54
            p_e^\pi &= \Big(\frac{|X| + |Y|}{2 \cdot |N|}\Big)^2 +
55
            \Big(\frac{|N \setminus X| + |N \setminus Y|}{2 \cdot |N|}\Big)^2
56
            \end{array}
57
58
59
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
60
    this is
61
62
        .. math::
63
64
            \begin{array}{ll}
65
            p_o &= \frac{a+d}{n}
66
67
            p_e^\pi &= \Big(\frac{2a+b+c}{2n}\Big)^2 +
68
            \Big(\frac{2d+b+c}{2n}\Big)^2
69
            \end{array}
70
71
72
    .. versionadded:: 0.4.0
73
    """
74
75 1
    def __init__(
76
        self,
77
        alphabet=None,
78
        tokenizer=None,
79
        intersection_type='crisp',
80
        **kwargs
81
    ):
82
        """Initialize ScottPi instance.
83
84
        Parameters
85
        ----------
86
        alphabet : Counter, collection, int, or None
87
            This represents the alphabet of possible tokens.
88
            See :ref:`alphabet <alphabet>` description in
89
            :py:class:`_TokenDistance` for details.
90
        tokenizer : _Tokenizer
91
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
92
        intersection_type : str
93
            Specifies the intersection type, and set type as a result:
94
            See :ref:`intersection_type <intersection_type>` description in
95
            :py:class:`_TokenDistance` for details.
96
        **kwargs
97
            Arbitrary keyword arguments
98
99
        Other Parameters
100
        ----------------
101
        qval : int
102
            The length of each q-gram. Using this parameter and tokenizer=None
103
            will cause the instance to use the QGram tokenizer with this
104
            q value.
105
        metric : _Distance
106
            A string distance measure class for use in the ``soft`` and
107
            ``fuzzy`` variants.
108
        threshold : float
109
            A threshold value, similarities above which are counted as
110
            members of the intersection for the ``fuzzy`` variant.
111
112
113
        .. versionadded:: 0.4.0
114
115
        """
116 1
        super(ScottPi, self).__init__(
117
            alphabet=alphabet,
118
            tokenizer=tokenizer,
119
            intersection_type=intersection_type,
120
            **kwargs
121
        )
122
123 1
    def corr(self, src, tar):
124
        """Return the Scott's Pi correlation of two strings.
125
126
        Parameters
127
        ----------
128
        src : str
129
            Source string (or QGrams/Counter objects) for comparison
130
        tar : str
131
            Target string (or QGrams/Counter objects) for comparison
132
133
        Returns
134
        -------
135
        float
136
            Scott's Pi correlation
137
138
        Examples
139
        --------
140
        >>> cmp = ScottPi()
141
        >>> cmp.corr('cat', 'hat')
142
        0.49743589743589733
143
        >>> cmp.corr('Niall', 'Neil')
144
        0.35914053833129245
145
        >>> cmp.corr('aluminum', 'Catalan')
146
        0.10798833377524023
147
        >>> cmp.corr('ATCG', 'TAGC')
148
        -0.006418485237489689
149
150
151
        .. versionadded:: 0.4.0
152
153
        """
154 1
        if src == tar:
155 1
            return 1.0
156
157 1
        self._tokenize(src, tar)
158
159 1
        a = self._intersection_card()
160 1
        b = self._src_only_card()
161 1
        c = self._tar_only_card()
162 1
        d = self._total_complement_card()
163 1
        n = a + b + c + d
164
165 1
        po = (a + d) / n
166 1
        pe = ((2 * a + b + c) / (2 * n)) ** 2 + (
167
            (2 * d + b + c) / (2 * n)
168
        ) ** 2
169
170 1
        if po != pe:
171 1
            return (po - pe) / (1 - pe)
172
        return 0.0
173
174 1
    def sim(self, src, tar):
175
        """Return the Scott's Pi similarity of two strings.
176
177
        Parameters
178
        ----------
179
        src : str
180
            Source string (or QGrams/Counter objects) for comparison
181
        tar : str
182
            Target string (or QGrams/Counter objects) for comparison
183
184
        Returns
185
        -------
186
        float
187
            Scott's Pi similarity
188
189
        Examples
190
        --------
191
        >>> cmp = ScottPi()
192
        >>> cmp.sim('cat', 'hat')
193
        0.7487179487179487
194
        >>> cmp.sim('Niall', 'Neil')
195
        0.6795702691656462
196
        >>> cmp.sim('aluminum', 'Catalan')
197
        0.5539941668876202
198
        >>> cmp.sim('ATCG', 'TAGC')
199
        0.49679075738125517
200
201
202
        .. versionadded:: 0.4.0
203
204
        """
205 1
        return (1.0 + self.corr(src, tar)) / 2.0
206
207
208
if __name__ == '__main__':
209
    import doctest
210
211
    doctest.testmod()
212