PearsonChiSquared.sim()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 32
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._pearson_chi_squared.
18
19 1
Pearson's Chi-Squared similarity
20
"""
21
22
from math import copysign
23
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
24 1
25
from ._token_distance import _TokenDistance
26
from ..tokenizer import _Tokenizer
27
28
__all__ = ['PearsonChiSquared']
29
30
31 1
class PearsonChiSquared(_TokenDistance):
32
    r"""Pearson's Chi-Squared similarity.
33 1
34
    For two sets X and Y and a population N, the Pearson's :math:`\chi^2`
35 1
    similarity :cite:`Pearson:1913` is
36
37
        .. math::
38 1
39
            sim_{PearsonChiSquared}(X, Y) =
40
            \frac{|N| \cdot (|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
41
            |X \setminus Y| \cdot |Y \setminus X|)^2}
42
            {|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}
43
44
    This is also Pearson I similarity.
45
46
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
47
    this is
48
49
        .. math::
50
51
            sim_{PearsonChiSquared} =
52
            \frac{n(ad-bc)^2}{(a+b)(a+c)(b+d)(c+d)}
53
54
    .. versionadded:: 0.4.0
55
    """
56
57
    def __init__(
58
        self,
59
        alphabet: Optional[
60
            Union[TCounter[str], Sequence[str], Set[str], int]
61
        ] = None,
62
        tokenizer: Optional[_Tokenizer] = None,
63
        intersection_type: str = 'crisp',
64 1
        **kwargs: Any
65
    ) -> None:
66
        """Initialize PearsonChiSquared instance.
67
68
        Parameters
69
        ----------
70
        alphabet : Counter, collection, int, or None
71
            This represents the alphabet of possible tokens.
72
            See :ref:`alphabet <alphabet>` description in
73
            :py:class:`_TokenDistance` for details.
74
        tokenizer : _Tokenizer
75
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
76
        intersection_type : str
77
            Specifies the intersection type, and set type as a result:
78
            See :ref:`intersection_type <intersection_type>` description in
79
            :py:class:`_TokenDistance` for details.
80
        **kwargs
81
            Arbitrary keyword arguments
82
83
        Other Parameters
84
        ----------------
85
        qval : int
86
            The length of each q-gram. Using this parameter and tokenizer=None
87
            will cause the instance to use the QGram tokenizer with this
88
            q value.
89
        metric : _Distance
90
            A string distance measure class for use in the ``soft`` and
91
            ``fuzzy`` variants.
92
        threshold : float
93
            A threshold value, similarities above which are counted as
94
            members of the intersection for the ``fuzzy`` variant.
95
96
97
        .. versionadded:: 0.4.0
98
99
        """
100
        super(PearsonChiSquared, self).__init__(
101
            alphabet=alphabet,
102
            tokenizer=tokenizer,
103
            intersection_type=intersection_type,
104
            **kwargs
105 1
        )
106
107 View Code Duplication
    def sim_score(self, src: str, tar: str) -> float:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
108
        """Return Pearson's Chi-Squared similarity of two strings.
109
110
        Parameters
111
        ----------
112 1
        src : str
113
            Source string (or QGrams/Counter objects) for comparison
114
        tar : str
115
            Target string (or QGrams/Counter objects) for comparison
116
117
        Returns
118
        -------
119
        float
120
            Pearson's Chi-Squared similarity
121
122
        Examples
123
        --------
124
        >>> cmp = PearsonChiSquared()
125
        >>> cmp.sim_score('cat', 'hat')
126
        193.99489809335964
127
        >>> cmp.sim_score('Niall', 'Neil')
128
        101.99771068526542
129
        >>> cmp.sim_score('aluminum', 'Catalan')
130
        9.19249664336649
131
        >>> cmp.sim_score('ATCG', 'TAGC')
132
        0.032298410951138765
133
134
135
        .. versionadded:: 0.4.0
136
137
        """
138
        self._tokenize(src, tar)
139
140
        a = self._intersection_card()
141
        b = self._src_only_card()
142
        c = self._tar_only_card()
143 1
        d = self._total_complement_card()
144
        n = self._population_unique_card()
145 1
        ab = self._src_card()
146 1
        ac = self._tar_card()
147 1
148 1
        if src == tar:
149 1
            return float(n)
150 1
        if not src or not tar:
151 1
            return 0.0
152
        num = n * (a * d - b * c) ** 2
153 1
        if num:
154 1
            return num / (ab * ac * (b + d) * (c + d))
155 1
        return 0.0
156 1
157 1
    def corr(self, src: str, tar: str) -> float:
158 1
        """Return Pearson's Chi-Squared correlation of two strings.
159 1
160
        Parameters
161
        ----------
162 1
        src : str
163
            Source string (or QGrams/Counter objects) for comparison
164
        tar : str
165
            Target string (or QGrams/Counter objects) for comparison
166
167
        Returns
168
        -------
169
        float
170
            Pearson's Chi-Squared correlation
171
172
        Examples
173
        --------
174
        >>> cmp = PearsonChiSquared()
175
        >>> cmp.corr('cat', 'hat')
176
        0.2474424720578567
177
        >>> cmp.corr('Niall', 'Neil')
178
        0.1300991207720222
179
        >>> cmp.corr('aluminum', 'Catalan')
180
        0.011710186806836291
181
        >>> cmp.corr('ATCG', 'TAGC')
182
        -4.1196952743799446e-05
183
184
185
        .. versionadded:: 0.4.0
186
187
        """
188
        if src == tar:
189
            return 1.0
190
191
        score = self.sim_score(src, tar)
192
193 1
        a = self._intersection_card()
194 1
        b = self._src_only_card()
195
        c = self._tar_only_card()
196 1
        d = self._total_complement_card()
197
198 1
        score /= a + b + c + d
199 1
200 1
        return copysign(score, a * d - b * c)
201 1
202
    def sim(self, src: str, tar: str) -> float:
203 1
        """Return Pearson's normalized Chi-Squared similarity of two strings.
204
205 1
        Parameters
206
        ----------
207 1
        src : str
208
            Source string (or QGrams/Counter objects) for comparison
209
        tar : str
210
            Target string (or QGrams/Counter objects) for comparison
211
212
        Returns
213
        -------
214
        float
215
            Normalized Pearson's Chi-Squared similarity
216
217
        Examples
218
        --------
219
        >>> cmp = PearsonChiSquared()
220
        >>> cmp.corr('cat', 'hat')
221
        0.2474424720578567
222
        >>> cmp.corr('Niall', 'Neil')
223
        0.1300991207720222
224
        >>> cmp.corr('aluminum', 'Catalan')
225
        0.011710186806836291
226
        >>> cmp.corr('ATCG', 'TAGC')
227
        -4.1196952743799446e-05
228
229
230
        .. versionadded:: 0.4.0
231
232
        """
233
        return (1.0 + self.corr(src, tar)) / 2.0
234
235
236
if __name__ == '__main__':
237
    import doctest
238 1
239
    doctest.testmod()
240