Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._koppen_i.KoppenI.corr()   A

Complexity

Conditions 3

Size

Total Lines 46
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 3.004

Importance

Changes 0
Metric Value
eloc 13
dl 0
loc 46
ccs 12
cts 13
cp 0.9231
rs 9.75
c 0
b 0
f 0
cc 3
nop 3
crap 3.004
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018-2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._koppen_i.
20
21
Köppen I correlation
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._token_distance import _TokenDistance
32
33 1
__all__ = ['KoppenI']
34
35
36 1
class KoppenI(_TokenDistance):
37
    r"""Köppen I correlation.
38
39
    For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`,
40
    Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is
41
42
        .. math::
43
44
            corr_{KoppenI}(X, Y) =
45
            \frac{|X| \cdot |N \setminus X| - |X \setminus Y|}
46
            {|X| \cdot |N \setminus X|}
47
48
    To support cases where :math:`|X| \neq |Y|`, this class implements a slight
49
    variation, while still providing the expected results when
50
    :math:`|X| = |Y|`:
51
52
        .. math::
53
54
            corr_{KoppenI}(X, Y) =
55
            \frac{\frac{|X|+|Y|}{2} \cdot
56
            \frac{|N \setminus X|+|N \setminus Y|}{2}-
57
            \frac{|X \triangle Y|}{2}}
58
            {\frac{|X|+|Y|}{2} \cdot
59
            \frac{|N \setminus X|+|N \setminus Y|}{2}}
60
61
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
62
    this is
63
64
        .. math::
65
66
            sim_{KoppenI} =
67
            \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
68
            \frac{b+c}{2}}
69
            {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}
70
71
    Notes
72
    -----
73
    In the usual case all of the above values should be proportional to the
74
    total number of samples n. I.e., a, b, c, d, & n should all be divided by
75
    n prior to calculating the coefficient. This class's default normalizer
76
    is, accordingly, 'proportional'.
77
78
    .. versionadded:: 0.4.0
79
80
    """
81
82 1
    def __init__(
83
        self,
84
        alphabet=None,
85
        tokenizer=None,
86
        intersection_type='crisp',
87
        normalizer='proportional',
88
        **kwargs
89
    ):
90
        """Initialize KoppenI instance.
91
92
        Parameters
93
        ----------
94
        alphabet : Counter, collection, int, or None
95
            This represents the alphabet of possible tokens.
96
            See :ref:`alphabet <alphabet>` description in
97
            :py:class:`_TokenDistance` for details.
98
        tokenizer : _Tokenizer
99
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
100
        intersection_type : str
101
            Specifies the intersection type, and set type as a result:
102
            See :ref:`intersection_type <intersection_type>` description in
103
            :py:class:`_TokenDistance` for details.
104
        normalizer : str
105
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
106
            description in :py:class:`_TokenDistance` for details.
107
        **kwargs
108
            Arbitrary keyword arguments
109
110
        Other Parameters
111
        ----------------
112
        qval : int
113
            The length of each q-gram. Using this parameter and tokenizer=None
114
            will cause the instance to use the QGram tokenizer with this
115
            q value.
116
        metric : _Distance
117
            A string distance measure class for use in the ``soft`` and
118
            ``fuzzy`` variants.
119
        threshold : float
120
            A threshold value, similarities above which are counted as
121
            members of the intersection for the ``fuzzy`` variant.
122
123
124
        .. versionadded:: 0.4.0
125
126
        """
127 1
        super(KoppenI, self).__init__(
128
            alphabet=alphabet,
129
            tokenizer=tokenizer,
130
            intersection_type=intersection_type,
131
            normalizer=normalizer,
132
            **kwargs
133
        )
134
135 1
    def corr(self, src, tar):
136
        """Return the Köppen I correlation of two strings.
137
138
        Parameters
139
        ----------
140
        src : str
141
            Source string (or QGrams/Counter objects) for comparison
142
        tar : str
143
            Target string (or QGrams/Counter objects) for comparison
144
145
        Returns
146
        -------
147
        float
148
            Köppen I correlation
149
150
        Examples
151
        --------
152
        >>> cmp = KoppenI()
153
        >>> cmp.corr('cat', 'hat')
154
        0.49615384615384617
155
        >>> cmp.corr('Niall', 'Neil')
156
        0.3575056927658083
157
        >>> cmp.corr('aluminum', 'Catalan')
158
        0.1068520131813188
159
        >>> cmp.corr('ATCG', 'TAGC')
160
        -0.006418485237483896
161
162
163
        .. versionadded:: 0.4.0
164
165
        """
166 1
        if src == tar:
167 1
            return 1.0
168 1
        self._tokenize(src, tar)
169
170 1
        a = self._intersection_card()
171 1
        b = self._src_only_card()
172 1
        c = self._tar_only_card()
173 1
        d = self._total_complement_card()
174
175 1
        abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4
176
177 1
        num = abac_dbdc_mean_prod - (b + c) / 2
178 1
        if num:
179 1
            return num / abac_dbdc_mean_prod
180
        return 0.0
181
182 1
    def sim(self, src, tar):
183
        """Return the Köppen I similarity of two strings.
184
185
        Parameters
186
        ----------
187
        src : str
188
            Source string (or QGrams/Counter objects) for comparison
189
        tar : str
190
            Target string (or QGrams/Counter objects) for comparison
191
192
        Returns
193
        -------
194
        float
195
            Köppen I similarity
196
197
        Examples
198
        --------
199
        >>> cmp = KoppenI()
200
        >>> cmp.sim('cat', 'hat')
201
        0.7480769230769231
202
        >>> cmp.sim('Niall', 'Neil')
203
        0.6787528463829041
204
        >>> cmp.sim('aluminum', 'Catalan')
205
        0.5534260065906594
206
        >>> cmp.sim('ATCG', 'TAGC')
207
        0.49679075738125805
208
209
210
        .. versionadded:: 0.4.0
211
212
        """
213 1
        return (1.0 + self.corr(src, tar)) / 2.0
214
215
216
if __name__ == '__main__':
217
    import doctest
218
219
    doctest.testmod()
220