abydos.distance._koppen_i.KoppenI.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 53
Code Lines 15

Duplication

Lines 53
Ratio 100 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 15
dl 53
loc 53
ccs 2
cts 2
cp 1
rs 9.65
c 0
b 0
f 0
cc 1
nop 6
crap 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._koppen_i.
18
19 1
Köppen I correlation
20
"""
21
22
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
23
24 1
from ._token_distance import _TokenDistance
25
from ..tokenizer import _Tokenizer
26
27
__all__ = ['KoppenI']
28
29
30 View Code Duplication
class KoppenI(_TokenDistance):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
31 1
    r"""Köppen I correlation.
32
33 1
    For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`,
34
    Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is
35
36 1
        .. math::
37
38
            corr_{KoppenI}(X, Y) =
39
            \frac{|X| \cdot |N \setminus X| - |X \setminus Y|}
40
            {|X| \cdot |N \setminus X|}
41
42
    To support cases where :math:`|X| \neq |Y|`, this class implements a slight
43
    variation, while still providing the expected results when
44
    :math:`|X| = |Y|`:
45
46
        .. math::
47
48
            corr_{KoppenI}(X, Y) =
49
            \frac{\frac{|X|+|Y|}{2} \cdot
50
            \frac{|N \setminus X|+|N \setminus Y|}{2}-
51
            \frac{|X \triangle Y|}{2}}
52
            {\frac{|X|+|Y|}{2} \cdot
53
            \frac{|N \setminus X|+|N \setminus Y|}{2}}
54
55
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
56
    this is
57
58
        .. math::
59
60
            sim_{KoppenI} =
61
            \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
62
            \frac{b+c}{2}}
63
            {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}
64
65
    Notes
66
    -----
67
    In the usual case all of the above values should be proportional to the
68
    total number of samples n. I.e., a, b, c, d, & n should all be divided by
69
    n prior to calculating the coefficient. This class's default normalizer
70
    is, accordingly, 'proportional'.
71
72
    .. versionadded:: 0.4.0
73
74
    """
75
76
    def __init__(
77
        self,
78
        alphabet: Optional[
79
            Union[TCounter[str], Sequence[str], Set[str], int]
80
        ] = None,
81
        tokenizer: Optional[_Tokenizer] = None,
82 1
        intersection_type: str = 'crisp',
83
        normalizer: str = 'proportional',
84
        **kwargs: Any
85
    ) -> None:
86
        """Initialize KoppenI instance.
87
88
        Parameters
89
        ----------
90
        alphabet : Counter, collection, int, or None
91
            This represents the alphabet of possible tokens.
92
            See :ref:`alphabet <alphabet>` description in
93
            :py:class:`_TokenDistance` for details.
94
        tokenizer : _Tokenizer
95
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
96
        intersection_type : str
97
            Specifies the intersection type, and set type as a result:
98
            See :ref:`intersection_type <intersection_type>` description in
99
            :py:class:`_TokenDistance` for details.
100
        normalizer : str
101
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
102
            description in :py:class:`_TokenDistance` for details.
103
        **kwargs
104
            Arbitrary keyword arguments
105
106
        Other Parameters
107
        ----------------
108
        qval : int
109
            The length of each q-gram. Using this parameter and tokenizer=None
110
            will cause the instance to use the QGram tokenizer with this
111
            q value.
112
        metric : _Distance
113
            A string distance measure class for use in the ``soft`` and
114
            ``fuzzy`` variants.
115
        threshold : float
116
            A threshold value, similarities above which are counted as
117
            members of the intersection for the ``fuzzy`` variant.
118
119
120
        .. versionadded:: 0.4.0
121
122
        """
123
        super(KoppenI, self).__init__(
124
            alphabet=alphabet,
125
            tokenizer=tokenizer,
126
            intersection_type=intersection_type,
127 1
            normalizer=normalizer,
128
            **kwargs
129
        )
130
131
    def corr(self, src: str, tar: str) -> float:
132
        """Return the Köppen I correlation of two strings.
133
134
        Parameters
135 1
        ----------
136
        src : str
137
            Source string (or QGrams/Counter objects) for comparison
138
        tar : str
139
            Target string (or QGrams/Counter objects) for comparison
140
141
        Returns
142
        -------
143
        float
144
            Köppen I correlation
145
146
        Examples
147
        --------
148
        >>> cmp = KoppenI()
149
        >>> cmp.corr('cat', 'hat')
150
        0.49615384615384617
151
        >>> cmp.corr('Niall', 'Neil')
152
        0.3575056927658083
153
        >>> cmp.corr('aluminum', 'Catalan')
154
        0.1068520131813188
155
        >>> cmp.corr('ATCG', 'TAGC')
156
        -0.006418485237483896
157
158
159
        .. versionadded:: 0.4.0
160
161
        """
162
        if src == tar:
163
            return 1.0
164
        self._tokenize(src, tar)
165
166 1
        a = self._intersection_card()
167 1
        b = self._src_only_card()
168 1
        c = self._tar_only_card()
169
        d = self._total_complement_card()
170 1
171 1
        abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4
172 1
173 1
        num = abac_dbdc_mean_prod - (b + c) / 2
174
        if num:
175 1
            return num / abac_dbdc_mean_prod
176
        return 0.0
177 1
178 1
    def sim(self, src: str, tar: str) -> float:
179 1
        """Return the Köppen I similarity of two strings.
180
181
        Parameters
182 1
        ----------
183
        src : str
184
            Source string (or QGrams/Counter objects) for comparison
185
        tar : str
186
            Target string (or QGrams/Counter objects) for comparison
187
188
        Returns
189
        -------
190
        float
191
            Köppen I similarity
192
193
        Examples
194
        --------
195
        >>> cmp = KoppenI()
196
        >>> cmp.sim('cat', 'hat')
197
        0.7480769230769231
198
        >>> cmp.sim('Niall', 'Neil')
199
        0.6787528463829041
200
        >>> cmp.sim('aluminum', 'Catalan')
201
        0.5534260065906594
202
        >>> cmp.sim('ATCG', 'TAGC')
203
        0.49679075738125805
204
205
206
        .. versionadded:: 0.4.0
207
208
        """
209
        return (1.0 + self.corr(src, tar)) / 2.0
210
211
212
if __name__ == '__main__':
213 1
    import doctest
214
215
    doctest.testmod()
216