abydos.distance._koppen_i.KoppenI.sim() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.distance._koppen_i.KoppenI.sim() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._koppen_i

Complexity

Conditions

Size

Total Lines	32
Code Lines	2

Duplication

Lines	32
Ratio	100 %

Code Coverage

Tests	3
CRAP Score	1.0156

Importance

Changes

Metric	Value
eloc	2
dl	32
loc	32
ccs	3
cts	4
cp	0.75
rs	10
c	0
b	0
f	0
cc	1
nop	3
crap	1.0156

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._koppen_i.

Köppen I correlation
"""

from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union

from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer

__all__ = ['KoppenI']


class KoppenI(_TokenDistance):

    r"""Köppen I correlation.

    For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`,
    Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is

        .. math::

            corr_{KoppenI}(X, Y) =
            \frac{|X| \cdot |N \setminus X| - |X \setminus Y|}
            {|X| \cdot |N \setminus X|}

    To support cases where :math:`|X| \neq |Y|`, this class implements a slight
    variation, while still providing the expected results when
    :math:`|X| = |Y|`:

        .. math::

            corr_{KoppenI}(X, Y) =
            \frac{\frac{|X|+|Y|}{2} \cdot
            \frac{|N \setminus X|+|N \setminus Y|}{2}-
            \frac{|X \triangle Y|}{2}}
            {\frac{|X|+|Y|}{2} \cdot
            \frac{|N \setminus X|+|N \setminus Y|}{2}}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            sim_{KoppenI} =
            \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
            \frac{b+c}{2}}
            {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}

    Notes
    -----
    In the usual case all of the above values should be proportional to the
    total number of samples n. I.e., a, b, c, d, & n should all be divided by
    n prior to calculating the coefficient. This class's default normalizer
    is, accordingly, 'proportional'.

    .. versionadded:: 0.4.0

    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        normalizer: str = 'proportional',
        **kwargs: Any
    ) -> None:
        """Initialize KoppenI instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        normalizer : str
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
            description in :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(KoppenI, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            normalizer=normalizer,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Köppen I correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Köppen I correlation

        Examples
        --------
        >>> cmp = KoppenI()
        >>> cmp.corr('cat', 'hat')
        0.49615384615384617
        >>> cmp.corr('Niall', 'Neil')
        0.3575056927658083
        >>> cmp.corr('aluminum', 'Catalan')
        0.1068520131813188
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483896


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4

        num = abac_dbdc_mean_prod - (b + c) / 2
        if num:
            return num / abac_dbdc_mean_prod
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Köppen I similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Köppen I similarity

        Examples
        --------
        >>> cmp = KoppenI()
        >>> cmp.sim('cat', 'hat')
        0.7480769230769231
        >>> cmp.sim('Niall', 'Neil')
        0.6787528463829041
        >>> cmp.sim('aluminum', 'Catalan')
        0.5534260065906594
        >>> cmp.sim('ATCG', 'TAGC')
        0.49679075738125805


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# Copyright 2018-2020 by Christopher C. Little.
2			# This file is part of Abydos.
3			#
4			# Abydos is free software: you can redistribute it and/or modify
5			# it under the terms of the GNU General Public License as published by
6			# the Free Software Foundation, either version 3 of the License, or
7			# (at your option) any later version.
8			#
9			# Abydos is distributed in the hope that it will be useful,
10			# but WITHOUT ANY WARRANTY; without even the implied warranty of
11			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12			# GNU General Public License for more details.
13			#
14			# You should have received a copy of the GNU General Public License
15			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17			"""abydos.distance._koppen_i.
18
19	1		Köppen I correlation
20			"""
21
22			from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
23
24	1		from ._token_distance import _TokenDistance
25			from ..tokenizer import _Tokenizer
26
27			__all__ = ['KoppenI']
28
29
30		View Code Duplication	class KoppenI(_TokenDistance):
			0 ignored issues – show Duplication introduced 2019-02-15 07:24 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
31	1		r"""Köppen I correlation.
32
33	1		For two sets X and Y and an alphabet N, provided that :math:`\|X\| = \|Y\|`,
34			Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is
35
36	1		.. math::
37
38			corr_{KoppenI}(X, Y) =
39			\frac{\|X\| \cdot \|N \setminus X\| - \|X \setminus Y\|}
40			{\|X\| \cdot \|N \setminus X\|}
41
42			To support cases where :math:`\|X\| \neq \|Y\|`, this class implements a slight
43			variation, while still providing the expected results when
44			:math:`\|X\| = \|Y\|`:
45
46			.. math::
47
48			corr_{KoppenI}(X, Y) =
49			\frac{\frac{\|X\|+\|Y\|}{2} \cdot
50			\frac{\|N \setminus X\|+\|N \setminus Y\|}{2}-
51			\frac{\|X \triangle Y\|}{2}}
52			{\frac{\|X\|+\|Y\|}{2} \cdot
53			\frac{\|N \setminus X\|+\|N \setminus Y\|}{2}}
54
55			In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
56			this is
57
58			.. math::
59
60			sim_{KoppenI} =
61			\frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
62			\frac{b+c}{2}}
63			{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}
64
65			Notes
66			-----
67			In the usual case all of the above values should be proportional to the
68			total number of samples n. I.e., a, b, c, d, & n should all be divided by
69			n prior to calculating the coefficient. This class's default normalizer
70			is, accordingly, 'proportional'.
71
72			.. versionadded:: 0.4.0
73
74			"""
75
76			def __init__(
77			self,
78			alphabet: Optional[
79			Union[TCounter[str], Sequence[str], Set[str], int]
80			] = None,
81			tokenizer: Optional[_Tokenizer] = None,
82	1		intersection_type: str = 'crisp',
83			normalizer: str = 'proportional',
84			**kwargs: Any
85			) -> None:
86			"""Initialize KoppenI instance.
87
88			Parameters
89			----------
90			alphabet : Counter, collection, int, or None
91			This represents the alphabet of possible tokens.
92			See :ref:`alphabet <alphabet>` description in
93			:py:class:`_TokenDistance` for details.
94			tokenizer : _Tokenizer
95			A tokenizer instance from the :py:mod:`abydos.tokenizer` package
96			intersection_type : str
97			Specifies the intersection type, and set type as a result:
98			See :ref:`intersection_type <intersection_type>` description in
99			:py:class:`_TokenDistance` for details.
100			normalizer : str
101			Specifies the normalization type. See :ref:`normalizer <alphabet>`
102			description in :py:class:`_TokenDistance` for details.
103			**kwargs
104			Arbitrary keyword arguments
105
106			Other Parameters
107			----------------
108			qval : int
109			The length of each q-gram. Using this parameter and tokenizer=None
110			will cause the instance to use the QGram tokenizer with this
111			q value.
112			metric : _Distance
113			A string distance measure class for use in the ``soft`` and
114			``fuzzy`` variants.
115			threshold : float
116			A threshold value, similarities above which are counted as
117			members of the intersection for the ``fuzzy`` variant.
118
119
120			.. versionadded:: 0.4.0
121
122			"""
123			super(KoppenI, self).__init__(
124			alphabet=alphabet,
125			tokenizer=tokenizer,
126			intersection_type=intersection_type,
127	1		normalizer=normalizer,
128			**kwargs
129			)
130
131			def corr(self, src: str, tar: str) -> float:
132			"""Return the Köppen I correlation of two strings.
133
134			Parameters
135	1		----------
136			src : str
137			Source string (or QGrams/Counter objects) for comparison
138			tar : str
139			Target string (or QGrams/Counter objects) for comparison
140
141			Returns
142			-------
143			float
144			Köppen I correlation
145
146			Examples
147			--------
148			>>> cmp = KoppenI()
149			>>> cmp.corr('cat', 'hat')
150			0.49615384615384617
151			>>> cmp.corr('Niall', 'Neil')
152			0.3575056927658083
153			>>> cmp.corr('aluminum', 'Catalan')
154			0.1068520131813188
155			>>> cmp.corr('ATCG', 'TAGC')
156			-0.006418485237483896
157
158
159			.. versionadded:: 0.4.0
160
161			"""
162			if src == tar:
163			return 1.0
164			self._tokenize(src, tar)
165
166	1		a = self._intersection_card()
167	1		b = self._src_only_card()
168	1		c = self._tar_only_card()
169			d = self._total_complement_card()
170	1
171	1		abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4
172	1
173	1		num = abac_dbdc_mean_prod - (b + c) / 2
174			if num:
175	1		return num / abac_dbdc_mean_prod
176			return 0.0
177	1
178	1		def sim(self, src: str, tar: str) -> float:
179	1		"""Return the Köppen I similarity of two strings.
180
181			Parameters
182	1		----------
183			src : str
184			Source string (or QGrams/Counter objects) for comparison
185			tar : str
186			Target string (or QGrams/Counter objects) for comparison
187
188			Returns
189			-------
190			float
191			Köppen I similarity
192
193			Examples
194			--------
195			>>> cmp = KoppenI()
196			>>> cmp.sim('cat', 'hat')
197			0.7480769230769231
198			>>> cmp.sim('Niall', 'Neil')
199			0.6787528463829041
200			>>> cmp.sim('aluminum', 'Catalan')
201			0.5534260065906594
202			>>> cmp.sim('ATCG', 'TAGC')
203			0.49679075738125805
204
205
206			.. versionadded:: 0.4.0
207
208			"""
209			return (1.0 + self.corr(src, tar)) / 2.0
210
211
212			if __name__ == '__main__':
213	1		import doctest
214
215			doctest.testmod()
216

chrislit / abydos

abydos.distance._koppen_i.KoppenI.sim() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

abydos.distance._koppen_i.KoppenI.sim() A
last analyzed 2020-12-31 20:10 UTC