Code Duplication - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Code Duplication Length = 149-180 lines in 3 locations

abydos/distance/_koppen_i.py 1 location


__all__ = ['KoppenI']


class KoppenI(_TokenDistance):
    r"""Köppen I correlation.

    For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`,
    Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is

        .. math::

            corr_{KoppenI}(X, Y) =
            \frac{|X| \cdot |N \setminus X| - |X \setminus Y|}
            {|X| \cdot |N \setminus X|}

    To support cases where :math:`|X| \neq |Y|`, this class implements a slight
    variation, while still providing the expected results when
    :math:`|X| = |Y|`:

        .. math::

            corr_{KoppenI}(X, Y) =
            \frac{\frac{|X|+|Y|}{2} \cdot
            \frac{|N \setminus X|+|N \setminus Y|}{2}-
            \frac{|X \triangle Y|}{2}}
            {\frac{|X|+|Y|}{2} \cdot
            \frac{|N \setminus X|+|N \setminus Y|}{2}}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            sim_{KoppenI} =
            \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
            \frac{b+c}{2}}
            {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}

    Notes
    -----
    In the usual case all of the above values should be proportional to the
    total number of samples n. I.e., a, b, c, d, & n should all be divided by
    n prior to calculating the coefficient. This class's default normalizer
    is, accordingly, 'proportional'.

    .. versionadded:: 0.4.0

    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        normalizer: str = 'proportional',
        **kwargs: Any
    ) -> None:
        """Initialize KoppenI instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        normalizer : str
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
            description in :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(KoppenI, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            normalizer=normalizer,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Köppen I correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Köppen I correlation

        Examples
        --------
        >>> cmp = KoppenI()
        >>> cmp.corr('cat', 'hat')
        0.49615384615384617
        >>> cmp.corr('Niall', 'Neil')
        0.3575056927658083
        >>> cmp.corr('aluminum', 'Catalan')
        0.1068520131813188
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483896


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4

        num = abac_dbdc_mean_prod - (b + c) / 2
        if num:
            return num / abac_dbdc_mean_prod
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Köppen I similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Köppen I similarity

        Examples
        --------
        >>> cmp = KoppenI()
        >>> cmp.sim('cat', 'hat')
        0.7480769230769231
        >>> cmp.sim('Niall', 'Neil')
        0.6787528463829041
        >>> cmp.sim('aluminum', 'Catalan')
        0.5534260065906594
        >>> cmp.sim('ATCG', 'TAGC')
        0.49679075738125805


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

abydos/distance/_gilbert.py 1 location


__all__ = ['Gilbert']


class Gilbert(_TokenDistance):
    r"""Gilbert correlation.

    For two sets X and Y and a population N, the Gilbert correlation
    :cite:`Gilbert:1884` is

        .. math::

            corr_{Gilbert}(X, Y) =
            \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus X|)}
            {|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 -
            |(N \setminus X) \setminus Y|^2}

    For lack of access to the original, this formula is based on the concurring
    formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`.

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{Gilbert} =
            \frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize Gilbert instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(Gilbert, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Gilbert correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Gilbert correlation

        Examples
        --------
        >>> cmp = Gilbert()
        >>> cmp.corr('cat', 'hat')
        0.3310580204778157
        >>> cmp.corr('Niall', 'Neil')
        0.21890122402504983
        >>> cmp.corr('aluminum', 'Catalan')
        0.057094811018577836
        >>> cmp.corr('ATCG', 'TAGC')
        -0.003198976327575176


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        n = self._population_unique_card()

        num = a * n - (a + b) * (a + c)
        if num:
            return num / (n * (a + b + c) - (a + b) * (a + c))
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Gilbert similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Gilbert similarity

        Examples
        --------
        >>> cmp = Gilbert()
        >>> cmp.sim('cat', 'hat')
        0.6655290102389079
        >>> cmp.sim('Niall', 'Neil')
        0.6094506120125249
        >>> cmp.sim('aluminum', 'Catalan')
        0.5285474055092889
        >>> cmp.sim('ATCG', 'TAGC')
        0.4984005118362124


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

abydos/distance/_maxwell_pilliner.py 1 location


__all__ = ['MaxwellPilliner']


class MaxwellPilliner(_TokenDistance):
    r"""Maxwell & Pilliner correlation.

    For two sets X and Y and a population N, Maxwell & Pilliner correlation
    :cite:`Maxwell:1968` is

        .. math::

            corr_{MaxwellPilliner}(X, Y) =
            \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus X|)}
            {|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{MaxwellPilliner} =
            \frac{2(ad-bc)}{(a+b)(c+d)+(a+c)(b+c)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize MaxwellPilliner instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(MaxwellPilliner, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Maxwell & Pilliner correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Maxwell & Pilliner correlation

        Examples
        --------
        >>> cmp = MaxwellPilliner()
        >>> cmp.corr('cat', 'hat')
        0.49743589743589745
        >>> cmp.corr('Niall', 'Neil')
        0.35921989956790845
        >>> cmp.corr('aluminum', 'Catalan')
        0.10803030303030303
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483954


        .. versionadded:: 0.4.0

        """
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = a * d - b * c
        if num:
            return 2 * num / ((a + b) * (c + d) + (a + c) * (b + d))
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Maxwell & Pilliner similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Maxwell & Pilliner similarity

        Examples
        --------
        >>> cmp = MaxwellPilliner()
        >>> cmp.sim('cat', 'hat')
        0.7487179487179487
        >>> cmp.sim('Niall', 'Neil')
        0.6796099497839543
        >>> cmp.sim('aluminum', 'Catalan')
        0.5540151515151515
        >>> cmp.sim('ATCG', 'TAGC')
        0.496790757381258


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

		@@ 30-209 (lines=180) @@
27		__all__ = ['KoppenI']
28
29
30		class KoppenI(_TokenDistance):
31		r"""Köppen I correlation.
32
33		For two sets X and Y and an alphabet N, provided that :math:`\|X\| = \|Y\|`,
34		Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is
35
36		.. math::
37
38		corr_{KoppenI}(X, Y) =
39		\frac{\|X\| \cdot \|N \setminus X\| - \|X \setminus Y\|}
40		{\|X\| \cdot \|N \setminus X\|}
41
42		To support cases where :math:`\|X\| \neq \|Y\|`, this class implements a slight
43		variation, while still providing the expected results when
44		:math:`\|X\| = \|Y\|`:
45
46		.. math::
47
48		corr_{KoppenI}(X, Y) =
49		\frac{\frac{\|X\|+\|Y\|}{2} \cdot
50		\frac{\|N \setminus X\|+\|N \setminus Y\|}{2}-
51		\frac{\|X \triangle Y\|}{2}}
52		{\frac{\|X\|+\|Y\|}{2} \cdot
53		\frac{\|N \setminus X\|+\|N \setminus Y\|}{2}}
54
55		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
56		this is
57
58		.. math::
59
60		sim_{KoppenI} =
61		\frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
62		\frac{b+c}{2}}
63		{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}
64
65		Notes
66		-----
67		In the usual case all of the above values should be proportional to the
68		total number of samples n. I.e., a, b, c, d, & n should all be divided by
69		n prior to calculating the coefficient. This class's default normalizer
70		is, accordingly, 'proportional'.
71
72		.. versionadded:: 0.4.0
73
74		"""
75
76		def __init__(
77		self,
78		alphabet: Optional[
79		Union[TCounter[str], Sequence[str], Set[str], int]
80		] = None,
81		tokenizer: Optional[_Tokenizer] = None,
82		intersection_type: str = 'crisp',
83		normalizer: str = 'proportional',
84		**kwargs: Any
85		) -> None:
86		"""Initialize KoppenI instance.
87
88		Parameters
89		----------
90		alphabet : Counter, collection, int, or None
91		This represents the alphabet of possible tokens.
92		See :ref:`alphabet <alphabet>` description in
93		:py:class:`_TokenDistance` for details.
94		tokenizer : _Tokenizer
95		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
96		intersection_type : str
97		Specifies the intersection type, and set type as a result:
98		See :ref:`intersection_type <intersection_type>` description in
99		:py:class:`_TokenDistance` for details.
100		normalizer : str
101		Specifies the normalization type. See :ref:`normalizer <alphabet>`
102		description in :py:class:`_TokenDistance` for details.
103		**kwargs
104		Arbitrary keyword arguments
105
106		Other Parameters
107		----------------
108		qval : int
109		The length of each q-gram. Using this parameter and tokenizer=None
110		will cause the instance to use the QGram tokenizer with this
111		q value.
112		metric : _Distance
113		A string distance measure class for use in the ``soft`` and
114		``fuzzy`` variants.
115		threshold : float
116		A threshold value, similarities above which are counted as
117		members of the intersection for the ``fuzzy`` variant.
118
119
120		.. versionadded:: 0.4.0
121
122		"""
123		super(KoppenI, self).__init__(
124		alphabet=alphabet,
125		tokenizer=tokenizer,
126		intersection_type=intersection_type,
127		normalizer=normalizer,
128		**kwargs
129		)
130
131		def corr(self, src: str, tar: str) -> float:
132		"""Return the Köppen I correlation of two strings.
133
134		Parameters
135		----------
136		src : str
137		Source string (or QGrams/Counter objects) for comparison
138		tar : str
139		Target string (or QGrams/Counter objects) for comparison
140
141		Returns
142		-------
143		float
144		Köppen I correlation
145
146		Examples
147		--------
148		>>> cmp = KoppenI()
149		>>> cmp.corr('cat', 'hat')
150		0.49615384615384617
151		>>> cmp.corr('Niall', 'Neil')
152		0.3575056927658083
153		>>> cmp.corr('aluminum', 'Catalan')
154		0.1068520131813188
155		>>> cmp.corr('ATCG', 'TAGC')
156		-0.006418485237483896
157
158
159		.. versionadded:: 0.4.0
160
161		"""
162		if src == tar:
163		return 1.0
164		self._tokenize(src, tar)
165
166		a = self._intersection_card()
167		b = self._src_only_card()
168		c = self._tar_only_card()
169		d = self._total_complement_card()
170
171		abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4
172
173		num = abac_dbdc_mean_prod - (b + c) / 2
174		if num:
175		return num / abac_dbdc_mean_prod
176		return 0.0
177
178		def sim(self, src: str, tar: str) -> float:
179		"""Return the Köppen I similarity of two strings.
180
181		Parameters
182		----------
183		src : str
184		Source string (or QGrams/Counter objects) for comparison
185		tar : str
186		Target string (or QGrams/Counter objects) for comparison
187
188		Returns
189		-------
190		float
191		Köppen I similarity
192
193		Examples
194		--------
195		>>> cmp = KoppenI()
196		>>> cmp.sim('cat', 'hat')
197		0.7480769230769231
198		>>> cmp.sim('Niall', 'Neil')
199		0.6787528463829041
200		>>> cmp.sim('aluminum', 'Catalan')
201		0.5534260065906594
202		>>> cmp.sim('ATCG', 'TAGC')
203		0.49679075738125805
204
205
206		.. versionadded:: 0.4.0
207
208		"""
209		return (1.0 + self.corr(src, tar)) / 2.0
210
211
212		if __name__ == '__main__':

		@@ 30-185 (lines=156) @@
27		__all__ = ['Gilbert']
28
29
30		class Gilbert(_TokenDistance):
31		r"""Gilbert correlation.
32
33		For two sets X and Y and a population N, the Gilbert correlation
34		:cite:`Gilbert:1884` is
35
36		.. math::
37
38		corr_{Gilbert}(X, Y) =
39		\frac{2(\|X \cap Y\| \cdot \|(N \setminus X) \setminus Y\| -
40		\|X \setminus Y\| \cdot \|Y \setminus X\|)}
41		{\|N\|^2 - \|X \cap Y\|^2 + \|X \setminus Y\|^2 + \|Y \setminus X\|^2 -
42		\|(N \setminus X) \setminus Y\|^2}
43
44		For lack of access to the original, this formula is based on the concurring
45		formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`.
46
47		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
48		this is
49
50		.. math::
51
52		corr_{Gilbert} =
53		\frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2}
54
55		.. versionadded:: 0.4.0
56		"""
57
58		def __init__(
59		self,
60		alphabet: Optional[
61		Union[TCounter[str], Sequence[str], Set[str], int]
62		] = None,
63		tokenizer: Optional[_Tokenizer] = None,
64		intersection_type: str = 'crisp',
65		**kwargs: Any
66		) -> None:
67		"""Initialize Gilbert instance.
68
69		Parameters
70		----------
71		alphabet : Counter, collection, int, or None
72		This represents the alphabet of possible tokens.
73		See :ref:`alphabet <alphabet>` description in
74		:py:class:`_TokenDistance` for details.
75		tokenizer : _Tokenizer
76		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
77		intersection_type : str
78		Specifies the intersection type, and set type as a result:
79		See :ref:`intersection_type <intersection_type>` description in
80		:py:class:`_TokenDistance` for details.
81		**kwargs
82		Arbitrary keyword arguments
83
84		Other Parameters
85		----------------
86		qval : int
87		The length of each q-gram. Using this parameter and tokenizer=None
88		will cause the instance to use the QGram tokenizer with this
89		q value.
90		metric : _Distance
91		A string distance measure class for use in the ``soft`` and
92		``fuzzy`` variants.
93		threshold : float
94		A threshold value, similarities above which are counted as
95		members of the intersection for the ``fuzzy`` variant.
96
97
98		.. versionadded:: 0.4.0
99
100		"""
101		super(Gilbert, self).__init__(
102		alphabet=alphabet,
103		tokenizer=tokenizer,
104		intersection_type=intersection_type,
105		**kwargs
106		)
107
108		def corr(self, src: str, tar: str) -> float:
109		"""Return the Gilbert correlation of two strings.
110
111		Parameters
112		----------
113		src : str
114		Source string (or QGrams/Counter objects) for comparison
115		tar : str
116		Target string (or QGrams/Counter objects) for comparison
117
118		Returns
119		-------
120		float
121		Gilbert correlation
122
123		Examples
124		--------
125		>>> cmp = Gilbert()
126		>>> cmp.corr('cat', 'hat')
127		0.3310580204778157
128		>>> cmp.corr('Niall', 'Neil')
129		0.21890122402504983
130		>>> cmp.corr('aluminum', 'Catalan')
131		0.057094811018577836
132		>>> cmp.corr('ATCG', 'TAGC')
133		-0.003198976327575176
134
135
136		.. versionadded:: 0.4.0
137
138		"""
139		if src == tar:
140		return 1.0
141
142		self._tokenize(src, tar)
143
144		a = self._intersection_card()
145		b = self._src_only_card()
146		c = self._tar_only_card()
147		n = self._population_unique_card()
148
149		num = a * n - (a + b) * (a + c)
150		if num:
151		return num / (n * (a + b + c) - (a + b) * (a + c))
152		return 0.0
153
154		def sim(self, src: str, tar: str) -> float:
155		"""Return the Gilbert similarity of two strings.
156
157		Parameters
158		----------
159		src : str
160		Source string (or QGrams/Counter objects) for comparison
161		tar : str
162		Target string (or QGrams/Counter objects) for comparison
163
164		Returns
165		-------
166		float
167		Gilbert similarity
168
169		Examples
170		--------
171		>>> cmp = Gilbert()
172		>>> cmp.sim('cat', 'hat')
173		0.6655290102389079
174		>>> cmp.sim('Niall', 'Neil')
175		0.6094506120125249
176		>>> cmp.sim('aluminum', 'Catalan')
177		0.5285474055092889
178		>>> cmp.sim('ATCG', 'TAGC')
179		0.4984005118362124
180
181
182		.. versionadded:: 0.4.0
183
184		"""
185		return (1.0 + self.corr(src, tar)) / 2.0
186
187
188		if __name__ == '__main__':

		@@ 30-178 (lines=149) @@
27		__all__ = ['MaxwellPilliner']
28
29
30		class MaxwellPilliner(_TokenDistance):
31		r"""Maxwell & Pilliner correlation.
32
33		For two sets X and Y and a population N, Maxwell & Pilliner correlation
34		:cite:`Maxwell:1968` is
35
36		.. math::
37
38		corr_{MaxwellPilliner}(X, Y) =
39		\frac{2(\|X \cap Y\| \cdot \|(N \setminus X) \setminus Y\| -
40		\|X \setminus Y\| \cdot \|Y \setminus X\|)}
41		{\|X\| \cdot \|N \setminus X\| + \|Y\| \cdot \|N \setminus Y\|}
42
43		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44		this is
45
46		.. math::
47
48		corr_{MaxwellPilliner} =
49		\frac{2(ad-bc)}{(a+b)(c+d)+(a+c)(b+c)}
50
51		.. versionadded:: 0.4.0
52		"""
53
54		def __init__(
55		self,
56		alphabet: Optional[
57		Union[TCounter[str], Sequence[str], Set[str], int]
58		] = None,
59		tokenizer: Optional[_Tokenizer] = None,
60		intersection_type: str = 'crisp',
61		**kwargs: Any
62		) -> None:
63		"""Initialize MaxwellPilliner instance.
64
65		Parameters
66		----------
67		alphabet : Counter, collection, int, or None
68		This represents the alphabet of possible tokens.
69		See :ref:`alphabet <alphabet>` description in
70		:py:class:`_TokenDistance` for details.
71		tokenizer : _Tokenizer
72		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73		intersection_type : str
74		Specifies the intersection type, and set type as a result:
75		See :ref:`intersection_type <intersection_type>` description in
76		:py:class:`_TokenDistance` for details.
77		**kwargs
78		Arbitrary keyword arguments
79
80		Other Parameters
81		----------------
82		qval : int
83		The length of each q-gram. Using this parameter and tokenizer=None
84		will cause the instance to use the QGram tokenizer with this
85		q value.
86		metric : _Distance
87		A string distance measure class for use in the ``soft`` and
88		``fuzzy`` variants.
89		threshold : float
90		A threshold value, similarities above which are counted as
91		members of the intersection for the ``fuzzy`` variant.
92
93
94		.. versionadded:: 0.4.0
95
96		"""
97		super(MaxwellPilliner, self).__init__(
98		alphabet=alphabet,
99		tokenizer=tokenizer,
100		intersection_type=intersection_type,
101		**kwargs
102		)
103
104		def corr(self, src: str, tar: str) -> float:
105		"""Return the Maxwell & Pilliner correlation of two strings.
106
107		Parameters
108		----------
109		src : str
110		Source string (or QGrams/Counter objects) for comparison
111		tar : str
112		Target string (or QGrams/Counter objects) for comparison
113
114		Returns
115		-------
116		float
117		Maxwell & Pilliner correlation
118
119		Examples
120		--------
121		>>> cmp = MaxwellPilliner()
122		>>> cmp.corr('cat', 'hat')
123		0.49743589743589745
124		>>> cmp.corr('Niall', 'Neil')
125		0.35921989956790845
126		>>> cmp.corr('aluminum', 'Catalan')
127		0.10803030303030303
128		>>> cmp.corr('ATCG', 'TAGC')
129		-0.006418485237483954
130
131
132		.. versionadded:: 0.4.0
133
134		"""
135		self._tokenize(src, tar)
136
137		a = self._intersection_card()
138		b = self._src_only_card()
139		c = self._tar_only_card()
140		d = self._total_complement_card()
141
142		num = a * d - b * c
143		if num:
144		return 2 * num / ((a + b) * (c + d) + (a + c) * (b + d))
145		return 0.0
146
147		def sim(self, src: str, tar: str) -> float:
148		"""Return the Maxwell & Pilliner similarity of two strings.
149
150		Parameters
151		----------
152		src : str
153		Source string (or QGrams/Counter objects) for comparison
154		tar : str
155		Target string (or QGrams/Counter objects) for comparison
156
157		Returns
158		-------
159		float
160		Maxwell & Pilliner similarity
161
162		Examples
163		--------
164		>>> cmp = MaxwellPilliner()
165		>>> cmp.sim('cat', 'hat')
166		0.7487179487179487
167		>>> cmp.sim('Niall', 'Neil')
168		0.6796099497839543
169		>>> cmp.sim('aluminum', 'Catalan')
170		0.5540151515151515
171		>>> cmp.sim('ATCG', 'TAGC')
172		0.496790757381258
173
174
175		.. versionadded:: 0.4.0
176
177		"""
178		return (1.0 + self.corr(src, tar)) / 2.0
179
180
181		if __name__ == '__main__':

chrislit / abydos

Code Duplication Length = 149-180 lines in 3 locations

abydos/distance/_koppen_i.py 1 location

abydos/distance/_gilbert.py 1 location

abydos/distance/_maxwell_pilliner.py 1 location