Code Duplication - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Code Duplication Length = 137-137 lines in 2 locations

abydos/distance/_goodman_kruskal_tau_b.py 1 location


__all__ = ['GoodmanKruskalTauB']


class GoodmanKruskalTauB(_TokenDistance):
    r"""Goodman & Kruskal's Tau B similarity.

    For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b`
    similarity :cite:`Goodman:1954` is

        .. math::

            sim_{GK_{\tau_b}}(X, Y) =
            \frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
            \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+
            \frac{\frac{|Y \setminus X|}{|N|}^2 +
            \frac{|(N \setminus X) \setminus Y|}{|N|}^2}
            {\frac{|N \setminus X|}{|N|}} -
            (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
            {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    after each term has been converted to a proportion by dividing by n, this
    is

        .. math::

            sim_{GK_{\tau_b}} =
            \frac{
            \frac{a^2 + b^2}{a+b} +
            \frac{c^2 + d^2}{c+d} -
            ((a+c)^2 + (b+d)^2)}
            {1 - ((a+c)^2 + (b+d)^2)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet=None,
        tokenizer=None,
        intersection_type='crisp',
        normalizer='proportional',
        **kwargs
    ):
        """Initialize GoodmanKruskalTauB instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        normalizer : str
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
            description in :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(GoodmanKruskalTauB, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            normalizer=normalizer,
            **kwargs
        )

    def sim(self, src, tar):
        """Return Goodman & Kruskal's Tau B similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Goodman & Kruskal's Tau B similarity

        Examples
        --------
        >>> cmp = GoodmanKruskalTauB()
        >>> cmp.sim('cat', 'hat')
        0.3304969657208484
        >>> cmp.sim('Niall', 'Neil')
        0.2346006486710202
        >>> cmp.sim('aluminum', 'Catalan')
        0.06533810992392582
        >>> cmp.sim('ATCG', 'TAGC')
        4.119695274745721e-05


        .. versionadded:: 0.4.0

        """
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        if a + b == 0 or a + c == 0:
            return 0.0

        fp = (a * a + b * b) / (a + b)

        sp = c * c + d * d
        if sp:
            sp /= c + d

        num = fp + sp - (a + c) ** 2 - (b + d) ** 2
        if num > 1e-14:
            return num / (1 - (a + c) ** 2 - (b + d) ** 2)
        return 0.0  # pragma: no cover


if __name__ == '__main__':

abydos/distance/_goodman_kruskal_tau_a.py 1 location


__all__ = ['GoodmanKruskalTauA']


class GoodmanKruskalTauA(_TokenDistance):
    r"""Goodman & Kruskal's Tau A similarity.

    For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a`
    similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is

        .. math::

            sim_{GK_{\tau_a}}(X, Y) =
            \frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
            \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+
            \frac{\frac{|X \setminus Y|}{|N|}^2 +
            \frac{|(N \setminus X) \setminus Y|}{|N|}^2}
            {\frac{|N \setminus X|}{|N|}} -
            (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}
            {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    after each term has been converted to a proportion by dividing by n, this
    is

        .. math::

            sim_{GK_{\tau_a}} =
            \frac{
            \frac{a^2 + c^2}{a+c} +
            \frac{b^2 + d^2}{b+d} -
            ((a+b)^2 + (c+d)^2)}
            {1 - ((a+b)^2 + (c+d)^2)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet=None,
        tokenizer=None,
        intersection_type='crisp',
        normalizer='proportional',
        **kwargs
    ):
        """Initialize GoodmanKruskalTauA instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        normalizer : str
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
            description in :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(GoodmanKruskalTauA, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            normalizer=normalizer,
            **kwargs
        )

    def sim(self, src, tar):
        """Return Goodman & Kruskal's Tau A similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Goodman & Kruskal's Tau A similarity

        Examples
        --------
        >>> cmp = GoodmanKruskalTauA()
        >>> cmp.sim('cat', 'hat')
        0.3304969657208484
        >>> cmp.sim('Niall', 'Neil')
        0.22137604585914503
        >>> cmp.sim('aluminum', 'Catalan')
        0.05991264724130685
        >>> cmp.sim('ATCG', 'TAGC')
        4.119695274745721e-05


        .. versionadded:: 0.4.0

        """
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        if a + b == 0 or a + c == 0:
            return 0.0

        fp = (a * a + c * c) / (a + c)

        sp = b * b + d * d
        if sp:
            sp /= b + d

        num = fp + sp - (a + b) ** 2 - (c + d) ** 2
        if num > 1e-14:
            return num / (1 - (a + b) ** 2 - (c + d) ** 2)
        return 0.0  # pragma: no cover


if __name__ == '__main__':

		@@ 36-172 (lines=137) @@
33		__all__ = ['GoodmanKruskalTauB']
34
35
36		class GoodmanKruskalTauB(_TokenDistance):
37		r"""Goodman & Kruskal's Tau B similarity.
38
39		For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b`
40		similarity :cite:`Goodman:1954` is
41
42		.. math::
43
44		sim_{GK_{\tau_b}}(X, Y) =
45		\frac{\frac{\frac{\|X \cap Y\|}{\|N\|}^2 +
46		\frac{\|X \setminus Y\|}{\|N\|}^2}{\frac{\|X\|}{\|N\|}}+
47		\frac{\frac{\|Y \setminus X\|}{\|N\|}^2 +
48		\frac{\|(N \setminus X) \setminus Y\|}{\|N\|}^2}
49		{\frac{\|N \setminus X\|}{\|N\|}} -
50		(\frac{\|Y\|}{\|N\|}^2 + \frac{\|N \setminus Y\|}{\|N\|}^2)}
51		{1 - (\frac{\|Y\|}{\|N\|}^2 + \frac{\|N \setminus Y\|}{\|N\|}^2)}
52
53		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54		after each term has been converted to a proportion by dividing by n, this
55		is
56
57		.. math::
58
59		sim_{GK_{\tau_b}} =
60		\frac{
61		\frac{a^2 + b^2}{a+b} +
62		\frac{c^2 + d^2}{c+d} -
63		((a+c)^2 + (b+d)^2)}
64		{1 - ((a+c)^2 + (b+d)^2)}
65
66		.. versionadded:: 0.4.0
67		"""
68
69		def __init__(
70		self,
71		alphabet=None,
72		tokenizer=None,
73		intersection_type='crisp',
74		normalizer='proportional',
75		**kwargs
76		):
77		"""Initialize GoodmanKruskalTauB instance.
78
79		Parameters
80		----------
81		alphabet : Counter, collection, int, or None
82		This represents the alphabet of possible tokens.
83		See :ref:`alphabet <alphabet>` description in
84		:py:class:`_TokenDistance` for details.
85		tokenizer : _Tokenizer
86		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
87		intersection_type : str
88		Specifies the intersection type, and set type as a result:
89		See :ref:`intersection_type <intersection_type>` description in
90		:py:class:`_TokenDistance` for details.
91		normalizer : str
92		Specifies the normalization type. See :ref:`normalizer <alphabet>`
93		description in :py:class:`_TokenDistance` for details.
94		**kwargs
95		Arbitrary keyword arguments
96
97		Other Parameters
98		----------------
99		qval : int
100		The length of each q-gram. Using this parameter and tokenizer=None
101		will cause the instance to use the QGram tokenizer with this
102		q value.
103		metric : _Distance
104		A string distance measure class for use in the ``soft`` and
105		``fuzzy`` variants.
106		threshold : float
107		A threshold value, similarities above which are counted as
108		members of the intersection for the ``fuzzy`` variant.
109
110
111		.. versionadded:: 0.4.0
112
113		"""
114		super(GoodmanKruskalTauB, self).__init__(
115		alphabet=alphabet,
116		tokenizer=tokenizer,
117		intersection_type=intersection_type,
118		normalizer=normalizer,
119		**kwargs
120		)
121
122		def sim(self, src, tar):
123		"""Return Goodman & Kruskal's Tau B similarity of two strings.
124
125		Parameters
126		----------
127		src : str
128		Source string (or QGrams/Counter objects) for comparison
129		tar : str
130		Target string (or QGrams/Counter objects) for comparison
131
132		Returns
133		-------
134		float
135		Goodman & Kruskal's Tau B similarity
136
137		Examples
138		--------
139		>>> cmp = GoodmanKruskalTauB()
140		>>> cmp.sim('cat', 'hat')
141		0.3304969657208484
142		>>> cmp.sim('Niall', 'Neil')
143		0.2346006486710202
144		>>> cmp.sim('aluminum', 'Catalan')
145		0.06533810992392582
146		>>> cmp.sim('ATCG', 'TAGC')
147		4.119695274745721e-05
148
149
150		.. versionadded:: 0.4.0
151
152		"""
153		self._tokenize(src, tar)
154
155		a = self._intersection_card()
156		b = self._src_only_card()
157		c = self._tar_only_card()
158		d = self._total_complement_card()
159
160		if a + b == 0 or a + c == 0:
161		return 0.0
162
163		fp = (a * a + b * b) / (a + b)
164
165		sp = c * c + d * d
166		if sp:
167		sp /= c + d
168
169		num = fp + sp - (a + c) 2 - (b + d) 2
170		if num > 1e-14:
171		return num / (1 - (a + c) 2 - (b + d) 2)
172		return 0.0 # pragma: no cover
173
174
175		if __name__ == '__main__':

		@@ 36-172 (lines=137) @@
33		__all__ = ['GoodmanKruskalTauA']
34
35
36		class GoodmanKruskalTauA(_TokenDistance):
37		r"""Goodman & Kruskal's Tau A similarity.
38
39		For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a`
40		similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is
41
42		.. math::
43
44		sim_{GK_{\tau_a}}(X, Y) =
45		\frac{\frac{\frac{\|X \cap Y\|}{\|N\|}^2 +
46		\frac{\|Y \setminus X\|}{\|N\|}^2}{\frac{\|Y\|}{\|N\|}}+
47		\frac{\frac{\|X \setminus Y\|}{\|N\|}^2 +
48		\frac{\|(N \setminus X) \setminus Y\|}{\|N\|}^2}
49		{\frac{\|N \setminus X\|}{\|N\|}} -
50		(\frac{\|X\|}{\|N\|}^2 + \frac{\|N \setminus X\|}{\|N\|}^2)}
51		{1 - (\frac{\|X\|}{\|N\|}^2 + \frac{\|N \setminus X\|}{\|N\|}^2)}
52
53		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54		after each term has been converted to a proportion by dividing by n, this
55		is
56
57		.. math::
58
59		sim_{GK_{\tau_a}} =
60		\frac{
61		\frac{a^2 + c^2}{a+c} +
62		\frac{b^2 + d^2}{b+d} -
63		((a+b)^2 + (c+d)^2)}
64		{1 - ((a+b)^2 + (c+d)^2)}
65
66		.. versionadded:: 0.4.0
67		"""
68
69		def __init__(
70		self,
71		alphabet=None,
72		tokenizer=None,
73		intersection_type='crisp',
74		normalizer='proportional',
75		**kwargs
76		):
77		"""Initialize GoodmanKruskalTauA instance.
78
79		Parameters
80		----------
81		alphabet : Counter, collection, int, or None
82		This represents the alphabet of possible tokens.
83		See :ref:`alphabet <alphabet>` description in
84		:py:class:`_TokenDistance` for details.
85		tokenizer : _Tokenizer
86		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
87		intersection_type : str
88		Specifies the intersection type, and set type as a result:
89		See :ref:`intersection_type <intersection_type>` description in
90		:py:class:`_TokenDistance` for details.
91		normalizer : str
92		Specifies the normalization type. See :ref:`normalizer <alphabet>`
93		description in :py:class:`_TokenDistance` for details.
94		**kwargs
95		Arbitrary keyword arguments
96
97		Other Parameters
98		----------------
99		qval : int
100		The length of each q-gram. Using this parameter and tokenizer=None
101		will cause the instance to use the QGram tokenizer with this
102		q value.
103		metric : _Distance
104		A string distance measure class for use in the ``soft`` and
105		``fuzzy`` variants.
106		threshold : float
107		A threshold value, similarities above which are counted as
108		members of the intersection for the ``fuzzy`` variant.
109
110
111		.. versionadded:: 0.4.0
112
113		"""
114		super(GoodmanKruskalTauA, self).__init__(
115		alphabet=alphabet,
116		tokenizer=tokenizer,
117		intersection_type=intersection_type,
118		normalizer=normalizer,
119		**kwargs
120		)
121
122		def sim(self, src, tar):
123		"""Return Goodman & Kruskal's Tau A similarity of two strings.
124
125		Parameters
126		----------
127		src : str
128		Source string (or QGrams/Counter objects) for comparison
129		tar : str
130		Target string (or QGrams/Counter objects) for comparison
131
132		Returns
133		-------
134		float
135		Goodman & Kruskal's Tau A similarity
136
137		Examples
138		--------
139		>>> cmp = GoodmanKruskalTauA()
140		>>> cmp.sim('cat', 'hat')
141		0.3304969657208484
142		>>> cmp.sim('Niall', 'Neil')
143		0.22137604585914503
144		>>> cmp.sim('aluminum', 'Catalan')
145		0.05991264724130685
146		>>> cmp.sim('ATCG', 'TAGC')
147		4.119695274745721e-05
148
149
150		.. versionadded:: 0.4.0
151
152		"""
153		self._tokenize(src, tar)
154
155		a = self._intersection_card()
156		b = self._src_only_card()
157		c = self._tar_only_card()
158		d = self._total_complement_card()
159
160		if a + b == 0 or a + c == 0:
161		return 0.0
162
163		fp = (a * a + c * c) / (a + c)
164
165		sp = b * b + d * d
166		if sp:
167		sp /= b + d
168
169		num = fp + sp - (a + b) 2 - (c + d) 2
170		if num > 1e-14:
171		return num / (1 - (a + b) 2 - (c + d) 2)
172		return 0.0 # pragma: no cover
173
174
175		if __name__ == '__main__':

chrislit / abydos

Code Duplication Length = 137-137 lines in 2 locations

abydos/distance/_goodman_kruskal_tau_b.py 1 location

abydos/distance/_goodman_kruskal_tau_a.py 1 location