Code Duplication    Length = 137-137 lines in 2 locations

abydos/distance/_goodman_kruskal_tau_b.py 1 location

@@ 36-172 (lines=137) @@
33
__all__ = ['GoodmanKruskalTauB']
34
35
36
class GoodmanKruskalTauB(_TokenDistance):
37
    r"""Goodman & Kruskal's Tau B similarity.
38
39
    For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b`
40
    similarity :cite:`Goodman:1954` is
41
42
        .. math::
43
44
            sim_{GK_{\tau_b}}(X, Y) =
45
            \frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
46
            \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+
47
            \frac{\frac{|Y \setminus X|}{|N|}^2 +
48
            \frac{|(N \setminus X) \setminus Y|}{|N|}^2}
49
            {\frac{|N \setminus X|}{|N|}} -
50
            (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
51
            {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
52
53
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54
    after each term has been converted to a proportion by dividing by n, this
55
    is
56
57
        .. math::
58
59
            sim_{GK_{\tau_b}} =
60
            \frac{
61
            \frac{a^2 + b^2}{a+b} +
62
            \frac{c^2 + d^2}{c+d} -
63
            ((a+c)^2 + (b+d)^2)}
64
            {1 - ((a+c)^2 + (b+d)^2)}
65
66
    .. versionadded:: 0.4.0
67
    """
68
69
    def __init__(
70
        self,
71
        alphabet=None,
72
        tokenizer=None,
73
        intersection_type='crisp',
74
        normalizer='proportional',
75
        **kwargs
76
    ):
77
        """Initialize GoodmanKruskalTauB instance.
78
79
        Parameters
80
        ----------
81
        alphabet : Counter, collection, int, or None
82
            This represents the alphabet of possible tokens.
83
            See :ref:`alphabet <alphabet>` description in
84
            :py:class:`_TokenDistance` for details.
85
        tokenizer : _Tokenizer
86
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
87
        intersection_type : str
88
            Specifies the intersection type, and set type as a result:
89
            See :ref:`intersection_type <intersection_type>` description in
90
            :py:class:`_TokenDistance` for details.
91
        normalizer : str
92
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
93
            description in :py:class:`_TokenDistance` for details.
94
        **kwargs
95
            Arbitrary keyword arguments
96
97
        Other Parameters
98
        ----------------
99
        qval : int
100
            The length of each q-gram. Using this parameter and tokenizer=None
101
            will cause the instance to use the QGram tokenizer with this
102
            q value.
103
        metric : _Distance
104
            A string distance measure class for use in the ``soft`` and
105
            ``fuzzy`` variants.
106
        threshold : float
107
            A threshold value, similarities above which are counted as
108
            members of the intersection for the ``fuzzy`` variant.
109
110
111
        .. versionadded:: 0.4.0
112
113
        """
114
        super(GoodmanKruskalTauB, self).__init__(
115
            alphabet=alphabet,
116
            tokenizer=tokenizer,
117
            intersection_type=intersection_type,
118
            normalizer=normalizer,
119
            **kwargs
120
        )
121
122
    def sim(self, src, tar):
123
        """Return Goodman & Kruskal's Tau B similarity of two strings.
124
125
        Parameters
126
        ----------
127
        src : str
128
            Source string (or QGrams/Counter objects) for comparison
129
        tar : str
130
            Target string (or QGrams/Counter objects) for comparison
131
132
        Returns
133
        -------
134
        float
135
            Goodman & Kruskal's Tau B similarity
136
137
        Examples
138
        --------
139
        >>> cmp = GoodmanKruskalTauB()
140
        >>> cmp.sim('cat', 'hat')
141
        0.3304969657208484
142
        >>> cmp.sim('Niall', 'Neil')
143
        0.2346006486710202
144
        >>> cmp.sim('aluminum', 'Catalan')
145
        0.06533810992392582
146
        >>> cmp.sim('ATCG', 'TAGC')
147
        4.119695274745721e-05
148
149
150
        .. versionadded:: 0.4.0
151
152
        """
153
        self._tokenize(src, tar)
154
155
        a = self._intersection_card()
156
        b = self._src_only_card()
157
        c = self._tar_only_card()
158
        d = self._total_complement_card()
159
160
        if a + b == 0 or a + c == 0:
161
            return 0.0
162
163
        fp = (a * a + b * b) / (a + b)
164
165
        sp = c * c + d * d
166
        if sp:
167
            sp /= c + d
168
169
        num = fp + sp - (a + c) ** 2 - (b + d) ** 2
170
        if num > 1e-14:
171
            return num / (1 - (a + c) ** 2 - (b + d) ** 2)
172
        return 0.0  # pragma: no cover
173
174
175
if __name__ == '__main__':

abydos/distance/_goodman_kruskal_tau_a.py 1 location

@@ 36-172 (lines=137) @@
33
__all__ = ['GoodmanKruskalTauA']
34
35
36
class GoodmanKruskalTauA(_TokenDistance):
37
    r"""Goodman & Kruskal's Tau A similarity.
38
39
    For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a`
40
    similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is
41
42
        .. math::
43
44
            sim_{GK_{\tau_a}}(X, Y) =
45
            \frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
46
            \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+
47
            \frac{\frac{|X \setminus Y|}{|N|}^2 +
48
            \frac{|(N \setminus X) \setminus Y|}{|N|}^2}
49
            {\frac{|N \setminus X|}{|N|}} -
50
            (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}
51
            {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}
52
53
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
54
    after each term has been converted to a proportion by dividing by n, this
55
    is
56
57
        .. math::
58
59
            sim_{GK_{\tau_a}} =
60
            \frac{
61
            \frac{a^2 + c^2}{a+c} +
62
            \frac{b^2 + d^2}{b+d} -
63
            ((a+b)^2 + (c+d)^2)}
64
            {1 - ((a+b)^2 + (c+d)^2)}
65
66
    .. versionadded:: 0.4.0
67
    """
68
69
    def __init__(
70
        self,
71
        alphabet=None,
72
        tokenizer=None,
73
        intersection_type='crisp',
74
        normalizer='proportional',
75
        **kwargs
76
    ):
77
        """Initialize GoodmanKruskalTauA instance.
78
79
        Parameters
80
        ----------
81
        alphabet : Counter, collection, int, or None
82
            This represents the alphabet of possible tokens.
83
            See :ref:`alphabet <alphabet>` description in
84
            :py:class:`_TokenDistance` for details.
85
        tokenizer : _Tokenizer
86
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
87
        intersection_type : str
88
            Specifies the intersection type, and set type as a result:
89
            See :ref:`intersection_type <intersection_type>` description in
90
            :py:class:`_TokenDistance` for details.
91
        normalizer : str
92
            Specifies the normalization type. See :ref:`normalizer <alphabet>`
93
            description in :py:class:`_TokenDistance` for details.
94
        **kwargs
95
            Arbitrary keyword arguments
96
97
        Other Parameters
98
        ----------------
99
        qval : int
100
            The length of each q-gram. Using this parameter and tokenizer=None
101
            will cause the instance to use the QGram tokenizer with this
102
            q value.
103
        metric : _Distance
104
            A string distance measure class for use in the ``soft`` and
105
            ``fuzzy`` variants.
106
        threshold : float
107
            A threshold value, similarities above which are counted as
108
            members of the intersection for the ``fuzzy`` variant.
109
110
111
        .. versionadded:: 0.4.0
112
113
        """
114
        super(GoodmanKruskalTauA, self).__init__(
115
            alphabet=alphabet,
116
            tokenizer=tokenizer,
117
            intersection_type=intersection_type,
118
            normalizer=normalizer,
119
            **kwargs
120
        )
121
122
    def sim(self, src, tar):
123
        """Return Goodman & Kruskal's Tau A similarity of two strings.
124
125
        Parameters
126
        ----------
127
        src : str
128
            Source string (or QGrams/Counter objects) for comparison
129
        tar : str
130
            Target string (or QGrams/Counter objects) for comparison
131
132
        Returns
133
        -------
134
        float
135
            Goodman & Kruskal's Tau A similarity
136
137
        Examples
138
        --------
139
        >>> cmp = GoodmanKruskalTauA()
140
        >>> cmp.sim('cat', 'hat')
141
        0.3304969657208484
142
        >>> cmp.sim('Niall', 'Neil')
143
        0.22137604585914503
144
        >>> cmp.sim('aluminum', 'Catalan')
145
        0.05991264724130685
146
        >>> cmp.sim('ATCG', 'TAGC')
147
        4.119695274745721e-05
148
149
150
        .. versionadded:: 0.4.0
151
152
        """
153
        self._tokenize(src, tar)
154
155
        a = self._intersection_card()
156
        b = self._src_only_card()
157
        c = self._tar_only_card()
158
        d = self._total_complement_card()
159
160
        if a + b == 0 or a + c == 0:
161
            return 0.0
162
163
        fp = (a * a + c * c) / (a + c)
164
165
        sp = b * b + d * d
166
        if sp:
167
            sp /= b + d
168
169
        num = fp + sp - (a + b) ** 2 - (c + d) ** 2
170
        if num > 1e-14:
171
            return num / (1 - (a + b) ** 2 - (c + d) ** 2)
172
        return 0.0  # pragma: no cover
173
174
175
if __name__ == '__main__':