@@ 36-172 (lines=137) @@ | ||
33 | __all__ = ['GoodmanKruskalTauB'] |
|
34 | ||
35 | ||
36 | class GoodmanKruskalTauB(_TokenDistance): |
|
37 | r"""Goodman & Kruskal's Tau B similarity. |
|
38 | ||
39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b` |
|
40 | similarity :cite:`Goodman:1954` is |
|
41 | ||
42 | .. math:: |
|
43 | ||
44 | sim_{GK_{\tau_b}}(X, Y) = |
|
45 | \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + |
|
46 | \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+ |
|
47 | \frac{\frac{|Y \setminus X|}{|N|}^2 + |
|
48 | \frac{|(N \setminus X) \setminus Y|}{|N|}^2} |
|
49 | {\frac{|N \setminus X|}{|N|}} - |
|
50 | (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} |
|
51 | {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} |
|
52 | ||
53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
54 | after each term has been converted to a proportion by dividing by n, this |
|
55 | is |
|
56 | ||
57 | .. math:: |
|
58 | ||
59 | sim_{GK_{\tau_b}} = |
|
60 | \frac{ |
|
61 | \frac{a^2 + b^2}{a+b} + |
|
62 | \frac{c^2 + d^2}{c+d} - |
|
63 | ((a+c)^2 + (b+d)^2)} |
|
64 | {1 - ((a+c)^2 + (b+d)^2)} |
|
65 | ||
66 | .. versionadded:: 0.4.0 |
|
67 | """ |
|
68 | ||
69 | def __init__( |
|
70 | self, |
|
71 | alphabet=None, |
|
72 | tokenizer=None, |
|
73 | intersection_type='crisp', |
|
74 | normalizer='proportional', |
|
75 | **kwargs |
|
76 | ): |
|
77 | """Initialize GoodmanKruskalTauB instance. |
|
78 | ||
79 | Parameters |
|
80 | ---------- |
|
81 | alphabet : Counter, collection, int, or None |
|
82 | This represents the alphabet of possible tokens. |
|
83 | See :ref:`alphabet <alphabet>` description in |
|
84 | :py:class:`_TokenDistance` for details. |
|
85 | tokenizer : _Tokenizer |
|
86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
87 | intersection_type : str |
|
88 | Specifies the intersection type, and set type as a result: |
|
89 | See :ref:`intersection_type <intersection_type>` description in |
|
90 | :py:class:`_TokenDistance` for details. |
|
91 | normalizer : str |
|
92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
93 | description in :py:class:`_TokenDistance` for details. |
|
94 | **kwargs |
|
95 | Arbitrary keyword arguments |
|
96 | ||
97 | Other Parameters |
|
98 | ---------------- |
|
99 | qval : int |
|
100 | The length of each q-gram. Using this parameter and tokenizer=None |
|
101 | will cause the instance to use the QGram tokenizer with this |
|
102 | q value. |
|
103 | metric : _Distance |
|
104 | A string distance measure class for use in the ``soft`` and |
|
105 | ``fuzzy`` variants. |
|
106 | threshold : float |
|
107 | A threshold value, similarities above which are counted as |
|
108 | members of the intersection for the ``fuzzy`` variant. |
|
109 | ||
110 | ||
111 | .. versionadded:: 0.4.0 |
|
112 | ||
113 | """ |
|
114 | super(GoodmanKruskalTauB, self).__init__( |
|
115 | alphabet=alphabet, |
|
116 | tokenizer=tokenizer, |
|
117 | intersection_type=intersection_type, |
|
118 | normalizer=normalizer, |
|
119 | **kwargs |
|
120 | ) |
|
121 | ||
122 | def sim(self, src, tar): |
|
123 | """Return Goodman & Kruskal's Tau B similarity of two strings. |
|
124 | ||
125 | Parameters |
|
126 | ---------- |
|
127 | src : str |
|
128 | Source string (or QGrams/Counter objects) for comparison |
|
129 | tar : str |
|
130 | Target string (or QGrams/Counter objects) for comparison |
|
131 | ||
132 | Returns |
|
133 | ------- |
|
134 | float |
|
135 | Goodman & Kruskal's Tau B similarity |
|
136 | ||
137 | Examples |
|
138 | -------- |
|
139 | >>> cmp = GoodmanKruskalTauB() |
|
140 | >>> cmp.sim('cat', 'hat') |
|
141 | 0.3304969657208484 |
|
142 | >>> cmp.sim('Niall', 'Neil') |
|
143 | 0.2346006486710202 |
|
144 | >>> cmp.sim('aluminum', 'Catalan') |
|
145 | 0.06533810992392582 |
|
146 | >>> cmp.sim('ATCG', 'TAGC') |
|
147 | 4.119695274745721e-05 |
|
148 | ||
149 | ||
150 | .. versionadded:: 0.4.0 |
|
151 | ||
152 | """ |
|
153 | self._tokenize(src, tar) |
|
154 | ||
155 | a = self._intersection_card() |
|
156 | b = self._src_only_card() |
|
157 | c = self._tar_only_card() |
|
158 | d = self._total_complement_card() |
|
159 | ||
160 | if a + b == 0 or a + c == 0: |
|
161 | return 0.0 |
|
162 | ||
163 | fp = (a * a + b * b) / (a + b) |
|
164 | ||
165 | sp = c * c + d * d |
|
166 | if sp: |
|
167 | sp /= c + d |
|
168 | ||
169 | num = fp + sp - (a + c) ** 2 - (b + d) ** 2 |
|
170 | if num > 1e-14: |
|
171 | return num / (1 - (a + c) ** 2 - (b + d) ** 2) |
|
172 | return 0.0 # pragma: no cover |
|
173 | ||
174 | ||
175 | if __name__ == '__main__': |
@@ 36-172 (lines=137) @@ | ||
33 | __all__ = ['GoodmanKruskalTauA'] |
|
34 | ||
35 | ||
36 | class GoodmanKruskalTauA(_TokenDistance): |
|
37 | r"""Goodman & Kruskal's Tau A similarity. |
|
38 | ||
39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a` |
|
40 | similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is |
|
41 | ||
42 | .. math:: |
|
43 | ||
44 | sim_{GK_{\tau_a}}(X, Y) = |
|
45 | \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + |
|
46 | \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+ |
|
47 | \frac{\frac{|X \setminus Y|}{|N|}^2 + |
|
48 | \frac{|(N \setminus X) \setminus Y|}{|N|}^2} |
|
49 | {\frac{|N \setminus X|}{|N|}} - |
|
50 | (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} |
|
51 | {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} |
|
52 | ||
53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
54 | after each term has been converted to a proportion by dividing by n, this |
|
55 | is |
|
56 | ||
57 | .. math:: |
|
58 | ||
59 | sim_{GK_{\tau_a}} = |
|
60 | \frac{ |
|
61 | \frac{a^2 + c^2}{a+c} + |
|
62 | \frac{b^2 + d^2}{b+d} - |
|
63 | ((a+b)^2 + (c+d)^2)} |
|
64 | {1 - ((a+b)^2 + (c+d)^2)} |
|
65 | ||
66 | .. versionadded:: 0.4.0 |
|
67 | """ |
|
68 | ||
69 | def __init__( |
|
70 | self, |
|
71 | alphabet=None, |
|
72 | tokenizer=None, |
|
73 | intersection_type='crisp', |
|
74 | normalizer='proportional', |
|
75 | **kwargs |
|
76 | ): |
|
77 | """Initialize GoodmanKruskalTauA instance. |
|
78 | ||
79 | Parameters |
|
80 | ---------- |
|
81 | alphabet : Counter, collection, int, or None |
|
82 | This represents the alphabet of possible tokens. |
|
83 | See :ref:`alphabet <alphabet>` description in |
|
84 | :py:class:`_TokenDistance` for details. |
|
85 | tokenizer : _Tokenizer |
|
86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
87 | intersection_type : str |
|
88 | Specifies the intersection type, and set type as a result: |
|
89 | See :ref:`intersection_type <intersection_type>` description in |
|
90 | :py:class:`_TokenDistance` for details. |
|
91 | normalizer : str |
|
92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
93 | description in :py:class:`_TokenDistance` for details. |
|
94 | **kwargs |
|
95 | Arbitrary keyword arguments |
|
96 | ||
97 | Other Parameters |
|
98 | ---------------- |
|
99 | qval : int |
|
100 | The length of each q-gram. Using this parameter and tokenizer=None |
|
101 | will cause the instance to use the QGram tokenizer with this |
|
102 | q value. |
|
103 | metric : _Distance |
|
104 | A string distance measure class for use in the ``soft`` and |
|
105 | ``fuzzy`` variants. |
|
106 | threshold : float |
|
107 | A threshold value, similarities above which are counted as |
|
108 | members of the intersection for the ``fuzzy`` variant. |
|
109 | ||
110 | ||
111 | .. versionadded:: 0.4.0 |
|
112 | ||
113 | """ |
|
114 | super(GoodmanKruskalTauA, self).__init__( |
|
115 | alphabet=alphabet, |
|
116 | tokenizer=tokenizer, |
|
117 | intersection_type=intersection_type, |
|
118 | normalizer=normalizer, |
|
119 | **kwargs |
|
120 | ) |
|
121 | ||
122 | def sim(self, src, tar): |
|
123 | """Return Goodman & Kruskal's Tau A similarity of two strings. |
|
124 | ||
125 | Parameters |
|
126 | ---------- |
|
127 | src : str |
|
128 | Source string (or QGrams/Counter objects) for comparison |
|
129 | tar : str |
|
130 | Target string (or QGrams/Counter objects) for comparison |
|
131 | ||
132 | Returns |
|
133 | ------- |
|
134 | float |
|
135 | Goodman & Kruskal's Tau A similarity |
|
136 | ||
137 | Examples |
|
138 | -------- |
|
139 | >>> cmp = GoodmanKruskalTauA() |
|
140 | >>> cmp.sim('cat', 'hat') |
|
141 | 0.3304969657208484 |
|
142 | >>> cmp.sim('Niall', 'Neil') |
|
143 | 0.22137604585914503 |
|
144 | >>> cmp.sim('aluminum', 'Catalan') |
|
145 | 0.05991264724130685 |
|
146 | >>> cmp.sim('ATCG', 'TAGC') |
|
147 | 4.119695274745721e-05 |
|
148 | ||
149 | ||
150 | .. versionadded:: 0.4.0 |
|
151 | ||
152 | """ |
|
153 | self._tokenize(src, tar) |
|
154 | ||
155 | a = self._intersection_card() |
|
156 | b = self._src_only_card() |
|
157 | c = self._tar_only_card() |
|
158 | d = self._total_complement_card() |
|
159 | ||
160 | if a + b == 0 or a + c == 0: |
|
161 | return 0.0 |
|
162 | ||
163 | fp = (a * a + c * c) / (a + c) |
|
164 | ||
165 | sp = b * b + d * d |
|
166 | if sp: |
|
167 | sp /= b + d |
|
168 | ||
169 | num = fp + sp - (a + b) ** 2 - (c + d) ** 2 |
|
170 | if num > 1e-14: |
|
171 | return num / (1 - (a + b) ** 2 - (c + d) ** 2) |
|
172 | return 0.0 # pragma: no cover |
|
173 | ||
174 | ||
175 | if __name__ == '__main__': |