| @@ 36-172 (lines=137) @@ | ||
| 33 | __all__ = ['GoodmanKruskalTauB'] |
|
| 34 | ||
| 35 | ||
| 36 | class GoodmanKruskalTauB(_TokenDistance): |
|
| 37 | r"""Goodman & Kruskal's Tau B similarity. |
|
| 38 | ||
| 39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b` |
|
| 40 | similarity :cite:`Goodman:1954` is |
|
| 41 | ||
| 42 | .. math:: |
|
| 43 | ||
| 44 | sim_{GK_{\tau_b}}(X, Y) = |
|
| 45 | \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + |
|
| 46 | \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+ |
|
| 47 | \frac{\frac{|Y \setminus X|}{|N|}^2 + |
|
| 48 | \frac{|(N \setminus X) \setminus Y|}{|N|}^2} |
|
| 49 | {\frac{|N \setminus X|}{|N|}} - |
|
| 50 | (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} |
|
| 51 | {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} |
|
| 52 | ||
| 53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
| 54 | after each term has been converted to a proportion by dividing by n, this |
|
| 55 | is |
|
| 56 | ||
| 57 | .. math:: |
|
| 58 | ||
| 59 | sim_{GK_{\tau_b}} = |
|
| 60 | \frac{ |
|
| 61 | \frac{a^2 + b^2}{a+b} + |
|
| 62 | \frac{c^2 + d^2}{c+d} - |
|
| 63 | ((a+c)^2 + (b+d)^2)} |
|
| 64 | {1 - ((a+c)^2 + (b+d)^2)} |
|
| 65 | ||
| 66 | .. versionadded:: 0.4.0 |
|
| 67 | """ |
|
| 68 | ||
| 69 | def __init__( |
|
| 70 | self, |
|
| 71 | alphabet=None, |
|
| 72 | tokenizer=None, |
|
| 73 | intersection_type='crisp', |
|
| 74 | normalizer='proportional', |
|
| 75 | **kwargs |
|
| 76 | ): |
|
| 77 | """Initialize GoodmanKruskalTauB instance. |
|
| 78 | ||
| 79 | Parameters |
|
| 80 | ---------- |
|
| 81 | alphabet : Counter, collection, int, or None |
|
| 82 | This represents the alphabet of possible tokens. |
|
| 83 | See :ref:`alphabet <alphabet>` description in |
|
| 84 | :py:class:`_TokenDistance` for details. |
|
| 85 | tokenizer : _Tokenizer |
|
| 86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
| 87 | intersection_type : str |
|
| 88 | Specifies the intersection type, and set type as a result: |
|
| 89 | See :ref:`intersection_type <intersection_type>` description in |
|
| 90 | :py:class:`_TokenDistance` for details. |
|
| 91 | normalizer : str |
|
| 92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
| 93 | description in :py:class:`_TokenDistance` for details. |
|
| 94 | **kwargs |
|
| 95 | Arbitrary keyword arguments |
|
| 96 | ||
| 97 | Other Parameters |
|
| 98 | ---------------- |
|
| 99 | qval : int |
|
| 100 | The length of each q-gram. Using this parameter and tokenizer=None |
|
| 101 | will cause the instance to use the QGram tokenizer with this |
|
| 102 | q value. |
|
| 103 | metric : _Distance |
|
| 104 | A string distance measure class for use in the ``soft`` and |
|
| 105 | ``fuzzy`` variants. |
|
| 106 | threshold : float |
|
| 107 | A threshold value, similarities above which are counted as |
|
| 108 | members of the intersection for the ``fuzzy`` variant. |
|
| 109 | ||
| 110 | ||
| 111 | .. versionadded:: 0.4.0 |
|
| 112 | ||
| 113 | """ |
|
| 114 | super(GoodmanKruskalTauB, self).__init__( |
|
| 115 | alphabet=alphabet, |
|
| 116 | tokenizer=tokenizer, |
|
| 117 | intersection_type=intersection_type, |
|
| 118 | normalizer=normalizer, |
|
| 119 | **kwargs |
|
| 120 | ) |
|
| 121 | ||
| 122 | def sim(self, src, tar): |
|
| 123 | """Return Goodman & Kruskal's Tau B similarity of two strings. |
|
| 124 | ||
| 125 | Parameters |
|
| 126 | ---------- |
|
| 127 | src : str |
|
| 128 | Source string (or QGrams/Counter objects) for comparison |
|
| 129 | tar : str |
|
| 130 | Target string (or QGrams/Counter objects) for comparison |
|
| 131 | ||
| 132 | Returns |
|
| 133 | ------- |
|
| 134 | float |
|
| 135 | Goodman & Kruskal's Tau B similarity |
|
| 136 | ||
| 137 | Examples |
|
| 138 | -------- |
|
| 139 | >>> cmp = GoodmanKruskalTauB() |
|
| 140 | >>> cmp.sim('cat', 'hat') |
|
| 141 | 0.3304969657208484 |
|
| 142 | >>> cmp.sim('Niall', 'Neil') |
|
| 143 | 0.2346006486710202 |
|
| 144 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 145 | 0.06533810992392582 |
|
| 146 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 147 | 4.119695274745721e-05 |
|
| 148 | ||
| 149 | ||
| 150 | .. versionadded:: 0.4.0 |
|
| 151 | ||
| 152 | """ |
|
| 153 | self._tokenize(src, tar) |
|
| 154 | ||
| 155 | a = self._intersection_card() |
|
| 156 | b = self._src_only_card() |
|
| 157 | c = self._tar_only_card() |
|
| 158 | d = self._total_complement_card() |
|
| 159 | ||
| 160 | if a + b == 0 or a + c == 0: |
|
| 161 | return 0.0 |
|
| 162 | ||
| 163 | fp = (a * a + b * b) / (a + b) |
|
| 164 | ||
| 165 | sp = c * c + d * d |
|
| 166 | if sp: |
|
| 167 | sp /= c + d |
|
| 168 | ||
| 169 | num = fp + sp - (a + c) ** 2 - (b + d) ** 2 |
|
| 170 | if num > 1e-14: |
|
| 171 | return num / (1 - (a + c) ** 2 - (b + d) ** 2) |
|
| 172 | return 0.0 # pragma: no cover |
|
| 173 | ||
| 174 | ||
| 175 | if __name__ == '__main__': |
|
| @@ 36-172 (lines=137) @@ | ||
| 33 | __all__ = ['GoodmanKruskalTauA'] |
|
| 34 | ||
| 35 | ||
| 36 | class GoodmanKruskalTauA(_TokenDistance): |
|
| 37 | r"""Goodman & Kruskal's Tau A similarity. |
|
| 38 | ||
| 39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a` |
|
| 40 | similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is |
|
| 41 | ||
| 42 | .. math:: |
|
| 43 | ||
| 44 | sim_{GK_{\tau_a}}(X, Y) = |
|
| 45 | \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + |
|
| 46 | \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+ |
|
| 47 | \frac{\frac{|X \setminus Y|}{|N|}^2 + |
|
| 48 | \frac{|(N \setminus X) \setminus Y|}{|N|}^2} |
|
| 49 | {\frac{|N \setminus X|}{|N|}} - |
|
| 50 | (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} |
|
| 51 | {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} |
|
| 52 | ||
| 53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
| 54 | after each term has been converted to a proportion by dividing by n, this |
|
| 55 | is |
|
| 56 | ||
| 57 | .. math:: |
|
| 58 | ||
| 59 | sim_{GK_{\tau_a}} = |
|
| 60 | \frac{ |
|
| 61 | \frac{a^2 + c^2}{a+c} + |
|
| 62 | \frac{b^2 + d^2}{b+d} - |
|
| 63 | ((a+b)^2 + (c+d)^2)} |
|
| 64 | {1 - ((a+b)^2 + (c+d)^2)} |
|
| 65 | ||
| 66 | .. versionadded:: 0.4.0 |
|
| 67 | """ |
|
| 68 | ||
| 69 | def __init__( |
|
| 70 | self, |
|
| 71 | alphabet=None, |
|
| 72 | tokenizer=None, |
|
| 73 | intersection_type='crisp', |
|
| 74 | normalizer='proportional', |
|
| 75 | **kwargs |
|
| 76 | ): |
|
| 77 | """Initialize GoodmanKruskalTauA instance. |
|
| 78 | ||
| 79 | Parameters |
|
| 80 | ---------- |
|
| 81 | alphabet : Counter, collection, int, or None |
|
| 82 | This represents the alphabet of possible tokens. |
|
| 83 | See :ref:`alphabet <alphabet>` description in |
|
| 84 | :py:class:`_TokenDistance` for details. |
|
| 85 | tokenizer : _Tokenizer |
|
| 86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
| 87 | intersection_type : str |
|
| 88 | Specifies the intersection type, and set type as a result: |
|
| 89 | See :ref:`intersection_type <intersection_type>` description in |
|
| 90 | :py:class:`_TokenDistance` for details. |
|
| 91 | normalizer : str |
|
| 92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
| 93 | description in :py:class:`_TokenDistance` for details. |
|
| 94 | **kwargs |
|
| 95 | Arbitrary keyword arguments |
|
| 96 | ||
| 97 | Other Parameters |
|
| 98 | ---------------- |
|
| 99 | qval : int |
|
| 100 | The length of each q-gram. Using this parameter and tokenizer=None |
|
| 101 | will cause the instance to use the QGram tokenizer with this |
|
| 102 | q value. |
|
| 103 | metric : _Distance |
|
| 104 | A string distance measure class for use in the ``soft`` and |
|
| 105 | ``fuzzy`` variants. |
|
| 106 | threshold : float |
|
| 107 | A threshold value, similarities above which are counted as |
|
| 108 | members of the intersection for the ``fuzzy`` variant. |
|
| 109 | ||
| 110 | ||
| 111 | .. versionadded:: 0.4.0 |
|
| 112 | ||
| 113 | """ |
|
| 114 | super(GoodmanKruskalTauA, self).__init__( |
|
| 115 | alphabet=alphabet, |
|
| 116 | tokenizer=tokenizer, |
|
| 117 | intersection_type=intersection_type, |
|
| 118 | normalizer=normalizer, |
|
| 119 | **kwargs |
|
| 120 | ) |
|
| 121 | ||
| 122 | def sim(self, src, tar): |
|
| 123 | """Return Goodman & Kruskal's Tau A similarity of two strings. |
|
| 124 | ||
| 125 | Parameters |
|
| 126 | ---------- |
|
| 127 | src : str |
|
| 128 | Source string (or QGrams/Counter objects) for comparison |
|
| 129 | tar : str |
|
| 130 | Target string (or QGrams/Counter objects) for comparison |
|
| 131 | ||
| 132 | Returns |
|
| 133 | ------- |
|
| 134 | float |
|
| 135 | Goodman & Kruskal's Tau A similarity |
|
| 136 | ||
| 137 | Examples |
|
| 138 | -------- |
|
| 139 | >>> cmp = GoodmanKruskalTauA() |
|
| 140 | >>> cmp.sim('cat', 'hat') |
|
| 141 | 0.3304969657208484 |
|
| 142 | >>> cmp.sim('Niall', 'Neil') |
|
| 143 | 0.22137604585914503 |
|
| 144 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 145 | 0.05991264724130685 |
|
| 146 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 147 | 4.119695274745721e-05 |
|
| 148 | ||
| 149 | ||
| 150 | .. versionadded:: 0.4.0 |
|
| 151 | ||
| 152 | """ |
|
| 153 | self._tokenize(src, tar) |
|
| 154 | ||
| 155 | a = self._intersection_card() |
|
| 156 | b = self._src_only_card() |
|
| 157 | c = self._tar_only_card() |
|
| 158 | d = self._total_complement_card() |
|
| 159 | ||
| 160 | if a + b == 0 or a + c == 0: |
|
| 161 | return 0.0 |
|
| 162 | ||
| 163 | fp = (a * a + c * c) / (a + c) |
|
| 164 | ||
| 165 | sp = b * b + d * d |
|
| 166 | if sp: |
|
| 167 | sp /= b + d |
|
| 168 | ||
| 169 | num = fp + sp - (a + b) ** 2 - (c + d) ** 2 |
|
| 170 | if num > 1e-14: |
|
| 171 | return num / (1 - (a + b) ** 2 - (c + d) ** 2) |
|
| 172 | return 0.0 # pragma: no cover |
|
| 173 | ||
| 174 | ||
| 175 | if __name__ == '__main__': |
|