| @@ 36-172 (lines=137) @@ | ||
| 33 | __all__ = ['GoodmanKruskalTauB'] | |
| 34 | ||
| 35 | ||
| 36 | class GoodmanKruskalTauB(_TokenDistance): | |
| 37 | r"""Goodman & Kruskal's Tau B similarity. | |
| 38 | ||
| 39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b` | |
| 40 | similarity :cite:`Goodman:1954` is | |
| 41 | ||
| 42 | .. math:: | |
| 43 | ||
| 44 |             sim_{GK_{\tau_b}}(X, Y) = | |
| 45 |             \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + | |
| 46 |             \frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+ | |
| 47 |             \frac{\frac{|Y \setminus X|}{|N|}^2 + | |
| 48 |             \frac{|(N \setminus X) \setminus Y|}{|N|}^2} | |
| 49 |             {\frac{|N \setminus X|}{|N|}} - | |
| 50 |             (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} | |
| 51 |             {1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)} | |
| 52 | ||
| 53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, | |
| 54 | after each term has been converted to a proportion by dividing by n, this | |
| 55 | is | |
| 56 | ||
| 57 | .. math:: | |
| 58 | ||
| 59 |             sim_{GK_{\tau_b}} = | |
| 60 |             \frac{ | |
| 61 |             \frac{a^2 + b^2}{a+b} + | |
| 62 |             \frac{c^2 + d^2}{c+d} - | |
| 63 | ((a+c)^2 + (b+d)^2)} | |
| 64 |             {1 - ((a+c)^2 + (b+d)^2)} | |
| 65 | ||
| 66 | .. versionadded:: 0.4.0 | |
| 67 | """ | |
| 68 | ||
| 69 | def __init__( | |
| 70 | self, | |
| 71 | alphabet=None, | |
| 72 | tokenizer=None, | |
| 73 | intersection_type='crisp', | |
| 74 | normalizer='proportional', | |
| 75 | **kwargs | |
| 76 | ): | |
| 77 | """Initialize GoodmanKruskalTauB instance. | |
| 78 | ||
| 79 | Parameters | |
| 80 | ---------- | |
| 81 | alphabet : Counter, collection, int, or None | |
| 82 | This represents the alphabet of possible tokens. | |
| 83 | See :ref:`alphabet <alphabet>` description in | |
| 84 | :py:class:`_TokenDistance` for details. | |
| 85 | tokenizer : _Tokenizer | |
| 86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package | |
| 87 | intersection_type : str | |
| 88 | Specifies the intersection type, and set type as a result: | |
| 89 | See :ref:`intersection_type <intersection_type>` description in | |
| 90 | :py:class:`_TokenDistance` for details. | |
| 91 | normalizer : str | |
| 92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` | |
| 93 | description in :py:class:`_TokenDistance` for details. | |
| 94 | **kwargs | |
| 95 | Arbitrary keyword arguments | |
| 96 | ||
| 97 | Other Parameters | |
| 98 | ---------------- | |
| 99 | qval : int | |
| 100 | The length of each q-gram. Using this parameter and tokenizer=None | |
| 101 | will cause the instance to use the QGram tokenizer with this | |
| 102 | q value. | |
| 103 | metric : _Distance | |
| 104 | A string distance measure class for use in the ``soft`` and | |
| 105 | ``fuzzy`` variants. | |
| 106 | threshold : float | |
| 107 | A threshold value, similarities above which are counted as | |
| 108 | members of the intersection for the ``fuzzy`` variant. | |
| 109 | ||
| 110 | ||
| 111 | .. versionadded:: 0.4.0 | |
| 112 | ||
| 113 | """ | |
| 114 | super(GoodmanKruskalTauB, self).__init__( | |
| 115 | alphabet=alphabet, | |
| 116 | tokenizer=tokenizer, | |
| 117 | intersection_type=intersection_type, | |
| 118 | normalizer=normalizer, | |
| 119 | **kwargs | |
| 120 | ) | |
| 121 | ||
| 122 | def sim(self, src, tar): | |
| 123 | """Return Goodman & Kruskal's Tau B similarity of two strings. | |
| 124 | ||
| 125 | Parameters | |
| 126 | ---------- | |
| 127 | src : str | |
| 128 | Source string (or QGrams/Counter objects) for comparison | |
| 129 | tar : str | |
| 130 | Target string (or QGrams/Counter objects) for comparison | |
| 131 | ||
| 132 | Returns | |
| 133 | ------- | |
| 134 | float | |
| 135 | Goodman & Kruskal's Tau B similarity | |
| 136 | ||
| 137 | Examples | |
| 138 | -------- | |
| 139 | >>> cmp = GoodmanKruskalTauB() | |
| 140 |         >>> cmp.sim('cat', 'hat') | |
| 141 | 0.3304969657208484 | |
| 142 |         >>> cmp.sim('Niall', 'Neil') | |
| 143 | 0.2346006486710202 | |
| 144 |         >>> cmp.sim('aluminum', 'Catalan') | |
| 145 | 0.06533810992392582 | |
| 146 |         >>> cmp.sim('ATCG', 'TAGC') | |
| 147 | 4.119695274745721e-05 | |
| 148 | ||
| 149 | ||
| 150 | .. versionadded:: 0.4.0 | |
| 151 | ||
| 152 | """ | |
| 153 | self._tokenize(src, tar) | |
| 154 | ||
| 155 | a = self._intersection_card() | |
| 156 | b = self._src_only_card() | |
| 157 | c = self._tar_only_card() | |
| 158 | d = self._total_complement_card() | |
| 159 | ||
| 160 | if a + b == 0 or a + c == 0: | |
| 161 | return 0.0 | |
| 162 | ||
| 163 | fp = (a * a + b * b) / (a + b) | |
| 164 | ||
| 165 | sp = c * c + d * d | |
| 166 | if sp: | |
| 167 | sp /= c + d | |
| 168 | ||
| 169 | num = fp + sp - (a + c) ** 2 - (b + d) ** 2 | |
| 170 | if num > 1e-14: | |
| 171 | return num / (1 - (a + c) ** 2 - (b + d) ** 2) | |
| 172 | return 0.0 # pragma: no cover | |
| 173 | ||
| 174 | ||
| 175 | if __name__ == '__main__': | |
| @@ 36-172 (lines=137) @@ | ||
| 33 | __all__ = ['GoodmanKruskalTauA'] | |
| 34 | ||
| 35 | ||
| 36 | class GoodmanKruskalTauA(_TokenDistance): | |
| 37 | r"""Goodman & Kruskal's Tau A similarity. | |
| 38 | ||
| 39 | For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a` | |
| 40 | similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is | |
| 41 | ||
| 42 | .. math:: | |
| 43 | ||
| 44 |             sim_{GK_{\tau_a}}(X, Y) = | |
| 45 |             \frac{\frac{\frac{|X \cap Y|}{|N|}^2 + | |
| 46 |             \frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+ | |
| 47 |             \frac{\frac{|X \setminus Y|}{|N|}^2 + | |
| 48 |             \frac{|(N \setminus X) \setminus Y|}{|N|}^2} | |
| 49 |             {\frac{|N \setminus X|}{|N|}} - | |
| 50 |             (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} | |
| 51 |             {1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)} | |
| 52 | ||
| 53 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, | |
| 54 | after each term has been converted to a proportion by dividing by n, this | |
| 55 | is | |
| 56 | ||
| 57 | .. math:: | |
| 58 | ||
| 59 |             sim_{GK_{\tau_a}} = | |
| 60 |             \frac{ | |
| 61 |             \frac{a^2 + c^2}{a+c} + | |
| 62 |             \frac{b^2 + d^2}{b+d} - | |
| 63 | ((a+b)^2 + (c+d)^2)} | |
| 64 |             {1 - ((a+b)^2 + (c+d)^2)} | |
| 65 | ||
| 66 | .. versionadded:: 0.4.0 | |
| 67 | """ | |
| 68 | ||
| 69 | def __init__( | |
| 70 | self, | |
| 71 | alphabet=None, | |
| 72 | tokenizer=None, | |
| 73 | intersection_type='crisp', | |
| 74 | normalizer='proportional', | |
| 75 | **kwargs | |
| 76 | ): | |
| 77 | """Initialize GoodmanKruskalTauA instance. | |
| 78 | ||
| 79 | Parameters | |
| 80 | ---------- | |
| 81 | alphabet : Counter, collection, int, or None | |
| 82 | This represents the alphabet of possible tokens. | |
| 83 | See :ref:`alphabet <alphabet>` description in | |
| 84 | :py:class:`_TokenDistance` for details. | |
| 85 | tokenizer : _Tokenizer | |
| 86 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package | |
| 87 | intersection_type : str | |
| 88 | Specifies the intersection type, and set type as a result: | |
| 89 | See :ref:`intersection_type <intersection_type>` description in | |
| 90 | :py:class:`_TokenDistance` for details. | |
| 91 | normalizer : str | |
| 92 | Specifies the normalization type. See :ref:`normalizer <alphabet>` | |
| 93 | description in :py:class:`_TokenDistance` for details. | |
| 94 | **kwargs | |
| 95 | Arbitrary keyword arguments | |
| 96 | ||
| 97 | Other Parameters | |
| 98 | ---------------- | |
| 99 | qval : int | |
| 100 | The length of each q-gram. Using this parameter and tokenizer=None | |
| 101 | will cause the instance to use the QGram tokenizer with this | |
| 102 | q value. | |
| 103 | metric : _Distance | |
| 104 | A string distance measure class for use in the ``soft`` and | |
| 105 | ``fuzzy`` variants. | |
| 106 | threshold : float | |
| 107 | A threshold value, similarities above which are counted as | |
| 108 | members of the intersection for the ``fuzzy`` variant. | |
| 109 | ||
| 110 | ||
| 111 | .. versionadded:: 0.4.0 | |
| 112 | ||
| 113 | """ | |
| 114 | super(GoodmanKruskalTauA, self).__init__( | |
| 115 | alphabet=alphabet, | |
| 116 | tokenizer=tokenizer, | |
| 117 | intersection_type=intersection_type, | |
| 118 | normalizer=normalizer, | |
| 119 | **kwargs | |
| 120 | ) | |
| 121 | ||
| 122 | def sim(self, src, tar): | |
| 123 | """Return Goodman & Kruskal's Tau A similarity of two strings. | |
| 124 | ||
| 125 | Parameters | |
| 126 | ---------- | |
| 127 | src : str | |
| 128 | Source string (or QGrams/Counter objects) for comparison | |
| 129 | tar : str | |
| 130 | Target string (or QGrams/Counter objects) for comparison | |
| 131 | ||
| 132 | Returns | |
| 133 | ------- | |
| 134 | float | |
| 135 | Goodman & Kruskal's Tau A similarity | |
| 136 | ||
| 137 | Examples | |
| 138 | -------- | |
| 139 | >>> cmp = GoodmanKruskalTauA() | |
| 140 |         >>> cmp.sim('cat', 'hat') | |
| 141 | 0.3304969657208484 | |
| 142 |         >>> cmp.sim('Niall', 'Neil') | |
| 143 | 0.22137604585914503 | |
| 144 |         >>> cmp.sim('aluminum', 'Catalan') | |
| 145 | 0.05991264724130685 | |
| 146 |         >>> cmp.sim('ATCG', 'TAGC') | |
| 147 | 4.119695274745721e-05 | |
| 148 | ||
| 149 | ||
| 150 | .. versionadded:: 0.4.0 | |
| 151 | ||
| 152 | """ | |
| 153 | self._tokenize(src, tar) | |
| 154 | ||
| 155 | a = self._intersection_card() | |
| 156 | b = self._src_only_card() | |
| 157 | c = self._tar_only_card() | |
| 158 | d = self._total_complement_card() | |
| 159 | ||
| 160 | if a + b == 0 or a + c == 0: | |
| 161 | return 0.0 | |
| 162 | ||
| 163 | fp = (a * a + c * c) / (a + c) | |
| 164 | ||
| 165 | sp = b * b + d * d | |
| 166 | if sp: | |
| 167 | sp /= b + d | |
| 168 | ||
| 169 | num = fp + sp - (a + b) ** 2 - (c + d) ** 2 | |
| 170 | if num > 1e-14: | |
| 171 | return num / (1 - (a + b) ** 2 - (c + d) ** 2) | |
| 172 | return 0.0 # pragma: no cover | |
| 173 | ||
| 174 | ||
| 175 | if __name__ == '__main__': | |