| @@ 30-209 (lines=180) @@ | ||
| 27 | __all__ = ['KoppenI'] |
|
| 28 | ||
| 29 | ||
| 30 | class KoppenI(_TokenDistance): |
|
| 31 | r"""Köppen I correlation. |
|
| 32 | ||
| 33 | For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`, |
|
| 34 | Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is |
|
| 35 | ||
| 36 | .. math:: |
|
| 37 | ||
| 38 | corr_{KoppenI}(X, Y) = |
|
| 39 | \frac{|X| \cdot |N \setminus X| - |X \setminus Y|} |
|
| 40 | {|X| \cdot |N \setminus X|} |
|
| 41 | ||
| 42 | To support cases where :math:`|X| \neq |Y|`, this class implements a slight |
|
| 43 | variation, while still providing the expected results when |
|
| 44 | :math:`|X| = |Y|`: |
|
| 45 | ||
| 46 | .. math:: |
|
| 47 | ||
| 48 | corr_{KoppenI}(X, Y) = |
|
| 49 | \frac{\frac{|X|+|Y|}{2} \cdot |
|
| 50 | \frac{|N \setminus X|+|N \setminus Y|}{2}- |
|
| 51 | \frac{|X \triangle Y|}{2}} |
|
| 52 | {\frac{|X|+|Y|}{2} \cdot |
|
| 53 | \frac{|N \setminus X|+|N \setminus Y|}{2}} |
|
| 54 | ||
| 55 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
| 56 | this is |
|
| 57 | ||
| 58 | .. math:: |
|
| 59 | ||
| 60 | sim_{KoppenI} = |
|
| 61 | \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}- |
|
| 62 | \frac{b+c}{2}} |
|
| 63 | {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}} |
|
| 64 | ||
| 65 | Notes |
|
| 66 | ----- |
|
| 67 | In the usual case all of the above values should be proportional to the |
|
| 68 | total number of samples n. I.e., a, b, c, d, & n should all be divided by |
|
| 69 | n prior to calculating the coefficient. This class's default normalizer |
|
| 70 | is, accordingly, 'proportional'. |
|
| 71 | ||
| 72 | .. versionadded:: 0.4.0 |
|
| 73 | ||
| 74 | """ |
|
| 75 | ||
| 76 | def __init__( |
|
| 77 | self, |
|
| 78 | alphabet: Optional[ |
|
| 79 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
| 80 | ] = None, |
|
| 81 | tokenizer: Optional[_Tokenizer] = None, |
|
| 82 | intersection_type: str = 'crisp', |
|
| 83 | normalizer: str = 'proportional', |
|
| 84 | **kwargs: Any |
|
| 85 | ) -> None: |
|
| 86 | """Initialize KoppenI instance. |
|
| 87 | ||
| 88 | Parameters |
|
| 89 | ---------- |
|
| 90 | alphabet : Counter, collection, int, or None |
|
| 91 | This represents the alphabet of possible tokens. |
|
| 92 | See :ref:`alphabet <alphabet>` description in |
|
| 93 | :py:class:`_TokenDistance` for details. |
|
| 94 | tokenizer : _Tokenizer |
|
| 95 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
| 96 | intersection_type : str |
|
| 97 | Specifies the intersection type, and set type as a result: |
|
| 98 | See :ref:`intersection_type <intersection_type>` description in |
|
| 99 | :py:class:`_TokenDistance` for details. |
|
| 100 | normalizer : str |
|
| 101 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
| 102 | description in :py:class:`_TokenDistance` for details. |
|
| 103 | **kwargs |
|
| 104 | Arbitrary keyword arguments |
|
| 105 | ||
| 106 | Other Parameters |
|
| 107 | ---------------- |
|
| 108 | qval : int |
|
| 109 | The length of each q-gram. Using this parameter and tokenizer=None |
|
| 110 | will cause the instance to use the QGram tokenizer with this |
|
| 111 | q value. |
|
| 112 | metric : _Distance |
|
| 113 | A string distance measure class for use in the ``soft`` and |
|
| 114 | ``fuzzy`` variants. |
|
| 115 | threshold : float |
|
| 116 | A threshold value, similarities above which are counted as |
|
| 117 | members of the intersection for the ``fuzzy`` variant. |
|
| 118 | ||
| 119 | ||
| 120 | .. versionadded:: 0.4.0 |
|
| 121 | ||
| 122 | """ |
|
| 123 | super(KoppenI, self).__init__( |
|
| 124 | alphabet=alphabet, |
|
| 125 | tokenizer=tokenizer, |
|
| 126 | intersection_type=intersection_type, |
|
| 127 | normalizer=normalizer, |
|
| 128 | **kwargs |
|
| 129 | ) |
|
| 130 | ||
| 131 | def corr(self, src: str, tar: str) -> float: |
|
| 132 | """Return the Köppen I correlation of two strings. |
|
| 133 | ||
| 134 | Parameters |
|
| 135 | ---------- |
|
| 136 | src : str |
|
| 137 | Source string (or QGrams/Counter objects) for comparison |
|
| 138 | tar : str |
|
| 139 | Target string (or QGrams/Counter objects) for comparison |
|
| 140 | ||
| 141 | Returns |
|
| 142 | ------- |
|
| 143 | float |
|
| 144 | Köppen I correlation |
|
| 145 | ||
| 146 | Examples |
|
| 147 | -------- |
|
| 148 | >>> cmp = KoppenI() |
|
| 149 | >>> cmp.corr('cat', 'hat') |
|
| 150 | 0.49615384615384617 |
|
| 151 | >>> cmp.corr('Niall', 'Neil') |
|
| 152 | 0.3575056927658083 |
|
| 153 | >>> cmp.corr('aluminum', 'Catalan') |
|
| 154 | 0.1068520131813188 |
|
| 155 | >>> cmp.corr('ATCG', 'TAGC') |
|
| 156 | -0.006418485237483896 |
|
| 157 | ||
| 158 | ||
| 159 | .. versionadded:: 0.4.0 |
|
| 160 | ||
| 161 | """ |
|
| 162 | if src == tar: |
|
| 163 | return 1.0 |
|
| 164 | self._tokenize(src, tar) |
|
| 165 | ||
| 166 | a = self._intersection_card() |
|
| 167 | b = self._src_only_card() |
|
| 168 | c = self._tar_only_card() |
|
| 169 | d = self._total_complement_card() |
|
| 170 | ||
| 171 | abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4 |
|
| 172 | ||
| 173 | num = abac_dbdc_mean_prod - (b + c) / 2 |
|
| 174 | if num: |
|
| 175 | return num / abac_dbdc_mean_prod |
|
| 176 | return 0.0 |
|
| 177 | ||
| 178 | def sim(self, src: str, tar: str) -> float: |
|
| 179 | """Return the Köppen I similarity of two strings. |
|
| 180 | ||
| 181 | Parameters |
|
| 182 | ---------- |
|
| 183 | src : str |
|
| 184 | Source string (or QGrams/Counter objects) for comparison |
|
| 185 | tar : str |
|
| 186 | Target string (or QGrams/Counter objects) for comparison |
|
| 187 | ||
| 188 | Returns |
|
| 189 | ------- |
|
| 190 | float |
|
| 191 | Köppen I similarity |
|
| 192 | ||
| 193 | Examples |
|
| 194 | -------- |
|
| 195 | >>> cmp = KoppenI() |
|
| 196 | >>> cmp.sim('cat', 'hat') |
|
| 197 | 0.7480769230769231 |
|
| 198 | >>> cmp.sim('Niall', 'Neil') |
|
| 199 | 0.6787528463829041 |
|
| 200 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 201 | 0.5534260065906594 |
|
| 202 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 203 | 0.49679075738125805 |
|
| 204 | ||
| 205 | ||
| 206 | .. versionadded:: 0.4.0 |
|
| 207 | ||
| 208 | """ |
|
| 209 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
| 210 | ||
| 211 | ||
| 212 | if __name__ == '__main__': |
|
| @@ 30-185 (lines=156) @@ | ||
| 27 | __all__ = ['Gilbert'] |
|
| 28 | ||
| 29 | ||
| 30 | class Gilbert(_TokenDistance): |
|
| 31 | r"""Gilbert correlation. |
|
| 32 | ||
| 33 | For two sets X and Y and a population N, the Gilbert correlation |
|
| 34 | :cite:`Gilbert:1884` is |
|
| 35 | ||
| 36 | .. math:: |
|
| 37 | ||
| 38 | corr_{Gilbert}(X, Y) = |
|
| 39 | \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
| 40 | |X \setminus Y| \cdot |Y \setminus X|)} |
|
| 41 | {|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 - |
|
| 42 | |(N \setminus X) \setminus Y|^2} |
|
| 43 | ||
| 44 | For lack of access to the original, this formula is based on the concurring |
|
| 45 | formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`. |
|
| 46 | ||
| 47 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
| 48 | this is |
|
| 49 | ||
| 50 | .. math:: |
|
| 51 | ||
| 52 | corr_{Gilbert} = |
|
| 53 | \frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2} |
|
| 54 | ||
| 55 | .. versionadded:: 0.4.0 |
|
| 56 | """ |
|
| 57 | ||
| 58 | def __init__( |
|
| 59 | self, |
|
| 60 | alphabet: Optional[ |
|
| 61 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
| 62 | ] = None, |
|
| 63 | tokenizer: Optional[_Tokenizer] = None, |
|
| 64 | intersection_type: str = 'crisp', |
|
| 65 | **kwargs: Any |
|
| 66 | ) -> None: |
|
| 67 | """Initialize Gilbert instance. |
|
| 68 | ||
| 69 | Parameters |
|
| 70 | ---------- |
|
| 71 | alphabet : Counter, collection, int, or None |
|
| 72 | This represents the alphabet of possible tokens. |
|
| 73 | See :ref:`alphabet <alphabet>` description in |
|
| 74 | :py:class:`_TokenDistance` for details. |
|
| 75 | tokenizer : _Tokenizer |
|
| 76 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
| 77 | intersection_type : str |
|
| 78 | Specifies the intersection type, and set type as a result: |
|
| 79 | See :ref:`intersection_type <intersection_type>` description in |
|
| 80 | :py:class:`_TokenDistance` for details. |
|
| 81 | **kwargs |
|
| 82 | Arbitrary keyword arguments |
|
| 83 | ||
| 84 | Other Parameters |
|
| 85 | ---------------- |
|
| 86 | qval : int |
|
| 87 | The length of each q-gram. Using this parameter and tokenizer=None |
|
| 88 | will cause the instance to use the QGram tokenizer with this |
|
| 89 | q value. |
|
| 90 | metric : _Distance |
|
| 91 | A string distance measure class for use in the ``soft`` and |
|
| 92 | ``fuzzy`` variants. |
|
| 93 | threshold : float |
|
| 94 | A threshold value, similarities above which are counted as |
|
| 95 | members of the intersection for the ``fuzzy`` variant. |
|
| 96 | ||
| 97 | ||
| 98 | .. versionadded:: 0.4.0 |
|
| 99 | ||
| 100 | """ |
|
| 101 | super(Gilbert, self).__init__( |
|
| 102 | alphabet=alphabet, |
|
| 103 | tokenizer=tokenizer, |
|
| 104 | intersection_type=intersection_type, |
|
| 105 | **kwargs |
|
| 106 | ) |
|
| 107 | ||
| 108 | def corr(self, src: str, tar: str) -> float: |
|
| 109 | """Return the Gilbert correlation of two strings. |
|
| 110 | ||
| 111 | Parameters |
|
| 112 | ---------- |
|
| 113 | src : str |
|
| 114 | Source string (or QGrams/Counter objects) for comparison |
|
| 115 | tar : str |
|
| 116 | Target string (or QGrams/Counter objects) for comparison |
|
| 117 | ||
| 118 | Returns |
|
| 119 | ------- |
|
| 120 | float |
|
| 121 | Gilbert correlation |
|
| 122 | ||
| 123 | Examples |
|
| 124 | -------- |
|
| 125 | >>> cmp = Gilbert() |
|
| 126 | >>> cmp.corr('cat', 'hat') |
|
| 127 | 0.3310580204778157 |
|
| 128 | >>> cmp.corr('Niall', 'Neil') |
|
| 129 | 0.21890122402504983 |
|
| 130 | >>> cmp.corr('aluminum', 'Catalan') |
|
| 131 | 0.057094811018577836 |
|
| 132 | >>> cmp.corr('ATCG', 'TAGC') |
|
| 133 | -0.003198976327575176 |
|
| 134 | ||
| 135 | ||
| 136 | .. versionadded:: 0.4.0 |
|
| 137 | ||
| 138 | """ |
|
| 139 | if src == tar: |
|
| 140 | return 1.0 |
|
| 141 | ||
| 142 | self._tokenize(src, tar) |
|
| 143 | ||
| 144 | a = self._intersection_card() |
|
| 145 | b = self._src_only_card() |
|
| 146 | c = self._tar_only_card() |
|
| 147 | n = self._population_unique_card() |
|
| 148 | ||
| 149 | num = a * n - (a + b) * (a + c) |
|
| 150 | if num: |
|
| 151 | return num / (n * (a + b + c) - (a + b) * (a + c)) |
|
| 152 | return 0.0 |
|
| 153 | ||
| 154 | def sim(self, src: str, tar: str) -> float: |
|
| 155 | """Return the Gilbert similarity of two strings. |
|
| 156 | ||
| 157 | Parameters |
|
| 158 | ---------- |
|
| 159 | src : str |
|
| 160 | Source string (or QGrams/Counter objects) for comparison |
|
| 161 | tar : str |
|
| 162 | Target string (or QGrams/Counter objects) for comparison |
|
| 163 | ||
| 164 | Returns |
|
| 165 | ------- |
|
| 166 | float |
|
| 167 | Gilbert similarity |
|
| 168 | ||
| 169 | Examples |
|
| 170 | -------- |
|
| 171 | >>> cmp = Gilbert() |
|
| 172 | >>> cmp.sim('cat', 'hat') |
|
| 173 | 0.6655290102389079 |
|
| 174 | >>> cmp.sim('Niall', 'Neil') |
|
| 175 | 0.6094506120125249 |
|
| 176 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 177 | 0.5285474055092889 |
|
| 178 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 179 | 0.4984005118362124 |
|
| 180 | ||
| 181 | ||
| 182 | .. versionadded:: 0.4.0 |
|
| 183 | ||
| 184 | """ |
|
| 185 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
| 186 | ||
| 187 | ||
| 188 | if __name__ == '__main__': |
|
| @@ 30-178 (lines=149) @@ | ||
| 27 | __all__ = ['MaxwellPilliner'] |
|
| 28 | ||
| 29 | ||
| 30 | class MaxwellPilliner(_TokenDistance): |
|
| 31 | r"""Maxwell & Pilliner correlation. |
|
| 32 | ||
| 33 | For two sets X and Y and a population N, Maxwell & Pilliner correlation |
|
| 34 | :cite:`Maxwell:1968` is |
|
| 35 | ||
| 36 | .. math:: |
|
| 37 | ||
| 38 | corr_{MaxwellPilliner}(X, Y) = |
|
| 39 | \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
| 40 | |X \setminus Y| \cdot |Y \setminus X|)} |
|
| 41 | {|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|} |
|
| 42 | ||
| 43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
| 44 | this is |
|
| 45 | ||
| 46 | .. math:: |
|
| 47 | ||
| 48 | corr_{MaxwellPilliner} = |
|
| 49 | \frac{2(ad-bc)}{(a+b)(c+d)+(a+c)(b+c)} |
|
| 50 | ||
| 51 | .. versionadded:: 0.4.0 |
|
| 52 | """ |
|
| 53 | ||
| 54 | def __init__( |
|
| 55 | self, |
|
| 56 | alphabet: Optional[ |
|
| 57 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
| 58 | ] = None, |
|
| 59 | tokenizer: Optional[_Tokenizer] = None, |
|
| 60 | intersection_type: str = 'crisp', |
|
| 61 | **kwargs: Any |
|
| 62 | ) -> None: |
|
| 63 | """Initialize MaxwellPilliner instance. |
|
| 64 | ||
| 65 | Parameters |
|
| 66 | ---------- |
|
| 67 | alphabet : Counter, collection, int, or None |
|
| 68 | This represents the alphabet of possible tokens. |
|
| 69 | See :ref:`alphabet <alphabet>` description in |
|
| 70 | :py:class:`_TokenDistance` for details. |
|
| 71 | tokenizer : _Tokenizer |
|
| 72 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
| 73 | intersection_type : str |
|
| 74 | Specifies the intersection type, and set type as a result: |
|
| 75 | See :ref:`intersection_type <intersection_type>` description in |
|
| 76 | :py:class:`_TokenDistance` for details. |
|
| 77 | **kwargs |
|
| 78 | Arbitrary keyword arguments |
|
| 79 | ||
| 80 | Other Parameters |
|
| 81 | ---------------- |
|
| 82 | qval : int |
|
| 83 | The length of each q-gram. Using this parameter and tokenizer=None |
|
| 84 | will cause the instance to use the QGram tokenizer with this |
|
| 85 | q value. |
|
| 86 | metric : _Distance |
|
| 87 | A string distance measure class for use in the ``soft`` and |
|
| 88 | ``fuzzy`` variants. |
|
| 89 | threshold : float |
|
| 90 | A threshold value, similarities above which are counted as |
|
| 91 | members of the intersection for the ``fuzzy`` variant. |
|
| 92 | ||
| 93 | ||
| 94 | .. versionadded:: 0.4.0 |
|
| 95 | ||
| 96 | """ |
|
| 97 | super(MaxwellPilliner, self).__init__( |
|
| 98 | alphabet=alphabet, |
|
| 99 | tokenizer=tokenizer, |
|
| 100 | intersection_type=intersection_type, |
|
| 101 | **kwargs |
|
| 102 | ) |
|
| 103 | ||
| 104 | def corr(self, src: str, tar: str) -> float: |
|
| 105 | """Return the Maxwell & Pilliner correlation of two strings. |
|
| 106 | ||
| 107 | Parameters |
|
| 108 | ---------- |
|
| 109 | src : str |
|
| 110 | Source string (or QGrams/Counter objects) for comparison |
|
| 111 | tar : str |
|
| 112 | Target string (or QGrams/Counter objects) for comparison |
|
| 113 | ||
| 114 | Returns |
|
| 115 | ------- |
|
| 116 | float |
|
| 117 | Maxwell & Pilliner correlation |
|
| 118 | ||
| 119 | Examples |
|
| 120 | -------- |
|
| 121 | >>> cmp = MaxwellPilliner() |
|
| 122 | >>> cmp.corr('cat', 'hat') |
|
| 123 | 0.49743589743589745 |
|
| 124 | >>> cmp.corr('Niall', 'Neil') |
|
| 125 | 0.35921989956790845 |
|
| 126 | >>> cmp.corr('aluminum', 'Catalan') |
|
| 127 | 0.10803030303030303 |
|
| 128 | >>> cmp.corr('ATCG', 'TAGC') |
|
| 129 | -0.006418485237483954 |
|
| 130 | ||
| 131 | ||
| 132 | .. versionadded:: 0.4.0 |
|
| 133 | ||
| 134 | """ |
|
| 135 | self._tokenize(src, tar) |
|
| 136 | ||
| 137 | a = self._intersection_card() |
|
| 138 | b = self._src_only_card() |
|
| 139 | c = self._tar_only_card() |
|
| 140 | d = self._total_complement_card() |
|
| 141 | ||
| 142 | num = a * d - b * c |
|
| 143 | if num: |
|
| 144 | return 2 * num / ((a + b) * (c + d) + (a + c) * (b + d)) |
|
| 145 | return 0.0 |
|
| 146 | ||
| 147 | def sim(self, src: str, tar: str) -> float: |
|
| 148 | """Return the Maxwell & Pilliner similarity of two strings. |
|
| 149 | ||
| 150 | Parameters |
|
| 151 | ---------- |
|
| 152 | src : str |
|
| 153 | Source string (or QGrams/Counter objects) for comparison |
|
| 154 | tar : str |
|
| 155 | Target string (or QGrams/Counter objects) for comparison |
|
| 156 | ||
| 157 | Returns |
|
| 158 | ------- |
|
| 159 | float |
|
| 160 | Maxwell & Pilliner similarity |
|
| 161 | ||
| 162 | Examples |
|
| 163 | -------- |
|
| 164 | >>> cmp = MaxwellPilliner() |
|
| 165 | >>> cmp.sim('cat', 'hat') |
|
| 166 | 0.7487179487179487 |
|
| 167 | >>> cmp.sim('Niall', 'Neil') |
|
| 168 | 0.6796099497839543 |
|
| 169 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 170 | 0.5540151515151515 |
|
| 171 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 172 | 0.496790757381258 |
|
| 173 | ||
| 174 | ||
| 175 | .. versionadded:: 0.4.0 |
|
| 176 | ||
| 177 | """ |
|
| 178 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
| 179 | ||
| 180 | ||
| 181 | if __name__ == '__main__': |
|