@@ 30-209 (lines=180) @@ | ||
27 | __all__ = ['KoppenI'] |
|
28 | ||
29 | ||
30 | class KoppenI(_TokenDistance): |
|
31 | r"""Köppen I correlation. |
|
32 | ||
33 | For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`, |
|
34 | Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{KoppenI}(X, Y) = |
|
39 | \frac{|X| \cdot |N \setminus X| - |X \setminus Y|} |
|
40 | {|X| \cdot |N \setminus X|} |
|
41 | ||
42 | To support cases where :math:`|X| \neq |Y|`, this class implements a slight |
|
43 | variation, while still providing the expected results when |
|
44 | :math:`|X| = |Y|`: |
|
45 | ||
46 | .. math:: |
|
47 | ||
48 | corr_{KoppenI}(X, Y) = |
|
49 | \frac{\frac{|X|+|Y|}{2} \cdot |
|
50 | \frac{|N \setminus X|+|N \setminus Y|}{2}- |
|
51 | \frac{|X \triangle Y|}{2}} |
|
52 | {\frac{|X|+|Y|}{2} \cdot |
|
53 | \frac{|N \setminus X|+|N \setminus Y|}{2}} |
|
54 | ||
55 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
56 | this is |
|
57 | ||
58 | .. math:: |
|
59 | ||
60 | sim_{KoppenI} = |
|
61 | \frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}- |
|
62 | \frac{b+c}{2}} |
|
63 | {\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}} |
|
64 | ||
65 | Notes |
|
66 | ----- |
|
67 | In the usual case all of the above values should be proportional to the |
|
68 | total number of samples n. I.e., a, b, c, d, & n should all be divided by |
|
69 | n prior to calculating the coefficient. This class's default normalizer |
|
70 | is, accordingly, 'proportional'. |
|
71 | ||
72 | .. versionadded:: 0.4.0 |
|
73 | ||
74 | """ |
|
75 | ||
76 | def __init__( |
|
77 | self, |
|
78 | alphabet: Optional[ |
|
79 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
80 | ] = None, |
|
81 | tokenizer: Optional[_Tokenizer] = None, |
|
82 | intersection_type: str = 'crisp', |
|
83 | normalizer: str = 'proportional', |
|
84 | **kwargs: Any |
|
85 | ) -> None: |
|
86 | """Initialize KoppenI instance. |
|
87 | ||
88 | Parameters |
|
89 | ---------- |
|
90 | alphabet : Counter, collection, int, or None |
|
91 | This represents the alphabet of possible tokens. |
|
92 | See :ref:`alphabet <alphabet>` description in |
|
93 | :py:class:`_TokenDistance` for details. |
|
94 | tokenizer : _Tokenizer |
|
95 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
96 | intersection_type : str |
|
97 | Specifies the intersection type, and set type as a result: |
|
98 | See :ref:`intersection_type <intersection_type>` description in |
|
99 | :py:class:`_TokenDistance` for details. |
|
100 | normalizer : str |
|
101 | Specifies the normalization type. See :ref:`normalizer <alphabet>` |
|
102 | description in :py:class:`_TokenDistance` for details. |
|
103 | **kwargs |
|
104 | Arbitrary keyword arguments |
|
105 | ||
106 | Other Parameters |
|
107 | ---------------- |
|
108 | qval : int |
|
109 | The length of each q-gram. Using this parameter and tokenizer=None |
|
110 | will cause the instance to use the QGram tokenizer with this |
|
111 | q value. |
|
112 | metric : _Distance |
|
113 | A string distance measure class for use in the ``soft`` and |
|
114 | ``fuzzy`` variants. |
|
115 | threshold : float |
|
116 | A threshold value, similarities above which are counted as |
|
117 | members of the intersection for the ``fuzzy`` variant. |
|
118 | ||
119 | ||
120 | .. versionadded:: 0.4.0 |
|
121 | ||
122 | """ |
|
123 | super(KoppenI, self).__init__( |
|
124 | alphabet=alphabet, |
|
125 | tokenizer=tokenizer, |
|
126 | intersection_type=intersection_type, |
|
127 | normalizer=normalizer, |
|
128 | **kwargs |
|
129 | ) |
|
130 | ||
131 | def corr(self, src: str, tar: str) -> float: |
|
132 | """Return the Köppen I correlation of two strings. |
|
133 | ||
134 | Parameters |
|
135 | ---------- |
|
136 | src : str |
|
137 | Source string (or QGrams/Counter objects) for comparison |
|
138 | tar : str |
|
139 | Target string (or QGrams/Counter objects) for comparison |
|
140 | ||
141 | Returns |
|
142 | ------- |
|
143 | float |
|
144 | Köppen I correlation |
|
145 | ||
146 | Examples |
|
147 | -------- |
|
148 | >>> cmp = KoppenI() |
|
149 | >>> cmp.corr('cat', 'hat') |
|
150 | 0.49615384615384617 |
|
151 | >>> cmp.corr('Niall', 'Neil') |
|
152 | 0.3575056927658083 |
|
153 | >>> cmp.corr('aluminum', 'Catalan') |
|
154 | 0.1068520131813188 |
|
155 | >>> cmp.corr('ATCG', 'TAGC') |
|
156 | -0.006418485237483896 |
|
157 | ||
158 | ||
159 | .. versionadded:: 0.4.0 |
|
160 | ||
161 | """ |
|
162 | if src == tar: |
|
163 | return 1.0 |
|
164 | self._tokenize(src, tar) |
|
165 | ||
166 | a = self._intersection_card() |
|
167 | b = self._src_only_card() |
|
168 | c = self._tar_only_card() |
|
169 | d = self._total_complement_card() |
|
170 | ||
171 | abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4 |
|
172 | ||
173 | num = abac_dbdc_mean_prod - (b + c) / 2 |
|
174 | if num: |
|
175 | return num / abac_dbdc_mean_prod |
|
176 | return 0.0 |
|
177 | ||
178 | def sim(self, src: str, tar: str) -> float: |
|
179 | """Return the Köppen I similarity of two strings. |
|
180 | ||
181 | Parameters |
|
182 | ---------- |
|
183 | src : str |
|
184 | Source string (or QGrams/Counter objects) for comparison |
|
185 | tar : str |
|
186 | Target string (or QGrams/Counter objects) for comparison |
|
187 | ||
188 | Returns |
|
189 | ------- |
|
190 | float |
|
191 | Köppen I similarity |
|
192 | ||
193 | Examples |
|
194 | -------- |
|
195 | >>> cmp = KoppenI() |
|
196 | >>> cmp.sim('cat', 'hat') |
|
197 | 0.7480769230769231 |
|
198 | >>> cmp.sim('Niall', 'Neil') |
|
199 | 0.6787528463829041 |
|
200 | >>> cmp.sim('aluminum', 'Catalan') |
|
201 | 0.5534260065906594 |
|
202 | >>> cmp.sim('ATCG', 'TAGC') |
|
203 | 0.49679075738125805 |
|
204 | ||
205 | ||
206 | .. versionadded:: 0.4.0 |
|
207 | ||
208 | """ |
|
209 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
210 | ||
211 | ||
212 | if __name__ == '__main__': |
@@ 30-185 (lines=156) @@ | ||
27 | __all__ = ['Gilbert'] |
|
28 | ||
29 | ||
30 | class Gilbert(_TokenDistance): |
|
31 | r"""Gilbert correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, the Gilbert correlation |
|
34 | :cite:`Gilbert:1884` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{Gilbert}(X, Y) = |
|
39 | \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
40 | |X \setminus Y| \cdot |Y \setminus X|)} |
|
41 | {|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 - |
|
42 | |(N \setminus X) \setminus Y|^2} |
|
43 | ||
44 | For lack of access to the original, this formula is based on the concurring |
|
45 | formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`. |
|
46 | ||
47 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
48 | this is |
|
49 | ||
50 | .. math:: |
|
51 | ||
52 | corr_{Gilbert} = |
|
53 | \frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2} |
|
54 | ||
55 | .. versionadded:: 0.4.0 |
|
56 | """ |
|
57 | ||
58 | def __init__( |
|
59 | self, |
|
60 | alphabet: Optional[ |
|
61 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
62 | ] = None, |
|
63 | tokenizer: Optional[_Tokenizer] = None, |
|
64 | intersection_type: str = 'crisp', |
|
65 | **kwargs: Any |
|
66 | ) -> None: |
|
67 | """Initialize Gilbert instance. |
|
68 | ||
69 | Parameters |
|
70 | ---------- |
|
71 | alphabet : Counter, collection, int, or None |
|
72 | This represents the alphabet of possible tokens. |
|
73 | See :ref:`alphabet <alphabet>` description in |
|
74 | :py:class:`_TokenDistance` for details. |
|
75 | tokenizer : _Tokenizer |
|
76 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
77 | intersection_type : str |
|
78 | Specifies the intersection type, and set type as a result: |
|
79 | See :ref:`intersection_type <intersection_type>` description in |
|
80 | :py:class:`_TokenDistance` for details. |
|
81 | **kwargs |
|
82 | Arbitrary keyword arguments |
|
83 | ||
84 | Other Parameters |
|
85 | ---------------- |
|
86 | qval : int |
|
87 | The length of each q-gram. Using this parameter and tokenizer=None |
|
88 | will cause the instance to use the QGram tokenizer with this |
|
89 | q value. |
|
90 | metric : _Distance |
|
91 | A string distance measure class for use in the ``soft`` and |
|
92 | ``fuzzy`` variants. |
|
93 | threshold : float |
|
94 | A threshold value, similarities above which are counted as |
|
95 | members of the intersection for the ``fuzzy`` variant. |
|
96 | ||
97 | ||
98 | .. versionadded:: 0.4.0 |
|
99 | ||
100 | """ |
|
101 | super(Gilbert, self).__init__( |
|
102 | alphabet=alphabet, |
|
103 | tokenizer=tokenizer, |
|
104 | intersection_type=intersection_type, |
|
105 | **kwargs |
|
106 | ) |
|
107 | ||
108 | def corr(self, src: str, tar: str) -> float: |
|
109 | """Return the Gilbert correlation of two strings. |
|
110 | ||
111 | Parameters |
|
112 | ---------- |
|
113 | src : str |
|
114 | Source string (or QGrams/Counter objects) for comparison |
|
115 | tar : str |
|
116 | Target string (or QGrams/Counter objects) for comparison |
|
117 | ||
118 | Returns |
|
119 | ------- |
|
120 | float |
|
121 | Gilbert correlation |
|
122 | ||
123 | Examples |
|
124 | -------- |
|
125 | >>> cmp = Gilbert() |
|
126 | >>> cmp.corr('cat', 'hat') |
|
127 | 0.3310580204778157 |
|
128 | >>> cmp.corr('Niall', 'Neil') |
|
129 | 0.21890122402504983 |
|
130 | >>> cmp.corr('aluminum', 'Catalan') |
|
131 | 0.057094811018577836 |
|
132 | >>> cmp.corr('ATCG', 'TAGC') |
|
133 | -0.003198976327575176 |
|
134 | ||
135 | ||
136 | .. versionadded:: 0.4.0 |
|
137 | ||
138 | """ |
|
139 | if src == tar: |
|
140 | return 1.0 |
|
141 | ||
142 | self._tokenize(src, tar) |
|
143 | ||
144 | a = self._intersection_card() |
|
145 | b = self._src_only_card() |
|
146 | c = self._tar_only_card() |
|
147 | n = self._population_unique_card() |
|
148 | ||
149 | num = a * n - (a + b) * (a + c) |
|
150 | if num: |
|
151 | return num / (n * (a + b + c) - (a + b) * (a + c)) |
|
152 | return 0.0 |
|
153 | ||
154 | def sim(self, src: str, tar: str) -> float: |
|
155 | """Return the Gilbert similarity of two strings. |
|
156 | ||
157 | Parameters |
|
158 | ---------- |
|
159 | src : str |
|
160 | Source string (or QGrams/Counter objects) for comparison |
|
161 | tar : str |
|
162 | Target string (or QGrams/Counter objects) for comparison |
|
163 | ||
164 | Returns |
|
165 | ------- |
|
166 | float |
|
167 | Gilbert similarity |
|
168 | ||
169 | Examples |
|
170 | -------- |
|
171 | >>> cmp = Gilbert() |
|
172 | >>> cmp.sim('cat', 'hat') |
|
173 | 0.6655290102389079 |
|
174 | >>> cmp.sim('Niall', 'Neil') |
|
175 | 0.6094506120125249 |
|
176 | >>> cmp.sim('aluminum', 'Catalan') |
|
177 | 0.5285474055092889 |
|
178 | >>> cmp.sim('ATCG', 'TAGC') |
|
179 | 0.4984005118362124 |
|
180 | ||
181 | ||
182 | .. versionadded:: 0.4.0 |
|
183 | ||
184 | """ |
|
185 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
186 | ||
187 | ||
188 | if __name__ == '__main__': |
@@ 30-178 (lines=149) @@ | ||
27 | __all__ = ['MaxwellPilliner'] |
|
28 | ||
29 | ||
30 | class MaxwellPilliner(_TokenDistance): |
|
31 | r"""Maxwell & Pilliner correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, Maxwell & Pilliner correlation |
|
34 | :cite:`Maxwell:1968` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{MaxwellPilliner}(X, Y) = |
|
39 | \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
40 | |X \setminus Y| \cdot |Y \setminus X|)} |
|
41 | {|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|} |
|
42 | ||
43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
44 | this is |
|
45 | ||
46 | .. math:: |
|
47 | ||
48 | corr_{MaxwellPilliner} = |
|
49 | \frac{2(ad-bc)}{(a+b)(c+d)+(a+c)(b+c)} |
|
50 | ||
51 | .. versionadded:: 0.4.0 |
|
52 | """ |
|
53 | ||
54 | def __init__( |
|
55 | self, |
|
56 | alphabet: Optional[ |
|
57 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
58 | ] = None, |
|
59 | tokenizer: Optional[_Tokenizer] = None, |
|
60 | intersection_type: str = 'crisp', |
|
61 | **kwargs: Any |
|
62 | ) -> None: |
|
63 | """Initialize MaxwellPilliner instance. |
|
64 | ||
65 | Parameters |
|
66 | ---------- |
|
67 | alphabet : Counter, collection, int, or None |
|
68 | This represents the alphabet of possible tokens. |
|
69 | See :ref:`alphabet <alphabet>` description in |
|
70 | :py:class:`_TokenDistance` for details. |
|
71 | tokenizer : _Tokenizer |
|
72 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
73 | intersection_type : str |
|
74 | Specifies the intersection type, and set type as a result: |
|
75 | See :ref:`intersection_type <intersection_type>` description in |
|
76 | :py:class:`_TokenDistance` for details. |
|
77 | **kwargs |
|
78 | Arbitrary keyword arguments |
|
79 | ||
80 | Other Parameters |
|
81 | ---------------- |
|
82 | qval : int |
|
83 | The length of each q-gram. Using this parameter and tokenizer=None |
|
84 | will cause the instance to use the QGram tokenizer with this |
|
85 | q value. |
|
86 | metric : _Distance |
|
87 | A string distance measure class for use in the ``soft`` and |
|
88 | ``fuzzy`` variants. |
|
89 | threshold : float |
|
90 | A threshold value, similarities above which are counted as |
|
91 | members of the intersection for the ``fuzzy`` variant. |
|
92 | ||
93 | ||
94 | .. versionadded:: 0.4.0 |
|
95 | ||
96 | """ |
|
97 | super(MaxwellPilliner, self).__init__( |
|
98 | alphabet=alphabet, |
|
99 | tokenizer=tokenizer, |
|
100 | intersection_type=intersection_type, |
|
101 | **kwargs |
|
102 | ) |
|
103 | ||
104 | def corr(self, src: str, tar: str) -> float: |
|
105 | """Return the Maxwell & Pilliner correlation of two strings. |
|
106 | ||
107 | Parameters |
|
108 | ---------- |
|
109 | src : str |
|
110 | Source string (or QGrams/Counter objects) for comparison |
|
111 | tar : str |
|
112 | Target string (or QGrams/Counter objects) for comparison |
|
113 | ||
114 | Returns |
|
115 | ------- |
|
116 | float |
|
117 | Maxwell & Pilliner correlation |
|
118 | ||
119 | Examples |
|
120 | -------- |
|
121 | >>> cmp = MaxwellPilliner() |
|
122 | >>> cmp.corr('cat', 'hat') |
|
123 | 0.49743589743589745 |
|
124 | >>> cmp.corr('Niall', 'Neil') |
|
125 | 0.35921989956790845 |
|
126 | >>> cmp.corr('aluminum', 'Catalan') |
|
127 | 0.10803030303030303 |
|
128 | >>> cmp.corr('ATCG', 'TAGC') |
|
129 | -0.006418485237483954 |
|
130 | ||
131 | ||
132 | .. versionadded:: 0.4.0 |
|
133 | ||
134 | """ |
|
135 | self._tokenize(src, tar) |
|
136 | ||
137 | a = self._intersection_card() |
|
138 | b = self._src_only_card() |
|
139 | c = self._tar_only_card() |
|
140 | d = self._total_complement_card() |
|
141 | ||
142 | num = a * d - b * c |
|
143 | if num: |
|
144 | return 2 * num / ((a + b) * (c + d) + (a + c) * (b + d)) |
|
145 | return 0.0 |
|
146 | ||
147 | def sim(self, src: str, tar: str) -> float: |
|
148 | """Return the Maxwell & Pilliner similarity of two strings. |
|
149 | ||
150 | Parameters |
|
151 | ---------- |
|
152 | src : str |
|
153 | Source string (or QGrams/Counter objects) for comparison |
|
154 | tar : str |
|
155 | Target string (or QGrams/Counter objects) for comparison |
|
156 | ||
157 | Returns |
|
158 | ------- |
|
159 | float |
|
160 | Maxwell & Pilliner similarity |
|
161 | ||
162 | Examples |
|
163 | -------- |
|
164 | >>> cmp = MaxwellPilliner() |
|
165 | >>> cmp.sim('cat', 'hat') |
|
166 | 0.7487179487179487 |
|
167 | >>> cmp.sim('Niall', 'Neil') |
|
168 | 0.6796099497839543 |
|
169 | >>> cmp.sim('aluminum', 'Catalan') |
|
170 | 0.5540151515151515 |
|
171 | >>> cmp.sim('ATCG', 'TAGC') |
|
172 | 0.496790757381258 |
|
173 | ||
174 | ||
175 | .. versionadded:: 0.4.0 |
|
176 | ||
177 | """ |
|
178 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
179 | ||
180 | ||
181 | if __name__ == '__main__': |