1 | # Copyright 2018-2020 by Christopher C. Little. |
||
2 | # This file is part of Abydos. |
||
3 | # |
||
4 | # Abydos is free software: you can redistribute it and/or modify |
||
5 | # it under the terms of the GNU General Public License as published by |
||
6 | # the Free Software Foundation, either version 3 of the License, or |
||
7 | # (at your option) any later version. |
||
8 | # |
||
9 | # Abydos is distributed in the hope that it will be useful, |
||
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
12 | # GNU General Public License for more details. |
||
13 | # |
||
14 | # You should have received a copy of the GNU General Public License |
||
15 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
16 | |||
17 | """abydos.distance._gilbert. |
||
18 | |||
19 | 1 | Gilbert correlation |
|
20 | """ |
||
21 | |||
22 | from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union |
||
23 | |||
24 | 1 | from ._token_distance import _TokenDistance |
|
25 | from ..tokenizer import _Tokenizer |
||
26 | |||
27 | __all__ = ['Gilbert'] |
||
28 | |||
29 | |||
30 | View Code Duplication | class Gilbert(_TokenDistance): |
|
0 ignored issues
–
show
Duplication
introduced
by
![]() |
|||
31 | 1 | r"""Gilbert correlation. |
|
32 | |||
33 | 1 | For two sets X and Y and a population N, the Gilbert correlation |
|
34 | :cite:`Gilbert:1884` is |
||
35 | |||
36 | 1 | .. math:: |
|
37 | |||
38 | corr_{Gilbert}(X, Y) = |
||
39 | \frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
||
40 | |X \setminus Y| \cdot |Y \setminus X|)} |
||
41 | {|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 - |
||
42 | |(N \setminus X) \setminus Y|^2} |
||
43 | |||
44 | For lack of access to the original, this formula is based on the concurring |
||
45 | formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`. |
||
46 | |||
47 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
||
48 | this is |
||
49 | |||
50 | .. math:: |
||
51 | |||
52 | corr_{Gilbert} = |
||
53 | \frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2} |
||
54 | |||
55 | .. versionadded:: 0.4.0 |
||
56 | """ |
||
57 | |||
58 | def __init__( |
||
59 | self, |
||
60 | alphabet: Optional[ |
||
61 | Union[TCounter[str], Sequence[str], Set[str], int] |
||
62 | ] = None, |
||
63 | tokenizer: Optional[_Tokenizer] = None, |
||
64 | 1 | intersection_type: str = 'crisp', |
|
65 | **kwargs: Any |
||
66 | ) -> None: |
||
67 | """Initialize Gilbert instance. |
||
68 | |||
69 | Parameters |
||
70 | ---------- |
||
71 | alphabet : Counter, collection, int, or None |
||
72 | This represents the alphabet of possible tokens. |
||
73 | See :ref:`alphabet <alphabet>` description in |
||
74 | :py:class:`_TokenDistance` for details. |
||
75 | tokenizer : _Tokenizer |
||
76 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
||
77 | intersection_type : str |
||
78 | Specifies the intersection type, and set type as a result: |
||
79 | See :ref:`intersection_type <intersection_type>` description in |
||
80 | :py:class:`_TokenDistance` for details. |
||
81 | **kwargs |
||
82 | Arbitrary keyword arguments |
||
83 | |||
84 | Other Parameters |
||
85 | ---------------- |
||
86 | qval : int |
||
87 | The length of each q-gram. Using this parameter and tokenizer=None |
||
88 | will cause the instance to use the QGram tokenizer with this |
||
89 | q value. |
||
90 | metric : _Distance |
||
91 | A string distance measure class for use in the ``soft`` and |
||
92 | ``fuzzy`` variants. |
||
93 | threshold : float |
||
94 | A threshold value, similarities above which are counted as |
||
95 | members of the intersection for the ``fuzzy`` variant. |
||
96 | |||
97 | |||
98 | .. versionadded:: 0.4.0 |
||
99 | |||
100 | """ |
||
101 | super(Gilbert, self).__init__( |
||
102 | alphabet=alphabet, |
||
103 | tokenizer=tokenizer, |
||
104 | intersection_type=intersection_type, |
||
105 | 1 | **kwargs |
|
106 | ) |
||
107 | |||
108 | def corr(self, src: str, tar: str) -> float: |
||
109 | """Return the Gilbert correlation of two strings. |
||
110 | |||
111 | Parameters |
||
112 | 1 | ---------- |
|
113 | src : str |
||
114 | Source string (or QGrams/Counter objects) for comparison |
||
115 | tar : str |
||
116 | Target string (or QGrams/Counter objects) for comparison |
||
117 | |||
118 | Returns |
||
119 | ------- |
||
120 | float |
||
121 | Gilbert correlation |
||
122 | |||
123 | Examples |
||
124 | -------- |
||
125 | >>> cmp = Gilbert() |
||
126 | >>> cmp.corr('cat', 'hat') |
||
127 | 0.3310580204778157 |
||
128 | >>> cmp.corr('Niall', 'Neil') |
||
129 | 0.21890122402504983 |
||
130 | >>> cmp.corr('aluminum', 'Catalan') |
||
131 | 0.057094811018577836 |
||
132 | >>> cmp.corr('ATCG', 'TAGC') |
||
133 | -0.003198976327575176 |
||
134 | |||
135 | |||
136 | .. versionadded:: 0.4.0 |
||
137 | |||
138 | """ |
||
139 | if src == tar: |
||
140 | return 1.0 |
||
141 | |||
142 | self._tokenize(src, tar) |
||
143 | 1 | ||
144 | 1 | a = self._intersection_card() |
|
145 | b = self._src_only_card() |
||
146 | 1 | c = self._tar_only_card() |
|
147 | n = self._population_unique_card() |
||
148 | 1 | ||
149 | 1 | num = a * n - (a + b) * (a + c) |
|
150 | 1 | if num: |
|
151 | 1 | return num / (n * (a + b + c) - (a + b) * (a + c)) |
|
152 | return 0.0 |
||
153 | 1 | ||
154 | 1 | def sim(self, src: str, tar: str) -> float: |
|
155 | 1 | """Return the Gilbert similarity of two strings. |
|
156 | 1 | ||
157 | Parameters |
||
158 | 1 | ---------- |
|
159 | src : str |
||
160 | Source string (or QGrams/Counter objects) for comparison |
||
161 | tar : str |
||
162 | Target string (or QGrams/Counter objects) for comparison |
||
163 | |||
164 | Returns |
||
165 | ------- |
||
166 | float |
||
167 | Gilbert similarity |
||
168 | |||
169 | Examples |
||
170 | -------- |
||
171 | >>> cmp = Gilbert() |
||
172 | >>> cmp.sim('cat', 'hat') |
||
173 | 0.6655290102389079 |
||
174 | >>> cmp.sim('Niall', 'Neil') |
||
175 | 0.6094506120125249 |
||
176 | >>> cmp.sim('aluminum', 'Catalan') |
||
177 | 0.5285474055092889 |
||
178 | >>> cmp.sim('ATCG', 'TAGC') |
||
179 | 0.4984005118362124 |
||
180 | |||
181 | |||
182 | .. versionadded:: 0.4.0 |
||
183 | |||
184 | """ |
||
185 | return (1.0 + self.corr(src, tar)) / 2.0 |
||
186 | |||
187 | |||
188 | if __name__ == '__main__': |
||
189 | 1 | import doctest |
|
190 | |||
191 | doctest.testmod() |
||
192 |