1 | # Copyright 2018-2020 by Christopher C. Little. |
||
2 | # This file is part of Abydos. |
||
3 | # |
||
4 | # Abydos is free software: you can redistribute it and/or modify |
||
5 | # it under the terms of the GNU General Public License as published by |
||
6 | # the Free Software Foundation, either version 3 of the License, or |
||
7 | # (at your option) any later version. |
||
8 | # |
||
9 | # Abydos is distributed in the hope that it will be useful, |
||
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
12 | # GNU General Public License for more details. |
||
13 | # |
||
14 | # You should have received a copy of the GNU General Public License |
||
15 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
16 | |||
17 | """abydos.distance._dispersion. |
||
18 | |||
19 | 1 | Dispersion correlation |
|
20 | """ |
||
21 | |||
22 | from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union |
||
23 | |||
24 | 1 | from ._token_distance import _TokenDistance |
|
25 | from ..tokenizer import _Tokenizer |
||
26 | |||
27 | __all__ = ['Dispersion'] |
||
28 | |||
29 | |||
30 | View Code Duplication | class Dispersion(_TokenDistance): |
|
0 ignored issues
–
show
Duplication
introduced
by
![]() |
|||
31 | 1 | r"""Dispersion correlation. |
|
32 | |||
33 | 1 | For two sets X and Y and a population N, the dispersion |
|
34 | correlation :cite:`IBM:2017` is |
||
35 | |||
36 | 1 | .. math:: |
|
37 | |||
38 | corr_{dispersion}(X, Y) = |
||
39 | \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
||
40 | |X \setminus Y| \cdot |Y \setminus X|} |
||
41 | {|N|^2} |
||
42 | |||
43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
||
44 | this is |
||
45 | |||
46 | .. math:: |
||
47 | |||
48 | corr_{dispersion} = |
||
49 | \frac{ad-bc}{n^2} |
||
50 | |||
51 | .. versionadded:: 0.4.0 |
||
52 | """ |
||
53 | |||
54 | def __init__( |
||
55 | self, |
||
56 | alphabet: Optional[ |
||
57 | Union[TCounter[str], Sequence[str], Set[str], int] |
||
58 | ] = None, |
||
59 | tokenizer: Optional[_Tokenizer] = None, |
||
60 | 1 | intersection_type: str = 'crisp', |
|
61 | **kwargs: Any |
||
62 | ) -> None: |
||
63 | """Initialize Dispersion instance. |
||
64 | |||
65 | Parameters |
||
66 | ---------- |
||
67 | alphabet : Counter, collection, int, or None |
||
68 | This represents the alphabet of possible tokens. |
||
69 | See :ref:`alphabet <alphabet>` description in |
||
70 | :py:class:`_TokenDistance` for details. |
||
71 | tokenizer : _Tokenizer |
||
72 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
||
73 | intersection_type : str |
||
74 | Specifies the intersection type, and set type as a result: |
||
75 | See :ref:`intersection_type <intersection_type>` description in |
||
76 | :py:class:`_TokenDistance` for details. |
||
77 | **kwargs |
||
78 | Arbitrary keyword arguments |
||
79 | |||
80 | Other Parameters |
||
81 | ---------------- |
||
82 | qval : int |
||
83 | The length of each q-gram. Using this parameter and tokenizer=None |
||
84 | will cause the instance to use the QGram tokenizer with this |
||
85 | q value. |
||
86 | metric : _Distance |
||
87 | A string distance measure class for use in the ``soft`` and |
||
88 | ``fuzzy`` variants. |
||
89 | threshold : float |
||
90 | A threshold value, similarities above which are counted as |
||
91 | members of the intersection for the ``fuzzy`` variant. |
||
92 | |||
93 | |||
94 | .. versionadded:: 0.4.0 |
||
95 | |||
96 | """ |
||
97 | super(Dispersion, self).__init__( |
||
98 | alphabet=alphabet, |
||
99 | tokenizer=tokenizer, |
||
100 | intersection_type=intersection_type, |
||
101 | 1 | **kwargs |
|
102 | ) |
||
103 | |||
104 | def corr(self, src: str, tar: str) -> float: |
||
105 | """Return the Dispersion correlation of two strings. |
||
106 | |||
107 | Parameters |
||
108 | 1 | ---------- |
|
109 | src : str |
||
110 | Source string (or QGrams/Counter objects) for comparison |
||
111 | tar : str |
||
112 | Target string (or QGrams/Counter objects) for comparison |
||
113 | |||
114 | Returns |
||
115 | ------- |
||
116 | float |
||
117 | Dispersion correlation |
||
118 | |||
119 | Examples |
||
120 | -------- |
||
121 | >>> cmp = Dispersion() |
||
122 | >>> cmp.corr('cat', 'hat') |
||
123 | 0.002524989587671803 |
||
124 | >>> cmp.corr('Niall', 'Neil') |
||
125 | 0.002502212619741774 |
||
126 | >>> cmp.corr('aluminum', 'Catalan') |
||
127 | 0.0011570449105440383 |
||
128 | >>> cmp.corr('ATCG', 'TAGC') |
||
129 | -4.06731570179092e-05 |
||
130 | |||
131 | |||
132 | .. versionadded:: 0.4.0 |
||
133 | |||
134 | """ |
||
135 | self._tokenize(src, tar) |
||
136 | |||
137 | a = self._intersection_card() |
||
138 | b = self._src_only_card() |
||
139 | 1 | c = self._tar_only_card() |
|
140 | d = self._total_complement_card() |
||
141 | 1 | n = self._population_unique_card() |
|
142 | 1 | ||
143 | 1 | admbc = a * d - b * c |
|
144 | 1 | if admbc == 0.0: |
|
145 | 1 | return 0.0 |
|
146 | return admbc / n ** 2 |
||
147 | 1 | ||
148 | 1 | def sim(self, src: str, tar: str) -> float: |
|
149 | 1 | """Return the Dispersion similarity of two strings. |
|
150 | 1 | ||
151 | Parameters |
||
152 | 1 | ---------- |
|
153 | src : str |
||
154 | Source string (or QGrams/Counter objects) for comparison |
||
155 | tar : str |
||
156 | Target string (or QGrams/Counter objects) for comparison |
||
157 | |||
158 | Returns |
||
159 | ------- |
||
160 | float |
||
161 | Dispersion similarity |
||
162 | |||
163 | Examples |
||
164 | -------- |
||
165 | >>> cmp = Dispersion() |
||
166 | >>> cmp.sim('cat', 'hat') |
||
167 | 0.5012624947938359 |
||
168 | >>> cmp.sim('Niall', 'Neil') |
||
169 | 0.5012511063098709 |
||
170 | >>> cmp.sim('aluminum', 'Catalan') |
||
171 | 0.500578522455272 |
||
172 | >>> cmp.sim('ATCG', 'TAGC') |
||
173 | 0.499979663421491 |
||
174 | |||
175 | |||
176 | .. versionadded:: 0.4.0 |
||
177 | |||
178 | """ |
||
179 | return (1 + self.corr(src, tar)) / 2 |
||
180 | |||
181 | |||
182 | if __name__ == '__main__': |
||
183 | 1 | import doctest |
|
184 | |||
185 | doctest.testmod() |
||
186 |