1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2019 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.distance._cao. |
20
|
|
|
|
21
|
|
|
Cao's CY dissimilarity. |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import ( |
25
|
|
|
absolute_import, |
26
|
|
|
division, |
27
|
|
|
print_function, |
28
|
|
|
unicode_literals, |
29
|
|
|
) |
30
|
|
|
|
31
|
1 |
|
from math import log10 |
32
|
|
|
|
33
|
1 |
|
from ._token_distance import _TokenDistance |
34
|
|
|
|
35
|
1 |
|
__all__ = ['Cao'] |
36
|
|
|
|
37
|
|
|
|
38
|
1 |
|
class Cao(_TokenDistance): |
39
|
|
|
r"""Cao's CY dissimilarity. |
40
|
|
|
|
41
|
|
|
Given :math:`X_{ij}` (the number of individuals of speecies :math:`j` in |
42
|
|
|
sample :math:`i`), :math:`X_{kj}` (the number of individuals of speecies |
43
|
|
|
:math:`j` in sample :math:`k`), and :math:`N` (the total number of speecies |
44
|
|
|
present in both samples), Cao dissimilarity (CYd) :cite:`Cao:1997` is: |
45
|
|
|
|
46
|
|
|
.. math:: |
47
|
|
|
|
48
|
|
|
dist_{Cao}(X, Y) = |
49
|
|
|
CYd = \frac{1}{N}\sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big( |
50
|
|
|
\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}} |
51
|
|
|
{X_{ij}+X_{kj}}\Bigg) |
52
|
|
|
|
53
|
|
|
In the above formula, whenever :math:`X_{ij} = 0` or :math:`X_{kj} = 0`, |
54
|
|
|
the value 0.1 is substituted. |
55
|
|
|
|
56
|
|
|
Since this measure ranges from 0 to :math:`\infty`, a similarity measure, |
57
|
|
|
CYs, ranging from 0 to 1 was also developed. |
58
|
|
|
|
59
|
|
|
.. math:: |
60
|
|
|
|
61
|
|
|
sim_{Cao}(X, Y) = CYs = 1 - \frac{Observed~CYd}{Maximum~CYd} |
62
|
|
|
|
63
|
|
|
where |
64
|
|
|
|
65
|
|
|
.. math:: |
66
|
|
|
|
67
|
|
|
Observed~CYd = \sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big( |
68
|
|
|
\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}} |
69
|
|
|
{X_{ij}+X_{kj}}\Bigg) |
70
|
|
|
|
71
|
|
|
and with :math:`a` (the number of species present in both samples), |
72
|
|
|
:math:`b` (the number of species present in sample :math:`i` only), and |
73
|
|
|
:math:`c` (the number of species present in sample :math:`j` only), |
74
|
|
|
|
75
|
|
|
.. math:: |
76
|
|
|
|
77
|
|
|
Maximum~CYd = D_1 + D_2 + D_3 |
78
|
|
|
|
79
|
|
|
with |
80
|
|
|
|
81
|
|
|
.. math:: |
82
|
|
|
|
83
|
|
|
D_1 = \sum_{j=1}^b \Bigg(\frac{(X_{ij} + 0.1) log_{10} \big( |
84
|
|
|
\frac{X_{ij}+0.1}{2}\big)-X_{ij}log_{10}0.1-0.1log_{10}X_{ij}} |
85
|
|
|
{X_{ij}+0.1}\Bigg) |
86
|
|
|
|
87
|
|
|
D_2 = \sum_{j=1}^c \Bigg(\frac{(X_{kj} + 0.1) log_{10} \big( |
88
|
|
|
\frac{X_{kj}+0.1}{2}\big)-X_{kj}log_{10}0.1-0.1log_{10}X_{kj}} |
89
|
|
|
{X_{kj}+0.1}\Bigg) |
90
|
|
|
|
91
|
|
|
D_1 = \sum_{j=1}^a \frac{a}{2} \Bigg(\frac{(D_i + 1) log_{10} |
92
|
|
|
\big(\frac{D_i+1}{2}\big)-log_{10}D_i}{D_i+1} + \frac{(D_k + 1) log_{10} |
93
|
|
|
\big(\frac{D_k+1}{2}\big)-log_{10}D_k}{D_k+1}\Bigg) |
94
|
|
|
|
95
|
|
|
with |
96
|
|
|
|
97
|
|
|
.. math:: |
98
|
|
|
|
99
|
|
|
D_i = \frac{\sum X_{ij} - \frac{a}{2}}{\frac{a}{2}} |
100
|
|
|
|
101
|
|
|
D_k = \frac{\sum X_{kj} - \frac{a}{2}}{\frac{a}{2}} |
102
|
|
|
|
103
|
|
|
for |
104
|
|
|
|
105
|
|
|
.. math:: |
106
|
|
|
|
107
|
|
|
X_{ij} \geq 1 |
108
|
|
|
|
109
|
|
|
X_{kj} \geq 1 |
110
|
|
|
|
111
|
|
|
.. versionadded:: 0.4.1 |
112
|
|
|
""" |
113
|
|
|
|
114
|
1 |
|
def __init__(self, **kwargs): |
115
|
|
|
"""Initialize Cao instance. |
116
|
|
|
|
117
|
|
|
Parameters |
118
|
|
|
---------- |
119
|
|
|
**kwargs |
120
|
|
|
Arbitrary keyword arguments |
121
|
|
|
|
122
|
|
|
|
123
|
|
|
.. versionadded:: 0.4.1 |
124
|
|
|
|
125
|
|
|
""" |
126
|
1 |
|
super(Cao, self).__init__(**kwargs) |
127
|
|
|
|
128
|
1 |
|
def sim(self, src, tar): |
129
|
|
|
"""Return Cao's CY similarity (CYs) of two strings. |
130
|
|
|
|
131
|
|
|
Parameters |
132
|
|
|
---------- |
133
|
|
|
src : str |
134
|
|
|
Source string for comparison |
135
|
|
|
tar : str |
136
|
|
|
Target string for comparison |
137
|
|
|
|
138
|
|
|
Returns |
139
|
|
|
------- |
140
|
|
|
float |
141
|
|
|
Cao's CY similarity |
142
|
|
|
|
143
|
|
|
Examples |
144
|
|
|
-------- |
145
|
|
|
>>> cmp = Cao() |
146
|
|
|
>>> cmp.sim('cat', 'hat') |
147
|
|
|
0.0 |
148
|
|
|
>>> cmp.sim('Niall', 'Neil') |
149
|
|
|
0.0 |
150
|
|
|
>>> cmp.sim('aluminum', 'Catalan') |
151
|
|
|
0.0 |
152
|
|
|
>>> cmp.sim('ATCG', 'TAGC') |
153
|
|
|
0.0 |
154
|
|
|
|
155
|
|
|
|
156
|
|
|
.. versionadded:: 0.4.1 |
157
|
|
|
|
158
|
|
|
""" |
159
|
1 |
|
if src == tar: |
160
|
1 |
|
return 1.0 |
161
|
1 |
|
if not src or not tar: |
162
|
1 |
|
return 0.0 |
163
|
|
|
|
164
|
1 |
|
self._tokenize(src, tar) |
165
|
|
|
|
166
|
1 |
|
alphabet = self._total().keys() |
167
|
1 |
|
in_both_samples_half = len(self._intersection().keys()) / 2 |
168
|
1 |
|
if not in_both_samples_half: |
169
|
1 |
|
return 0.0 |
170
|
|
|
|
171
|
1 |
|
observed_cyd = 0 |
172
|
1 |
|
maximum_cyd = 0 |
173
|
1 |
|
for symbol in alphabet: |
174
|
1 |
|
src_tok = max(0.1, self._src_tokens[symbol]) |
175
|
1 |
|
tar_tok = max(0.1, self._tar_tokens[symbol]) |
176
|
1 |
|
tok_sum = src_tok + tar_tok |
177
|
1 |
|
observed_cyd += ( |
178
|
|
|
tok_sum * log10(tok_sum / 2) |
179
|
|
|
- src_tok * log10(tar_tok) |
180
|
|
|
- tar_tok * log10(src_tok) |
181
|
|
|
) / tok_sum |
182
|
|
|
|
183
|
1 |
|
if self._tar_tokens[symbol] == 0: |
184
|
1 |
|
maximum_cyd += ( |
185
|
|
|
(self._src_tokens[symbol] + 0.1) |
186
|
|
|
* log10((self._src_tokens[symbol] + 0.1) / 2) |
187
|
|
|
- self._src_tokens[symbol] * log10(0.1) |
188
|
|
|
- 0.1 * log10(self._src_tokens[symbol]) |
189
|
|
|
) / (self._src_tokens[symbol] + 0.1) |
190
|
1 |
|
elif self._src_tokens[symbol] == 0: |
191
|
1 |
|
maximum_cyd += ( |
192
|
|
|
(self._tar_tokens[symbol] + 0.1) |
193
|
|
|
* log10((self._tar_tokens[symbol] + 0.1) / 2) |
194
|
|
|
- self._tar_tokens[symbol] * log10(0.1) |
195
|
|
|
- 0.1 * log10(self._tar_tokens[symbol]) |
196
|
|
|
) / (self._tar_tokens[symbol] + 0.1) |
197
|
|
|
|
198
|
1 |
|
d_i = 0 |
199
|
1 |
|
d_k = 0 |
200
|
1 |
|
for symbol in self._intersection().keys(): |
201
|
1 |
|
d_i += self._src_tokens[symbol] |
202
|
1 |
|
d_k += self._tar_tokens[symbol] |
203
|
1 |
|
d_i = (d_i - in_both_samples_half) / in_both_samples_half |
204
|
1 |
|
d_k = (d_k - in_both_samples_half) / in_both_samples_half |
205
|
|
|
|
206
|
1 |
|
maximum_cyd += in_both_samples_half * ( |
207
|
|
|
((d_i + 1) * log10((d_i + 1) / 2) - log10(d_i)) / (d_i + 1) |
208
|
|
|
+ ((d_k + 1) * log10((d_k + 1) / 2) - log10(d_k)) / (d_k + 1) |
209
|
|
|
) |
210
|
|
|
|
211
|
1 |
|
return max(0.0, min(1.0, 1 - (observed_cyd / maximum_cyd))) |
212
|
|
|
|
213
|
1 |
|
def dist_abs(self, src, tar): |
214
|
|
|
"""Return Cao's CY dissimilarity (CYd) of two strings. |
215
|
|
|
|
216
|
|
|
Parameters |
217
|
|
|
---------- |
218
|
|
|
src : str |
219
|
|
|
Source string for comparison |
220
|
|
|
tar : str |
221
|
|
|
Target string for comparison |
222
|
|
|
|
223
|
|
|
Returns |
224
|
|
|
------- |
225
|
|
|
float |
226
|
|
|
Cao's CY dissimilarity |
227
|
|
|
|
228
|
|
|
Examples |
229
|
|
|
-------- |
230
|
|
|
>>> cmp = Cao() |
231
|
|
|
>>> cmp.dist_abs('cat', 'hat') |
232
|
|
|
0.3247267992925765 |
233
|
|
|
>>> cmp.dist_abs('Niall', 'Neil') |
234
|
|
|
0.4132886536450973 |
235
|
|
|
>>> cmp.dist_abs('aluminum', 'Catalan') |
236
|
|
|
0.5530666041976232 |
237
|
|
|
>>> cmp.dist_abs('ATCG', 'TAGC') |
238
|
|
|
0.6494535985851531 |
239
|
|
|
|
240
|
|
|
|
241
|
|
|
.. versionadded:: 0.4.1 |
242
|
|
|
|
243
|
|
|
""" |
244
|
1 |
|
if src == tar: |
245
|
1 |
|
return 0.0 |
246
|
|
|
|
247
|
1 |
|
self._tokenize(src, tar) |
248
|
|
|
|
249
|
1 |
|
alphabet = self._total().keys() |
250
|
|
|
|
251
|
1 |
|
score = 0 |
252
|
1 |
|
for symbol in alphabet: |
253
|
1 |
|
src_tok = max(0.1, self._src_tokens[symbol]) |
254
|
1 |
|
tar_tok = max(0.1, self._tar_tokens[symbol]) |
255
|
1 |
|
tok_sum = src_tok + tar_tok |
256
|
1 |
|
score += ( |
257
|
|
|
tok_sum * log10(tok_sum / 2) |
258
|
|
|
- src_tok * log10(tar_tok) |
259
|
|
|
- tar_tok * log10(src_tok) |
260
|
|
|
) / tok_sum |
261
|
|
|
|
262
|
1 |
|
return score / sum(self._total().values()) |
263
|
|
|
|
264
|
|
|
|
265
|
|
|
if __name__ == '__main__': |
266
|
|
|
import doctest |
267
|
|
|
|
268
|
|
|
doctest.testmod() |
269
|
|
|
|