Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._cao.Cao.sim()   C

Complexity

Conditions 9

Size

Total Lines 84
Code Lines 46

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 9

Importance

Changes 0
Metric Value
eloc 46
dl 0
loc 84
ccs 30
cts 30
cp 1
rs 6.4339
c 0
b 0
f 0
cc 9
nop 3
crap 9

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._cao.
20
21
Cao's CY dissimilarity.
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from math import log10
32
33 1
from ._token_distance import _TokenDistance
34
35 1
__all__ = ['Cao']
36
37
38 1
class Cao(_TokenDistance):
39
    r"""Cao's CY dissimilarity.
40
41
    Given :math:`X_{ij}` (the number of individuals of speecies :math:`j` in
42
    sample :math:`i`), :math:`X_{kj}` (the number of individuals of speecies
43
    :math:`j` in sample :math:`k`), and :math:`N` (the total number of speecies
44
    present in both samples), Cao dissimilarity (CYd) :cite:`Cao:1997` is:
45
46
    .. math::
47
48
        dist_{Cao}(X, Y) =
49
        CYd = \frac{1}{N}\sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
50
        \frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
51
        {X_{ij}+X_{kj}}\Bigg)
52
53
    In the above formula, whenever :math:`X_{ij} = 0` or :math:`X_{kj} = 0`,
54
    the value 0.1 is substituted.
55
56
    Since this measure ranges from 0 to :math:`\infty`, a similarity measure,
57
    CYs, ranging from 0 to 1 was also developed.
58
59
    .. math::
60
61
        sim_{Cao}(X, Y) = CYs = 1 - \frac{Observed~CYd}{Maximum~CYd}
62
63
    where
64
65
    .. math::
66
67
        Observed~CYd = \sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
68
        \frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
69
        {X_{ij}+X_{kj}}\Bigg)
70
71
    and with :math:`a` (the number of species present in both samples),
72
    :math:`b` (the number of species present in sample :math:`i` only), and
73
    :math:`c` (the number of species present in sample :math:`j` only),
74
75
    .. math::
76
77
        Maximum~CYd = D_1 + D_2 + D_3
78
79
    with
80
81
    .. math::
82
83
        D_1 = \sum_{j=1}^b \Bigg(\frac{(X_{ij} + 0.1) log_{10} \big(
84
        \frac{X_{ij}+0.1}{2}\big)-X_{ij}log_{10}0.1-0.1log_{10}X_{ij}}
85
        {X_{ij}+0.1}\Bigg)
86
87
        D_2 = \sum_{j=1}^c \Bigg(\frac{(X_{kj} + 0.1) log_{10} \big(
88
        \frac{X_{kj}+0.1}{2}\big)-X_{kj}log_{10}0.1-0.1log_{10}X_{kj}}
89
        {X_{kj}+0.1}\Bigg)
90
91
        D_1 = \sum_{j=1}^a \frac{a}{2} \Bigg(\frac{(D_i + 1) log_{10}
92
        \big(\frac{D_i+1}{2}\big)-log_{10}D_i}{D_i+1} + \frac{(D_k + 1) log_{10}
93
        \big(\frac{D_k+1}{2}\big)-log_{10}D_k}{D_k+1}\Bigg)
94
95
    with
96
97
    .. math::
98
99
        D_i = \frac{\sum X_{ij} - \frac{a}{2}}{\frac{a}{2}}
100
101
        D_k = \frac{\sum X_{kj} - \frac{a}{2}}{\frac{a}{2}}
102
103
    for
104
105
    .. math::
106
107
        X_{ij} \geq 1
108
109
        X_{kj} \geq 1
110
111
    .. versionadded:: 0.4.1
112
    """
113
114 1
    def __init__(self, **kwargs):
115
        """Initialize Cao instance.
116
117
        Parameters
118
        ----------
119
        **kwargs
120
            Arbitrary keyword arguments
121
122
123
        .. versionadded:: 0.4.1
124
125
        """
126 1
        super(Cao, self).__init__(**kwargs)
127
128 1
    def sim(self, src, tar):
129
        """Return Cao's CY similarity (CYs) of two strings.
130
131
        Parameters
132
        ----------
133
        src : str
134
            Source string for comparison
135
        tar : str
136
            Target string for comparison
137
138
        Returns
139
        -------
140
        float
141
            Cao's CY similarity
142
143
        Examples
144
        --------
145
        >>> cmp = Cao()
146
        >>> cmp.sim('cat', 'hat')
147
        0.0
148
        >>> cmp.sim('Niall', 'Neil')
149
        0.0
150
        >>> cmp.sim('aluminum', 'Catalan')
151
        0.0
152
        >>> cmp.sim('ATCG', 'TAGC')
153
        0.0
154
155
156
        .. versionadded:: 0.4.1
157
158
        """
159 1
        if src == tar:
160 1
            return 1.0
161 1
        if not src or not tar:
162 1
            return 0.0
163
164 1
        self._tokenize(src, tar)
165
166 1
        alphabet = self._total().keys()
167 1
        in_both_samples_half = len(self._intersection().keys()) / 2
168 1
        if not in_both_samples_half:
169 1
            return 0.0
170
171 1
        observed_cyd = 0
172 1
        maximum_cyd = 0
173 1
        for symbol in alphabet:
174 1
            src_tok = max(0.1, self._src_tokens[symbol])
175 1
            tar_tok = max(0.1, self._tar_tokens[symbol])
176 1
            tok_sum = src_tok + tar_tok
177 1
            observed_cyd += (
178
                tok_sum * log10(tok_sum / 2)
179
                - src_tok * log10(tar_tok)
180
                - tar_tok * log10(src_tok)
181
            ) / tok_sum
182
183 1
            if self._tar_tokens[symbol] == 0:
184 1
                maximum_cyd += (
185
                    (self._src_tokens[symbol] + 0.1)
186
                    * log10((self._src_tokens[symbol] + 0.1) / 2)
187
                    - self._src_tokens[symbol] * log10(0.1)
188
                    - 0.1 * log10(self._src_tokens[symbol])
189
                ) / (self._src_tokens[symbol] + 0.1)
190 1
            elif self._src_tokens[symbol] == 0:
191 1
                maximum_cyd += (
192
                    (self._tar_tokens[symbol] + 0.1)
193
                    * log10((self._tar_tokens[symbol] + 0.1) / 2)
194
                    - self._tar_tokens[symbol] * log10(0.1)
195
                    - 0.1 * log10(self._tar_tokens[symbol])
196
                ) / (self._tar_tokens[symbol] + 0.1)
197
198 1
        d_i = 0
199 1
        d_k = 0
200 1
        for symbol in self._intersection().keys():
201 1
            d_i += self._src_tokens[symbol]
202 1
            d_k += self._tar_tokens[symbol]
203 1
        d_i = (d_i - in_both_samples_half) / in_both_samples_half
204 1
        d_k = (d_k - in_both_samples_half) / in_both_samples_half
205
206 1
        maximum_cyd += in_both_samples_half * (
207
            ((d_i + 1) * log10((d_i + 1) / 2) - log10(d_i)) / (d_i + 1)
208
            + ((d_k + 1) * log10((d_k + 1) / 2) - log10(d_k)) / (d_k + 1)
209
        )
210
211 1
        return max(0.0, min(1.0, 1 - (observed_cyd / maximum_cyd)))
212
213 1
    def dist_abs(self, src, tar):
214
        """Return Cao's CY dissimilarity (CYd) of two strings.
215
216
        Parameters
217
        ----------
218
        src : str
219
            Source string for comparison
220
        tar : str
221
            Target string for comparison
222
223
        Returns
224
        -------
225
        float
226
            Cao's CY dissimilarity
227
228
        Examples
229
        --------
230
        >>> cmp = Cao()
231
        >>> cmp.dist_abs('cat', 'hat')
232
        0.3247267992925765
233
        >>> cmp.dist_abs('Niall', 'Neil')
234
        0.4132886536450973
235
        >>> cmp.dist_abs('aluminum', 'Catalan')
236
        0.5530666041976232
237
        >>> cmp.dist_abs('ATCG', 'TAGC')
238
        0.6494535985851531
239
240
241
        .. versionadded:: 0.4.1
242
243
        """
244 1
        if src == tar:
245 1
            return 0.0
246
247 1
        self._tokenize(src, tar)
248
249 1
        alphabet = self._total().keys()
250
251 1
        score = 0
252 1
        for symbol in alphabet:
253 1
            src_tok = max(0.1, self._src_tokens[symbol])
254 1
            tar_tok = max(0.1, self._tar_tokens[symbol])
255 1
            tok_sum = src_tok + tar_tok
256 1
            score += (
257
                tok_sum * log10(tok_sum / 2)
258
                - src_tok * log10(tar_tok)
259
                - tar_tok * log10(src_tok)
260
            ) / tok_sum
261
262 1
        return score / sum(self._total().values())
263
264
265
if __name__ == '__main__':
266
    import doctest
267
268
    doctest.testmod()
269