abydos.distance._cao.Cao.sim() - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.distance._cao.Cao.sim() C

↳ Parent: abydos.distance._cao

Complexity

Conditions

Size

Total Lines	84
Code Lines	46

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	30
CRAP Score	9

Importance

Changes

Metric	Value
eloc	46
dl	0
loc	84
ccs	30
cts	30
cp	1
rs	6.4339
c	0
b	0
f	0
cc	9
nop	3
crap	9

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._cao.

Cao's CY dissimilarity.
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from math import log10

from ._token_distance import _TokenDistance

__all__ = ['Cao']


class Cao(_TokenDistance):
    r"""Cao's CY dissimilarity.

    Given :math:`X_{ij}` (the number of individuals of speecies :math:`j` in
    sample :math:`i`), :math:`X_{kj}` (the number of individuals of speecies
    :math:`j` in sample :math:`k`), and :math:`N` (the total number of speecies
    present in both samples), Cao dissimilarity (CYd) :cite:`Cao:1997` is:

    .. math::

        dist_{Cao}(X, Y) =
        CYd = \frac{1}{N}\sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
        \frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
        {X_{ij}+X_{kj}}\Bigg)

    In the above formula, whenever :math:`X_{ij} = 0` or :math:`X_{kj} = 0`,
    the value 0.1 is substituted.

    Since this measure ranges from 0 to :math:`\infty`, a similarity measure,
    CYs, ranging from 0 to 1 was also developed.

    .. math::

        sim_{Cao}(X, Y) = CYs = 1 - \frac{Observed~CYd}{Maximum~CYd}

    where

    .. math::

        Observed~CYd = \sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
        \frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
        {X_{ij}+X_{kj}}\Bigg)

    and with :math:`a` (the number of species present in both samples),
    :math:`b` (the number of species present in sample :math:`i` only), and
    :math:`c` (the number of species present in sample :math:`j` only),

    .. math::

        Maximum~CYd = D_1 + D_2 + D_3

    with

    .. math::

        D_1 = \sum_{j=1}^b \Bigg(\frac{(X_{ij} + 0.1) log_{10} \big(
        \frac{X_{ij}+0.1}{2}\big)-X_{ij}log_{10}0.1-0.1log_{10}X_{ij}}
        {X_{ij}+0.1}\Bigg)

        D_2 = \sum_{j=1}^c \Bigg(\frac{(X_{kj} + 0.1) log_{10} \big(
        \frac{X_{kj}+0.1}{2}\big)-X_{kj}log_{10}0.1-0.1log_{10}X_{kj}}
        {X_{kj}+0.1}\Bigg)

        D_1 = \sum_{j=1}^a \frac{a}{2} \Bigg(\frac{(D_i + 1) log_{10}
        \big(\frac{D_i+1}{2}\big)-log_{10}D_i}{D_i+1} + \frac{(D_k + 1) log_{10}
        \big(\frac{D_k+1}{2}\big)-log_{10}D_k}{D_k+1}\Bigg)

    with

    .. math::

        D_i = \frac{\sum X_{ij} - \frac{a}{2}}{\frac{a}{2}}

        D_k = \frac{\sum X_{kj} - \frac{a}{2}}{\frac{a}{2}}

    for

    .. math::

        X_{ij} \geq 1

        X_{kj} \geq 1

    .. versionadded:: 0.4.1
    """

    def __init__(self, **kwargs):
        """Initialize Cao instance.

        Parameters
        ----------
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.1

        """
        super(Cao, self).__init__(**kwargs)

    def sim(self, src, tar):
        """Return Cao's CY similarity (CYs) of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Cao's CY similarity

        Examples
        --------
        >>> cmp = Cao()
        >>> cmp.sim('cat', 'hat')
        0.0
        >>> cmp.sim('Niall', 'Neil')
        0.0
        >>> cmp.sim('aluminum', 'Catalan')
        0.0
        >>> cmp.sim('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 1.0
        if not src or not tar:
            return 0.0

        self._tokenize(src, tar)

        alphabet = self._total().keys()
        in_both_samples_half = len(self._intersection().keys()) / 2
        if not in_both_samples_half:
            return 0.0

        observed_cyd = 0
        maximum_cyd = 0
        for symbol in alphabet:
            src_tok = max(0.1, self._src_tokens[symbol])
            tar_tok = max(0.1, self._tar_tokens[symbol])
            tok_sum = src_tok + tar_tok
            observed_cyd += (
                tok_sum * log10(tok_sum / 2)
                - src_tok * log10(tar_tok)
                - tar_tok * log10(src_tok)
            ) / tok_sum

            if self._tar_tokens[symbol] == 0:
                maximum_cyd += (
                    (self._src_tokens[symbol] + 0.1)
                    * log10((self._src_tokens[symbol] + 0.1) / 2)
                    - self._src_tokens[symbol] * log10(0.1)
                    - 0.1 * log10(self._src_tokens[symbol])
                ) / (self._src_tokens[symbol] + 0.1)
            elif self._src_tokens[symbol] == 0:
                maximum_cyd += (
                    (self._tar_tokens[symbol] + 0.1)
                    * log10((self._tar_tokens[symbol] + 0.1) / 2)
                    - self._tar_tokens[symbol] * log10(0.1)
                    - 0.1 * log10(self._tar_tokens[symbol])
                ) / (self._tar_tokens[symbol] + 0.1)

        d_i = 0
        d_k = 0
        for symbol in self._intersection().keys():
            d_i += self._src_tokens[symbol]
            d_k += self._tar_tokens[symbol]
        d_i = (d_i - in_both_samples_half) / in_both_samples_half
        d_k = (d_k - in_both_samples_half) / in_both_samples_half

        maximum_cyd += in_both_samples_half * (
            ((d_i + 1) * log10((d_i + 1) / 2) - log10(d_i)) / (d_i + 1)
            + ((d_k + 1) * log10((d_k + 1) / 2) - log10(d_k)) / (d_k + 1)
        )

        return max(0.0, min(1.0, 1 - (observed_cyd / maximum_cyd)))

    def dist_abs(self, src, tar):
        """Return Cao's CY dissimilarity (CYd) of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Cao's CY dissimilarity

        Examples
        --------
        >>> cmp = Cao()
        >>> cmp.dist_abs('cat', 'hat')
        0.3247267992925765
        >>> cmp.dist_abs('Niall', 'Neil')
        0.4132886536450973
        >>> cmp.dist_abs('aluminum', 'Catalan')
        0.5530666041976232
        >>> cmp.dist_abs('ATCG', 'TAGC')
        0.6494535985851531


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 0.0

        self._tokenize(src, tar)

        alphabet = self._total().keys()

        score = 0
        for symbol in alphabet:
            src_tok = max(0.1, self._src_tokens[symbol])
            tar_tok = max(0.1, self._tar_tokens[symbol])
            tok_sum = src_tok + tar_tok
            score += (
                tok_sum * log10(tok_sum / 2)
                - src_tok * log10(tar_tok)
                - tar_tok * log10(src_tok)
            ) / tok_sum

        return score / sum(self._total().values())


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2019 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._cao.
20
21		Cao's CY dissimilarity.
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from math import log10
32
33	1	from ._token_distance import _TokenDistance
34
35	1	__all__ = ['Cao']
36
37
38	1	class Cao(_TokenDistance):
39		r"""Cao's CY dissimilarity.
40
41		Given :math:`X_{ij}` (the number of individuals of speecies :math:`j` in
42		sample :math:`i`), :math:`X_{kj}` (the number of individuals of speecies
43		:math:`j` in sample :math:`k`), and :math:`N` (the total number of speecies
44		present in both samples), Cao dissimilarity (CYd) :cite:`Cao:1997` is:
45
46		.. math::
47
48		dist_{Cao}(X, Y) =
49		CYd = \frac{1}{N}\sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
50		\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
51		{X_{ij}+X_{kj}}\Bigg)
52
53		In the above formula, whenever :math:`X_{ij} = 0` or :math:`X_{kj} = 0`,
54		the value 0.1 is substituted.
55
56		Since this measure ranges from 0 to :math:`\infty`, a similarity measure,
57		CYs, ranging from 0 to 1 was also developed.
58
59		.. math::
60
61		sim_{Cao}(X, Y) = CYs = 1 - \frac{Observed~CYd}{Maximum~CYd}
62
63		where
64
65		.. math::
66
67		Observed~CYd = \sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
68		\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
69		{X_{ij}+X_{kj}}\Bigg)
70
71		and with :math:`a` (the number of species present in both samples),
72		:math:`b` (the number of species present in sample :math:`i` only), and
73		:math:`c` (the number of species present in sample :math:`j` only),
74
75		.. math::
76
77		Maximum~CYd = D_1 + D_2 + D_3
78
79		with
80
81		.. math::
82
83		D_1 = \sum_{j=1}^b \Bigg(\frac{(X_{ij} + 0.1) log_{10} \big(
84		\frac{X_{ij}+0.1}{2}\big)-X_{ij}log_{10}0.1-0.1log_{10}X_{ij}}
85		{X_{ij}+0.1}\Bigg)
86
87		D_2 = \sum_{j=1}^c \Bigg(\frac{(X_{kj} + 0.1) log_{10} \big(
88		\frac{X_{kj}+0.1}{2}\big)-X_{kj}log_{10}0.1-0.1log_{10}X_{kj}}
89		{X_{kj}+0.1}\Bigg)
90
91		D_1 = \sum_{j=1}^a \frac{a}{2} \Bigg(\frac{(D_i + 1) log_{10}
92		\big(\frac{D_i+1}{2}\big)-log_{10}D_i}{D_i+1} + \frac{(D_k + 1) log_{10}
93		\big(\frac{D_k+1}{2}\big)-log_{10}D_k}{D_k+1}\Bigg)
94
95		with
96
97		.. math::
98
99		D_i = \frac{\sum X_{ij} - \frac{a}{2}}{\frac{a}{2}}
100
101		D_k = \frac{\sum X_{kj} - \frac{a}{2}}{\frac{a}{2}}
102
103		for
104
105		.. math::
106
107		X_{ij} \geq 1
108
109		X_{kj} \geq 1
110
111		.. versionadded:: 0.4.1
112		"""
113
114	1	def __init__(self, **kwargs):
115		"""Initialize Cao instance.
116
117		Parameters
118		----------
119		**kwargs
120		Arbitrary keyword arguments
121
122
123		.. versionadded:: 0.4.1
124
125		"""
126	1	super(Cao, self).__init__(**kwargs)
127
128	1	def sim(self, src, tar):
129		"""Return Cao's CY similarity (CYs) of two strings.
130
131		Parameters
132		----------
133		src : str
134		Source string for comparison
135		tar : str
136		Target string for comparison
137
138		Returns
139		-------
140		float
141		Cao's CY similarity
142
143		Examples
144		--------
145		>>> cmp = Cao()
146		>>> cmp.sim('cat', 'hat')
147		0.0
148		>>> cmp.sim('Niall', 'Neil')
149		0.0
150		>>> cmp.sim('aluminum', 'Catalan')
151		0.0
152		>>> cmp.sim('ATCG', 'TAGC')
153		0.0
154
155
156		.. versionadded:: 0.4.1
157
158		"""
159	1	if src == tar:
160	1	return 1.0
161	1	if not src or not tar:
162	1	return 0.0
163
164	1	self._tokenize(src, tar)
165
166	1	alphabet = self._total().keys()
167	1	in_both_samples_half = len(self._intersection().keys()) / 2
168	1	if not in_both_samples_half:
169	1	return 0.0
170
171	1	observed_cyd = 0
172	1	maximum_cyd = 0
173	1	for symbol in alphabet:
174	1	src_tok = max(0.1, self._src_tokens[symbol])
175	1	tar_tok = max(0.1, self._tar_tokens[symbol])
176	1	tok_sum = src_tok + tar_tok
177	1	observed_cyd += (
178		tok_sum * log10(tok_sum / 2)
179		- src_tok * log10(tar_tok)
180		- tar_tok * log10(src_tok)
181		) / tok_sum
182
183	1	if self._tar_tokens[symbol] == 0:
184	1	maximum_cyd += (
185		(self._src_tokens[symbol] + 0.1)
186		* log10((self._src_tokens[symbol] + 0.1) / 2)
187		- self._src_tokens[symbol] * log10(0.1)
188		- 0.1 * log10(self._src_tokens[symbol])
189		) / (self._src_tokens[symbol] + 0.1)
190	1	elif self._src_tokens[symbol] == 0:
191	1	maximum_cyd += (
192		(self._tar_tokens[symbol] + 0.1)
193		* log10((self._tar_tokens[symbol] + 0.1) / 2)
194		- self._tar_tokens[symbol] * log10(0.1)
195		- 0.1 * log10(self._tar_tokens[symbol])
196		) / (self._tar_tokens[symbol] + 0.1)
197
198	1	d_i = 0
199	1	d_k = 0
200	1	for symbol in self._intersection().keys():
201	1	d_i += self._src_tokens[symbol]
202	1	d_k += self._tar_tokens[symbol]
203	1	d_i = (d_i - in_both_samples_half) / in_both_samples_half
204	1	d_k = (d_k - in_both_samples_half) / in_both_samples_half
205
206	1	maximum_cyd += in_both_samples_half * (
207		((d_i + 1) * log10((d_i + 1) / 2) - log10(d_i)) / (d_i + 1)
208		+ ((d_k + 1) * log10((d_k + 1) / 2) - log10(d_k)) / (d_k + 1)
209		)
210
211	1	return max(0.0, min(1.0, 1 - (observed_cyd / maximum_cyd)))
212
213	1	def dist_abs(self, src, tar):
214		"""Return Cao's CY dissimilarity (CYd) of two strings.
215
216		Parameters
217		----------
218		src : str
219		Source string for comparison
220		tar : str
221		Target string for comparison
222
223		Returns
224		-------
225		float
226		Cao's CY dissimilarity
227
228		Examples
229		--------
230		>>> cmp = Cao()
231		>>> cmp.dist_abs('cat', 'hat')
232		0.3247267992925765
233		>>> cmp.dist_abs('Niall', 'Neil')
234		0.4132886536450973
235		>>> cmp.dist_abs('aluminum', 'Catalan')
236		0.5530666041976232
237		>>> cmp.dist_abs('ATCG', 'TAGC')
238		0.6494535985851531
239
240
241		.. versionadded:: 0.4.1
242
243		"""
244	1	if src == tar:
245	1	return 0.0
246
247	1	self._tokenize(src, tar)
248
249	1	alphabet = self._total().keys()
250
251	1	score = 0
252	1	for symbol in alphabet:
253	1	src_tok = max(0.1, self._src_tokens[symbol])
254	1	tar_tok = max(0.1, self._tar_tokens[symbol])
255	1	tok_sum = src_tok + tar_tok
256	1	score += (
257		tok_sum * log10(tok_sum / 2)
258		- src_tok * log10(tar_tok)
259		- tar_tok * log10(src_tok)
260		) / tok_sum
261
262	1	return score / sum(self._total().values())
263
264
265		if __name__ == '__main__':
266		import doctest
267
268		doctest.testmod()
269

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.distance._cao.Cao.sim() C

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like