abydos.distance._chao_jaccard.ChaoJaccard.sim() - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.distance._chao_jaccard.ChaoJaccard.sim() A

↳ Parent: abydos.distance._chao_jaccard

Complexity

Conditions

Size

Total Lines	34
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	34
ccs	2
cts	2
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	3
crap	1

# -*- coding: utf-8 -*-

# Copyright 2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._chao_jaccard.

Chao's Jaccard similarity
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from collections import Counter

try:
    from random import choices
except ImportError:  # pragma: no cover
    from random import choice

    def choices(population, k=1):
        """Quick implementation of choices for Python < 3.6."""
        return [choice(population) for _ in range(k)]



from ._token_distance import _TokenDistance

__all__ = ['ChaoJaccard']


class ChaoJaccard(_TokenDistance):
    r"""Chao's Jaccard similarity.

    Chao's Jaccard similarity :cite:`Chao:2004`

    .. versionadded:: 0.4.1
    """

    def __init__(self, **kwargs):
        """Initialize ChaoJaccard instance.

        Parameters
        ----------
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.1

        """
        super(ChaoJaccard, self).__init__(**kwargs)

    def sim(self, src, tar):
        """Return normalized Chao's Jaccard similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Normalized Chao's Jaccard similarity

        Examples
        --------
        >>> import random
        >>> random.seed(0)
        >>> cmp = ChaoJaccard()
        >>> cmp.sim('cat', 'hat')
        0.22448979591836735
        >>> cmp.sim('Niall', 'Neil')
        0.1619047619047619
        >>> cmp.sim('aluminum', 'Catalan')
        0.0
        >>> cmp.sim('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.1

        """
        return max(0.0, min(1.0, self.sim_score(src, tar)))

    def sim_score(self, src, tar):
        """Return Chao's Jaccard similarity of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Chao's Jaccard similarity

        Examples
        --------
        >>> import random
        >>> random.seed(0)
        >>> cmp = ChaoJaccard()
        >>> cmp.sim_score('cat', 'hat')
        0.22448979591836735
        >>> cmp.sim_score('Niall', 'Neil')
        0.1619047619047619
        >>> cmp.sim_score('aluminum', 'Catalan')
        0.0
        >>> cmp.sim_score('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.1

        """
        self._tokenize(src, tar)
        self._intersection()

        if self._intersection_card() == 0:
            return 0.0

        u_hat, v_hat = self._get_estimates(src, tar)

        num = u_hat * v_hat
        if num:
            return num / (u_hat + v_hat - u_hat * v_hat)
        return 0.0

    def _get_estimates(self, src, tar):
        """Get the estimates U-hat & V-hat used for Chao's measures.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        tuple(float, float)
            The estimates U-hat & V-hat

        .. versionadded:: 0.4.1

        """
        src_card = self._src_card()  # n
        tar_card = self._tar_card()  # m

        src_token_list = self.params['tokenizer'].tokenize(src).get_list()
        tar_token_list = self.params['tokenizer'].tokenize(tar).get_list()

        src_sampled = Counter(choices(src_token_list, k=src_card))
        tar_sampled = Counter(choices(tar_token_list, k=tar_card))
        sample_intersection = src_sampled & tar_sampled

        f_1_plus = sum(
            1 if src_sampled[tok] == 1 and tar_sampled[tok] >= 1 else 0
            for tok in sample_intersection
        )
        f_2_plus = sum(
            1 if src_sampled[tok] == 2 and tar_sampled[tok] >= 1 else 0
            for tok in sample_intersection
        )
        if not f_2_plus:
            f_2_plus = 1

        f_plus_1 = sum(
            1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 1 else 0
            for tok in sample_intersection
        )
        f_plus_2 = sum(
            1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 2 else 0
            for tok in sample_intersection
        )
        if not f_plus_2:
            f_plus_2 = 1

        u_hat = 0
        if src_card:
            u_hat += sum(
                src_sampled[tok] / src_card
                for tok in sample_intersection.keys()
            )
        if tar_card:
            u_hat += (
                (tar_card - 1)
                / tar_card
                * f_plus_1
                / (2 * f_plus_2)
                * sum(
                    src_sampled[tok] / src_card * (tar_sampled[tok] == 1)
                    for tok in sample_intersection.keys()
                )
            )

        v_hat = 0
        if tar_card:
            v_hat += sum(
                tar_sampled[tok] / tar_card
                for tok in sample_intersection.keys()
            )
        if src_card:
            v_hat += (
                (src_card - 1)
                / src_card
                * f_1_plus
                / (2 * f_2_plus)
                * sum(
                    tar_sampled[tok] / tar_card * (src_sampled[tok] == 1)
                    for tok in sample_intersection.keys()
                )
            )

        return u_hat, v_hat


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2019 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._chao_jaccard.
20
21		Chao's Jaccard similarity
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from collections import Counter
32
33	1	try:
34	1	from random import choices
35		except ImportError: # pragma: no cover
36		from random import choice
37
38		def choices(population, k=1):
39		"""Quick implementation of choices for Python < 3.6."""
40		return [choice(population) for _ in range(k)]
		0 ignored issues – show Comprehensibility Best Practice introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report The variable `choice` does not seem to be defined. Loading history...
41
42
43	1	from ._token_distance import _TokenDistance
44
45	1	__all__ = ['ChaoJaccard']
46
47
48	1	class ChaoJaccard(_TokenDistance):
49		r"""Chao's Jaccard similarity.
50
51		Chao's Jaccard similarity :cite:`Chao:2004`
52
53		.. versionadded:: 0.4.1
54		"""
55
56	1	def __init__(self, **kwargs):
57		"""Initialize ChaoJaccard instance.
58
59		Parameters
60		----------
61		**kwargs
62		Arbitrary keyword arguments
63
64
65		.. versionadded:: 0.4.1
66
67		"""
68	1	super(ChaoJaccard, self).__init__(**kwargs)
69
70	1	def sim(self, src, tar):
71		"""Return normalized Chao's Jaccard similarity of two strings.
72
73		Parameters
74		----------
75		src : str
76		Source string for comparison
77		tar : str
78		Target string for comparison
79
80		Returns
81		-------
82		float
83		Normalized Chao's Jaccard similarity
84
85		Examples
86		--------
87		>>> import random
88		>>> random.seed(0)
89		>>> cmp = ChaoJaccard()
90		>>> cmp.sim('cat', 'hat')
91		0.22448979591836735
92		>>> cmp.sim('Niall', 'Neil')
93		0.1619047619047619
94		>>> cmp.sim('aluminum', 'Catalan')
95		0.0
96		>>> cmp.sim('ATCG', 'TAGC')
97		0.0
98
99
100		.. versionadded:: 0.4.1
101
102		"""
103	1	return max(0.0, min(1.0, self.sim_score(src, tar)))
104
105	1	def sim_score(self, src, tar):
106		"""Return Chao's Jaccard similarity of two strings.
107
108		Parameters
109		----------
110		src : str
111		Source string for comparison
112		tar : str
113		Target string for comparison
114
115		Returns
116		-------
117		float
118		Chao's Jaccard similarity
119
120		Examples
121		--------
122		>>> import random
123		>>> random.seed(0)
124		>>> cmp = ChaoJaccard()
125		>>> cmp.sim_score('cat', 'hat')
126		0.22448979591836735
127		>>> cmp.sim_score('Niall', 'Neil')
128		0.1619047619047619
129		>>> cmp.sim_score('aluminum', 'Catalan')
130		0.0
131		>>> cmp.sim_score('ATCG', 'TAGC')
132		0.0
133
134
135		.. versionadded:: 0.4.1
136
137		"""
138	1	self._tokenize(src, tar)
139	1	self._intersection()
140
141	1	if self._intersection_card() == 0:
142	1	return 0.0
143
144	1	u_hat, v_hat = self._get_estimates(src, tar)
145
146	1	num = u_hat * v_hat
147	1	if num:
148	1	return num / (u_hat + v_hat - u_hat * v_hat)
149	1	return 0.0
150
151	1	def _get_estimates(self, src, tar):
152		"""Get the estimates U-hat & V-hat used for Chao's measures.
153
154		Parameters
155		----------
156		src : str
157		Source string for comparison
158		tar : str
159		Target string for comparison
160
161		Returns
162		-------
163		tuple(float, float)
164		The estimates U-hat & V-hat
165
166		.. versionadded:: 0.4.1
167
168		"""
169	1	src_card = self._src_card() # n
170	1	tar_card = self._tar_card() # m
171
172	1	src_token_list = self.params['tokenizer'].tokenize(src).get_list()
173	1	tar_token_list = self.params['tokenizer'].tokenize(tar).get_list()
174
175	1	src_sampled = Counter(choices(src_token_list, k=src_card))
176	1	tar_sampled = Counter(choices(tar_token_list, k=tar_card))
177	1	sample_intersection = src_sampled & tar_sampled
178
179	1	f_1_plus = sum(
180		1 if src_sampled[tok] == 1 and tar_sampled[tok] >= 1 else 0
181		for tok in sample_intersection
182		)
183	1	f_2_plus = sum(
184		1 if src_sampled[tok] == 2 and tar_sampled[tok] >= 1 else 0
185		for tok in sample_intersection
186		)
187	1	if not f_2_plus:
188	1	f_2_plus = 1
189
190	1	f_plus_1 = sum(
191		1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 1 else 0
192		for tok in sample_intersection
193		)
194	1	f_plus_2 = sum(
195		1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 2 else 0
196		for tok in sample_intersection
197		)
198	1	if not f_plus_2:
199	1	f_plus_2 = 1
200
201	1	u_hat = 0
202	1	if src_card:
203	1	u_hat += sum(
204		src_sampled[tok] / src_card
205		for tok in sample_intersection.keys()
206		)
207	1	if tar_card:
208	1	u_hat += (
209		(tar_card - 1)
210		/ tar_card
211		* f_plus_1
212		/ (2 * f_plus_2)
213		* sum(
214		src_sampled[tok] / src_card * (tar_sampled[tok] == 1)
215		for tok in sample_intersection.keys()
216		)
217		)
218
219	1	v_hat = 0
220	1	if tar_card:
221	1	v_hat += sum(
222		tar_sampled[tok] / tar_card
223		for tok in sample_intersection.keys()
224		)
225	1	if src_card:
226	1	v_hat += (
227		(src_card - 1)
228		/ src_card
229		* f_1_plus
230		/ (2 * f_2_plus)
231		* sum(
232		tar_sampled[tok] / tar_card * (src_sampled[tok] == 1)
233		for tok in sample_intersection.keys()
234		)
235		)
236
237	1	return u_hat, v_hat
238
239
240		if __name__ == '__main__':
241		import doctest
242
243		doctest.testmod()
244

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.distance._chao_jaccard.ChaoJaccard.sim() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like