Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

ChaoJaccard._get_estimates()   C

Complexity

Conditions 11

Size

Total Lines 87
Code Lines 53

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 11

Importance

Changes 0
Metric Value
eloc 53
dl 0
loc 87
ccs 27
cts 27
cp 1
rs 5.3181
c 0
b 0
f 0
cc 11
nop 3
crap 11

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.distance._chao_jaccard.ChaoJaccard._get_estimates() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._chao_jaccard.
20
21
Chao's Jaccard similarity
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter
32
33 1
try:
34 1
    from random import choices
35
except ImportError:  # pragma: no cover
36
    from random import choice
37
38
    def choices(population, k=1):
39
        """Quick implementation of choices for Python < 3.6."""
40
        return [choice(population) for _ in range(k)]
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable choice does not seem to be defined.
Loading history...
41
42
43 1
from ._token_distance import _TokenDistance
44
45 1
__all__ = ['ChaoJaccard']
46
47
48 1
class ChaoJaccard(_TokenDistance):
49
    r"""Chao's Jaccard similarity.
50
51
    Chao's Jaccard similarity :cite:`Chao:2004`
52
53
    .. versionadded:: 0.4.1
54
    """
55
56 1
    def __init__(self, **kwargs):
57
        """Initialize ChaoJaccard instance.
58
59
        Parameters
60
        ----------
61
        **kwargs
62
            Arbitrary keyword arguments
63
64
65
        .. versionadded:: 0.4.1
66
67
        """
68 1
        super(ChaoJaccard, self).__init__(**kwargs)
69
70 1
    def sim(self, src, tar):
71
        """Return normalized Chao's Jaccard similarity of two strings.
72
73
        Parameters
74
        ----------
75
        src : str
76
            Source string for comparison
77
        tar : str
78
            Target string for comparison
79
80
        Returns
81
        -------
82
        float
83
            Normalized Chao's Jaccard similarity
84
85
        Examples
86
        --------
87
        >>> import random
88
        >>> random.seed(0)
89
        >>> cmp = ChaoJaccard()
90
        >>> cmp.sim('cat', 'hat')
91
        0.22448979591836735
92
        >>> cmp.sim('Niall', 'Neil')
93
        0.1619047619047619
94
        >>> cmp.sim('aluminum', 'Catalan')
95
        0.0
96
        >>> cmp.sim('ATCG', 'TAGC')
97
        0.0
98
99
100
        .. versionadded:: 0.4.1
101
102
        """
103 1
        return max(0.0, min(1.0, self.sim_score(src, tar)))
104
105 1
    def sim_score(self, src, tar):
106
        """Return Chao's Jaccard similarity of two strings.
107
108
        Parameters
109
        ----------
110
        src : str
111
            Source string for comparison
112
        tar : str
113
            Target string for comparison
114
115
        Returns
116
        -------
117
        float
118
            Chao's Jaccard similarity
119
120
        Examples
121
        --------
122
        >>> import random
123
        >>> random.seed(0)
124
        >>> cmp = ChaoJaccard()
125
        >>> cmp.sim_score('cat', 'hat')
126
        0.22448979591836735
127
        >>> cmp.sim_score('Niall', 'Neil')
128
        0.1619047619047619
129
        >>> cmp.sim_score('aluminum', 'Catalan')
130
        0.0
131
        >>> cmp.sim_score('ATCG', 'TAGC')
132
        0.0
133
134
135
        .. versionadded:: 0.4.1
136
137
        """
138 1
        self._tokenize(src, tar)
139 1
        self._intersection()
140
141 1
        if self._intersection_card() == 0:
142 1
            return 0.0
143
144 1
        u_hat, v_hat = self._get_estimates(src, tar)
145
146 1
        num = u_hat * v_hat
147 1
        if num:
148 1
            return num / (u_hat + v_hat - u_hat * v_hat)
149 1
        return 0.0
150
151 1
    def _get_estimates(self, src, tar):
152
        """Get the estimates U-hat & V-hat used for Chao's measures.
153
154
        Parameters
155
        ----------
156
        src : str
157
            Source string for comparison
158
        tar : str
159
            Target string for comparison
160
161
        Returns
162
        -------
163
        tuple(float, float)
164
            The estimates U-hat & V-hat
165
166
        .. versionadded:: 0.4.1
167
168
        """
169 1
        src_card = self._src_card()  # n
170 1
        tar_card = self._tar_card()  # m
171
172 1
        src_token_list = self.params['tokenizer'].tokenize(src).get_list()
173 1
        tar_token_list = self.params['tokenizer'].tokenize(tar).get_list()
174
175 1
        src_sampled = Counter(choices(src_token_list, k=src_card))
176 1
        tar_sampled = Counter(choices(tar_token_list, k=tar_card))
177 1
        sample_intersection = src_sampled & tar_sampled
178
179 1
        f_1_plus = sum(
180
            1 if src_sampled[tok] == 1 and tar_sampled[tok] >= 1 else 0
181
            for tok in sample_intersection
182
        )
183 1
        f_2_plus = sum(
184
            1 if src_sampled[tok] == 2 and tar_sampled[tok] >= 1 else 0
185
            for tok in sample_intersection
186
        )
187 1
        if not f_2_plus:
188 1
            f_2_plus = 1
189
190 1
        f_plus_1 = sum(
191
            1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 1 else 0
192
            for tok in sample_intersection
193
        )
194 1
        f_plus_2 = sum(
195
            1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 2 else 0
196
            for tok in sample_intersection
197
        )
198 1
        if not f_plus_2:
199 1
            f_plus_2 = 1
200
201 1
        u_hat = 0
202 1
        if src_card:
203 1
            u_hat += sum(
204
                src_sampled[tok] / src_card
205
                for tok in sample_intersection.keys()
206
            )
207 1
        if tar_card:
208 1
            u_hat += (
209
                (tar_card - 1)
210
                / tar_card
211
                * f_plus_1
212
                / (2 * f_plus_2)
213
                * sum(
214
                    src_sampled[tok] / src_card * (tar_sampled[tok] == 1)
215
                    for tok in sample_intersection.keys()
216
                )
217
            )
218
219 1
        v_hat = 0
220 1
        if tar_card:
221 1
            v_hat += sum(
222
                tar_sampled[tok] / tar_card
223
                for tok in sample_intersection.keys()
224
            )
225 1
        if src_card:
226 1
            v_hat += (
227
                (src_card - 1)
228
                / src_card
229
                * f_1_plus
230
                / (2 * f_2_plus)
231
                * sum(
232
                    tar_sampled[tok] / tar_card * (src_sampled[tok] == 1)
233
                    for tok in sample_intersection.keys()
234
                )
235
            )
236
237 1
        return u_hat, v_hat
238
239
240
if __name__ == '__main__':
241
    import doctest
242
243
    doctest.testmod()
244