Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

abydos.distance._soft_cosine.SoftCosine.__init__()   A

Complexity

Conditions 3

Size

Total Lines 56
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 56
ccs 6
cts 6
cp 1
rs 9.85
c 0
b 0
f 0
cc 3
nop 5
crap 3

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._softy_cosine.
18
19 1
Soft Cosine similarity & distance
20
"""
21
22
from typing import Any, Optional, cast
23
24 1
from ._distance import _Distance
25
from ._levenshtein import Levenshtein
26
from ._token_distance import _TokenDistance
27
from ..tokenizer import _Tokenizer
28
29
__all__ = ['SoftCosine']
30
31 1
32 1
class SoftCosine(_TokenDistance):
33
    r"""Soft Cosine similarity.
34 1
35
    As described in :cite:`Sidorov:2014`, soft cosine similarity of two
36
    multi-sets X and Y, drawn from an alphabet S, is
37 1
38
        .. math::
39
40
            sim_{soft cosine}(X, Y) =
41
            \frac{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i Y_j}
42
            {\sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} X_i X_j}
43
            \sqrt{\sum_{i \in S}\sum_{j \in S} s_{ij} Y_i Y_j}}
44
45
    where :math:`s_{ij}` is the similarity of two tokens, by default a function
46
    of Levenshtein distance: :math:`\frac{1}{1+Levenshtein\_distance(i, j)}`.
47
48
    Notes
49
    -----
50
    This class implements soft cosine similarity, as defined by
51
    :cite:`Sidorov:2014`. An alternative formulation of soft cosine similarity
52
    using soft (multi-)sets is provided by the :class:`Cosine` class using
53
    intersection_type=``soft``, based on the soft intersection
54
    defined in :cite:`Russ:2014`.
55
56
    .. versionadded:: 0.4.0
57
58
    """
59
60
    def __init__(
61
        self,
62
        tokenizer: Optional[_Tokenizer] = None,
63
        metric: Optional[_Distance] = None,
64
        sim_method: str = 'a',
65 1
        **kwargs: Any
66
    ) -> None:
67
        r"""Initialize SoftCosine instance.
68
69
        Parameters
70
        ----------
71
        tokenizer : _Tokenizer
72
            A tokenizer instance from the :py:mod:`abydos.tokenizer`
73
            package, defaulting to the QGrams tokenizer with q=4
74
        threshold : float
75
            The minimum similarity for a pair of tokens to contribute to
76
            similarity
77
        metric : _Distance
78
            A distance instance from the abydos.distance package, defaulting
79
            to Levenshtein distance
80
        sim_method : str
81
            Selects the similarity method from the four given in
82
            :cite:`Sidorov:2014`:
83
84
                - ``a`` : :math:`\frac{1}{1+d}`
85
                - ``b`` : :math:`1-\frac{d}{m}`
86
                - ``c`` : :math:`\sqrt{1-\frac{d}{m}}`
87
                - ``d`` : :math:`\Big(1-\frac{d}{m}\Big)^2`
88
89
            Where :math:`d` is the distance (Levenshtein by default) and
90
            :math:`m` is the maximum length of the two tokens. Option `a` is
91
            default, as suggested by the paper.
92
        **kwargs
93
            Arbitrary keyword arguments
94
95
        Raises
96
        ------
97
        ValueError
98
            sim_method must be one of 'a', 'b', 'c', or 'd'
99
100
        Other Parameters
101
        ----------------
102
        qval : int
103
            The length of each q-gram. Using this parameter and tokenizer=None
104
            will cause the instance to use the QGram tokenizer with this
105
            q value.
106
107
108
        .. versionadded:: 0.4.0
109
110 1
        """
111 1
        super(SoftCosine, self).__init__(tokenizer, **kwargs)
112 1
        self.params['metric'] = metric if metric is not None else Levenshtein()
113 1
        if sim_method not in {'a', 'b', 'c', 'd'}:
114 1
            raise ValueError("sim_method must be one of 'a', 'b', 'c', or 'd'")
115
        self.params['sim_method'] = sim_method
116 1
117
    def _sim_a(self, src: str, tar: str) -> float:
118
        return 1 / (1 + cast(float, self.params['metric'].dist_abs(src, tar)))
119
120
    def _sim_b(self, src: str, tar: str) -> float:
121
        return 1 - (
122
            cast(float, self.params['metric'].dist_abs(src, tar))
123
            / max(len(src), len(tar))
124
        )
125
126
    def _sim_c(self, src: str, tar: str) -> float:
127
        return (
128
            1
129
            - (
130
                cast(float, self.params['metric'].dist_abs(src, tar))
131
                / max(len(src), len(tar))
132
            )
133
        ) ** 0.5
134
135
    def _sim_d(self, src: str, tar: str) -> float:
136
        return (
137
            1
138
            - (
139
                cast(float, self.params['metric'].dist_abs(src, tar))
140
                / max(len(src), len(tar))
141
            )
142
        ) ** 2
143
144
    def sim(self, src: str, tar: str) -> float:
145
        r"""Return the Soft Cosine similarity of two strings.
146
147 1
        Parameters
148 1
        ----------
149
        src : str
150 1
            Source string (or QGrams/Counter objects) for comparison
151
        tar : str
152 1
            Target string (or QGrams/Counter objects) for comparison
153 1
154
        Returns
155 1
        -------
156
        float
157
            Fuzzy Cosine similarity
158
159
        Examples
160
        --------
161
        >>> cmp = SoftCosine()
162
        >>> cmp.sim('cat', 'hat')
163
        0.8750000000000001
164
        >>> cmp.sim('Niall', 'Neil')
165
        0.8844691709074513
166
        >>> cmp.sim('aluminum', 'Catalan')
167
        0.831348688760277
168
        >>> cmp.sim('ATCG', 'TAGC')
169
        0.8571428571428572
170
171
172
        .. versionadded:: 0.4.0
173
174
        """
175
        if src == tar:
176
            return 1.0
177
178
        self._tokenize(src, tar)
179
180
        if not self._src_card() or not self._tar_card():
181 1
            return 0.0
182 1
183 1
        similarity = {
184
            'a': self._sim_a,
185 1
            'b': self._sim_b,
186 1
            'c': self._sim_c,
187 1
            'd': self._sim_d,
188
        }
189
190
        nom = 0.0
191
        denom_left = 0.0
192
        denom_right = 0.0
193 1
194 1
        for src in self._src_tokens.keys():
195 1
            for tar in self._tar_tokens.keys():
196
                nom += (
197
                    self._src_tokens[src]
198
                    * self._tar_tokens[tar]
199
                    * similarity[self.params['sim_method']](src, tar)
200
                )
201 1
202 1
        for src in self._src_tokens.keys():
203 1
            for tar in self._src_tokens.keys():
204
                denom_left += (
205
                    self._src_tokens[src]
206
                    * self._src_tokens[tar]
207
                    * similarity[self.params['sim_method']](src, tar)
208
                )
209 1
210
        for src in self._tar_tokens.keys():
211
            for tar in self._tar_tokens.keys():
212
                denom_right += (
213
                    self._tar_tokens[src]
214
                    * self._tar_tokens[tar]
215
                    * similarity[self.params['sim_method']](src, tar)
216
                )
217
218
        return nom / (denom_left ** 0.5 * denom_right ** 0.5)
219
220
221
if __name__ == '__main__':
222
    import doctest
223
224
    doctest.testmod()
225