Completed
Pull Request — master (#257)
by Chris
07:41
created

abydos.distance._flexmetric.FlexMetric.__init__()   B

Complexity

Conditions 3

Size

Total Lines 71
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 37
dl 0
loc 71
ccs 10
cts 10
cp 1
rs 8.9919
c 0
b 0
f 0
cc 3
nop 5
crap 3

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._flexmetric.
18
19 1
FlexMetric distance
20
"""
21
22
from typing import Any, Callable, Collection, List, Optional, Tuple, cast
23
24 1
from numpy import float_ as np_float
25
from numpy import zeros as np_zeros
26
27
from ._distance import _Distance
28
29
__all__ = ['FlexMetric']
30
31 1
32 1
class FlexMetric(_Distance):
33
    r"""FlexMetric distance.
34 1
35
    FlexMetric distance :cite:`Kempken:2005`
36 1
37
    .. versionadded:: 0.4.0
38
    """
39 1
40
    def __init__(
41
        self,
42
        normalizer: Callable[[List[float]], float] = max,
43
        indel_costs: Optional[List[Tuple[Collection[str], float]]] = None,
44
        subst_costs: Optional[List[Tuple[Collection[str], float]]] = None,
45
        **kwargs: Any
46
    ) -> None:
47 1
        """Initialize FlexMetric instance.
48
49
        Parameters
50
        ----------
51
        normalizer : function
52
            A function that takes an list and computes a normalization term
53
            by which the edit distance is divided (max by default). Another
54
            good option is the sum function.
55
        indel_costs : list of tuples
56
            A list of insertion and deletion costs. Each list element should
57
            be a tuple consisting of an iterable (sets are best) and a float
58
            value. The iterable consists of those letters whose insertion
59
            or deletion has a cost equal to the float value.
60
        subst_costs : list of tuples
61
            A list of substitution costs. Each list element should
62
            be a tuple consisting of an iterable (sets are best) and a float
63
            value. The iterable consists of the letters in each letter class,
64
            which may be substituted for each other at cost equal to the float
65
            value.
66
        **kwargs
67
            Arbitrary keyword arguments
68
69
70
        .. versionadded:: 0.4.0
71
72
        """
73
        super(FlexMetric, self).__init__(**kwargs)
74
        self._normalizer = normalizer
75
76 1
        def _get_second(s: Tuple[Collection[str], float]) -> float:
77 1
            return s[1]
78
79 1
        if indel_costs is None:
80 1
            self._indel_costs = [
81
                (frozenset('dtch'), 0.4),
82
                (frozenset('e'), 0.5),
83
                (frozenset('u'), 0.9),
84
                (frozenset('rpn'), 0.95),
85
            ]  # type: List[Tuple[Collection[str], float]]
86
        else:
87 1
            self._indel_costs = sorted(indel_costs, key=_get_second)
88
89 1
        if subst_costs is None:
90 1
            self._subst_costs = [
91
                (frozenset('szß'), 0.1),
92 1
                (frozenset('dt'), 0.1),
93 1
                (frozenset('iy'), 0.1),
94
                (frozenset('ckq'), 0.1),
95
                (frozenset('eä'), 0.1),
96
                (frozenset('uüv'), 0.1),
97
                (frozenset('iü'), 0.1),
98
                (frozenset('fv'), 0.1),
99
                (frozenset('zc'), 0.1),
100
                (frozenset('ij'), 0.1),
101
                (frozenset('bp'), 0.1),
102
                (frozenset('eoö'), 0.2),
103
                (frozenset('aä'), 0.2),
104
                (frozenset('mbp'), 0.4),
105
                (frozenset('uw'), 0.4),
106
                (frozenset('uo'), 0.8),
107
                (frozenset('aeiouy'), 0.9),
108
            ]  # type: List[Tuple[Collection[str], float]]
109
        else:
110
            self._subst_costs = sorted(subst_costs, key=_get_second)
111
112
    def _cost(self, src: str, s_pos: int, tar: str, t_pos: int) -> float:
113 1
        if s_pos == -1:
114
            if t_pos > 0 and tar[t_pos - 1] == tar[t_pos]:
115 1
                return 0.0
116 1
            for letter_set in self._indel_costs:
117 1
                if tar[t_pos] in letter_set[0]:
118 1
                    return letter_set[1]
119 1
            else:
120 1
                return 1.0
121 1
        elif t_pos == -1:
122
            if s_pos > 0 and src[s_pos - 1] == src[s_pos]:
123 1
                return 0.0
124 1
            for letter_set in self._indel_costs:
125 1
                if src[s_pos] in letter_set[0]:
126 1
                    return letter_set[1]
127 1
            else:
128 1
                return 1.0
129 1
        for letter_set in self._subst_costs:
130
            if src[s_pos] in letter_set[0] and tar[t_pos] in letter_set[0]:
131 1
                return letter_set[1]
132 1
        else:
133 1
            return 1.0
134 1
135
    def dist_abs(self, src: str, tar: str) -> float:
136 1
        """Return the FlexMetric distance of two strings.
137
138 1
        Parameters
139
        ----------
140
        src : str
141
            Source string for comparison
142
        tar : str
143
            Target string for comparison
144
145
        Returns
146
        -------
147
        float
148
            FlexMetric distance
149
150
        Examples
151
        --------
152
        >>> cmp = FlexMetric()
153
        >>> cmp.dist_abs('cat', 'hat')
154
        0.8
155
        >>> cmp.dist_abs('Niall', 'Neil')
156
        1.5
157
        >>> cmp.dist_abs('aluminum', 'Catalan')
158
        6.7
159
        >>> cmp.dist_abs('ATCG', 'TAGC')
160
        2.1999999999999997
161
162
163
        .. versionadded:: 0.4.0
164
165
        """
166
        src_len = len(src)
167
        tar_len = len(tar)
168
169 1
        if src == tar:
170 1
            return 0
171
        if not src:
172 1
            return sum(self._cost('', -1, tar, j) for j in range(len(tar)))
173 1
        if not tar:
174 1
            return sum(self._cost(src, i, '', -1) for i in range(len(src)))
175 1
176 1
        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
177 1
        for i in range(1, src_len + 1):
178
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1)
179 1
        for j in range(1, tar_len + 1):
180 1
            d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1)
181 1
182 1
        src_lc = src.lower()
183 1
        tar_lc = tar.lower()
184
185 1
        for i in range(src_len):
186 1
            for j in range(tar_len):
187
                d_mat[i + 1, j + 1] = min(
188 1
                    d_mat[i + 1, j] + self._cost('', -1, tar_lc, j),  # ins
189 1
                    d_mat[i, j + 1] + self._cost(src_lc, i, '', -1),  # del
190 1
                    d_mat[i, j]
191
                    + (
192
                        self._cost(src_lc, i, tar_lc, j)
193
                        if src[i] != tar[j]
194
                        else 0
195
                    ),  # sub/==
196
                )
197
198
        return cast(float, d_mat[src_len, tar_len])
199
200
    def dist(self, src: str, tar: str) -> float:
201 1
        """Return the normalized FlexMetric distance of two strings.
202
203 1
        Parameters
204
        ----------
205
        src : str
206
            Source string for comparison
207
        tar : str
208
            Target string for comparison
209
210
        Returns
211
        -------
212
        float
213
            Normalized FlexMetric distance
214
215
        Examples
216
        --------
217
        >>> cmp = FlexMetric()
218
        >>> cmp.dist('cat', 'hat')
219
        0.26666666666666666
220
        >>> cmp.dist('Niall', 'Neil')
221
        0.3
222
        >>> cmp.dist('aluminum', 'Catalan')
223
        0.8375
224
        >>> cmp.dist('ATCG', 'TAGC')
225
        0.5499999999999999
226
227
228
        .. versionadded:: 0.4.0
229
230
        """
231
        score = self.dist_abs(src, tar)
232
        if score:
233
            return score / self._normalizer([len(src), len(tar)])
234 1
        return 0.0
235 1
236 1
237 1
if __name__ == '__main__':
238
    import doctest
239
240
    doctest.testmod()
241