Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

abydos.distance._flexmetric.FlexMetric.dist_abs()   C

Complexity

Conditions 9

Size

Total Lines 64
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 9

Importance

Changes 0
Metric Value
eloc 26
dl 0
loc 64
ccs 20
cts 20
cp 1
rs 6.6666
c 0
b 0
f 0
cc 9
nop 3
crap 9

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._flexmetric.
18
19 1
FlexMetric distance
20
"""
21
22
from typing import (
23
    Any,
24 1
    Callable,
25
    FrozenSet,
26
    List,
27
    Optional,
28
    Sequence,
29
    Set,
30
    Tuple,
31 1
    Union,
32 1
    cast,
33
)
34 1
35
from numpy import float_ as np_float
36 1
from numpy import zeros as np_zeros
37
38
from ._distance import _Distance
39 1
40
__all__ = ['FlexMetric']
41
42
43
class FlexMetric(_Distance):
44
    r"""FlexMetric distance.
45
46
    FlexMetric distance :cite:`Kempken:2005`
47 1
48
    .. versionadded:: 0.4.0
49
    """
50
51
    def __init__(
52
        self,
53
        normalizer: Callable[[List[float]], float] = max,
54
        indel_costs: Optional[
55
            List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]
56
        ] = None,
57
        subst_costs: Optional[
58
            List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]
59
        ] = None,
60
        **kwargs: Any
61
    ) -> None:
62
        """Initialize FlexMetric instance.
63
64
        Parameters
65
        ----------
66
        normalizer : function
67
            A function that takes an list and computes a normalization term
68
            by which the edit distance is divided (max by default). Another
69
            good option is the sum function.
70
        indel_costs : list of tuples
71
            A list of insertion and deletion costs. Each list element should
72
            be a tuple consisting of an iterable (sets are best) and a float
73
            value. The iterable consists of those letters whose insertion
74
            or deletion has a cost equal to the float value.
75
        subst_costs : list of tuples
76 1
            A list of substitution costs. Each list element should
77 1
            be a tuple consisting of an iterable (sets are best) and a float
78
            value. The iterable consists of the letters in each letter class,
79 1
            which may be substituted for each other at cost equal to the float
80 1
            value.
81
        **kwargs
82
            Arbitrary keyword arguments
83
84
85
        .. versionadded:: 0.4.0
86
87 1
        """
88
        super(FlexMetric, self).__init__(**kwargs)
89 1
        self._normalizer = normalizer
90 1
91
        def _get_second(
92 1
            s: Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]
93 1
        ) -> float:
94
            return s[1]
95
96
        if indel_costs is None:
97
            self._indel_costs = [
98
                (frozenset('dtch'), 0.4),
99
                (frozenset('e'), 0.5),
100
                (frozenset('u'), 0.9),
101
                (frozenset('rpn'), 0.95),
102
            ]  # type: List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]  # noqa: E501
103
        else:
104
            self._indel_costs = sorted(indel_costs, key=_get_second)
105
106
        if subst_costs is None:
107
            self._subst_costs = [
108
                (frozenset('szß'), 0.1),
109
                (frozenset('dt'), 0.1),
110
                (frozenset('iy'), 0.1),
111
                (frozenset('ckq'), 0.1),
112
                (frozenset('eä'), 0.1),
113 1
                (frozenset('uüv'), 0.1),
114
                (frozenset('iü'), 0.1),
115 1
                (frozenset('fv'), 0.1),
116 1
                (frozenset('zc'), 0.1),
117 1
                (frozenset('ij'), 0.1),
118 1
                (frozenset('bp'), 0.1),
119 1
                (frozenset('eoö'), 0.2),
120 1
                (frozenset('aä'), 0.2),
121 1
                (frozenset('mbp'), 0.4),
122
                (frozenset('uw'), 0.4),
123 1
                (frozenset('uo'), 0.8),
124 1
                (frozenset('aeiouy'), 0.9),
125 1
            ]  # type: List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]  # noqa: E501
126 1
        else:
127 1
            self._subst_costs = sorted(subst_costs, key=_get_second)
128 1
129 1
    def _cost(self, src: str, s_pos: int, tar: str, t_pos: int) -> float:
130
        if s_pos == -1:
131 1
            if t_pos > 0 and tar[t_pos - 1] == tar[t_pos]:
132 1
                return 0.0
133 1
            for letter_set in self._indel_costs:
134 1
                if tar[t_pos] in letter_set[0]:
135
                    return letter_set[1]
136 1
            else:
137
                return 1.0
138 1
        elif t_pos == -1:
139
            if s_pos > 0 and src[s_pos - 1] == src[s_pos]:
140
                return 0.0
141
            for letter_set in self._indel_costs:
142
                if src[s_pos] in letter_set[0]:
143
                    return letter_set[1]
144
            else:
145
                return 1.0
146
        for letter_set in self._subst_costs:
147
            if src[s_pos] in letter_set[0] and tar[t_pos] in letter_set[0]:
148
                return letter_set[1]
149
        else:
150
            return 1.0
151
152
    def dist_abs(self, src: str, tar: str) -> float:
153
        """Return the FlexMetric distance of two strings.
154
155
        Parameters
156
        ----------
157
        src : str
158
            Source string for comparison
159
        tar : str
160
            Target string for comparison
161
162
        Returns
163
        -------
164
        float
165
            FlexMetric distance
166
167
        Examples
168
        --------
169 1
        >>> cmp = FlexMetric()
170 1
        >>> cmp.dist_abs('cat', 'hat')
171
        0.8
172 1
        >>> cmp.dist_abs('Niall', 'Neil')
173 1
        1.5
174 1
        >>> cmp.dist_abs('aluminum', 'Catalan')
175 1
        6.7
176 1
        >>> cmp.dist_abs('ATCG', 'TAGC')
177 1
        2.1999999999999997
178
179 1
180 1
        .. versionadded:: 0.4.0
181 1
182 1
        """
183 1
        src_len = len(src)
184
        tar_len = len(tar)
185 1
186 1
        if src == tar:
187
            return 0
188 1
        if not src:
189 1
            return sum(self._cost('', -1, tar, j) for j in range(len(tar)))
190 1
        if not tar:
191
            return sum(self._cost(src, i, '', -1) for i in range(len(src)))
192
193
        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
194
        for i in range(1, src_len + 1):
195
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1)
196
        for j in range(1, tar_len + 1):
197
            d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1)
198
199
        src_lc = src.lower()
200
        tar_lc = tar.lower()
201 1
202
        for i in range(src_len):
203 1
            for j in range(tar_len):
204
                d_mat[i + 1, j + 1] = min(
205
                    d_mat[i + 1, j] + self._cost('', -1, tar_lc, j),  # ins
206
                    d_mat[i, j + 1] + self._cost(src_lc, i, '', -1),  # del
207
                    d_mat[i, j]
208
                    + (
209
                        self._cost(src_lc, i, tar_lc, j)
210
                        if src[i] != tar[j]
211
                        else 0
212
                    ),  # sub/==
213
                )
214
215
        return cast(float, d_mat[src_len, tar_len])
216
217
    def dist(self, src: str, tar: str) -> float:
218
        """Return the normalized FlexMetric distance of two strings.
219
220
        Parameters
221
        ----------
222
        src : str
223
            Source string for comparison
224
        tar : str
225
            Target string for comparison
226
227
        Returns
228
        -------
229
        float
230
            Normalized FlexMetric distance
231
232
        Examples
233
        --------
234 1
        >>> cmp = FlexMetric()
235 1
        >>> cmp.dist('cat', 'hat')
236 1
        0.26666666666666666
237 1
        >>> cmp.dist('Niall', 'Neil')
238
        0.3
239
        >>> cmp.dist('aluminum', 'Catalan')
240
        0.8375
241
        >>> cmp.dist('ATCG', 'TAGC')
242
        0.5499999999999999
243
244
245
        .. versionadded:: 0.4.0
246
247
        """
248
        score = self.dist_abs(src, tar)
249
        if score:
250
            return score / self._normalizer([len(src), len(tar)])
251
        return 0.0
252
253
254
if __name__ == '__main__':
255
    import doctest
256
257
    doctest.testmod()
258