DiscountedLevenshtein.dist()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 63
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 17
dl 0
loc 63
ccs 11
cts 11
cp 1
rs 9.55
c 0
b 0
f 0
cc 3
nop 3
crap 3

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._discounted_levenshtein.
18
19 1
Discounted Levenshtein edit distance
20
"""
21
22
from math import log
23
from typing import Any, Callable, List, Tuple, Union, cast
24 1
25
import numpy as np
26
27
from ._levenshtein import Levenshtein
28
29
__all__ = ['DiscountedLevenshtein']
30
31 1
32
class DiscountedLevenshtein(Levenshtein):
33 1
    """Discounted Levenshtein distance.
34
35 1
    This is a variant of Levenshtein distance for which edits later in a string
36
    have discounted cost, on the theory that earlier edits are less likely
37 1
    than later ones.
38
39 1
    .. versionadded:: 0.4.1
40
    """
41
42 1
    def __init__(
43
        self,
44
        mode: str = 'lev',
45
        normalizer: Callable[[List[float]], float] = max,
46
        discount_from: Union[int, str] = 1,
47
        discount_func: Union[str, Callable[[float], float]] = 'log',
48
        vowels: str = 'aeiou',
49
        **kwargs: Any
50
    ) -> None:
51
        """Initialize DiscountedLevenshtein instance.
52 1
53
        Parameters
54
        ----------
55
        mode : str
56
            Specifies a mode for computing the discounted Levenshtein distance:
57
58
                - ``lev`` (default) computes the ordinary Levenshtein distance,
59
                  in which edits may include inserts, deletes, and
60
                  substitutions
61
                - ``osa`` computes the Optimal String Alignment distance, in
62
                  which edits may include inserts, deletes, substitutions, and
63
                  transpositions but substrings may only be edited once
64
65
        normalizer : function
66
            A function that takes an list and computes a normalization term
67
            by which the edit distance is divided (max by default). Another
68
            good option is the sum function.
69
        discount_from : int or str
70
            If an int is supplied, this is the first character whose edit cost
71
            will be discounted. If the str ``coda`` is supplied, discounting
72
            will start with the first non-vowel after the first vowel (the
73
            first syllable coda).
74
        discount_func : str or function
75
            The two supported str arguments are ``log``, for a logarithmic
76
            discount function, and ``exp`` for a exponential discount function.
77
            See notes below for information on how to supply your own
78
            discount function.
79
        vowels : str
80
            These are the letters to consider as vowels when discount_from is
81
            set to ``coda``. It defaults to the English vowels 'aeiou', but
82
            it would be reasonable to localize this to other languages or to
83
            add orthographic semi-vowels like 'y', 'w', and even 'h'.
84
        **kwargs
85
            Arbitrary keyword arguments
86
87
        Notes
88
        -----
89
        This class is highly experimental and will need additional tuning.
90
91
        The discount function can be passed as a callable function. It should
92
        expect an integer as its only argument and return a float, ideally
93
        less than or equal to 1.0. The argument represents the degree of
94
        discounting to apply.
95
96
97
        .. versionadded:: 0.4.1
98
99
        """
100
        super(DiscountedLevenshtein, self).__init__(**kwargs)
101
        self._mode = mode
102
        self._normalizer = normalizer
103
        self._discount_from = discount_from
104
        self._vowels = set(vowels.lower())
105
        if callable(discount_func):
106
            self._discount_func = discount_func
107
        elif discount_func == 'exp':
108
            self._discount_func = self._exp_discount
109
        else:
110 1
            self._discount_func = self._log_discount
111 1
112 1
    @staticmethod
113 1
    def _log_discount(discounts: float) -> float:
114 1
        return 1 / (log(1 + discounts / 5) + 1)
115 1
116 1
    @staticmethod
117 1
    def _exp_discount(discounts: float) -> float:
118 1
        return 1 / (discounts + 1) ** 0.2
119
120 1
    def _alignment_matrix(
121
        self, src: str, tar: str, backtrace: bool = True
122 1
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
123
        """Return the Levenshtein alignment matrix.
124 1
125
        Parameters
126 1
        ----------
127
        src : str
128 1
            Source string for comparison
129
        tar : str
130 1
            Target string for comparison
131
        backtrace : bool
132
            Return the backtrace matrix as well
133
134
        Returns
135
        -------
136
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
137
            The alignment matrix and (optionally) the backtrace matrix
138
139
140
        .. versionadded:: 0.4.1
141
142
        """
143
        src_len = len(src)
144
        tar_len = len(tar)
145
146
        if self._discount_from == 'coda':
147
            discount_from = [0, 0]
148
149
            src_voc = src.lower()
150
            for i in range(len(src_voc)):
151 1
                if src_voc[i] in self._vowels:
152 1
                    discount_from[0] = i
153
                    break
154 1
            for i in range(discount_from[0], len(src_voc)):
155 1
                if src_voc[i] not in self._vowels:
156
                    discount_from[0] = i
157 1
                    break
158 1
            else:
159 1
                discount_from[0] += 1
160 1
161 1
            tar_voc = tar.lower()
162 1
            for i in range(len(tar_voc)):
163 1
                if tar_voc[i] in self._vowels:
164 1
                    discount_from[1] = i
165 1
                    break
166
            for i in range(discount_from[1], len(tar_voc)):
167 1
                if tar_voc[i] not in self._vowels:
168
                    discount_from[1] = i
169 1
                    break
170 1
            else:
171 1
                discount_from[1] += 1
172 1
173 1
        elif isinstance(self._discount_from, int):
174 1
            discount_from = [self._discount_from, self._discount_from]
175 1
        else:
176 1
            discount_from = [1, 1]
177 1
178
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
179 1
        if backtrace:
180
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
181 1
        for i in range(1, src_len + 1):
182 1
            d_mat[i, 0] = d_mat[i - 1, 0] + self._discount_func(
183
                max(0, i - discount_from[0])
184 1
            )
185
            if backtrace:
186 1
                trace_mat[i, 0] = 1
187 1
        for j in range(1, tar_len + 1):
188 1
            d_mat[0, j] = d_mat[0, j - 1] + self._discount_func(
189 1
                max(0, j - discount_from[1])
190 1
            )
191
            if backtrace:
192
                trace_mat[0, j] = 0
193 1
        for i in range(src_len):
194 1
            i_extend = self._discount_func(max(0, i - discount_from[0]))
195 1
            for j in range(tar_len):
196 1
                traces = ((i + 1, j), (i, j + 1), (i, j))
197
                cost = min(
198
                    i_extend, self._discount_func(max(0, j - discount_from[1]))
199 1
                )
200 1
                opts = (
201 1
                    d_mat[traces[0]] + cost,  # ins
202 1
                    d_mat[traces[1]] + cost,  # del
203 1
                    d_mat[traces[2]]
204 1
                    + (cost if src[i] != tar[j] else 0),  # sub/==
205 1
                )
206 1
                d_mat[i + 1, j + 1] = min(opts)
207
                if backtrace:
208
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
209
210
                if self._mode == 'osa':
211
                    if (
212 1
                        i + 1 > 1
213 1
                        and j + 1 > 1
214 1
                        and src[i] == tar[j - 1]
215
                        and src[i - 1] == tar[j]
216 1
                    ):
217 1
                        # transposition
218
                        d_mat[i + 1, j + 1] = min(
219
                            d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
220
                        )
221
                        if backtrace:
222
                            trace_mat[i + 1, j + 1] = 2
223
        if backtrace:
224 1
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 179 is False. Are you sure this can never be the case?
Loading history...
225
        return d_mat
226
227 1
    def dist_abs(self, src: str, tar: str) -> float:
228 1
        """Return the Levenshtein distance between two strings.
229 1
230 1
        Parameters
231 1
        ----------
232
        src : str
233 1
            Source string for comparison
234
        tar : str
235
            Target string for comparison
236
237
        Returns
238
        -------
239
        float (may return a float if cost has float values)
240
            The Levenshtein distance between src & tar
241
242
        Examples
243
        --------
244
        >>> cmp = DiscountedLevenshtein()
245
        >>> cmp.dist_abs('cat', 'hat')
246
        1
247
        >>> cmp.dist_abs('Niall', 'Neil')
248
        2.526064024369237
249
        >>> cmp.dist_abs('aluminum', 'Catalan')
250
        5.053867269967515
251
        >>> cmp.dist_abs('ATCG', 'TAGC')
252
        2.594032108779918
253
254
        >>> cmp = DiscountedLevenshtein(mode='osa')
255
        >>> cmp.dist_abs('ATCG', 'TAGC')
256
        1.7482385137517997
257
        >>> cmp.dist_abs('ACTG', 'TAGC')
258
        3.342270622531718
259
260
261
        .. versionadded:: 0.4.1
262
263
        """
264
        src_len = len(src)
265
        tar_len = len(tar)
266
267
        if src == tar:
268
            return 0.0
269
270 1
        if isinstance(self._discount_from, int):
271 1
            discount_from = self._discount_from
272
        else:
273 1
            discount_from = 1
274 1
275
        if not src:
276 1
            return sum(
277 1
                self._discount_func(max(0, pos - discount_from))
278
                for pos in range(tar_len)
279 1
            )
280
        if not tar:
281 1
            return sum(
282 1
                self._discount_func(max(0, pos - discount_from))
283
                for pos in range(src_len)
284
            )
285
286 1
        d_mat = cast(
287 1
            np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
288
        )
289
290
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
291
            return int(d_mat[src_len, tar_len])
292 1
        else:
293
            return cast(float, d_mat[src_len, tar_len])
294 1
295 1
    def dist(self, src: str, tar: str) -> float:
296
        """Return the normalized Levenshtein distance between two strings.
297 1
298
        The Levenshtein distance is normalized by dividing the Levenshtein
299 1
        distance (calculated by any of the three supported methods) by the
300
        greater of the number of characters in src times the cost of a delete
301
        and the number of characters in tar times the cost of an insert.
302
        For the case in which all operations have :math:`cost = 1`, this is
303
        equivalent to the greater of the length of the two strings src & tar.
304
305
        Parameters
306
        ----------
307
        src : str
308
            Source string for comparison
309
        tar : str
310
            Target string for comparison
311
312
        Returns
313
        -------
314
        float
315
            The normalized Levenshtein distance between src & tar
316
317
        Examples
318
        --------
319
        >>> cmp = DiscountedLevenshtein()
320
        >>> cmp.dist('cat', 'hat')
321
        0.3513958291799864
322
        >>> cmp.dist('Niall', 'Neil')
323
        0.5909885886270658
324
        >>> cmp.dist('aluminum', 'Catalan')
325
        0.8348163322045603
326
        >>> cmp.dist('ATCG', 'TAGC')
327
        0.7217609721523955
328
329
330
        .. versionadded:: 0.4.1
331
332
        """
333
        if src == tar:
334
            return 0
335
336
        if isinstance(self._discount_from, int):
337 1
            discount_from = self._discount_from
338 1
        else:
339
            discount_from = 1
340 1
341 1
        src_len = len(src)
342
        tar_len = len(tar)
343 1
344
        normalize_term = self._normalizer(
345 1
            [
346 1
                sum(
347
                    self._discount_func(max(0, pos - discount_from))
348 1
                    for pos in range(src_len)
349
                ),
350
                sum(
351
                    self._discount_func(max(0, pos - discount_from))
352
                    for pos in range(tar_len)
353
                ),
354
            ]
355
        )
356
357
        return self.dist_abs(src, tar) / normalize_term
358
359
360
if __name__ == '__main__':
361 1
    import doctest
362
363
    doctest.testmod()
364