Levenshtein.dist_abs()   A
last analyzed

Complexity

Conditions 5

Size

Total Lines 64
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 64
ccs 20
cts 20
cp 1
rs 9.0333
c 0
b 0
f 0
cc 5
nop 3
crap 5

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._levenshtein.
18
19 1
The distance._Levenshtein module implements string edit distance functions
20
based on Levenshtein distance, including:
21
22
    - Levenshtein distance
23
    - Optimal String Alignment distance
24
"""
25
26
from sys import float_info
27
from typing import Any, Callable, List, Tuple, Union, cast
28 1
29
import numpy as np
30
31
from ._distance import _Distance
32
33
__all__ = ['Levenshtein']
34
35 1
36
class Levenshtein(_Distance):
37 1
    """Levenshtein distance.
38
39 1
    This is the standard edit distance measure. Cf.
40
    :cite:`Levenshtein:1965,Levenshtein:1966`.
41 1
42
    Optimal string alignment (aka restricted
43 1
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
44 1
45
    The ordinary Levenshtein & Optimal String Alignment distance both
46 1
    employ the Wagner-Fischer dynamic programming algorithm
47
    :cite:`Wagner:1974`.
48
49 1
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
50
    substitution costs.
51
52
    .. versionadded:: 0.3.6
53
    .. versionchanged:: 0.4.0
54
        Added taper option
55
    """
56
57
    def __init__(
58
        self,
59
        mode: str = 'lev',
60
        cost: Tuple[float, float, float, float] = (1, 1, 1, 1),
61
        normalizer: Callable[[List[float]], float] = max,
62
        taper: bool = False,
63
        **kwargs: Any
64
    ) -> None:
65
        """Initialize Levenshtein instance.
66
67
        Parameters
68
        ----------
69
        mode : str
70 1
            Specifies a mode for computing the Levenshtein distance:
71
72
                - ``lev`` (default) computes the ordinary Levenshtein distance,
73
                  in which edits may include inserts, deletes, and
74
                  substitutions
75
                - ``osa`` computes the Optimal String Alignment distance, in
76
                  which edits may include inserts, deletes, substitutions, and
77
                  transpositions but substrings may only be edited once
78
79
        cost : tuple
80
            A 4-tuple representing the cost of the four possible edits:
81
            inserts, deletes, substitutions, and transpositions, respectively
82
            (by default: (1, 1, 1, 1))
83
        normalizer : function
84
            A function that takes an list and computes a normalization term
85
            by which the edit distance is divided (max by default). Another
86
            good option is the sum function.
87
        taper : bool
88
            Enables cost tapering. Following :cite:`Zobel:1996`, it causes
89
            edits at the start of the string to "just [exceed] twice the
90
            minimum penalty for replacement or deletion at the end of the
91
            string".
92
        **kwargs
93
            Arbitrary keyword arguments
94
95
96
        .. versionadded:: 0.4.0
97
98
        """
99
        super(Levenshtein, self).__init__(**kwargs)
100
        self._mode = mode
101
        self._cost = cost
102
        self._normalizer = normalizer
103
        self._taper_enabled = taper
104
105
    def _taper(self, pos: int, length: int) -> float:
106
        return (
107
            round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
108
            if self._taper_enabled
109
            else 1
110
        )
111
112 1
    def _alignment_matrix(
113 1
        self, src: str, tar: str, backtrace: bool = True
114 1
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
115 1
        """Return the Levenshtein alignment matrix.
116 1
117
        Parameters
118 1
        ----------
119 1
        src : str
120
            Source string for comparison
121
        tar : str
122
            Target string for comparison
123
        backtrace : bool
124
            Return the backtrace matrix as well
125 1
126
        Returns
127
        -------
128
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
129
            The alignment matrix and (optionally) the backtrace matrix
130
131
132
        .. versionadded:: 0.4.1
133
134
        """
135
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
136
137
        src_len = len(src)
138
        tar_len = len(tar)
139
        max_len = max(src_len, tar_len)
140
141
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
142
        if backtrace:
143
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
144
        for i in range(src_len + 1):
145
            d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
146 1
            if backtrace:
147
                trace_mat[i, 0] = 1
148 1
        for j in range(tar_len + 1):
149 1
            d_mat[0, j] = j * self._taper(j, max_len) * ins_cost
150 1
            if backtrace:
151
                trace_mat[0, j] = 0
152 1
153 1
        for i in range(src_len):
154 1
            for j in range(tar_len):
155 1
                opts = (
156 1
                    d_mat[i + 1, j]
157 1
                    + ins_cost * self._taper(1 + max(i, j), max_len),  # ins
158 1
                    d_mat[i, j + 1]
159 1
                    + del_cost * self._taper(1 + max(i, j), max_len),  # del
160 1
                    d_mat[i, j]
161 1
                    + (
162 1
                        sub_cost * self._taper(1 + max(i, j), max_len)
163
                        if src[i] != tar[j]
164 1
                        else 0
165 1
                    ),  # sub/==
166 1
                )
167
                d_mat[i + 1, j + 1] = min(opts)
168
                if backtrace:
169
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
170
171
                if self._mode == 'osa':
172
                    if (
173
                        i + 1 > 1
174
                        and j + 1 > 1
175
                        and src[i] == tar[j - 1]
176
                        and src[i - 1] == tar[j]
177
                    ):
178 1
                        # transposition
179 1
                        d_mat[i + 1, j + 1] = min(
180 1
                            d_mat[i + 1, j + 1],
181
                            d_mat[i - 1, j - 1]
182 1
                            + trans_cost * self._taper(1 + max(i, j), max_len),
183 1
                        )
184
                        if backtrace:
185
                            trace_mat[i + 1, j + 1] = 2
186
187
        if backtrace:
188
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 142 is False. Are you sure this can never be the case?
Loading history...
189
        return d_mat
190 1
191
    def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
192
        """Return the Levenshtein alignment of two strings.
193
194
        Parameters
195 1
        ----------
196 1
        src : str
197
            Source string for comparison
198 1
        tar : str
199 1
            Target string for comparison
200 1
201
        Returns
202 1
        -------
203
        tuple
204
            A tuple containing the Levenshtein distance and the two strings,
205
            aligned.
206
207
        Examples
208
        --------
209
        >>> cmp = Levenshtein()
210
        >>> cmp.alignment('cat', 'hat')
211
        (1.0, 'cat', 'hat')
212
        >>> cmp.alignment('Niall', 'Neil')
213
        (3.0, 'N-iall', 'Nei-l-')
214
        >>> cmp.alignment('aluminum', 'Catalan')
215
        (7.0, '-aluminum', 'Catalan--')
216
        >>> cmp.alignment('ATCG', 'TAGC')
217
        (3.0, 'ATCG-', '-TAGC')
218
219
        >>> cmp = Levenshtein(mode='osa')
220
        >>> cmp.alignment('ATCG', 'TAGC')
221
        (2.0, 'ATCG', 'TAGC')
222
        >>> cmp.alignment('ACTG', 'TAGC')
223
        (4.0, 'ACT-G-', '--TAGC')
224
225
226
        .. versionadded:: 0.4.1
227
228
        """
229
        d_mat, trace_mat = self._alignment_matrix(src, tar, backtrace=True)
230
231
        src_aligned = []
232
        tar_aligned = []
233
234
        src_pos = len(src)
235
        tar_pos = len(tar)
236
237
        distance = d_mat[src_pos, tar_pos]
238
239
        while src_pos and tar_pos:
240 1
241
            src_trace, tar_trace = (
242 1
                (src_pos, tar_pos - 1),
243 1
                (src_pos - 1, tar_pos),
244
                (src_pos - 1, tar_pos - 1),
245 1
            )[trace_mat[src_pos, tar_pos]]
246 1
247
            if src_pos != src_trace and tar_pos != tar_trace:
248 1
                src_aligned.append(src[src_trace])
249
                tar_aligned.append(tar[tar_trace])
250 1
            elif src_pos != src_trace:
251
                src_aligned.append(src[src_trace])
252 1
                tar_aligned.append('-')
253
            else:
254
                src_aligned.append('-')
255
                tar_aligned.append(tar[tar_trace])
256
            src_pos, tar_pos = src_trace, tar_trace
257
        while tar_pos:
258 1
            tar_pos -= 1
259 1
            src_aligned.append('-')
260 1
            tar_aligned.append(tar[tar_pos])
261 1
        while src_pos:
262 1
            src_pos -= 1
263 1
            src_aligned.append(src[src_pos])
264
            tar_aligned.append('-')
265 1
266 1
        return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])
267 1
268 1
    def dist_abs(self, src: str, tar: str) -> float:
269 1
        """Return the Levenshtein distance between two strings.
270 1
271 1
        Parameters
272 1
        ----------
273 1
        src : str
274 1
            Source string for comparison
275 1
        tar : str
276
            Target string for comparison
277 1
278
        Returns
279 1
        -------
280
        int (may return a float if cost has float values)
281
            The Levenshtein distance between src & tar
282
283
        Examples
284
        --------
285
        >>> cmp = Levenshtein()
286
        >>> cmp.dist_abs('cat', 'hat')
287
        1
288
        >>> cmp.dist_abs('Niall', 'Neil')
289
        3
290
        >>> cmp.dist_abs('aluminum', 'Catalan')
291
        7
292
        >>> cmp.dist_abs('ATCG', 'TAGC')
293
        3
294
295
        >>> cmp = Levenshtein(mode='osa')
296
        >>> cmp.dist_abs('ATCG', 'TAGC')
297
        2
298
        >>> cmp.dist_abs('ACTG', 'TAGC')
299
        4
300
301
302
        .. versionadded:: 0.1.0
303
        .. versionchanged:: 0.3.6
304
            Encapsulated in class
305
306
        """
307
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
308
309
        src_len = len(src)
310
        tar_len = len(tar)
311
        max_len = max(src_len, tar_len)
312
313
        if src == tar:
314
            return 0
315
        if not src:
316
            return sum(
317
                ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
318 1
            )
319
        if not tar:
320 1
            return sum(
321 1
                del_cost * self._taper(pos, max_len) for pos in range(src_len)
322 1
            )
323
324 1
        d_mat = cast(
325 1
            np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
326 1
        )
327 1
328
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
329
            return int(d_mat[src_len, tar_len])
330 1
        else:
331 1
            return cast(float, d_mat[src_len, tar_len])
332
333
    def dist(self, src: str, tar: str) -> float:
334
        """Return the normalized Levenshtein distance between two strings.
335 1
336
        The Levenshtein distance is normalized by dividing the Levenshtein
337 1
        distance (calculated by either of the two supported methods) by the
338 1
        greater of the number of characters in src times the cost of a delete
339
        and the number of characters in tar times the cost of an insert.
340 1
        For the case in which all operations have :math:`cost = 1`, this is
341
        equivalent to the greater of the length of the two strings src & tar.
342 1
343
        Parameters
344
        ----------
345
        src : str
346
            Source string for comparison
347
        tar : str
348
            Target string for comparison
349
350
        Returns
351
        -------
352
        float
353
            The normalized Levenshtein distance between src & tar
354
355
        Examples
356
        --------
357
        >>> cmp = Levenshtein()
358
        >>> round(cmp.dist('cat', 'hat'), 12)
359
        0.333333333333
360
        >>> round(cmp.dist('Niall', 'Neil'), 12)
361
        0.6
362
        >>> cmp.dist('aluminum', 'Catalan')
363
        0.875
364
        >>> cmp.dist('ATCG', 'TAGC')
365
        0.75
366
367
368
        .. versionadded:: 0.1.0
369
        .. versionchanged:: 0.3.6
370
            Encapsulated in class
371
372
        """
373
        if src == tar:
374
            return 0.0
375
        ins_cost, del_cost = self._cost[:2]
376
377
        src_len = len(src)
378
        tar_len = len(tar)
379
380
        if self._taper_enabled:
381
            normalize_term = self._normalizer(
382 1
                [
383 1
                    sum(
384 1
                        self._taper(pos, src_len) * del_cost
385
                        for pos in range(src_len)
386 1
                    ),
387 1
                    sum(
388
                        self._taper(pos, tar_len) * ins_cost
389 1
                        for pos in range(tar_len)
390 1
                    ),
391
                ]
392
            )
393
        else:
394
            normalize_term = self._normalizer(
395
                [src_len * del_cost, tar_len * ins_cost]
396
            )
397
398
        return self.dist_abs(src, tar) / normalize_term
399
400
401
if __name__ == '__main__':
402
    import doctest
403 1
404
    doctest.testmod()
405