Completed
Pull Request — master (#225)
by Chris
09:15
created

abydos.distance._levenshtein.levenshtein()   A

Complexity

Conditions 1

Size

Total Lines 56
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 56
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 4
crap 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._levenshtein.
20
21
The distance._Levenshtein module implements string edit distance functions
22
based on Levenshtein distance, including:
23
24
    - Levenshtein distance
25
    - Optimal String Alignment distance
26
"""
27
28 1
from __future__ import (
29
    absolute_import,
30
    division,
31
    print_function,
32
    unicode_literals,
33
)
34
35 1
from sys import float_info
36
37 1
from deprecation import deprecated
38
39 1
from numpy import float as np_float
40 1
from numpy import zeros as np_zeros
41
42 1
from six.moves import range
43
44 1
from ._distance import _Distance
45 1
from .. import __version__
46
47 1
__all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein']
48
49
50 1
class Levenshtein(_Distance):
51
    """Levenshtein distance.
52
53
    This is the standard edit distance measure. Cf.
54
    :cite:`Levenshtein:1965,Levenshtein:1966`.
55
56
    Optimal string alignment (aka restricted
57
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
58
59
    The ordinary Levenshtein & Optimal String Alignment distance both
60
    employ the Wagner-Fischer dynamic programming algorithm
61
    :cite:`Wagner:1974`.
62
63
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
64
    substitution costs.
65
66
    .. versionadded:: 0.3.6
67
    .. versionchanged:: 0.4.0
68
        Added taper option
69
    """
70
71 1
    def __init__(
72
        self,
73
        mode='lev',
74
        cost=(1, 1, 1, 1),
75
        normalizer=max,
76
        taper=False,
77
        **kwargs
78
    ):
79
        """Initialize Levenshtein instance.
80
81
        Parameters
82
        ----------
83
        mode : str
84
            Specifies a mode for computing the Levenshtein distance:
85
86
                - ``lev`` (default) computes the ordinary Levenshtein distance,
87
                  in which edits may include inserts, deletes, and
88
                  substitutions
89
                - ``osa`` computes the Optimal String Alignment distance, in
90
                  which edits may include inserts, deletes, substitutions, and
91
                  transpositions but substrings may only be edited once
92
93
        cost : tuple
94
            A 4-tuple representing the cost of the four possible edits:
95
            inserts, deletes, substitutions, and transpositions, respectively
96
            (by default: (1, 1, 1, 1))
97
        normalizer : function
98
            A function that takes an list and computes a normalization term
99
            by which the edit distance is divided (max by default). Another
100
            good option is the sum function.
101
        taper : bool
102
            Enables cost tapering. Following :cite:`Zobel:1996`, it causes
103
            edits at the start of the string to "just [exceed] twice the
104
            minimum penalty for replacement or deletion at the end of the
105
            string".
106
        **kwargs
107
            Arbitrary keyword arguments
108
109
110
        .. versionadded:: 0.4.0
111
112
        """
113 1
        super(Levenshtein, self).__init__(**kwargs)
114 1
        self._mode = mode
115 1
        self._cost = cost
116 1
        self._normalizer = normalizer
117 1
        self._taper_enabled = taper
118
119 1
    def _taper(self, pos, length):
120 1
        return (
121
            round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
122
            if self._taper_enabled
123
            else 1
124
        )
125
126 1
    def _alignment_matrix(self, src, tar):
127
        """Return the Levenshtein alignment matrix.
128
129
        Parameters
130
        ----------
131
        src : str
132
            Source string for comparison
133
        tar : str
134
            Target string for comparison
135
136
        Returns
137
        -------
138
        numpy.ndarray
139
            The alignment matrix
140
141
142
        .. versionadded:: 0.4.1
143
144
        """
145 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
146
147 1
        src_len = len(src)
148 1
        tar_len = len(tar)
149 1
        max_len = max(src_len, tar_len)
150
151 1
        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
152 1
        for i in range(src_len + 1):
153 1
            d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
154 1
        for j in range(tar_len + 1):
155 1
            d_mat[0, j] = j * self._taper(j, max_len) * ins_cost
156
157 1
        for i in range(src_len):
158 1
            for j in range(tar_len):
159 1
                d_mat[i + 1, j + 1] = min(
160
                    d_mat[i + 1, j]
161
                    + ins_cost * self._taper(1 + max(i, j), max_len),  # ins
162
                    d_mat[i, j + 1]
163
                    + del_cost * self._taper(1 + max(i, j), max_len),  # del
164
                    d_mat[i, j]
165
                    + (
166
                        sub_cost * self._taper(1 + max(i, j), max_len)
167
                        if src[i] != tar[j]
168
                        else 0
169
                    ),  # sub/==
170
                )
171
172 1
                if self._mode == 'osa':
173 1
                    if (
174
                        i + 1 > 1
175
                        and j + 1 > 1
176
                        and src[i] == tar[j - 1]
177
                        and src[i - 1] == tar[j]
178
                    ):
179
                        # transposition
180 1
                        d_mat[i + 1, j + 1] = min(
181
                            d_mat[i + 1, j + 1],
182
                            d_mat[i - 1, j - 1]
183
                            + trans_cost * self._taper(1 + max(i, j), max_len),
184
                        )
185
186 1
        return d_mat
187
188 1 View Code Duplication
    def alignment(self, src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
189
        """Return the Levenshtein alignment of two strings.
190
191
        Parameters
192
        ----------
193
        src : str
194
            Source string for comparison
195
        tar : str
196
            Target string for comparison
197
198
        Returns
199
        -------
200
        tuple
201
            A tuple containing the Levenshtein distance and the two strings,
202
            aligned.
203
204
        Examples
205
        --------
206
        >>> cmp = Levenshtein()
207
        >>> cmp.alignment('cat', 'hat')
208
        (1.0, 'cat', 'hat')
209
        >>> cmp.alignment('Niall', 'Neil')
210
        (3.0, 'Niall', 'Neil-')
211
        >>> cmp.alignment('aluminum', 'Catalan')
212
        (7.0, '-aluminum', 'Catalan--')
213
        >>> cmp.alignment('ATCG', 'TAGC')
214
        (3.0, 'ATCG-', '-TAGC')
215
216
        >>> cmp = Levenshtein(mode='osa')
217
        >>> cmp.alignment('ATCG', 'TAGC')
218
        (2.0, 'ATCG', 'TAGC')
219
        >>> cmp.alignment('ACTG', 'TAGC')
220
        (4.0, 'ACTG', 'TAGC')
221
222
223
        .. versionadded:: 0.4.1
224
225
        """
226 1
        d_mat = self._alignment_matrix(src, tar)
227
228 1
        src_aligned = []
229 1
        tar_aligned = []
230
231 1
        src_pos = len(src)
232 1
        tar_pos = len(tar)
233
234 1
        distance = d_mat[src_pos, tar_pos]
235
236 1
        while src_pos and tar_pos:
237 1
            up = d_mat[src_pos, tar_pos - 1]
238 1
            left = d_mat[src_pos - 1, tar_pos]
239 1
            diag = d_mat[src_pos - 1, tar_pos - 1]
240
241 1
            if diag <= min(up, left):
242 1
                src_pos -= 1
243 1
                tar_pos -= 1
244 1
                src_aligned.append(src[src_pos])
245 1
                tar_aligned.append(tar[tar_pos])
246 1
            elif up <= left:
247 1
                tar_pos -= 1
248 1
                src_aligned.append('-')
249 1
                tar_aligned.append(tar[tar_pos])
250
            else:
251 1
                src_pos -= 1
252 1
                src_aligned.append(src[src_pos])
253 1
                tar_aligned.append('-')
254 1
        while tar_pos:
255 1
            tar_pos -= 1
256 1
            tar_aligned.append(tar[tar_pos])
257 1
            src_aligned.append('-')
258 1
        while src_pos:
259 1
            src_pos -= 1
260 1
            src_aligned.append(src[src_pos])
261 1
            tar_aligned.append('-')
262
263 1
        return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])
264
265 1
    def dist_abs(self, src, tar):
266
        """Return the Levenshtein distance between two strings.
267
268
        Parameters
269
        ----------
270
        src : str
271
            Source string for comparison
272
        tar : str
273
            Target string for comparison
274
275
        Returns
276
        -------
277
        int (may return a float if cost has float values)
278
            The Levenshtein distance between src & tar
279
280
        Examples
281
        --------
282
        >>> cmp = Levenshtein()
283
        >>> cmp.dist_abs('cat', 'hat')
284
        1
285
        >>> cmp.dist_abs('Niall', 'Neil')
286
        3
287
        >>> cmp.dist_abs('aluminum', 'Catalan')
288
        7
289
        >>> cmp.dist_abs('ATCG', 'TAGC')
290
        3
291
292
        >>> cmp = Levenshtein(mode='osa')
293
        >>> cmp.dist_abs('ATCG', 'TAGC')
294
        2
295
        >>> cmp.dist_abs('ACTG', 'TAGC')
296
        4
297
298
299
        .. versionadded:: 0.1.0
300
        .. versionchanged:: 0.3.6
301
            Encapsulated in class
302
303
        """
304 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
305
306 1
        src_len = len(src)
307 1
        tar_len = len(tar)
308 1
        max_len = max(src_len, tar_len)
309
310 1
        if src == tar:
311 1
            return 0
312 1
        if not src:
313 1
            return sum(
314
                ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
315
            )
316 1
        if not tar:
317 1
            return sum(
318
                del_cost * self._taper(pos, max_len) for pos in range(src_len)
319
            )
320
321 1
        d_mat = self._alignment_matrix(src, tar)
322
323 1
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
324 1
            return int(d_mat[src_len, tar_len])
325
        else:
326 1
            return d_mat[src_len, tar_len]
327
328 1
    def dist(self, src, tar):
329
        """Return the normalized Levenshtein distance between two strings.
330
331
        The Levenshtein distance is normalized by dividing the Levenshtein
332
        distance (calculated by any of the three supported methods) by the
333
        greater of the number of characters in src times the cost of a delete
334
        and the number of characters in tar times the cost of an insert.
335
        For the case in which all operations have :math:`cost = 1`, this is
336
        equivalent to the greater of the length of the two strings src & tar.
337
338
        Parameters
339
        ----------
340
        src : str
341
            Source string for comparison
342
        tar : str
343
            Target string for comparison
344
345
        Returns
346
        -------
347
        float
348
            The normalized Levenshtein distance between src & tar
349
350
        Examples
351
        --------
352
        >>> cmp = Levenshtein()
353
        >>> round(cmp.dist('cat', 'hat'), 12)
354
        0.333333333333
355
        >>> round(cmp.dist('Niall', 'Neil'), 12)
356
        0.6
357
        >>> cmp.dist('aluminum', 'Catalan')
358
        0.875
359
        >>> cmp.dist('ATCG', 'TAGC')
360
        0.75
361
362
363
        .. versionadded:: 0.1.0
364
        .. versionchanged:: 0.3.6
365
            Encapsulated in class
366
367
        """
368 1
        if src == tar:
369 1
            return 0.0
370 1
        ins_cost, del_cost = self._cost[:2]
371
372 1
        src_len = len(src)
373 1
        tar_len = len(tar)
374
375 1
        if self._taper_enabled:
376 1
            normalize_term = self._normalizer(
377
                [
378
                    sum(
379
                        self._taper(pos, src_len) * del_cost
380
                        for pos in range(src_len)
381
                    ),
382
                    sum(
383
                        self._taper(pos, tar_len) * ins_cost
384
                        for pos in range(tar_len)
385
                    ),
386
                ]
387
            )
388
        else:
389 1
            normalize_term = self._normalizer(
390
                [src_len * del_cost, tar_len * ins_cost]
391
            )
392
393 1
        return self.dist_abs(src, tar) / normalize_term
394
395
396 1
@deprecated(
397
    deprecated_in='0.4.0',
398
    removed_in='0.6.0',
399
    current_version=__version__,
400
    details='Use the Levenshtein.dist_abs method instead.',
401
)
402 1
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
403
    """Return the Levenshtein distance between two strings.
404
405
    This is a wrapper of :py:meth:`Levenshtein.dist_abs`.
406
407
    Parameters
408
    ----------
409
    src : str
410
        Source string for comparison
411
    tar : str
412
        Target string for comparison
413
    mode : str
414
        Specifies a mode for computing the Levenshtein distance:
415
416
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
417
              which edits may include inserts, deletes, and substitutions
418
            - ``osa`` computes the Optimal String Alignment distance, in which
419
              edits may include inserts, deletes, substitutions, and
420
              transpositions but substrings may only be edited once
421
422
    cost : tuple
423
        A 4-tuple representing the cost of the four possible edits: inserts,
424
        deletes, substitutions, and transpositions, respectively (by default:
425
        (1, 1, 1, 1))
426
427
    Returns
428
    -------
429
    int (may return a float if cost has float values)
430
        The Levenshtein distance between src & tar
431
432
    Examples
433
    --------
434
    >>> levenshtein('cat', 'hat')
435
    1
436
    >>> levenshtein('Niall', 'Neil')
437
    3
438
    >>> levenshtein('aluminum', 'Catalan')
439
    7
440
    >>> levenshtein('ATCG', 'TAGC')
441
    3
442
443
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
444
    2
445
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
446
    4
447
448
    .. versionadded:: 0.1.0
449
450
    """
451 1
    return Levenshtein(mode=mode, cost=cost).dist_abs(src, tar)
452
453
454 1
@deprecated(
455
    deprecated_in='0.4.0',
456
    removed_in='0.6.0',
457
    current_version=__version__,
458
    details='Use the Levenshtein.dist method instead.',
459
)
460 1
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
461
    """Return the normalized Levenshtein distance between two strings.
462
463
    This is a wrapper of :py:meth:`Levenshtein.dist`.
464
465
    Parameters
466
    ----------
467
    src : str
468
        Source string for comparison
469
    tar : str
470
        Target string for comparison
471
    mode : str
472
        Specifies a mode for computing the Levenshtein distance:
473
474
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
475
              which edits may include inserts, deletes, and substitutions
476
            - ``osa`` computes the Optimal String Alignment distance, in which
477
              edits may include inserts, deletes, substitutions, and
478
              transpositions but substrings may only be edited once
479
480
    cost : tuple
481
        A 4-tuple representing the cost of the four possible edits: inserts,
482
        deletes, substitutions, and transpositions, respectively (by default:
483
        (1, 1, 1, 1))
484
485
    Returns
486
    -------
487
    float
488
        The Levenshtein distance between src & tar
489
490
    Examples
491
    --------
492
    >>> round(dist_levenshtein('cat', 'hat'), 12)
493
    0.333333333333
494
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
495
    0.6
496
    >>> dist_levenshtein('aluminum', 'Catalan')
497
    0.875
498
    >>> dist_levenshtein('ATCG', 'TAGC')
499
    0.75
500
501
    .. versionadded:: 0.1.0
502
503
    """
504 1
    return Levenshtein(mode=mode, cost=cost).dist(src, tar)
505
506
507 1
@deprecated(
508
    deprecated_in='0.4.0',
509
    removed_in='0.6.0',
510
    current_version=__version__,
511
    details='Use the Levenshtein.sim method instead.',
512
)
513 1
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
514
    """Return the Levenshtein similarity of two strings.
515
516
    This is a wrapper of :py:meth:`Levenshtein.sim`.
517
518
    Parameters
519
    ----------
520
    src : str
521
        Source string for comparison
522
    tar : str
523
        Target string for comparison
524
    mode : str
525
        Specifies a mode for computing the Levenshtein distance:
526
527
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
528
              which edits may include inserts, deletes, and substitutions
529
            - ``osa`` computes the Optimal String Alignment distance, in which
530
              edits may include inserts, deletes, substitutions, and
531
              transpositions but substrings may only be edited once
532
533
    cost : tuple
534
        A 4-tuple representing the cost of the four possible edits: inserts,
535
        deletes, substitutions, and transpositions, respectively (by default:
536
        (1, 1, 1, 1))
537
538
    Returns
539
    -------
540
    float
541
        The Levenshtein similarity between src & tar
542
543
    Examples
544
    --------
545
    >>> round(sim_levenshtein('cat', 'hat'), 12)
546
    0.666666666667
547
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
548
    0.4
549
    >>> sim_levenshtein('aluminum', 'Catalan')
550
    0.125
551
    >>> sim_levenshtein('ATCG', 'TAGC')
552
    0.25
553
554
    .. versionadded:: 0.1.0
555
556
    """
557 1
    return Levenshtein(mode=mode, cost=cost).sim(src, tar)
558
559
560
if __name__ == '__main__':
561
    import doctest
562
563
    doctest.testmod()
564