Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

abydos.distance._levenshtein.Levenshtein._taper()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 5
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 2
nop 3
crap 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._levenshtein.
20
21
The distance._Levenshtein module implements string edit distance functions
22
based on Levenshtein distance, including:
23
24
    - Levenshtein distance
25
    - Optimal String Alignment distance
26
"""
27
28 1
from __future__ import (
29
    absolute_import,
30
    division,
31
    print_function,
32
    unicode_literals,
33
)
34
35 1
from sys import float_info
36
37 1
from deprecation import deprecated
38
39 1
import numpy as np
40
41 1
from six.moves import range
42
43 1
from ._distance import _Distance
44 1
from .. import __version__
45
46 1
__all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein']
47
48
49 1
class Levenshtein(_Distance):
50
    """Levenshtein distance.
51
52
    This is the standard edit distance measure. Cf.
53
    :cite:`Levenshtein:1965,Levenshtein:1966`.
54
55
    Optimal string alignment (aka restricted
56
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
57
58
    The ordinary Levenshtein & Optimal String Alignment distance both
59
    employ the Wagner-Fischer dynamic programming algorithm
60
    :cite:`Wagner:1974`.
61
62
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
63
    substitution costs.
64
65
    .. versionadded:: 0.3.6
66
    .. versionchanged:: 0.4.0
67
        Added taper option
68
    """
69
70 1
    def __init__(
71
        self,
72
        mode='lev',
73
        cost=(1, 1, 1, 1),
74
        normalizer=max,
75
        taper=False,
76
        **kwargs
77
    ):
78
        """Initialize Levenshtein instance.
79
80
        Parameters
81
        ----------
82
        mode : str
83
            Specifies a mode for computing the Levenshtein distance:
84
85
                - ``lev`` (default) computes the ordinary Levenshtein distance,
86
                  in which edits may include inserts, deletes, and
87
                  substitutions
88
                - ``osa`` computes the Optimal String Alignment distance, in
89
                  which edits may include inserts, deletes, substitutions, and
90
                  transpositions but substrings may only be edited once
91
92
        cost : tuple
93
            A 4-tuple representing the cost of the four possible edits:
94
            inserts, deletes, substitutions, and transpositions, respectively
95
            (by default: (1, 1, 1, 1))
96
        normalizer : function
97
            A function that takes an list and computes a normalization term
98
            by which the edit distance is divided (max by default). Another
99
            good option is the sum function.
100
        taper : bool
101
            Enables cost tapering. Following :cite:`Zobel:1996`, it causes
102
            edits at the start of the string to "just [exceed] twice the
103
            minimum penalty for replacement or deletion at the end of the
104
            string".
105
        **kwargs
106
            Arbitrary keyword arguments
107
108
109
        .. versionadded:: 0.4.0
110
111
        """
112 1
        super(Levenshtein, self).__init__(**kwargs)
113 1
        self._mode = mode
114 1
        self._cost = cost
115 1
        self._normalizer = normalizer
116 1
        self._taper_enabled = taper
117
118 1
    def _taper(self, pos, length):
119 1
        return (
120
            round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
121
            if self._taper_enabled
122
            else 1
123
        )
124
125 1
    def _alignment_matrix(self, src, tar, backtrace=True):
126
        """Return the Levenshtein alignment matrix.
127
128
        Parameters
129
        ----------
130
        src : str
131
            Source string for comparison
132
        tar : str
133
            Target string for comparison
134
        backtrace : bool
135
            Return the backtrace matrix as well
136
137
        Returns
138
        -------
139
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
140
            The alignment matrix and (optionally) the backtrace matrix
141
142
143
        .. versionadded:: 0.4.1
144
145
        """
146 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
147
148 1
        src_len = len(src)
149 1
        tar_len = len(tar)
150 1
        max_len = max(src_len, tar_len)
151
152 1
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
153 1
        if backtrace:
154 1
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
155 1
        for i in range(src_len + 1):
156 1
            d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
157 1
            if backtrace:
158 1
                trace_mat[i, 0] = 1
159 1
        for j in range(tar_len + 1):
160 1
            d_mat[0, j] = j * self._taper(j, max_len) * ins_cost
161 1
            if backtrace:
162 1
                trace_mat[0, j] = 0
163
164 1
        for i in range(src_len):
165 1
            for j in range(tar_len):
166 1
                opts = (
167
                    d_mat[i + 1, j]
168
                    + ins_cost * self._taper(1 + max(i, j), max_len),  # ins
169
                    d_mat[i, j + 1]
170
                    + del_cost * self._taper(1 + max(i, j), max_len),  # del
171
                    d_mat[i, j]
172
                    + (
173
                        sub_cost * self._taper(1 + max(i, j), max_len)
174
                        if src[i] != tar[j]
175
                        else 0
176
                    ),  # sub/==
177
                )
178 1
                d_mat[i + 1, j + 1] = min(opts)
179 1
                if backtrace:
180 1
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
181
182 1
                if self._mode == 'osa':
183 1
                    if (
184
                        i + 1 > 1
185
                        and j + 1 > 1
186
                        and src[i] == tar[j - 1]
187
                        and src[i - 1] == tar[j]
188
                    ):
189
                        # transposition
190 1
                        d_mat[i + 1, j + 1] = min(
191
                            d_mat[i + 1, j + 1],
192
                            d_mat[i - 1, j - 1]
193
                            + trans_cost * self._taper(1 + max(i, j), max_len),
194
                        )
195 1
                        if backtrace:
196 1
                            trace_mat[i + 1, j + 1] = 2
197
198 1
        if backtrace:
199 1
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 153 is False. Are you sure this can never be the case?
Loading history...
200 1
        return d_mat
201
202 1
    def alignment(self, src, tar):
203
        """Return the Levenshtein alignment of two strings.
204
205
        Parameters
206
        ----------
207
        src : str
208
            Source string for comparison
209
        tar : str
210
            Target string for comparison
211
212
        Returns
213
        -------
214
        tuple
215
            A tuple containing the Levenshtein distance and the two strings,
216
            aligned.
217
218
        Examples
219
        --------
220
        >>> cmp = Levenshtein()
221
        >>> cmp.alignment('cat', 'hat')
222
        (1.0, 'cat', 'hat')
223
        >>> cmp.alignment('Niall', 'Neil')
224
        (3.0, 'N-iall', 'Nei-l-')
225
        >>> cmp.alignment('aluminum', 'Catalan')
226
        (7.0, '-aluminum', 'Catalan--')
227
        >>> cmp.alignment('ATCG', 'TAGC')
228
        (3.0, 'ATCG-', '-TAGC')
229
230
        >>> cmp = Levenshtein(mode='osa')
231
        >>> cmp.alignment('ATCG', 'TAGC')
232
        (2.0, 'ATCG', 'TAGC')
233
        >>> cmp.alignment('ACTG', 'TAGC')
234
        (4.0, 'ACT-G-', '--TAGC')
235
236
237
        .. versionadded:: 0.4.1
238
239
        """
240 1
        d_mat, trace_mat = self._alignment_matrix(src, tar, backtrace=True)
241
242 1
        src_aligned = []
243 1
        tar_aligned = []
244
245 1
        src_pos = len(src)
246 1
        tar_pos = len(tar)
247
248 1
        distance = d_mat[src_pos, tar_pos]
249
250 1
        while src_pos and tar_pos:
251
252 1
            src_trace, tar_trace = (
253
                (src_pos, tar_pos - 1),
254
                (src_pos - 1, tar_pos),
255
                (src_pos - 1, tar_pos - 1),
256
            )[trace_mat[src_pos, tar_pos]]
257
258 1
            if src_pos != src_trace and tar_pos != tar_trace:
259 1
                src_aligned.append(src[src_trace])
260 1
                tar_aligned.append(tar[tar_trace])
261 1
            elif src_pos != src_trace:
262 1
                src_aligned.append(src[src_trace])
263 1
                tar_aligned.append('-')
264
            else:
265 1
                src_aligned.append('-')
266 1
                tar_aligned.append(tar[tar_trace])
267 1
            src_pos, tar_pos = src_trace, tar_trace
268 1
        while tar_pos:
269 1
            tar_pos -= 1
270 1
            src_aligned.append('-')
271 1
            tar_aligned.append(tar[tar_pos])
272 1
        while src_pos:
273 1
            src_pos -= 1
274 1
            src_aligned.append(src[src_pos])
275 1
            tar_aligned.append('-')
276
277 1
        return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])
278
279 1
    def dist_abs(self, src, tar):
280
        """Return the Levenshtein distance between two strings.
281
282
        Parameters
283
        ----------
284
        src : str
285
            Source string for comparison
286
        tar : str
287
            Target string for comparison
288
289
        Returns
290
        -------
291
        int (may return a float if cost has float values)
292
            The Levenshtein distance between src & tar
293
294
        Examples
295
        --------
296
        >>> cmp = Levenshtein()
297
        >>> cmp.dist_abs('cat', 'hat')
298
        1
299
        >>> cmp.dist_abs('Niall', 'Neil')
300
        3
301
        >>> cmp.dist_abs('aluminum', 'Catalan')
302
        7
303
        >>> cmp.dist_abs('ATCG', 'TAGC')
304
        3
305
306
        >>> cmp = Levenshtein(mode='osa')
307
        >>> cmp.dist_abs('ATCG', 'TAGC')
308
        2
309
        >>> cmp.dist_abs('ACTG', 'TAGC')
310
        4
311
312
313
        .. versionadded:: 0.1.0
314
        .. versionchanged:: 0.3.6
315
            Encapsulated in class
316
317
        """
318 1
        ins_cost, del_cost, sub_cost, trans_cost = self._cost
319
320 1
        src_len = len(src)
321 1
        tar_len = len(tar)
322 1
        max_len = max(src_len, tar_len)
323
324 1
        if src == tar:
325 1
            return 0
326 1
        if not src:
327 1
            return sum(
328
                ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
329
            )
330 1
        if not tar:
331 1
            return sum(
332
                del_cost * self._taper(pos, max_len) for pos in range(src_len)
333
            )
334
335 1
        d_mat = self._alignment_matrix(src, tar, backtrace=False)
336
337 1
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
338 1
            return int(d_mat[src_len, tar_len])
339
        else:
340 1
            return d_mat[src_len, tar_len]
341
342 1
    def dist(self, src, tar):
343
        """Return the normalized Levenshtein distance between two strings.
344
345
        The Levenshtein distance is normalized by dividing the Levenshtein
346
        distance (calculated by either of the two supported methods) by the
347
        greater of the number of characters in src times the cost of a delete
348
        and the number of characters in tar times the cost of an insert.
349
        For the case in which all operations have :math:`cost = 1`, this is
350
        equivalent to the greater of the length of the two strings src & tar.
351
352
        Parameters
353
        ----------
354
        src : str
355
            Source string for comparison
356
        tar : str
357
            Target string for comparison
358
359
        Returns
360
        -------
361
        float
362
            The normalized Levenshtein distance between src & tar
363
364
        Examples
365
        --------
366
        >>> cmp = Levenshtein()
367
        >>> round(cmp.dist('cat', 'hat'), 12)
368
        0.333333333333
369
        >>> round(cmp.dist('Niall', 'Neil'), 12)
370
        0.6
371
        >>> cmp.dist('aluminum', 'Catalan')
372
        0.875
373
        >>> cmp.dist('ATCG', 'TAGC')
374
        0.75
375
376
377
        .. versionadded:: 0.1.0
378
        .. versionchanged:: 0.3.6
379
            Encapsulated in class
380
381
        """
382 1
        if src == tar:
383 1
            return 0.0
384 1
        ins_cost, del_cost = self._cost[:2]
385
386 1
        src_len = len(src)
387 1
        tar_len = len(tar)
388
389 1
        if self._taper_enabled:
390 1
            normalize_term = self._normalizer(
391
                [
392
                    sum(
393
                        self._taper(pos, src_len) * del_cost
394
                        for pos in range(src_len)
395
                    ),
396
                    sum(
397
                        self._taper(pos, tar_len) * ins_cost
398
                        for pos in range(tar_len)
399
                    ),
400
                ]
401
            )
402
        else:
403 1
            normalize_term = self._normalizer(
404
                [src_len * del_cost, tar_len * ins_cost]
405
            )
406
407 1
        return self.dist_abs(src, tar) / normalize_term
408
409
410 1
@deprecated(
411
    deprecated_in='0.4.0',
412
    removed_in='0.6.0',
413
    current_version=__version__,
414
    details='Use the Levenshtein.dist_abs method instead.',
415
)
416 1
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
417
    """Return the Levenshtein distance between two strings.
418
419
    This is a wrapper of :py:meth:`Levenshtein.dist_abs`.
420
421
    Parameters
422
    ----------
423
    src : str
424
        Source string for comparison
425
    tar : str
426
        Target string for comparison
427
    mode : str
428
        Specifies a mode for computing the Levenshtein distance:
429
430
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
431
              which edits may include inserts, deletes, and substitutions
432
            - ``osa`` computes the Optimal String Alignment distance, in which
433
              edits may include inserts, deletes, substitutions, and
434
              transpositions but substrings may only be edited once
435
436
    cost : tuple
437
        A 4-tuple representing the cost of the four possible edits: inserts,
438
        deletes, substitutions, and transpositions, respectively (by default:
439
        (1, 1, 1, 1))
440
441
    Returns
442
    -------
443
    int (may return a float if cost has float values)
444
        The Levenshtein distance between src & tar
445
446
    Examples
447
    --------
448
    >>> levenshtein('cat', 'hat')
449
    1
450
    >>> levenshtein('Niall', 'Neil')
451
    3
452
    >>> levenshtein('aluminum', 'Catalan')
453
    7
454
    >>> levenshtein('ATCG', 'TAGC')
455
    3
456
457
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
458
    2
459
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
460
    4
461
462
    .. versionadded:: 0.1.0
463
464
    """
465 1
    return Levenshtein(mode=mode, cost=cost).dist_abs(src, tar)
466
467
468 1
@deprecated(
469
    deprecated_in='0.4.0',
470
    removed_in='0.6.0',
471
    current_version=__version__,
472
    details='Use the Levenshtein.dist method instead.',
473
)
474 1
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
475
    """Return the normalized Levenshtein distance between two strings.
476
477
    This is a wrapper of :py:meth:`Levenshtein.dist`.
478
479
    Parameters
480
    ----------
481
    src : str
482
        Source string for comparison
483
    tar : str
484
        Target string for comparison
485
    mode : str
486
        Specifies a mode for computing the Levenshtein distance:
487
488
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
489
              which edits may include inserts, deletes, and substitutions
490
            - ``osa`` computes the Optimal String Alignment distance, in which
491
              edits may include inserts, deletes, substitutions, and
492
              transpositions but substrings may only be edited once
493
494
    cost : tuple
495
        A 4-tuple representing the cost of the four possible edits: inserts,
496
        deletes, substitutions, and transpositions, respectively (by default:
497
        (1, 1, 1, 1))
498
499
    Returns
500
    -------
501
    float
502
        The Levenshtein distance between src & tar
503
504
    Examples
505
    --------
506
    >>> round(dist_levenshtein('cat', 'hat'), 12)
507
    0.333333333333
508
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
509
    0.6
510
    >>> dist_levenshtein('aluminum', 'Catalan')
511
    0.875
512
    >>> dist_levenshtein('ATCG', 'TAGC')
513
    0.75
514
515
    .. versionadded:: 0.1.0
516
517
    """
518 1
    return Levenshtein(mode=mode, cost=cost).dist(src, tar)
519
520
521 1
@deprecated(
522
    deprecated_in='0.4.0',
523
    removed_in='0.6.0',
524
    current_version=__version__,
525
    details='Use the Levenshtein.sim method instead.',
526
)
527 1
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
528
    """Return the Levenshtein similarity of two strings.
529
530
    This is a wrapper of :py:meth:`Levenshtein.sim`.
531
532
    Parameters
533
    ----------
534
    src : str
535
        Source string for comparison
536
    tar : str
537
        Target string for comparison
538
    mode : str
539
        Specifies a mode for computing the Levenshtein distance:
540
541
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
542
              which edits may include inserts, deletes, and substitutions
543
            - ``osa`` computes the Optimal String Alignment distance, in which
544
              edits may include inserts, deletes, substitutions, and
545
              transpositions but substrings may only be edited once
546
547
    cost : tuple
548
        A 4-tuple representing the cost of the four possible edits: inserts,
549
        deletes, substitutions, and transpositions, respectively (by default:
550
        (1, 1, 1, 1))
551
552
    Returns
553
    -------
554
    float
555
        The Levenshtein similarity between src & tar
556
557
    Examples
558
    --------
559
    >>> round(sim_levenshtein('cat', 'hat'), 12)
560
    0.666666666667
561
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
562
    0.4
563
    >>> sim_levenshtein('aluminum', 'Catalan')
564
    0.125
565
    >>> sim_levenshtein('ATCG', 'TAGC')
566
    0.25
567
568
    .. versionadded:: 0.1.0
569
570
    """
571 1
    return Levenshtein(mode=mode, cost=cost).sim(src, tar)
572
573
574
if __name__ == '__main__':
575
    import doctest
576
577
    doctest.testmod()
578