Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._levenshtein.levenshtein()   A

Complexity

Conditions 1

Size

Total Lines 48
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 4
dl 0
loc 48
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._levenshtein.
20
21
The distance._Levenshtein module implements string edit distance functions
22
based on Levenshtein distance, including:
23
24
    - Levenshtein distance
25
    - Optimal String Alignment distance
26
"""
27
28 1
from __future__ import (
29
    absolute_import,
30
    division,
31
    print_function,
32
    unicode_literals,
33
)
34
35
36 1
from numpy import int as np_int
37 1
from numpy import zeros as np_zeros
38
39 1
from six.moves import range
40
41 1
from ._distance import _Distance
42
43 1
__all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein']
44
45
46 1
class Levenshtein(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
47
    """Levenshtein distance.
48
49
    This is the standard edit distance measure. Cf.
50
    :cite:`Levenshtein:1965,Levenshtein:1966`.
51
52
    Optimal string alignment (aka restricted
53
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
54
55
    The ordinary Levenshtein & Optimal String Alignment distance both
56
    employ the Wagner-Fischer dynamic programming algorithm
57
    :cite:`Wagner:1974`.
58
59
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
60
    substitution costs.
61
    """
62
63 1
    def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
64
        """Return the Levenshtein distance between two strings.
65
66
        Parameters
67
        ----------
68
        src : str
69
            Source string for comparison
70
        tar : str
71
            Target string for comparison
72
        mode : str
73
            Specifies a mode for computing the Levenshtein distance:
74
75
                - ``lev`` (default) computes the ordinary Levenshtein distance,
76
                  in which edits may include inserts, deletes, and
77
                  substitutions
78
                - ``osa`` computes the Optimal String Alignment distance, in
79
                  which edits may include inserts, deletes, substitutions, and
80
                  transpositions but substrings may only be edited once
81
82
        cost : tuple
83
            A 4-tuple representing the cost of the four possible edits:
84
            inserts, deletes, substitutions, and transpositions, respectively
85
            (by default: (1, 1, 1, 1))
86
87
        Returns
88
        -------
89
        int (may return a float if cost has float values)
90
            The Levenshtein distance between src & tar
91
92
        Examples
93
        --------
94
        >>> cmp = Levenshtein()
95
        >>> cmp.dist_abs('cat', 'hat')
96
        1
97
        >>> cmp.dist_abs('Niall', 'Neil')
98
        3
99
        >>> cmp.dist_abs('aluminum', 'Catalan')
100
        7
101
        >>> cmp.dist_abs('ATCG', 'TAGC')
102
        3
103
104
        >>> cmp.dist_abs('ATCG', 'TAGC', mode='osa')
105
        2
106
        >>> cmp.dist_abs('ACTG', 'TAGC', mode='osa')
107
        4
108
109
        """
110 1
        ins_cost, del_cost, sub_cost, trans_cost = cost
111
112 1
        if src == tar:
113 1
            return 0
114 1
        if not src:
115 1
            return len(tar) * ins_cost
116 1
        if not tar:
117 1
            return len(src) * del_cost
118
119 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
120 1
        for i in range(len(src) + 1):
121 1
            d_mat[i, 0] = i * del_cost
122 1
        for j in range(len(tar) + 1):
123 1
            d_mat[0, j] = j * ins_cost
124
125 1
        for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
126 1
            for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
127 1
                d_mat[i + 1, j + 1] = min(
128
                    d_mat[i + 1, j] + ins_cost,  # ins
129
                    d_mat[i, j + 1] + del_cost,  # del
130
                    d_mat[i, j]
131
                    + (sub_cost if src[i] != tar[j] else 0),  # sub/==
132
                )
133
134 1
                if mode == 'osa':
135 1
                    if (
136
                        i + 1 > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
137
                        and j + 1 > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
138
                        and src[i] == tar[j - 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
139
                        and src[i - 1] == tar[j]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
140
                    ):
141
                        # transposition
142 1
                        d_mat[i + 1, j + 1] = min(
143
                            d_mat[i + 1, j + 1],
144
                            d_mat[i - 1, j - 1] + trans_cost,
145
                        )
146
147 1
        return d_mat[len(src), len(tar)]
148
149 1
    def dist(self, src, tar, mode='lev', cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
150
        """Return the normalized Levenshtein distance between two strings.
151
152
        The Levenshtein distance is normalized by dividing the Levenshtein
153
        distance (calculated by any of the three supported methods) by the
154
        greater of the number of characters in src times the cost of a delete
155
        and the number of characters in tar times the cost of an insert.
156
        For the case in which all operations have :math:`cost = 1`, this is
157
        equivalent to the greater of the length of the two strings src & tar.
158
159
        Parameters
160
        ----------
161
        src : str
162
            Source string for comparison
163
        tar : str
164
            Target string for comparison
165
        mode : str
166
            Specifies a mode for computing the Levenshtein distance:
167
168
                - ``lev`` (default) computes the ordinary Levenshtein distance,
169
                  in which edits may include inserts, deletes, and
170
                  substitutions
171
                - ``osa`` computes the Optimal String Alignment distance, in
172
                  which edits may include inserts, deletes, substitutions, and
173
                  transpositions but substrings may only be edited once
174
175
        cost : tuple
176
            A 4-tuple representing the cost of the four possible edits:
177
            inserts, deletes, substitutions, and transpositions, respectively
178
            (by default: (1, 1, 1, 1))
179
180
        Returns
181
        -------
182
        float
183
            The normalized Levenshtein distance between src & tar
184
185
        Examples
186
        --------
187
        >>> cmp = Levenshtein()
188
        >>> round(cmp.dist('cat', 'hat'), 12)
189
        0.333333333333
190
        >>> round(cmp.dist('Niall', 'Neil'), 12)
191
        0.6
192
        >>> cmp.dist('aluminum', 'Catalan')
193
        0.875
194
        >>> cmp.dist('ATCG', 'TAGC')
195
        0.75
196
197
        """
198 1
        if src == tar:
199 1
            return 0
200 1
        ins_cost, del_cost = cost[:2]
201 1
        return levenshtein(src, tar, mode, cost) / (
202
            max(len(src) * del_cost, len(tar) * ins_cost)
203
        )
204
205
206 1
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
207
    """Return the Levenshtein distance between two strings.
208
209
    This is a wrapper of :py:meth:`Levenshtein.dist_abs`.
210
211
    Parameters
212
    ----------
213
    src : str
214
        Source string for comparison
215
    tar : str
216
        Target string for comparison
217
    mode : str
218
        Specifies a mode for computing the Levenshtein distance:
219
220
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
221
              which edits may include inserts, deletes, and substitutions
222
            - ``osa`` computes the Optimal String Alignment distance, in which
223
              edits may include inserts, deletes, substitutions, and
224
              transpositions but substrings may only be edited once
225
226
    cost : tuple
227
        A 4-tuple representing the cost of the four possible edits: inserts,
228
        deletes, substitutions, and transpositions, respectively (by default:
229
        (1, 1, 1, 1))
230
231
    Returns
232
    -------
233
    int (may return a float if cost has float values)
234
        The Levenshtein distance between src & tar
235
236
    Examples
237
    --------
238
    >>> levenshtein('cat', 'hat')
239
    1
240
    >>> levenshtein('Niall', 'Neil')
241
    3
242
    >>> levenshtein('aluminum', 'Catalan')
243
    7
244
    >>> levenshtein('ATCG', 'TAGC')
245
    3
246
247
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
248
    2
249
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
250
    4
251
252
    """
253 1
    return Levenshtein().dist_abs(src, tar, mode, cost)
254
255
256 1
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
257
    """Return the normalized Levenshtein distance between two strings.
258
259
    This is a wrapper of :py:meth:`Levenshtein.dist`.
260
261
    Parameters
262
    ----------
263
    src : str
264
        Source string for comparison
265
    tar : str
266
        Target string for comparison
267
    mode : str
268
        Specifies a mode for computing the Levenshtein distance:
269
270
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
271
              which edits may include inserts, deletes, and substitutions
272
            - ``osa`` computes the Optimal String Alignment distance, in which
273
              edits may include inserts, deletes, substitutions, and
274
              transpositions but substrings may only be edited once
275
276
    cost : tuple
277
        A 4-tuple representing the cost of the four possible edits: inserts,
278
        deletes, substitutions, and transpositions, respectively (by default:
279
        (1, 1, 1, 1))
280
281
    Returns
282
    -------
283
    float
284
        The Levenshtein distance between src & tar
285
286
    Examples
287
    --------
288
    >>> round(dist_levenshtein('cat', 'hat'), 12)
289
    0.333333333333
290
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
291
    0.6
292
    >>> dist_levenshtein('aluminum', 'Catalan')
293
    0.875
294
    >>> dist_levenshtein('ATCG', 'TAGC')
295
    0.75
296
297
    """
298 1
    return Levenshtein().dist(src, tar, mode, cost)
299
300
301 1
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
302
    """Return the Levenshtein similarity of two strings.
303
304
    This is a wrapper of :py:meth:`Levenshtein.sim`.
305
306
    Parameters
307
    ----------
308
    src : str
309
        Source string for comparison
310
    tar : str
311
        Target string for comparison
312
    mode : str
313
        Specifies a mode for computing the Levenshtein distance:
314
315
            - ``lev`` (default) computes the ordinary Levenshtein distance, in
316
              which edits may include inserts, deletes, and substitutions
317
            - ``osa`` computes the Optimal String Alignment distance, in which
318
              edits may include inserts, deletes, substitutions, and
319
              transpositions but substrings may only be edited once
320
321
    cost : tuple
322
        A 4-tuple representing the cost of the four possible edits: inserts,
323
        deletes, substitutions, and transpositions, respectively (by default:
324
        (1, 1, 1, 1))
325
326
    Returns
327
    -------
328
    float
329
        The Levenshtein similarity between src & tar
330
331
    Examples
332
    --------
333
    >>> round(sim_levenshtein('cat', 'hat'), 12)
334
    0.666666666667
335
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
336
    0.4
337
    >>> sim_levenshtein('aluminum', 'Catalan')
338
    0.125
339
    >>> sim_levenshtein('ATCG', 'TAGC')
340
    0.25
341
342
    """
343 1
    return Levenshtein().sim(src, tar, mode, cost)
344
345
346
if __name__ == '__main__':
347
    import doctest
348
349
    doctest.testmod()
350