Completed
Pull Request — master (#135)
by Chris
11:32
created

abydos.distance._levenshtein.sim_indel()   A

Complexity

Conditions 1

Size

Total Lines 21
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 21
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.levenshtein.
20
21
The distance.levenshtein module implements string edit distance functions
22
based on Levenshtein distance, including:
23
24
    - Levenshtein distance
25
    - Optimal String Alignment distance
26
    - Levenshtein-Damerau distance
27
    - Indel distance
28
"""
29
30 1
from __future__ import division, unicode_literals
31
32 1
from sys import maxsize
33
34 1
from numpy import int as np_int
35 1
from numpy import zeros as np_zeros
36
37 1
from six.moves import range
38
39 1
from ._distance import Distance
40
41 1
__all__ = [
42
    'DamerauLevenshtein',
43
    'Indel',
44
    'Levenshtein',
45
    'damerau_levenshtein',
46
    'dist_damerau',
47
    'dist_indel',
48
    'dist_levenshtein',
49
    'levenshtein',
50
    'sim_damerau',
51
    'sim_indel',
52
    'sim_levenshtein',
53
]
54
55
56 1
class Levenshtein(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
57
    """Levenshtein distance.
58
59
    This is the standard edit distance measure. Cf.
60
    :cite:`Levenshtein:1965,Levenshtein:1966`.
61
62
    Optimal string alignment (aka restricted
63
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
64
65
    The ordinary Levenshtein & Optimal String Alignment distance both
66
    employ the Wagner-Fischer dynamic programming algorithm
67
    :cite:`Wagner:1974`.
68
69
    Levenshtein edit distance ordinarily has unit insertion, deletion, and
70
    substitution costs.
71
    """
72
73 1
    def dist_abs(self, src, tar, mode='lev', cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
74
        """Return the Levenshtein distance between two strings.
75
76
        :param str src: source string for comparison
77
        :param str tar: target string for comparison
78
        :param str mode: specifies a mode for computing the Levenshtein
79
            distance:
80
81
            - 'lev' (default) computes the ordinary Levenshtein distance,
82
              in which edits may include inserts, deletes, and substitutions
83
            - 'osa' computes the Optimal String Alignment distance, in which
84
              edits may include inserts, deletes, substitutions, and
85
              transpositions but substrings may only be edited once
86
            - 'dam' computes the Damerau-Levenshtein distance, in which
87
              edits may include inserts, deletes, substitutions, and
88
              transpositions and substrings may undergo repeated edits
89
90
        :param tuple cost: a 4-tuple representing the cost of the four possible
91
            edits: inserts, deletes, substitutions, and transpositions,
92
            respectively (by default: (1, 1, 1, 1))
93
        :returns: the Levenshtein distance between src & tar
94
        :rtype: int (may return a float if cost has float values)
95
96
        >>> cmp = Levenshtein()
97
        >>> cmp.dist_abs('cat', 'hat')
98
        1
99
        >>> cmp.dist_abs('Niall', 'Neil')
100
        3
101
        >>> cmp.dist_abs('aluminum', 'Catalan')
102
        7
103
        >>> cmp.dist_abs('ATCG', 'TAGC')
104
        3
105
106
        >>> cmp.dist_abs('ATCG', 'TAGC', mode='osa')
107
        2
108
        >>> cmp.dist_abs('ACTG', 'TAGC', mode='osa')
109
        4
110
        """
111 1
        ins_cost, del_cost, sub_cost, trans_cost = cost
112
113 1
        if src == tar:
114 1
            return 0
115 1
        if not src:
116 1
            return len(tar) * ins_cost
117 1
        if not tar:
118 1
            return len(src) * del_cost
119
120 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int)
121 1
        for i in range(len(src) + 1):
122 1
            d_mat[i, 0] = i * del_cost
123 1
        for j in range(len(tar) + 1):
124 1
            d_mat[0, j] = j * ins_cost
125
126 1
        for i in range(len(src)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
127 1
            for j in range(len(tar)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
128 1
                d_mat[i + 1, j + 1] = min(
129
                    d_mat[i + 1, j] + ins_cost,  # ins
130
                    d_mat[i, j + 1] + del_cost,  # del
131
                    d_mat[i, j]
132
                    + (sub_cost if src[i] != tar[j] else 0),  # sub/==
133
                )
134
135 1
                if mode == 'osa':
136 1
                    if (
137
                        i + 1 > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
138
                        and j + 1 > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
139
                        and src[i] == tar[j - 1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
140
                        and src[i - 1] == tar[j]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
141
                    ):
142
                        # transposition
143 1
                        d_mat[i + 1, j + 1] = min(
144
                            d_mat[i + 1, j + 1],
145
                            d_mat[i - 1, j - 1] + trans_cost,
146
                        )
147
148 1
        return d_mat[len(src), len(tar)]
149
150 1
    def dist(self, src, tar, mode='lev', cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
151
        """Return the normalized Levenshtein distance between two strings.
152
153
        The Levenshtein distance is normalized by dividing the Levenshtein
154
        distance (calculated by any of the three supported methods) by the
155
        greater of the number of characters in src times the cost of a delete
156
        and the number of characters in tar times the cost of an insert.
157
        For the case in which all operations have :math:`cost = 1`, this is
158
        equivalent to the greater of the length of the two strings src & tar.
159
160
        :param str src: source string for comparison
161
        :param str tar: target string for comparison
162
        :param str mode: specifies a mode for computing the Levenshtein
163
            distance:
164
165
            - 'lev' (default) computes the ordinary Levenshtein distance,
166
              in which edits may include inserts, deletes, and substitutions
167
            - 'osa' computes the Optimal String Alignment distance, in which
168
              edits may include inserts, deletes, substitutions, and
169
              transpositions but substrings may only be edited once
170
171
        :param tuple cost: a 4-tuple representing the cost of the four possible
172
            edits: inserts, deletes, substitutions, and transpositions,
173
            respectively (by default: (1, 1, 1, 1))
174
        :returns: normalized Levenshtein distance
175
        :rtype: float
176
177
        >>> cmp = Levenshtein()
178
        >>> round(cmp.dist('cat', 'hat'), 12)
179
        0.333333333333
180
        >>> round(cmp.dist('Niall', 'Neil'), 12)
181
        0.6
182
        >>> cmp.dist('aluminum', 'Catalan')
183
        0.875
184
        >>> cmp.dist('ATCG', 'TAGC')
185
        0.75
186
        """
187 1
        if src == tar:
188 1
            return 0
189 1
        ins_cost, del_cost = cost[:2]
190 1
        return levenshtein(src, tar, mode, cost) / (
191
            max(len(src) * del_cost, len(tar) * ins_cost)
192
        )
193
194
195 1
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
196
    """Return the Levenshtein distance between two strings.
197
198
    :param str src: source string for comparison
199
    :param str tar: target string for comparison
200
    :param str mode: specifies a mode for computing the Levenshtein distance:
201
202
        - 'lev' (default) computes the ordinary Levenshtein distance,
203
          in which edits may include inserts, deletes, and substitutions
204
        - 'osa' computes the Optimal String Alignment distance, in which
205
          edits may include inserts, deletes, substitutions, and
206
          transpositions but substrings may only be edited once
207
208
    :param tuple cost: a 4-tuple representing the cost of the four possible
209
        edits: inserts, deletes, substitutions, and transpositions,
210
        respectively (by default: (1, 1, 1, 1))
211
    :returns: the Levenshtein distance between src & tar
212
    :rtype: int (may return a float if cost has float values)
213
214
    >>> levenshtein('cat', 'hat')
215
    1
216
    >>> levenshtein('Niall', 'Neil')
217
    3
218
    >>> levenshtein('aluminum', 'Catalan')
219
    7
220
    >>> levenshtein('ATCG', 'TAGC')
221
    3
222
223
    >>> levenshtein('ATCG', 'TAGC', mode='osa')
224
    2
225
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
226
    4
227
    """
228 1
    return Levenshtein().dist_abs(src, tar, mode, cost)
229
230
231 1
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
232
    """Return the normalized Levenshtein distance between two strings.
233
234
    The Levenshtein distance is normalized by dividing the Levenshtein distance
235
    (calculated by any of the three supported methods) by the greater of
236
    the number of characters in src times the cost of a delete and
237
    the number of characters in tar times the cost of an insert.
238
    For the case in which all operations have :math:`cost = 1`, this is
239
    equivalent to the greater of the length of the two strings src & tar.
240
241
    :param str src: source string for comparison
242
    :param str tar: target string for comparison
243
    :param str mode: specifies a mode for computing the Levenshtein distance:
244
245
        - 'lev' (default) computes the ordinary Levenshtein distance,
246
          in which edits may include inserts, deletes, and substitutions
247
        - 'osa' computes the Optimal String Alignment distance, in which
248
          edits may include inserts, deletes, substitutions, and
249
          transpositions but substrings may only be edited once
250
251
    :param tuple cost: a 4-tuple representing the cost of the four possible
252
        edits: inserts, deletes, substitutions, and transpositions,
253
        respectively (by default: (1, 1, 1, 1))
254
    :returns: normalized Levenshtein distance
255
    :rtype: float
256
257
    >>> round(dist_levenshtein('cat', 'hat'), 12)
258
    0.333333333333
259
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
260
    0.6
261
    >>> dist_levenshtein('aluminum', 'Catalan')
262
    0.875
263
    >>> dist_levenshtein('ATCG', 'TAGC')
264
    0.75
265
    """
266 1
    return Levenshtein().dist(src, tar, mode, cost)
267
268
269 1
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
270
    """Return the Levenshtein similarity of two strings.
271
272
    Normalized Levenshtein similarity is the complement of normalized
273
    Levenshtein distance:
274
    :math:`sim_{Levenshtein} = 1 - dist_{Levenshtein}`.
275
276
    :param str src: source string for comparison
277
    :param str tar: target string for comparison
278
    :param str mode: specifies a mode for computing the Levenshtein distance:
279
280
            - 'lev' (default) computes the ordinary Levenshtein distance,
281
              in which edits may include inserts, deletes, and substitutions
282
            - 'osa' computes the Optimal String Alignment distance, in which
283
              edits may include inserts, deletes, substitutions, and
284
              transpositions but substrings may only be edited once
285
286
    :param tuple cost: a 4-tuple representing the cost of the four possible
287
        edits:
288
        inserts, deletes, substitutions, and transpositions, respectively
289
        (by default: (1, 1, 1, 1))
290
    :returns: normalized Levenshtein similarity
291
    :rtype: float
292
293
    >>> round(sim_levenshtein('cat', 'hat'), 12)
294
    0.666666666667
295
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
296
    0.4
297
    >>> sim_levenshtein('aluminum', 'Catalan')
298
    0.125
299
    >>> sim_levenshtein('ATCG', 'TAGC')
300
    0.25
301
    """
302 1
    return Levenshtein().sim(src, tar, mode, cost)
303
304
305 1
class DamerauLevenshtein(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
306
    """Damerau-Levenshtein distance.
307
308
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
309
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
310
    :cite:`Stern:2014`, under the MIT license:
311
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
312
    """
313
314 1
    def dist_abs(self, src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
315
        """Return the Damerau-Levenshtein distance between two strings.
316
317
        :param str src: source string for comparison
318
        :param str tar: target string for comparison
319
        :param tuple cost: a 4-tuple representing the cost of the four possible
320
            edits:
321
            inserts, deletes, substitutions, and transpositions, respectively
322
            (by default: (1, 1, 1, 1))
323
        :returns: the Damerau-Levenshtein distance between src & tar
324
        :rtype: int (may return a float if cost has float values)
325
326
        >>> cmp = DamerauLevenshtein()
327
        >>> cmp.dist_abs('cat', 'hat')
328
        1
329
        >>> cmp.dist_abs('Niall', 'Neil')
330
        3
331
        >>> cmp.dist_abs('aluminum', 'Catalan')
332
        7
333
        >>> cmp.dist_abs('ATCG', 'TAGC')
334
        2
335
        """
336 1
        ins_cost, del_cost, sub_cost, trans_cost = cost
337
338 1
        if src == tar:
339 1
            return 0
340 1
        if not src:
341 1
            return len(tar) * ins_cost
342 1
        if not tar:
343 1
            return len(src) * del_cost
344
345 1
        if 2 * trans_cost < ins_cost + del_cost:
346 1
            raise ValueError(
347
                'Unsupported cost assignment; the cost of two transpositions'
348
                + 'must not be less than the cost of an insert plus a delete.'
349
            )
350
351 1
        d_mat = np_zeros((len(src)) * (len(tar)), dtype=np_int).reshape(
352
            (len(src), len(tar))
353
        )
354
355 1
        if src[0] != tar[0]:
356 1
            d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
357
358 1
        src_index_by_character = {src[0]: 0}
359 1
        for i in range(1, len(src)):
360 1
            del_distance = d_mat[i - 1, 0] + del_cost
361 1
            ins_distance = (i + 1) * del_cost + ins_cost
362 1
            match_distance = i * del_cost + (
363
                0 if src[i] == tar[0] else sub_cost
364
            )
365 1
            d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
366
367 1
        for j in range(1, len(tar)):
368 1
            del_distance = (j + 1) * ins_cost + del_cost
369 1
            ins_distance = d_mat[0, j - 1] + ins_cost
370 1
            match_distance = j * ins_cost + (
371
                0 if src[0] == tar[j] else sub_cost
372
            )
373 1
            d_mat[0, j] = min(del_distance, ins_distance, match_distance)
374
375 1
        for i in range(1, len(src)):
376 1
            max_src_letter_match_index = 0 if src[i] == tar[0] else -1
377 1
            for j in range(1, len(tar)):
378 1
                candidate_swap_index = (
379
                    -1
380
                    if tar[j] not in src_index_by_character
381
                    else src_index_by_character[tar[j]]
382
                )
383 1
                j_swap = max_src_letter_match_index
384 1
                del_distance = d_mat[i - 1, j] + del_cost
385 1
                ins_distance = d_mat[i, j - 1] + ins_cost
386 1
                match_distance = d_mat[i - 1, j - 1]
387 1
                if src[i] != tar[j]:
388 1
                    match_distance += sub_cost
389
                else:
390 1
                    max_src_letter_match_index = j
391
392 1
                if candidate_swap_index != -1 and j_swap != -1:
393 1
                    i_swap = candidate_swap_index
394
395 1
                    if i_swap == 0 and j_swap == 0:
396 1
                        pre_swap_cost = 0
397
                    else:
398 1
                        pre_swap_cost = d_mat[
399
                            max(0, i_swap - 1), max(0, j_swap - 1)
400
                        ]
401 1
                    swap_distance = (
402
                        pre_swap_cost
403
                        + (i - i_swap - 1) * del_cost
404
                        + (j - j_swap - 1) * ins_cost
405
                        + trans_cost
406
                    )
407
                else:
408 1
                    swap_distance = maxsize
409
410 1
                d_mat[i, j] = min(
411
                    del_distance, ins_distance, match_distance, swap_distance
412
                )
413 1
            src_index_by_character[src[i]] = i
414
415 1
        return d_mat[len(src) - 1, len(tar) - 1]
416
417 1
    def dist(self, src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
418
        """Return the Damerau-Levenshtein similarity of two strings.
419
420
        Damerau-Levenshtein distance normalized to the interval [0, 1].
421
422
        The Damerau-Levenshtein distance is normalized by dividing the
423
        Damerau-Levenshtein distance by the greater of
424
        the number of characters in src times the cost of a delete and
425
        the number of characters in tar times the cost of an insert.
426
        For the case in which all operations have :math:`cost = 1`, this is
427
        equivalent to the greater of the length of the two strings src & tar.
428
429
        The arguments are identical to those of the levenshtein() function.
430
431
        :param str src: source string for comparison
432
        :param str tar: target string for comparison
433
        :param tuple cost: a 4-tuple representing the cost of the four possible
434
            edits:
435
            inserts, deletes, substitutions, and transpositions, respectively
436
            (by default: (1, 1, 1, 1))
437
        :returns: normalized Damerau-Levenshtein distance
438
        :rtype: float
439
440
        >>> cmp = DamerauLevenshtein()
441
        >>> round(cmp.dist('cat', 'hat'), 12)
442
        0.333333333333
443
        >>> round(cmp.dist('Niall', 'Neil'), 12)
444
        0.6
445
        >>> cmp.dist('aluminum', 'Catalan')
446
        0.875
447
        >>> cmp.dist('ATCG', 'TAGC')
448
        0.5
449
        """
450 1
        if src == tar:
451 1
            return 0.0
452 1
        ins_cost, del_cost = cost[:2]
453 1
        return self.dist_abs(src, tar, cost) / (
454
            max(len(src) * del_cost, len(tar) * ins_cost)
455
        )
456
457
458 1
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
459
    """Return the Damerau-Levenshtein distance between two strings.
460
461
    :param str src: source string for comparison
462
    :param str tar: target string for comparison
463
    :param tuple cost: a 4-tuple representing the cost of the four possible
464
        edits:
465
        inserts, deletes, substitutions, and transpositions, respectively
466
        (by default: (1, 1, 1, 1))
467
    :returns: the Damerau-Levenshtein distance between src & tar
468
    :rtype: int (may return a float if cost has float values)
469
470
    >>> damerau_levenshtein('cat', 'hat')
471
    1
472
    >>> damerau_levenshtein('Niall', 'Neil')
473
    3
474
    >>> damerau_levenshtein('aluminum', 'Catalan')
475
    7
476
    >>> damerau_levenshtein('ATCG', 'TAGC')
477
    2
478
    """
479 1
    return DamerauLevenshtein().dist_abs(src, tar, cost)
480
481
482 1
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
483
    """Return the Damerau-Levenshtein similarity of two strings.
484
485
    Damerau-Levenshtein distance normalized to the interval [0, 1].
486
487
    The Damerau-Levenshtein distance is normalized by dividing the
488
    Damerau-Levenshtein distance by the greater of
489
    the number of characters in src times the cost of a delete and
490
    the number of characters in tar times the cost of an insert.
491
    For the case in which all operations have :math:`cost = 1`, this is
492
    equivalent to the greater of the length of the two strings src & tar.
493
494
    The arguments are identical to those of the levenshtein() function.
495
496
    :param str src: source string for comparison
497
    :param str tar: target string for comparison
498
    :param tuple cost: a 4-tuple representing the cost of the four possible
499
        edits:
500
        inserts, deletes, substitutions, and transpositions, respectively
501
        (by default: (1, 1, 1, 1))
502
    :returns: normalized Damerau-Levenshtein distance
503
    :rtype: float
504
505
    >>> round(dist_damerau('cat', 'hat'), 12)
506
    0.333333333333
507
    >>> round(dist_damerau('Niall', 'Neil'), 12)
508
    0.6
509
    >>> dist_damerau('aluminum', 'Catalan')
510
    0.875
511
    >>> dist_damerau('ATCG', 'TAGC')
512
    0.5
513
    """
514 1
    return DamerauLevenshtein().dist(src, tar, cost)
515
516
517 1
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
518
    """Return the Damerau-Levenshtein similarity of two strings.
519
520
    Normalized Damerau-Levenshtein similarity the complement of normalized
521
    Damerau-Levenshtein distance:
522
    :math:`sim_{Damerau} = 1 - dist_{Damerau}`.
523
524
    The arguments are identical to those of the levenshtein() function.
525
526
    :param str src: source string for comparison
527
    :param str tar: target string for comparison
528
    :param tuple cost: a 4-tuple representing the cost of the four possible
529
        edits:
530
        inserts, deletes, substitutions, and transpositions, respectively
531
        (by default: (1, 1, 1, 1))
532
    :returns: normalized Damerau-Levenshtein similarity
533
    :rtype: float
534
535
    >>> round(sim_damerau('cat', 'hat'), 12)
536
    0.666666666667
537
    >>> round(sim_damerau('Niall', 'Neil'), 12)
538
    0.4
539
    >>> sim_damerau('aluminum', 'Catalan')
540
    0.125
541
    >>> sim_damerau('ATCG', 'TAGC')
542
    0.5
543
    """
544 1
    return DamerauLevenshtein().sim(src, tar, cost)
545
546
547 1
class Indel(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
548
    """Indel distance.
549
550
    This is equivalent to Levenshtein distance, when only inserts and deletes
551
    are possible.
552
    """
553
554 1
    lev = Levenshtein()
555
556 1
    def dist_abs(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
557
        """Return the indel distance between two strings.
558
559
        :param str src: source string for comparison
560
        :param str tar: target string for comparison
561
        :returns: indel distance
562
        :rtype: int
563
564
        >>> cmp = Indel()
565
        >>> cmp.dist_abs('cat', 'hat')
566
        2
567
        >>> cmp.dist_abs('Niall', 'Neil')
568
        3
569
        >>> cmp.dist_abs('Colin', 'Cuilen')
570
        5
571
        >>> cmp.dist_abs('ATCG', 'TAGC')
572
        4
573
        """
574 1
        return self.lev.dist_abs(src, tar, mode='lev', cost=(1, 1, 9999, 9999))
575
576 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
577
        """Return the normalized indel distance between two strings.
578
579
        This is equivalent to normalized Levenshtein distance, when only
580
        inserts and deletes are possible.
581
582
        :param str src: source string for comparison
583
        :param str tar: target string for comparison
584
        :returns: indel distance
585
        :rtype: float
586
587
        >>> cmp = Indel()
588
        >>> round(cmp.dist('cat', 'hat'), 12)
589
        0.333333333333
590
        >>> round(cmp.dist('Niall', 'Neil'), 12)
591
        0.333333333333
592
        >>> round(cmp.dist('Colin', 'Cuilen'), 12)
593
        0.454545454545
594
        >>> cmp.dist('ATCG', 'TAGC')
595
        0.5
596
        """
597 1
        if src == tar:
598 1
            return 0.0
599 1
        return self.dist_abs(src, tar) / (len(src) + len(tar))
600
601
602 1
def indel(src, tar):
603
    """Return the indel distance between two strings.
604
605
    :param str src: source string for comparison
606
    :param str tar: target string for comparison
607
    :returns: indel distance
608
    :rtype: int
609
610
    >>> indel('cat', 'hat')
611
    2
612
    >>> indel('Niall', 'Neil')
613
    3
614
    >>> indel('Colin', 'Cuilen')
615
    5
616
    >>> indel('ATCG', 'TAGC')
617
    4
618
    """
619
    return Indel().dist_abs(src, tar)
620
621
622 1
def dist_indel(src, tar):
623
    """Return the normalized indel distance between two strings.
624
625
    This is equivalent to normalized Levenshtein distance, when only inserts
626
    and deletes are possible.
627
628
    :param str src: source string for comparison
629
    :param str tar: target string for comparison
630
    :returns: indel distance
631
    :rtype: float
632
633
    >>> round(dist_indel('cat', 'hat'), 12)
634
    0.333333333333
635
    >>> round(dist_indel('Niall', 'Neil'), 12)
636
    0.333333333333
637
    >>> round(dist_indel('Colin', 'Cuilen'), 12)
638
    0.454545454545
639
    >>> dist_indel('ATCG', 'TAGC')
640
    0.5
641
    """
642 1
    return Indel().dist(src, tar)
643
644
645 1
def sim_indel(src, tar):
646
    """Return the normalized indel similarity of two strings.
647
648
    This is equivalent to normalized Levenshtein similarity, when only inserts
649
    and deletes are possible.
650
651
    :param str src: source string for comparison
652
    :param str tar: target string for comparison
653
    :returns: indel similarity
654
    :rtype: float
655
656
    >>> round(sim_indel('cat', 'hat'), 12)
657
    0.666666666667
658
    >>> round(sim_indel('Niall', 'Neil'), 12)
659
    0.666666666667
660
    >>> round(sim_indel('Colin', 'Cuilen'), 12)
661
    0.545454545455
662
    >>> sim_indel('ATCG', 'TAGC')
663
    0.5
664
    """
665 1
    return Indel().sim(src, tar)
666
667
668
if __name__ == '__main__':
669
    import doctest
670
671
    doctest.testmod()
672