Completed
Pull Request — master (#141)
by Chris
11:04
created

abydos.distance._damerau_levenshtein.sim_damerau()   A

Complexity

Conditions 1

Size

Total Lines 27
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 27
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._damerau_levenshtein.
20
21
Damerau-Levenshtein distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from sys import maxsize
32
33 1
from numpy import int as np_int
34 1
from numpy import zeros as np_zeros
35
36 1
from six.moves import range
37
38 1
from ._distance import _Distance
39
40 1
__all__ = [
41
    'DamerauLevenshtein',
42
    'damerau_levenshtein',
43
    'dist_damerau',
44
    'sim_damerau',
45
]
46
47
48 1
class DamerauLevenshtein(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
49
    """Damerau-Levenshtein distance.
50
51
    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
52
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
53
    :cite:`Stern:2014`, under the MIT license:
54
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
55
    """
56
57 1
    def dist_abs(self, src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (22/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
58
        """Return the Damerau-Levenshtein distance between two strings.
59
60
        Args:
61
            src (str): Source string for comparison
62
            tar (str): Target string for comparison
63
            cost (tuple): a 4-tuple representing the cost of the four possible
64
                edits: inserts, deletes, substitutions, and transpositions,
65
                respectively (by default: (1, 1, 1, 1))
66
67
        Returns:
68
            int (may return a float if cost has float values): The
69
                Damerau-Levenshtein distance between src & tar
70
71
        Raises:
72
            ValueError: Unsupported cost assignment; the cost of two
73
                transpositions must not be less than the cost of an insert plus
74
                a delete.
75
76
        Examples:
77
            >>> cmp = DamerauLevenshtein()
78
            >>> cmp.dist_abs('cat', 'hat')
79
            1
80
            >>> cmp.dist_abs('Niall', 'Neil')
81
            3
82
            >>> cmp.dist_abs('aluminum', 'Catalan')
83
            7
84
            >>> cmp.dist_abs('ATCG', 'TAGC')
85
            2
86
87
        """
88 1
        ins_cost, del_cost, sub_cost, trans_cost = cost
89
90 1
        if src == tar:
91 1
            return 0
92 1
        if not src:
93 1
            return len(tar) * ins_cost
94 1
        if not tar:
95 1
            return len(src) * del_cost
96
97 1
        if 2 * trans_cost < ins_cost + del_cost:
98 1
            raise ValueError(
99
                'Unsupported cost assignment; the cost of two transpositions '
100
                + 'must not be less than the cost of an insert plus a delete.'
101
            )
102
103 1
        d_mat = np_zeros((len(src)) * (len(tar)), dtype=np_int).reshape(
104
            (len(src), len(tar))
105
        )
106
107 1
        if src[0] != tar[0]:
108 1
            d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
109
110 1
        src_index_by_character = {src[0]: 0}
111 1
        for i in range(1, len(src)):
112 1
            del_distance = d_mat[i - 1, 0] + del_cost
113 1
            ins_distance = (i + 1) * del_cost + ins_cost
114 1
            match_distance = i * del_cost + (
115
                0 if src[i] == tar[0] else sub_cost
116
            )
117 1
            d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
118
119 1
        for j in range(1, len(tar)):
120 1
            del_distance = (j + 1) * ins_cost + del_cost
121 1
            ins_distance = d_mat[0, j - 1] + ins_cost
122 1
            match_distance = j * ins_cost + (
123
                0 if src[0] == tar[j] else sub_cost
124
            )
125 1
            d_mat[0, j] = min(del_distance, ins_distance, match_distance)
126
127 1
        for i in range(1, len(src)):
128 1
            max_src_letter_match_index = 0 if src[i] == tar[0] else -1
129 1
            for j in range(1, len(tar)):
130 1
                candidate_swap_index = (
131
                    -1
132
                    if tar[j] not in src_index_by_character
133
                    else src_index_by_character[tar[j]]
134
                )
135 1
                j_swap = max_src_letter_match_index
136 1
                del_distance = d_mat[i - 1, j] + del_cost
137 1
                ins_distance = d_mat[i, j - 1] + ins_cost
138 1
                match_distance = d_mat[i - 1, j - 1]
139 1
                if src[i] != tar[j]:
140 1
                    match_distance += sub_cost
141
                else:
142 1
                    max_src_letter_match_index = j
143
144 1
                if candidate_swap_index != -1 and j_swap != -1:
145 1
                    i_swap = candidate_swap_index
146
147 1
                    if i_swap == 0 and j_swap == 0:
148 1
                        pre_swap_cost = 0
149
                    else:
150 1
                        pre_swap_cost = d_mat[
151
                            max(0, i_swap - 1), max(0, j_swap - 1)
152
                        ]
153 1
                    swap_distance = (
154
                        pre_swap_cost
155
                        + (i - i_swap - 1) * del_cost
156
                        + (j - j_swap - 1) * ins_cost
157
                        + trans_cost
158
                    )
159
                else:
160 1
                    swap_distance = maxsize
161
162 1
                d_mat[i, j] = min(
163
                    del_distance, ins_distance, match_distance, swap_distance
164
                )
165 1
            src_index_by_character[src[i]] = i
166
167 1
        return d_mat[len(src) - 1, len(tar) - 1]
168
169 1
    def dist(self, src, tar, cost=(1, 1, 1, 1)):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
170
        """Return the Damerau-Levenshtein similarity of two strings.
171
172
        Damerau-Levenshtein distance normalized to the interval [0, 1].
173
174
        The Damerau-Levenshtein distance is normalized by dividing the
175
        Damerau-Levenshtein distance by the greater of
176
        the number of characters in src times the cost of a delete and
177
        the number of characters in tar times the cost of an insert.
178
        For the case in which all operations have :math:`cost = 1`, this is
179
        equivalent to the greater of the length of the two strings src & tar.
180
181
        Args:
182
            src (str): Source string for comparison
183
            tar (str): Target string for comparison
184
            cost (tuple): a 4-tuple representing the cost of the four possible
185
                edits: inserts, deletes, substitutions, and transpositions,
186
                respectively (by default: (1, 1, 1, 1))
187
188
        Returns:
189
            float: The normalized Damerau-Levenshtein distance
190
191
        Examples:
192
            >>> cmp = DamerauLevenshtein()
193
            >>> round(cmp.dist('cat', 'hat'), 12)
194
            0.333333333333
195
            >>> round(cmp.dist('Niall', 'Neil'), 12)
196
            0.6
197
            >>> cmp.dist('aluminum', 'Catalan')
198
            0.875
199
            >>> cmp.dist('ATCG', 'TAGC')
200
            0.5
201
202
        """
203 1
        if src == tar:
204 1
            return 0.0
205 1
        ins_cost, del_cost = cost[:2]
206 1
        return self.dist_abs(src, tar, cost) / (
207
            max(len(src) * del_cost, len(tar) * ins_cost)
208
        )
209
210
211 1
def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
212
    """Return the Damerau-Levenshtein distance between two strings.
213
214
    This is a wrapper of :py:meth:`DamerauLevenshtein.dist_abs`.
215
216
    Args:
217
        src (str): Source string for comparison
218
        tar (str): Target string for comparison
219
        cost (tuple): a 4-tuple representing the cost of the four possible
220
            edits: inserts, deletes, substitutions, and transpositions,
221
            respectively (by default: (1, 1, 1, 1))
222
223
    Returns:
224
        int (may return a float if cost has float values): The
225
            Damerau-Levenshtein distance between src & tar
226
227
    Examples:
228
        >>> damerau_levenshtein('cat', 'hat')
229
        1
230
        >>> damerau_levenshtein('Niall', 'Neil')
231
        3
232
        >>> damerau_levenshtein('aluminum', 'Catalan')
233
        7
234
        >>> damerau_levenshtein('ATCG', 'TAGC')
235
        2
236
237
    """
238 1
    return DamerauLevenshtein().dist_abs(src, tar, cost)
239
240
241 1
def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
242
    """Return the Damerau-Levenshtein similarity of two strings.
243
244
    This is a wrapper of :py:meth:`DamerauLevenshtein.dist`.
245
246
    Args:
247
        src (str): Source string for comparison
248
        tar (str): Target string for comparison
249
        cost (tuple): a 4-tuple representing the cost of the four possible
250
            edits: inserts, deletes, substitutions, and transpositions,
251
            respectively (by default: (1, 1, 1, 1))
252
253
    Returns:
254
        float: The normalized Damerau-Levenshtein distance
255
256
    Examples:
257
        >>> round(dist_damerau('cat', 'hat'), 12)
258
        0.333333333333
259
        >>> round(dist_damerau('Niall', 'Neil'), 12)
260
        0.6
261
        >>> dist_damerau('aluminum', 'Catalan')
262
        0.875
263
        >>> dist_damerau('ATCG', 'TAGC')
264
        0.5
265
266
    """
267 1
    return DamerauLevenshtein().dist(src, tar, cost)
268
269
270 1
def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
271
    """Return the Damerau-Levenshtein similarity of two strings.
272
273
    This is a wrapper of :py:meth:`DamerauLevenshtein.sim`.
274
275
    Args:
276
        src (str): Source string for comparison
277
        tar (str): Target string for comparison
278
        cost (tuple): a 4-tuple representing the cost of the four possible
279
            edits: inserts, deletes, substitutions, and transpositions,
280
            respectively (by default: (1, 1, 1, 1))
281
282
    Returns:
283
        float: The normalized Damerau-Levenshtein similarity
284
285
    Examples:
286
        >>> round(sim_damerau('cat', 'hat'), 12)
287
        0.666666666667
288
        >>> round(sim_damerau('Niall', 'Neil'), 12)
289
        0.4
290
        >>> sim_damerau('aluminum', 'Catalan')
291
        0.125
292
        >>> sim_damerau('ATCG', 'TAGC')
293
        0.5
294
295
    """
296 1
    return DamerauLevenshtein().sim(src, tar, cost)
297
298
299
if __name__ == '__main__':
300
    import doctest
301
302
    doctest.testmod()
303