Completed
Pull Request — master (#225)
by Chris
09:15
created

DiscountedLevenshtein.alignment()   B

Complexity

Conditions 7

Size

Total Lines 76
Code Lines 32

Duplication

Lines 76
Ratio 100 %

Code Coverage

Tests 32
CRAP Score 7

Importance

Changes 0
Metric Value
eloc 32
dl 76
loc 76
ccs 32
cts 32
cp 1
rs 7.712
c 0
b 0
f 0
cc 7
nop 3
crap 7

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._discounted_levenshtein.
20
21
Discounted Levenshtein edit distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from math import log
32
33 1
from numpy import float as np_float
34 1
from numpy import zeros as np_zeros
35
36 1
from six.moves import range
37
38 1
from ._distance import _Distance
39
40 1
__all__ = ['DiscountedLevenshtein']
41
42
43 1
class DiscountedLevenshtein(_Distance):
44
    """Discounted Levenshtein distance.
45
46
    This is a variant of Levenshtein distance for which edits later in a string
47
    have discounted cost, on the theory that earlier edits are less likely
48
    than later ones.
49
50
    .. versionadded:: 0.4.1
51
    """
52
53 1
    def __init__(
54
        self,
55
        mode='lev',
56
        normalizer=max,
57
        discount_from=1,
58
        discount_func='log',
59
        vowels='aeiou',
60
        **kwargs
61
    ):
62
        """Initialize DiscountedLevenshtein instance.
63
64
        Parameters
65
        ----------
66
        mode : str
67
            Specifies a mode for computing the discounted Levenshtein distance:
68
69
                - ``lev`` (default) computes the ordinary Levenshtein distance,
70
                  in which edits may include inserts, deletes, and
71
                  substitutions
72
                - ``osa`` computes the Optimal String Alignment distance, in
73
                  which edits may include inserts, deletes, substitutions, and
74
                  transpositions but substrings may only be edited once
75
76
        normalizer : function
77
            A function that takes an list and computes a normalization term
78
            by which the edit distance is divided (max by default). Another
79
            good option is the sum function.
80
        discount_from : int or str
81
            If an int is supplied, this is the first character whose edit cost
82
            will be discounted. If the str ``coda`` is supplied, discounting
83
            will start with the first non-vowel after the first vowel (the
84
            first syllable coda).
85
        discount_func : str or function
86
            The two supported str arguments are ``log``, for a logarithmic
87
            discount function, and ``exp`` for a exponential discount function.
88
            See notes below for information on how to supply your own
89
            discount function.
90
        vowels : str
91
            These are the letters to consider as vowels when discount_from is
92
            set to ``coda``. It defaults to the English vowels 'aeiou', but
93
            it would be reasonable to localize this to other languages or to
94
            add orthographic semi-vowels like 'y', 'w', and even 'h'.
95
        **kwargs
96
            Arbitrary keyword arguments
97
98
        Notes
99
        -----
100
        This class is highly experimental and will need additional tuning.
101
102
        The discount function can be passed as a callable function. It should
103
        expect an integer as its only argument and return a float, ideally
104
        less than or equal to 1.0. The argument represents the degree of
105
        discounting to apply.
106
107
108
        .. versionadded:: 0.4.1
109
110
        """
111 1
        super(DiscountedLevenshtein, self).__init__(**kwargs)
112 1
        self._mode = mode
113 1
        self._normalizer = normalizer
114 1
        self._discount_from = discount_from
115 1
        self._vowels = set(vowels.lower())
116 1
        if callable(discount_func):
117 1
            self._cost = discount_func
118 1
        elif discount_func == 'exp':
119 1
            self._cost = self._exp_discount
120
        else:
121 1
            self._cost = self._log_discount
122
123 1
    @staticmethod
124
    def _log_discount(discounts):
125 1
        return 1 / (log(1 + discounts / 5) + 1)
126
127 1
    @staticmethod
128
    def _exp_discount(discounts):
129 1
        return 1 / (discounts + 1) ** 0.2
130
131 1
    def _alignment_matrix(self, src, tar):
132
        """Return the Levenshtein alignment matrix.
133
134
        Parameters
135
        ----------
136
        src : str
137
            Source string for comparison
138
        tar : str
139
            Target string for comparison
140
141
        Returns
142
        -------
143
        numpy.ndarray
144
            The alignment matrix
145
146
147
        .. versionadded:: 0.4.1
148
149
        """
150 1
        src_len = len(src)
151 1
        tar_len = len(tar)
152
153 1
        if self._discount_from == 'coda':
154 1
            discount_from = [0, 0]
155
156 1
            src_voc = src.lower()
157 1
            for i in range(len(src_voc)):
158 1
                if src_voc[i] in self._vowels:
159 1
                    discount_from[0] = i
160 1
                    break
161 1
            for i in range(discount_from[0], len(src_voc)):
162 1
                if src_voc[i] not in self._vowels:
163 1
                    discount_from[0] = i
164 1
                    break
165
            else:
166 1
                discount_from[0] += 1
167
168 1
            tar_voc = tar.lower()
169 1
            for i in range(len(tar_voc)):
170 1
                if tar_voc[i] in self._vowels:
171 1
                    discount_from[1] = i
172 1
                    break
173 1
            for i in range(discount_from[1], len(tar_voc)):
174 1
                if tar_voc[i] not in self._vowels:
175 1
                    discount_from[1] = i
176 1
                    break
177
            else:
178 1
                discount_from[1] += 1
179
180 1
        elif isinstance(self._discount_from, int):
181 1
            discount_from = [self._discount_from, self._discount_from]
182
        else:
183 1
            discount_from = [1, 1]
184
185 1
        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
186 1
        for i in range(1, src_len + 1):
187 1
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(
188
                max(0, i - discount_from[0])
189
            )
190 1
        for j in range(1, tar_len + 1):
191 1
            d_mat[0, j] = d_mat[0, j - 1] + self._cost(
192
                max(0, j - discount_from[1])
193
            )
194
195 1
        for i in range(src_len):
196 1
            i_extend = self._cost(max(0, i - discount_from[0]))
197 1
            for j in range(tar_len):
198 1
                cost = min(i_extend, self._cost(max(0, j - discount_from[1])))
199 1
                d_mat[i + 1, j + 1] = min(
200
                    d_mat[i + 1, j] + cost,  # ins
201
                    d_mat[i, j + 1] + cost,  # del
202
                    d_mat[i, j] + (cost if src[i] != tar[j] else 0),  # sub/==
203
                )
204
205 1
                if self._mode == 'osa':
206 1
                    if (
207
                        i + 1 > 1
208
                        and j + 1 > 1
209
                        and src[i] == tar[j - 1]
210
                        and src[i - 1] == tar[j]
211
                    ):
212
                        # transposition
213 1
                        d_mat[i + 1, j + 1] = min(
214
                            d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
215
                        )
216
217 1
        return d_mat
218
219 1 View Code Duplication
    def alignment(self, src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
220
        """Return the Levenshtein alignment of two strings.
221
222
        Parameters
223
        ----------
224
        src : str
225
            Source string for comparison
226
        tar : str
227
            Target string for comparison
228
229
        Returns
230
        -------
231
        tuple
232
            A tuple containing the Levenshtein distance and the two strings,
233
            aligned.
234
235
        Examples
236
        --------
237
        >>> cmp = DiscountedLevenshtein()
238
        >>> cmp.alignment('cat', 'hat')
239
        (1.0, 'cat', 'hat')
240
        >>> cmp.alignment('Niall', 'Neil')
241
        (2.526064024369237, 'N-iall', 'Neil--')
242
        >>> cmp.alignment('aluminum', 'Catalan')
243
        (5.053867269967515, '-aluminum', 'Catalan--')
244
        >>> cmp.alignment('ATCG', 'TAGC')
245
        (2.594032108779918, 'ATCG-', '-TAGC')
246
247
        >>> cmp = DiscountedLevenshtein(mode='osa')
248
        >>> cmp.alignment('ATCG', 'TAGC')
249
        (1.7482385137517997, 'ATCG', 'TAGC')
250
        >>> cmp.alignment('ACTG', 'TAGC')
251
        (3.342270622531718, '-ACTG', 'TAGC-')
252
253
254
        .. versionadded:: 0.4.1
255
256
        """
257 1
        d_mat = self._alignment_matrix(src, tar)
258
259 1
        src_aligned = []
260 1
        tar_aligned = []
261
262 1
        src_pos = len(src)
263 1
        tar_pos = len(tar)
264
265 1
        distance = d_mat[src_pos, tar_pos]
266
267 1
        while src_pos and tar_pos:
268 1
            up = d_mat[src_pos, tar_pos - 1]
269 1
            left = d_mat[src_pos - 1, tar_pos]
270 1
            diag = d_mat[src_pos - 1, tar_pos - 1]
271
272 1
            if diag <= min(up, left):
273 1
                src_pos -= 1
274 1
                tar_pos -= 1
275 1
                src_aligned.append(src[src_pos])
276 1
                tar_aligned.append(tar[tar_pos])
277 1
            elif up <= left:
278 1
                tar_pos -= 1
279 1
                src_aligned.append('-')
280 1
                tar_aligned.append(tar[tar_pos])
281
            else:
282 1
                src_pos -= 1
283 1
                src_aligned.append(src[src_pos])
284 1
                tar_aligned.append('-')
285 1
        while tar_pos:
286 1
            tar_pos -= 1
287 1
            tar_aligned.append(tar[tar_pos])
288 1
            src_aligned.append('-')
289 1
        while src_pos:
290 1
            src_pos -= 1
291 1
            src_aligned.append(src[src_pos])
292 1
            tar_aligned.append('-')
293
294 1
        return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])
295
296 1
    def dist_abs(self, src, tar):
297
        """Return the Levenshtein distance between two strings.
298
299
        Parameters
300
        ----------
301
        src : str
302
            Source string for comparison
303
        tar : str
304
            Target string for comparison
305
306
        Returns
307
        -------
308
        int (may return a float if cost has float values)
309
            The Levenshtein distance between src & tar
310
311
        Examples
312
        --------
313
        >>> cmp = DiscountedLevenshtein()
314
        >>> cmp.dist_abs('cat', 'hat')
315
        1
316
        >>> cmp.dist_abs('Niall', 'Neil')
317
        2.526064024369237
318
        >>> cmp.dist_abs('aluminum', 'Catalan')
319
        5.053867269967515
320
        >>> cmp.dist_abs('ATCG', 'TAGC')
321
        2.594032108779918
322
323
        >>> cmp = DiscountedLevenshtein(mode='osa')
324
        >>> cmp.dist_abs('ATCG', 'TAGC')
325
        1.7482385137517997
326
        >>> cmp.dist_abs('ACTG', 'TAGC')
327
        3.342270622531718
328
329
330
        .. versionadded:: 0.4.1
331
332
        """
333 1
        src_len = len(src)
334 1
        tar_len = len(tar)
335
336 1
        if src == tar:
337 1
            return 0
338
339 1
        if isinstance(self._discount_from, int):
340 1
            discount_from = self._discount_from
341
        else:
342 1
            discount_from = 1
343
344 1
        if not src:
345 1
            return sum(
346
                self._cost(max(0, pos - discount_from))
347
                for pos in range(tar_len)
348
            )
349 1
        if not tar:
350 1
            return sum(
351
                self._cost(max(0, pos - discount_from))
352
                for pos in range(src_len)
353
            )
354
355 1
        d_mat = self._alignment_matrix(src, tar)
356
357 1
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
358 1
            return int(d_mat[src_len, tar_len])
359
        else:
360 1
            return d_mat[src_len, tar_len]
361
362 1
    def dist(self, src, tar):
363
        """Return the normalized Levenshtein distance between two strings.
364
365
        The Levenshtein distance is normalized by dividing the Levenshtein
366
        distance (calculated by any of the three supported methods) by the
367
        greater of the number of characters in src times the cost of a delete
368
        and the number of characters in tar times the cost of an insert.
369
        For the case in which all operations have :math:`cost = 1`, this is
370
        equivalent to the greater of the length of the two strings src & tar.
371
372
        Parameters
373
        ----------
374
        src : str
375
            Source string for comparison
376
        tar : str
377
            Target string for comparison
378
379
        Returns
380
        -------
381
        float
382
            The normalized Levenshtein distance between src & tar
383
384
        Examples
385
        --------
386
        >>> cmp = DiscountedLevenshtein()
387
        >>> cmp.dist('cat', 'hat')
388
        0.3513958291799864
389
        >>> cmp.dist('Niall', 'Neil')
390
        0.5909885886270658
391
        >>> cmp.dist('aluminum', 'Catalan')
392
        0.8348163322045603
393
        >>> cmp.dist('ATCG', 'TAGC')
394
        0.7217609721523955
395
396
397
        .. versionadded:: 0.4.1
398
399
        """
400 1
        if src == tar:
401 1
            return 0
402
403 1
        if isinstance(self._discount_from, int):
404 1
            discount_from = self._discount_from
405
        else:
406 1
            discount_from = 1
407
408 1
        src_len = len(src)
409 1
        tar_len = len(tar)
410
411 1
        normalize_term = self._normalizer(
412
            [
413
                sum(
414
                    self._cost(max(0, pos - discount_from))
415
                    for pos in range(src_len)
416
                ),
417
                sum(
418
                    self._cost(max(0, pos - discount_from))
419
                    for pos in range(tar_len)
420
                ),
421
            ]
422
        )
423
424 1
        return self.dist_abs(src, tar) / normalize_term
425
426
427
if __name__ == '__main__':
428
    import doctest
429
430
    doctest.testmod()
431