Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

DiscountedLevenshtein.alignment()   B

Complexity

Conditions 7

Size

Total Lines 76
Code Lines 32

Duplication

Lines 76
Ratio 100 %

Code Coverage

Tests 32
CRAP Score 7

Importance

Changes 0
Metric Value
eloc 32
dl 76
loc 76
ccs 32
cts 32
cp 1
rs 7.712
c 0
b 0
f 0
cc 7
nop 3
crap 7

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._discounted_levenshtein.
20
21
Discounted Levenshtein edit distance
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from math import log
32
33 1
import numpy as np
34
35 1
from six.moves import range
36
37 1
from ._levenshtein import Levenshtein
38
39 1
__all__ = ['DiscountedLevenshtein']
40
41
42 1
class DiscountedLevenshtein(Levenshtein):
43
    """Discounted Levenshtein distance.
44
45
    This is a variant of Levenshtein distance for which edits later in a string
46
    have discounted cost, on the theory that earlier edits are less likely
47
    than later ones.
48
49
    .. versionadded:: 0.4.1
50
    """
51
52 1
    def __init__(
53
        self,
54
        mode='lev',
55
        normalizer=max,
56
        discount_from=1,
57
        discount_func='log',
58
        vowels='aeiou',
59
        **kwargs
60
    ):
61
        """Initialize DiscountedLevenshtein instance.
62
63
        Parameters
64
        ----------
65
        mode : str
66
            Specifies a mode for computing the discounted Levenshtein distance:
67
68
                - ``lev`` (default) computes the ordinary Levenshtein distance,
69
                  in which edits may include inserts, deletes, and
70
                  substitutions
71
                - ``osa`` computes the Optimal String Alignment distance, in
72
                  which edits may include inserts, deletes, substitutions, and
73
                  transpositions but substrings may only be edited once
74
75
        normalizer : function
76
            A function that takes an list and computes a normalization term
77
            by which the edit distance is divided (max by default). Another
78
            good option is the sum function.
79
        discount_from : int or str
80
            If an int is supplied, this is the first character whose edit cost
81
            will be discounted. If the str ``coda`` is supplied, discounting
82
            will start with the first non-vowel after the first vowel (the
83
            first syllable coda).
84
        discount_func : str or function
85
            The two supported str arguments are ``log``, for a logarithmic
86
            discount function, and ``exp`` for a exponential discount function.
87
            See notes below for information on how to supply your own
88
            discount function.
89
        vowels : str
90
            These are the letters to consider as vowels when discount_from is
91
            set to ``coda``. It defaults to the English vowels 'aeiou', but
92
            it would be reasonable to localize this to other languages or to
93
            add orthographic semi-vowels like 'y', 'w', and even 'h'.
94
        **kwargs
95
            Arbitrary keyword arguments
96
97
        Notes
98
        -----
99
        This class is highly experimental and will need additional tuning.
100
101
        The discount function can be passed as a callable function. It should
102
        expect an integer as its only argument and return a float, ideally
103
        less than or equal to 1.0. The argument represents the degree of
104
        discounting to apply.
105
106
107
        .. versionadded:: 0.4.1
108
109
        """
110 1
        super(DiscountedLevenshtein, self).__init__(**kwargs)
111 1
        self._mode = mode
112 1
        self._normalizer = normalizer
113 1
        self._discount_from = discount_from
114 1
        self._vowels = set(vowels.lower())
115 1
        if callable(discount_func):
116 1
            self._cost = discount_func
117 1
        elif discount_func == 'exp':
118 1
            self._cost = self._exp_discount
119
        else:
120 1
            self._cost = self._log_discount
121
122 1
    @staticmethod
123
    def _log_discount(discounts):
124 1
        return 1 / (log(1 + discounts / 5) + 1)
125
126 1
    @staticmethod
127
    def _exp_discount(discounts):
128 1
        return 1 / (discounts + 1) ** 0.2
129
130 1
    def _alignment_matrix(self, src, tar, backtrace=True):
131
        """Return the Levenshtein alignment matrix.
132
133
        Parameters
134
        ----------
135
        src : str
136
            Source string for comparison
137
        tar : str
138
            Target string for comparison
139
        backtrace : bool
140
            Return the backtrace matrix as well
141
142
        Returns
143
        -------
144
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
145
            The alignment matrix and (optionally) the backtrace matrix
146
147
148
        .. versionadded:: 0.4.1
149
150
        """
151 1
        src_len = len(src)
152 1
        tar_len = len(tar)
153
154 1
        if self._discount_from == 'coda':
155 1
            discount_from = [0, 0]
156
157 1
            src_voc = src.lower()
158 1
            for i in range(len(src_voc)):
159 1
                if src_voc[i] in self._vowels:
160 1
                    discount_from[0] = i
161 1
                    break
162 1
            for i in range(discount_from[0], len(src_voc)):
163 1
                if src_voc[i] not in self._vowels:
164 1
                    discount_from[0] = i
165 1
                    break
166
            else:
167 1
                discount_from[0] += 1
168
169 1
            tar_voc = tar.lower()
170 1
            for i in range(len(tar_voc)):
171 1
                if tar_voc[i] in self._vowels:
172 1
                    discount_from[1] = i
173 1
                    break
174 1
            for i in range(discount_from[1], len(tar_voc)):
175 1
                if tar_voc[i] not in self._vowels:
176 1
                    discount_from[1] = i
177 1
                    break
178
            else:
179 1
                discount_from[1] += 1
180
181 1
        elif isinstance(self._discount_from, int):
182 1
            discount_from = [self._discount_from, self._discount_from]
183
        else:
184 1
            discount_from = [1, 1]
185
186 1
        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
187 1
        if backtrace:
188 1
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
189 1
        for i in range(1, src_len + 1):
190 1
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(
191
                max(0, i - discount_from[0])
192
            )
193 1
            if backtrace:
194 1
                trace_mat[i, 0] = 1
195 1
        for j in range(1, tar_len + 1):
196 1
            d_mat[0, j] = d_mat[0, j - 1] + self._cost(
197
                max(0, j - discount_from[1])
198
            )
199 1
            if backtrace:
200 1
                trace_mat[0, j] = 0
201 1
        for i in range(src_len):
202 1
            i_extend = self._cost(max(0, i - discount_from[0]))
203 1
            for j in range(tar_len):
204 1
                traces = ((i + 1, j), (i, j + 1), (i, j))
205 1
                cost = min(i_extend, self._cost(max(0, j - discount_from[1])))
206 1
                opts = (
207
                    d_mat[traces[0]] + cost,  # ins
208
                    d_mat[traces[1]] + cost,  # del
209
                    d_mat[traces[2]]
210
                    + (cost if src[i] != tar[j] else 0),  # sub/==
211
                )
212 1
                d_mat[i + 1, j + 1] = min(opts)
213 1
                if backtrace:
214 1
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))
215
216 1 View Code Duplication
                if self._mode == 'osa':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
217 1
                    if (
218
                        i + 1 > 1
219
                        and j + 1 > 1
220
                        and src[i] == tar[j - 1]
221
                        and src[i - 1] == tar[j]
222
                    ):
223
                        # transposition
224 1
                        d_mat[i + 1, j + 1] = min(
225
                            d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
226
                        )
227 1
                        if backtrace:
228 1
                            trace_mat[i + 1, j + 1] = 2
229 1
        if backtrace:
230 1
            return d_mat, trace_mat
0 ignored issues
show
introduced by
The variable trace_mat does not seem to be defined in case backtrace on line 187 is False. Are you sure this can never be the case?
Loading history...
231 1
        return d_mat
232
233 1
    def dist_abs(self, src, tar):
234
        """Return the Levenshtein distance between two strings.
235
236
        Parameters
237
        ----------
238
        src : str
239
            Source string for comparison
240
        tar : str
241
            Target string for comparison
242
243
        Returns
244
        -------
245
        float (may return a float if cost has float values)
246
            The Levenshtein distance between src & tar
247
248
        Examples
249
        --------
250
        >>> cmp = DiscountedLevenshtein()
251
        >>> cmp.dist_abs('cat', 'hat')
252
        1
253
        >>> cmp.dist_abs('Niall', 'Neil')
254
        2.526064024369237
255
        >>> cmp.dist_abs('aluminum', 'Catalan')
256
        5.053867269967515
257
        >>> cmp.dist_abs('ATCG', 'TAGC')
258
        2.594032108779918
259
260
        >>> cmp = DiscountedLevenshtein(mode='osa')
261
        >>> cmp.dist_abs('ATCG', 'TAGC')
262
        1.7482385137517997
263
        >>> cmp.dist_abs('ACTG', 'TAGC')
264
        3.342270622531718
265
266
267
        .. versionadded:: 0.4.1
268
269
        """
270 1
        src_len = len(src)
271 1
        tar_len = len(tar)
272
273 1
        if src == tar:
274 1
            return 0.0
275
276 1
        if isinstance(self._discount_from, int):
277 1
            discount_from = self._discount_from
278
        else:
279 1
            discount_from = 1
280
281 1
        if not src:
282 1
            return sum(
283
                self._cost(max(0, pos - discount_from))
284
                for pos in range(tar_len)
285
            )
286 1
        if not tar:
287 1
            return sum(
288
                self._cost(max(0, pos - discount_from))
289
                for pos in range(src_len)
290
            )
291
292 1
        d_mat = self._alignment_matrix(src, tar, backtrace=False)
293
294 1
        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
295 1
            return int(d_mat[src_len, tar_len])
296
        else:
297 1
            return d_mat[src_len, tar_len]
298
299 1
    def dist(self, src, tar):
300
        """Return the normalized Levenshtein distance between two strings.
301
302
        The Levenshtein distance is normalized by dividing the Levenshtein
303
        distance (calculated by any of the three supported methods) by the
304
        greater of the number of characters in src times the cost of a delete
305
        and the number of characters in tar times the cost of an insert.
306
        For the case in which all operations have :math:`cost = 1`, this is
307
        equivalent to the greater of the length of the two strings src & tar.
308
309
        Parameters
310
        ----------
311
        src : str
312
            Source string for comparison
313
        tar : str
314
            Target string for comparison
315
316
        Returns
317
        -------
318
        float
319
            The normalized Levenshtein distance between src & tar
320
321
        Examples
322
        --------
323
        >>> cmp = DiscountedLevenshtein()
324
        >>> cmp.dist('cat', 'hat')
325
        0.3513958291799864
326
        >>> cmp.dist('Niall', 'Neil')
327
        0.5909885886270658
328
        >>> cmp.dist('aluminum', 'Catalan')
329
        0.8348163322045603
330
        >>> cmp.dist('ATCG', 'TAGC')
331
        0.7217609721523955
332
333
334
        .. versionadded:: 0.4.1
335
336
        """
337 1
        if src == tar:
338 1
            return 0
339
340 1
        if isinstance(self._discount_from, int):
341 1
            discount_from = self._discount_from
342
        else:
343 1
            discount_from = 1
344
345 1
        src_len = len(src)
346 1
        tar_len = len(tar)
347
348 1
        normalize_term = self._normalizer(
349
            [
350
                sum(
351
                    self._cost(max(0, pos - discount_from))
352
                    for pos in range(src_len)
353
                ),
354
                sum(
355
                    self._cost(max(0, pos - discount_from))
356
                    for pos in range(tar_len)
357
                ),
358
            ]
359
        )
360
361 1
        return self.dist_abs(src, tar) / normalize_term
362
363
364
if __name__ == '__main__':
365
    import doctest
366
367
    doctest.testmod()
368