abydos.distance._discounted_levenshtein - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.distance._discounted_levenshtein B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	368
Duplicated Lines	3.53 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	43
eloc	144
dl	13
loc	368
ccs	105
cts	105
cp	1
rs	8.96
c	0
b	0
f	0

6 Methods

Rating	Name	Duplication	Size	Complexity
F	DiscountedLevenshtein._alignment_matrix()	13	102	29
A	DiscountedLevenshtein.__init__()	0	69	3
A	DiscountedLevenshtein.dist()	0	63	3
A	DiscountedLevenshtein._log_discount()	0	3	1
A	DiscountedLevenshtein._exp_discount()	0	3	1
B	DiscountedLevenshtein.dist_abs()	0	65	6

How to fix Duplicated Code Complexity

# -*- coding: utf-8 -*-

# Copyright 2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._discounted_levenshtein.

Discounted Levenshtein edit distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from math import log

import numpy as np

from six.moves import range

from ._levenshtein import Levenshtein

__all__ = ['DiscountedLevenshtein']


class DiscountedLevenshtein(Levenshtein):
    """Discounted Levenshtein distance.

    This is a variant of Levenshtein distance for which edits later in a string
    have discounted cost, on the theory that earlier edits are less likely
    than later ones.

    .. versionadded:: 0.4.1
    """

    def __init__(
        self,
        mode='lev',
        normalizer=max,
        discount_from=1,
        discount_func='log',
        vowels='aeiou',
        **kwargs
    ):
        """Initialize DiscountedLevenshtein instance.

        Parameters
        ----------
        mode : str
            Specifies a mode for computing the discounted Levenshtein distance:

                - ``lev`` (default) computes the ordinary Levenshtein distance,
                  in which edits may include inserts, deletes, and
                  substitutions
                - ``osa`` computes the Optimal String Alignment distance, in
                  which edits may include inserts, deletes, substitutions, and
                  transpositions but substrings may only be edited once

        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        discount_from : int or str
            If an int is supplied, this is the first character whose edit cost
            will be discounted. If the str ``coda`` is supplied, discounting
            will start with the first non-vowel after the first vowel (the
            first syllable coda).
        discount_func : str or function
            The two supported str arguments are ``log``, for a logarithmic
            discount function, and ``exp`` for a exponential discount function.
            See notes below for information on how to supply your own
            discount function.
        vowels : str
            These are the letters to consider as vowels when discount_from is
            set to ``coda``. It defaults to the English vowels 'aeiou', but
            it would be reasonable to localize this to other languages or to
            add orthographic semi-vowels like 'y', 'w', and even 'h'.
        **kwargs
            Arbitrary keyword arguments

        Notes
        -----
        This class is highly experimental and will need additional tuning.

        The discount function can be passed as a callable function. It should
        expect an integer as its only argument and return a float, ideally
        less than or equal to 1.0. The argument represents the degree of
        discounting to apply.


        .. versionadded:: 0.4.1

        """
        super(DiscountedLevenshtein, self).__init__(**kwargs)
        self._mode = mode
        self._normalizer = normalizer
        self._discount_from = discount_from
        self._vowels = set(vowels.lower())
        if callable(discount_func):
            self._cost = discount_func
        elif discount_func == 'exp':
            self._cost = self._exp_discount
        else:
            self._cost = self._log_discount

    @staticmethod
    def _log_discount(discounts):
        return 1 / (log(1 + discounts / 5) + 1)

    @staticmethod
    def _exp_discount(discounts):
        return 1 / (discounts + 1) ** 0.2

    def _alignment_matrix(self, src, tar, backtrace=True):
        """Return the Levenshtein alignment matrix.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        backtrace : bool
            Return the backtrace matrix as well

        Returns
        -------
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
            The alignment matrix and (optionally) the backtrace matrix


        .. versionadded:: 0.4.1

        """
        src_len = len(src)
        tar_len = len(tar)

        if self._discount_from == 'coda':
            discount_from = [0, 0]

            src_voc = src.lower()
            for i in range(len(src_voc)):
                if src_voc[i] in self._vowels:
                    discount_from[0] = i
                    break
            for i in range(discount_from[0], len(src_voc)):
                if src_voc[i] not in self._vowels:
                    discount_from[0] = i
                    break
            else:
                discount_from[0] += 1

            tar_voc = tar.lower()
            for i in range(len(tar_voc)):
                if tar_voc[i] in self._vowels:
                    discount_from[1] = i
                    break
            for i in range(discount_from[1], len(tar_voc)):
                if tar_voc[i] not in self._vowels:
                    discount_from[1] = i
                    break
            else:
                discount_from[1] += 1

        elif isinstance(self._discount_from, int):
            discount_from = [self._discount_from, self._discount_from]
        else:
            discount_from = [1, 1]

        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
        if backtrace:
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
        for i in range(1, src_len + 1):
            d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(
                max(0, i - discount_from[0])
            )
            if backtrace:
                trace_mat[i, 0] = 1
        for j in range(1, tar_len + 1):
            d_mat[0, j] = d_mat[0, j - 1] + self._cost(
                max(0, j - discount_from[1])
            )
            if backtrace:
                trace_mat[0, j] = 0
        for i in range(src_len):
            i_extend = self._cost(max(0, i - discount_from[0]))
            for j in range(tar_len):
                traces = ((i + 1, j), (i, j + 1), (i, j))
                cost = min(i_extend, self._cost(max(0, j - discount_from[1])))
                opts = (
                    d_mat[traces[0]] + cost,  # ins
                    d_mat[traces[1]] + cost,  # del
                    d_mat[traces[2]]
                    + (cost if src[i] != tar[j] else 0),  # sub/==
                )
                d_mat[i + 1, j + 1] = min(opts)
                if backtrace:
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))

                if self._mode == 'osa':

                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src[i] == tar[j - 1]
                        and src[i - 1] == tar[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
                        )
                        if backtrace:
                            trace_mat[i + 1, j + 1] = 2
        if backtrace:
            return d_mat, trace_mat

        return d_mat

    def dist_abs(self, src, tar):
        """Return the Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float (may return a float if cost has float values)
            The Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = DiscountedLevenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        2.526064024369237
        >>> cmp.dist_abs('aluminum', 'Catalan')
        5.053867269967515
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.594032108779918

        >>> cmp = DiscountedLevenshtein(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        1.7482385137517997
        >>> cmp.dist_abs('ACTG', 'TAGC')
        3.342270622531718


        .. versionadded:: 0.4.1

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 0.0

        if isinstance(self._discount_from, int):
            discount_from = self._discount_from
        else:
            discount_from = 1

        if not src:
            return sum(
                self._cost(max(0, pos - discount_from))
                for pos in range(tar_len)
            )
        if not tar:
            return sum(
                self._cost(max(0, pos - discount_from))
                for pos in range(src_len)
            )

        d_mat = self._alignment_matrix(src, tar, backtrace=False)

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]

    def dist(self, src, tar):
        """Return the normalized Levenshtein distance between two strings.

        The Levenshtein distance is normalized by dividing the Levenshtein
        distance (calculated by any of the three supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = DiscountedLevenshtein()
        >>> cmp.dist('cat', 'hat')
        0.3513958291799864
        >>> cmp.dist('Niall', 'Neil')
        0.5909885886270658
        >>> cmp.dist('aluminum', 'Catalan')
        0.8348163322045603
        >>> cmp.dist('ATCG', 'TAGC')
        0.7217609721523955


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 0

        if isinstance(self._discount_from, int):
            discount_from = self._discount_from
        else:
            discount_from = 1

        src_len = len(src)
        tar_len = len(tar)

        normalize_term = self._normalizer(
            [
                sum(
                    self._cost(max(0, pos - discount_from))
                    for pos in range(src_len)
                ),
                sum(
                    self._cost(max(0, pos - discount_from))
                    for pos in range(tar_len)
                ),
            ]
        )

        return self.dist_abs(src, tar) / normalize_term


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2019 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.distance._discounted_levenshtein.
20
21			Discounted Levenshtein edit distance
22			"""
23
24	1		from __future__ import (
25			absolute_import,
26			division,
27			print_function,
28			unicode_literals,
29			)
30
31	1		from math import log
32
33	1		import numpy as np
34
35	1		from six.moves import range
36
37	1		from ._levenshtein import Levenshtein
38
39	1		__all__ = ['DiscountedLevenshtein']
40
41
42	1		class DiscountedLevenshtein(Levenshtein):
43			"""Discounted Levenshtein distance.
44
45			This is a variant of Levenshtein distance for which edits later in a string
46			have discounted cost, on the theory that earlier edits are less likely
47			than later ones.
48
49			.. versionadded:: 0.4.1
50			"""
51
52	1		def __init__(
53			self,
54			mode='lev',
55			normalizer=max,
56			discount_from=1,
57			discount_func='log',
58			vowels='aeiou',
59			**kwargs
60			):
61			"""Initialize DiscountedLevenshtein instance.
62
63			Parameters
64			----------
65			mode : str
66			Specifies a mode for computing the discounted Levenshtein distance:
67
68			- ``lev`` (default) computes the ordinary Levenshtein distance,
69			in which edits may include inserts, deletes, and
70			substitutions
71			- ``osa`` computes the Optimal String Alignment distance, in
72			which edits may include inserts, deletes, substitutions, and
73			transpositions but substrings may only be edited once
74
75			normalizer : function
76			A function that takes an list and computes a normalization term
77			by which the edit distance is divided (max by default). Another
78			good option is the sum function.
79			discount_from : int or str
80			If an int is supplied, this is the first character whose edit cost
81			will be discounted. If the str ``coda`` is supplied, discounting
82			will start with the first non-vowel after the first vowel (the
83			first syllable coda).
84			discount_func : str or function
85			The two supported str arguments are ``log``, for a logarithmic
86			discount function, and ``exp`` for a exponential discount function.
87			See notes below for information on how to supply your own
88			discount function.
89			vowels : str
90			These are the letters to consider as vowels when discount_from is
91			set to ``coda``. It defaults to the English vowels 'aeiou', but
92			it would be reasonable to localize this to other languages or to
93			add orthographic semi-vowels like 'y', 'w', and even 'h'.
94			**kwargs
95			Arbitrary keyword arguments
96
97			Notes
98			-----
99			This class is highly experimental and will need additional tuning.
100
101			The discount function can be passed as a callable function. It should
102			expect an integer as its only argument and return a float, ideally
103			less than or equal to 1.0. The argument represents the degree of
104			discounting to apply.
105
106
107			.. versionadded:: 0.4.1
108
109			"""
110	1		super(DiscountedLevenshtein, self).__init__(**kwargs)
111	1		self._mode = mode
112	1		self._normalizer = normalizer
113	1		self._discount_from = discount_from
114	1		self._vowels = set(vowels.lower())
115	1		if callable(discount_func):
116	1		self._cost = discount_func
117	1		elif discount_func == 'exp':
118	1		self._cost = self._exp_discount
119			else:
120	1		self._cost = self._log_discount
121
122	1		@staticmethod
123			def _log_discount(discounts):
124	1		return 1 / (log(1 + discounts / 5) + 1)
125
126	1		@staticmethod
127			def _exp_discount(discounts):
128	1		return 1 / (discounts + 1) ** 0.2
129
130	1		def _alignment_matrix(self, src, tar, backtrace=True):
131			"""Return the Levenshtein alignment matrix.
132
133			Parameters
134			----------
135			src : str
136			Source string for comparison
137			tar : str
138			Target string for comparison
139			backtrace : bool
140			Return the backtrace matrix as well
141
142			Returns
143			-------
144			numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
145			The alignment matrix and (optionally) the backtrace matrix
146
147
148			.. versionadded:: 0.4.1
149
150			"""
151	1		src_len = len(src)
152	1		tar_len = len(tar)
153
154	1		if self._discount_from == 'coda':
155	1		discount_from = [0, 0]
156
157	1		src_voc = src.lower()
158	1		for i in range(len(src_voc)):
159	1		if src_voc[i] in self._vowels:
160	1		discount_from[0] = i
161	1		break
162	1		for i in range(discount_from[0], len(src_voc)):
163	1		if src_voc[i] not in self._vowels:
164	1		discount_from[0] = i
165	1		break
166			else:
167	1		discount_from[0] += 1
168
169	1		tar_voc = tar.lower()
170	1		for i in range(len(tar_voc)):
171	1		if tar_voc[i] in self._vowels:
172	1		discount_from[1] = i
173	1		break
174	1		for i in range(discount_from[1], len(tar_voc)):
175	1		if tar_voc[i] not in self._vowels:
176	1		discount_from[1] = i
177	1		break
178			else:
179	1		discount_from[1] += 1
180
181	1		elif isinstance(self._discount_from, int):
182	1		discount_from = [self._discount_from, self._discount_from]
183			else:
184	1		discount_from = [1, 1]
185
186	1		d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float)
187	1		if backtrace:
188	1		trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
189	1		for i in range(1, src_len + 1):
190	1		d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(
191			max(0, i - discount_from[0])
192			)
193	1		if backtrace:
194	1		trace_mat[i, 0] = 1
195	1		for j in range(1, tar_len + 1):
196	1		d_mat[0, j] = d_mat[0, j - 1] + self._cost(
197			max(0, j - discount_from[1])
198			)
199	1		if backtrace:
200	1		trace_mat[0, j] = 0
201	1		for i in range(src_len):
202	1		i_extend = self._cost(max(0, i - discount_from[0]))
203	1		for j in range(tar_len):
204	1		traces = ((i + 1, j), (i, j + 1), (i, j))
205	1		cost = min(i_extend, self._cost(max(0, j - discount_from[1])))
206	1		opts = (
207			d_mat[traces[0]] + cost, # ins
208			d_mat[traces[1]] + cost, # del
209			d_mat[traces[2]]
210			+ (cost if src[i] != tar[j] else 0), # sub/==
211			)
212	1		d_mat[i + 1, j + 1] = min(opts)
213	1		if backtrace:
214	1		trace_mat[i + 1, j + 1] = int(np.argmin(opts))
215
216	1	View Code Duplication	if self._mode == 'osa':
			0 ignored issues – show Duplication introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
217	1		if (
218			i + 1 > 1
219			and j + 1 > 1
220			and src[i] == tar[j - 1]
221			and src[i - 1] == tar[j]
222			):
223			# transposition
224	1		d_mat[i + 1, j + 1] = min(
225			d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
226			)
227	1		if backtrace:
228	1		trace_mat[i + 1, j + 1] = 2
229	1		if backtrace:
230	1		return d_mat, trace_mat
			0 ignored issues – show introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report The variable `trace_mat` does not seem to be defined in case `backtrace` on line `187` is `False`. Are you sure this can never be the case? Loading history...
231	1		return d_mat
232
233	1		def dist_abs(self, src, tar):
234			"""Return the Levenshtein distance between two strings.
235
236			Parameters
237			----------
238			src : str
239			Source string for comparison
240			tar : str
241			Target string for comparison
242
243			Returns
244			-------
245			float (may return a float if cost has float values)
246			The Levenshtein distance between src & tar
247
248			Examples
249			--------
250			>>> cmp = DiscountedLevenshtein()
251			>>> cmp.dist_abs('cat', 'hat')
252			1
253			>>> cmp.dist_abs('Niall', 'Neil')
254			2.526064024369237
255			>>> cmp.dist_abs('aluminum', 'Catalan')
256			5.053867269967515
257			>>> cmp.dist_abs('ATCG', 'TAGC')
258			2.594032108779918
259
260			>>> cmp = DiscountedLevenshtein(mode='osa')
261			>>> cmp.dist_abs('ATCG', 'TAGC')
262			1.7482385137517997
263			>>> cmp.dist_abs('ACTG', 'TAGC')
264			3.342270622531718
265
266
267			.. versionadded:: 0.4.1
268
269			"""
270	1		src_len = len(src)
271	1		tar_len = len(tar)
272
273	1		if src == tar:
274	1		return 0.0
275
276	1		if isinstance(self._discount_from, int):
277	1		discount_from = self._discount_from
278			else:
279	1		discount_from = 1
280
281	1		if not src:
282	1		return sum(
283			self._cost(max(0, pos - discount_from))
284			for pos in range(tar_len)
285			)
286	1		if not tar:
287	1		return sum(
288			self._cost(max(0, pos - discount_from))
289			for pos in range(src_len)
290			)
291
292	1		d_mat = self._alignment_matrix(src, tar, backtrace=False)
293
294	1		if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
295	1		return int(d_mat[src_len, tar_len])
296			else:
297	1		return d_mat[src_len, tar_len]
298
299	1		def dist(self, src, tar):
300			"""Return the normalized Levenshtein distance between two strings.
301
302			The Levenshtein distance is normalized by dividing the Levenshtein
303			distance (calculated by any of the three supported methods) by the
304			greater of the number of characters in src times the cost of a delete
305			and the number of characters in tar times the cost of an insert.
306			For the case in which all operations have :math:`cost = 1`, this is
307			equivalent to the greater of the length of the two strings src & tar.
308
309			Parameters
310			----------
311			src : str
312			Source string for comparison
313			tar : str
314			Target string for comparison
315
316			Returns
317			-------
318			float
319			The normalized Levenshtein distance between src & tar
320
321			Examples
322			--------
323			>>> cmp = DiscountedLevenshtein()
324			>>> cmp.dist('cat', 'hat')
325			0.3513958291799864
326			>>> cmp.dist('Niall', 'Neil')
327			0.5909885886270658
328			>>> cmp.dist('aluminum', 'Catalan')
329			0.8348163322045603
330			>>> cmp.dist('ATCG', 'TAGC')
331			0.7217609721523955
332
333
334			.. versionadded:: 0.4.1
335
336			"""
337	1		if src == tar:
338	1		return 0
339
340	1		if isinstance(self._discount_from, int):
341	1		discount_from = self._discount_from
342			else:
343	1		discount_from = 1
344
345	1		src_len = len(src)
346	1		tar_len = len(tar)
347
348	1		normalize_term = self._normalizer(
349			[
350			sum(
351			self._cost(max(0, pos - discount_from))
352			for pos in range(src_len)
353			),
354			sum(
355			self._cost(max(0, pos - discount_from))
356			for pos in range(tar_len)
357			),
358			]
359			)
360
361	1		return self.dist_abs(src, tar) / normalize_term
362
363
364			if __name__ == '__main__':
365			import doctest
366
367			doctest.testmod()
368

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.distance._discounted_levenshtein B

Complexity

Size/Duplication

Test Coverage

Importance

6 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like