abydos.distance._discounted_levenshtein.DiscountedLevenshtein.dist() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

DiscountedLevenshtein.dist() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.distance._discounted_levenshtein

Complexity

Conditions

Size

Total Lines	63
Code Lines	17

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	11
CRAP Score	3

Importance

Changes

Metric	Value
eloc	17
dl	0
loc	63
ccs	11
cts	11
cp	1
rs	9.55
c	0
b	0
f	0
cc	3
nop	3
crap	3

How to fix Long Method

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._discounted_levenshtein.

Discounted Levenshtein edit distance
"""

from math import log
from typing import Any, Callable, List, Tuple, Union, cast

import numpy as np

from ._levenshtein import Levenshtein

__all__ = ['DiscountedLevenshtein']


class DiscountedLevenshtein(Levenshtein):
    """Discounted Levenshtein distance.

    This is a variant of Levenshtein distance for which edits later in a string
    have discounted cost, on the theory that earlier edits are less likely
    than later ones.

    .. versionadded:: 0.4.1
    """

    def __init__(
        self,
        mode: str = 'lev',
        normalizer: Callable[[List[float]], float] = max,
        discount_from: Union[int, str] = 1,
        discount_func: Union[str, Callable[[float], float]] = 'log',
        vowels: str = 'aeiou',
        **kwargs: Any
    ) -> None:
        """Initialize DiscountedLevenshtein instance.

        Parameters
        ----------
        mode : str
            Specifies a mode for computing the discounted Levenshtein distance:

                - ``lev`` (default) computes the ordinary Levenshtein distance,
                  in which edits may include inserts, deletes, and
                  substitutions
                - ``osa`` computes the Optimal String Alignment distance, in
                  which edits may include inserts, deletes, substitutions, and
                  transpositions but substrings may only be edited once

        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        discount_from : int or str
            If an int is supplied, this is the first character whose edit cost
            will be discounted. If the str ``coda`` is supplied, discounting
            will start with the first non-vowel after the first vowel (the
            first syllable coda).
        discount_func : str or function
            The two supported str arguments are ``log``, for a logarithmic
            discount function, and ``exp`` for a exponential discount function.
            See notes below for information on how to supply your own
            discount function.
        vowels : str
            These are the letters to consider as vowels when discount_from is
            set to ``coda``. It defaults to the English vowels 'aeiou', but
            it would be reasonable to localize this to other languages or to
            add orthographic semi-vowels like 'y', 'w', and even 'h'.
        **kwargs
            Arbitrary keyword arguments

        Notes
        -----
        This class is highly experimental and will need additional tuning.

        The discount function can be passed as a callable function. It should
        expect an integer as its only argument and return a float, ideally
        less than or equal to 1.0. The argument represents the degree of
        discounting to apply.


        .. versionadded:: 0.4.1

        """
        super(DiscountedLevenshtein, self).__init__(**kwargs)
        self._mode = mode
        self._normalizer = normalizer
        self._discount_from = discount_from
        self._vowels = set(vowels.lower())
        if callable(discount_func):
            self._discount_func = discount_func
        elif discount_func == 'exp':
            self._discount_func = self._exp_discount
        else:
            self._discount_func = self._log_discount

    @staticmethod
    def _log_discount(discounts: float) -> float:
        return 1 / (log(1 + discounts / 5) + 1)

    @staticmethod
    def _exp_discount(discounts: float) -> float:
        return 1 / (discounts + 1) ** 0.2

    def _alignment_matrix(
        self, src: str, tar: str, backtrace: bool = True
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        """Return the Levenshtein alignment matrix.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        backtrace : bool
            Return the backtrace matrix as well

        Returns
        -------
        numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
            The alignment matrix and (optionally) the backtrace matrix


        .. versionadded:: 0.4.1

        """
        src_len = len(src)
        tar_len = len(tar)

        if self._discount_from == 'coda':
            discount_from = [0, 0]

            src_voc = src.lower()
            for i in range(len(src_voc)):
                if src_voc[i] in self._vowels:
                    discount_from[0] = i
                    break
            for i in range(discount_from[0], len(src_voc)):
                if src_voc[i] not in self._vowels:
                    discount_from[0] = i
                    break
            else:
                discount_from[0] += 1

            tar_voc = tar.lower()
            for i in range(len(tar_voc)):
                if tar_voc[i] in self._vowels:
                    discount_from[1] = i
                    break
            for i in range(discount_from[1], len(tar_voc)):
                if tar_voc[i] not in self._vowels:
                    discount_from[1] = i
                    break
            else:
                discount_from[1] += 1

        elif isinstance(self._discount_from, int):
            discount_from = [self._discount_from, self._discount_from]
        else:
            discount_from = [1, 1]

        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
        if backtrace:
            trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
        for i in range(1, src_len + 1):
            d_mat[i, 0] = d_mat[i - 1, 0] + self._discount_func(
                max(0, i - discount_from[0])
            )
            if backtrace:
                trace_mat[i, 0] = 1
        for j in range(1, tar_len + 1):
            d_mat[0, j] = d_mat[0, j - 1] + self._discount_func(
                max(0, j - discount_from[1])
            )
            if backtrace:
                trace_mat[0, j] = 0
        for i in range(src_len):
            i_extend = self._discount_func(max(0, i - discount_from[0]))
            for j in range(tar_len):
                traces = ((i + 1, j), (i, j + 1), (i, j))
                cost = min(
                    i_extend, self._discount_func(max(0, j - discount_from[1]))
                )
                opts = (
                    d_mat[traces[0]] + cost,  # ins
                    d_mat[traces[1]] + cost,  # del
                    d_mat[traces[2]]
                    + (cost if src[i] != tar[j] else 0),  # sub/==
                )
                d_mat[i + 1, j + 1] = min(opts)
                if backtrace:
                    trace_mat[i + 1, j + 1] = int(np.argmin(opts))

                if self._mode == 'osa':
                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src[i] == tar[j - 1]
                        and src[i - 1] == tar[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
                        )
                        if backtrace:
                            trace_mat[i + 1, j + 1] = 2
        if backtrace:
            return d_mat, trace_mat

        return d_mat

    def dist_abs(self, src: str, tar: str) -> float:
        """Return the Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float (may return a float if cost has float values)
            The Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = DiscountedLevenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        2.526064024369237
        >>> cmp.dist_abs('aluminum', 'Catalan')
        5.053867269967515
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2.594032108779918

        >>> cmp = DiscountedLevenshtein(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        1.7482385137517997
        >>> cmp.dist_abs('ACTG', 'TAGC')
        3.342270622531718


        .. versionadded:: 0.4.1

        """
        src_len = len(src)
        tar_len = len(tar)

        if src == tar:
            return 0.0

        if isinstance(self._discount_from, int):
            discount_from = self._discount_from
        else:
            discount_from = 1

        if not src:
            return sum(
                self._discount_func(max(0, pos - discount_from))
                for pos in range(tar_len)
            )
        if not tar:
            return sum(
                self._discount_func(max(0, pos - discount_from))
                for pos in range(src_len)
            )

        d_mat = cast(
            np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
        )

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return cast(float, d_mat[src_len, tar_len])

    def dist(self, src: str, tar: str) -> float:
        """Return the normalized Levenshtein distance between two strings.

        The Levenshtein distance is normalized by dividing the Levenshtein
        distance (calculated by any of the three supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = DiscountedLevenshtein()
        >>> cmp.dist('cat', 'hat')
        0.3513958291799864
        >>> cmp.dist('Niall', 'Neil')
        0.5909885886270658
        >>> cmp.dist('aluminum', 'Catalan')
        0.8348163322045603
        >>> cmp.dist('ATCG', 'TAGC')
        0.7217609721523955


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 0

        if isinstance(self._discount_from, int):
            discount_from = self._discount_from
        else:
            discount_from = 1

        src_len = len(src)
        tar_len = len(tar)

        normalize_term = self._normalizer(
            [
                sum(
                    self._discount_func(max(0, pos - discount_from))
                    for pos in range(src_len)
                ),
                sum(
                    self._discount_func(max(0, pos - discount_from))
                    for pos in range(tar_len)
                ),
            ]
        )

        return self.dist_abs(src, tar) / normalize_term


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2019-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.distance._discounted_levenshtein.
18
19	1	Discounted Levenshtein edit distance
20		"""
21
22		from math import log
23		from typing import Any, Callable, List, Tuple, Union, cast
24	1
25		import numpy as np
26
27		from ._levenshtein import Levenshtein
28
29		__all__ = ['DiscountedLevenshtein']
30
31	1
32		class DiscountedLevenshtein(Levenshtein):
33	1	"""Discounted Levenshtein distance.
34
35	1	This is a variant of Levenshtein distance for which edits later in a string
36		have discounted cost, on the theory that earlier edits are less likely
37	1	than later ones.
38
39	1	.. versionadded:: 0.4.1
40		"""
41
42	1	def __init__(
43		self,
44		mode: str = 'lev',
45		normalizer: Callable[[List[float]], float] = max,
46		discount_from: Union[int, str] = 1,
47		discount_func: Union[str, Callable[[float], float]] = 'log',
48		vowels: str = 'aeiou',
49		**kwargs: Any
50		) -> None:
51		"""Initialize DiscountedLevenshtein instance.
52	1
53		Parameters
54		----------
55		mode : str
56		Specifies a mode for computing the discounted Levenshtein distance:
57
58		- ``lev`` (default) computes the ordinary Levenshtein distance,
59		in which edits may include inserts, deletes, and
60		substitutions
61		- ``osa`` computes the Optimal String Alignment distance, in
62		which edits may include inserts, deletes, substitutions, and
63		transpositions but substrings may only be edited once
64
65		normalizer : function
66		A function that takes an list and computes a normalization term
67		by which the edit distance is divided (max by default). Another
68		good option is the sum function.
69		discount_from : int or str
70		If an int is supplied, this is the first character whose edit cost
71		will be discounted. If the str ``coda`` is supplied, discounting
72		will start with the first non-vowel after the first vowel (the
73		first syllable coda).
74		discount_func : str or function
75		The two supported str arguments are ``log``, for a logarithmic
76		discount function, and ``exp`` for a exponential discount function.
77		See notes below for information on how to supply your own
78		discount function.
79		vowels : str
80		These are the letters to consider as vowels when discount_from is
81		set to ``coda``. It defaults to the English vowels 'aeiou', but
82		it would be reasonable to localize this to other languages or to
83		add orthographic semi-vowels like 'y', 'w', and even 'h'.
84		**kwargs
85		Arbitrary keyword arguments
86
87		Notes
88		-----
89		This class is highly experimental and will need additional tuning.
90
91		The discount function can be passed as a callable function. It should
92		expect an integer as its only argument and return a float, ideally
93		less than or equal to 1.0. The argument represents the degree of
94		discounting to apply.
95
96
97		.. versionadded:: 0.4.1
98
99		"""
100		super(DiscountedLevenshtein, self).__init__(**kwargs)
101		self._mode = mode
102		self._normalizer = normalizer
103		self._discount_from = discount_from
104		self._vowels = set(vowels.lower())
105		if callable(discount_func):
106		self._discount_func = discount_func
107		elif discount_func == 'exp':
108		self._discount_func = self._exp_discount
109		else:
110	1	self._discount_func = self._log_discount
111	1
112	1	@staticmethod
113	1	def _log_discount(discounts: float) -> float:
114	1	return 1 / (log(1 + discounts / 5) + 1)
115	1
116	1	@staticmethod
117	1	def _exp_discount(discounts: float) -> float:
118	1	return 1 / (discounts + 1) ** 0.2
119
120	1	def _alignment_matrix(
121		self, src: str, tar: str, backtrace: bool = True
122	1	) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
123		"""Return the Levenshtein alignment matrix.
124	1
125		Parameters
126	1	----------
127		src : str
128	1	Source string for comparison
129		tar : str
130	1	Target string for comparison
131		backtrace : bool
132		Return the backtrace matrix as well
133
134		Returns
135		-------
136		numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
137		The alignment matrix and (optionally) the backtrace matrix
138
139
140		.. versionadded:: 0.4.1
141
142		"""
143		src_len = len(src)
144		tar_len = len(tar)
145
146		if self._discount_from == 'coda':
147		discount_from = [0, 0]
148
149		src_voc = src.lower()
150		for i in range(len(src_voc)):
151	1	if src_voc[i] in self._vowels:
152	1	discount_from[0] = i
153		break
154	1	for i in range(discount_from[0], len(src_voc)):
155	1	if src_voc[i] not in self._vowels:
156		discount_from[0] = i
157	1	break
158	1	else:
159	1	discount_from[0] += 1
160	1
161	1	tar_voc = tar.lower()
162	1	for i in range(len(tar_voc)):
163	1	if tar_voc[i] in self._vowels:
164	1	discount_from[1] = i
165	1	break
166		for i in range(discount_from[1], len(tar_voc)):
167	1	if tar_voc[i] not in self._vowels:
168		discount_from[1] = i
169	1	break
170	1	else:
171	1	discount_from[1] += 1
172	1
173	1	elif isinstance(self._discount_from, int):
174	1	discount_from = [self._discount_from, self._discount_from]
175	1	else:
176	1	discount_from = [1, 1]
177	1
178		d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
179	1	if backtrace:
180		trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
181	1	for i in range(1, src_len + 1):
182	1	d_mat[i, 0] = d_mat[i - 1, 0] + self._discount_func(
183		max(0, i - discount_from[0])
184	1	)
185		if backtrace:
186	1	trace_mat[i, 0] = 1
187	1	for j in range(1, tar_len + 1):
188	1	d_mat[0, j] = d_mat[0, j - 1] + self._discount_func(
189	1	max(0, j - discount_from[1])
190	1	)
191		if backtrace:
192		trace_mat[0, j] = 0
193	1	for i in range(src_len):
194	1	i_extend = self._discount_func(max(0, i - discount_from[0]))
195	1	for j in range(tar_len):
196	1	traces = ((i + 1, j), (i, j + 1), (i, j))
197		cost = min(
198		i_extend, self._discount_func(max(0, j - discount_from[1]))
199	1	)
200	1	opts = (
201	1	d_mat[traces[0]] + cost, # ins
202	1	d_mat[traces[1]] + cost, # del
203	1	d_mat[traces[2]]
204	1	+ (cost if src[i] != tar[j] else 0), # sub/==
205	1	)
206	1	d_mat[i + 1, j + 1] = min(opts)
207		if backtrace:
208		trace_mat[i + 1, j + 1] = int(np.argmin(opts))
209
210		if self._mode == 'osa':
211		if (
212	1	i + 1 > 1
213	1	and j + 1 > 1
214	1	and src[i] == tar[j - 1]
215		and src[i - 1] == tar[j]
216	1	):
217	1	# transposition
218		d_mat[i + 1, j + 1] = min(
219		d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
220		)
221		if backtrace:
222		trace_mat[i + 1, j + 1] = 2
223		if backtrace:
224	1	return d_mat, trace_mat
		0 ignored issues – show introduced 2020-01-06 02:51 UTC by Report Bug Copy Issue Report The variable `trace_mat` does not seem to be defined in case `backtrace` on line `179` is `False`. Are you sure this can never be the case? Loading history...
225		return d_mat
226
227	1	def dist_abs(self, src: str, tar: str) -> float:
228	1	"""Return the Levenshtein distance between two strings.
229	1
230	1	Parameters
231	1	----------
232		src : str
233	1	Source string for comparison
234		tar : str
235		Target string for comparison
236
237		Returns
238		-------
239		float (may return a float if cost has float values)
240		The Levenshtein distance between src & tar
241
242		Examples
243		--------
244		>>> cmp = DiscountedLevenshtein()
245		>>> cmp.dist_abs('cat', 'hat')
246		1
247		>>> cmp.dist_abs('Niall', 'Neil')
248		2.526064024369237
249		>>> cmp.dist_abs('aluminum', 'Catalan')
250		5.053867269967515
251		>>> cmp.dist_abs('ATCG', 'TAGC')
252		2.594032108779918
253
254		>>> cmp = DiscountedLevenshtein(mode='osa')
255		>>> cmp.dist_abs('ATCG', 'TAGC')
256		1.7482385137517997
257		>>> cmp.dist_abs('ACTG', 'TAGC')
258		3.342270622531718
259
260
261		.. versionadded:: 0.4.1
262
263		"""
264		src_len = len(src)
265		tar_len = len(tar)
266
267		if src == tar:
268		return 0.0
269
270	1	if isinstance(self._discount_from, int):
271	1	discount_from = self._discount_from
272		else:
273	1	discount_from = 1
274	1
275		if not src:
276	1	return sum(
277	1	self._discount_func(max(0, pos - discount_from))
278		for pos in range(tar_len)
279	1	)
280		if not tar:
281	1	return sum(
282	1	self._discount_func(max(0, pos - discount_from))
283		for pos in range(src_len)
284		)
285
286	1	d_mat = cast(
287	1	np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
288		)
289
290		if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
291		return int(d_mat[src_len, tar_len])
292	1	else:
293		return cast(float, d_mat[src_len, tar_len])
294	1
295	1	def dist(self, src: str, tar: str) -> float:
296		"""Return the normalized Levenshtein distance between two strings.
297	1
298		The Levenshtein distance is normalized by dividing the Levenshtein
299	1	distance (calculated by any of the three supported methods) by the
300		greater of the number of characters in src times the cost of a delete
301		and the number of characters in tar times the cost of an insert.
302		For the case in which all operations have :math:`cost = 1`, this is
303		equivalent to the greater of the length of the two strings src & tar.
304
305		Parameters
306		----------
307		src : str
308		Source string for comparison
309		tar : str
310		Target string for comparison
311
312		Returns
313		-------
314		float
315		The normalized Levenshtein distance between src & tar
316
317		Examples
318		--------
319		>>> cmp = DiscountedLevenshtein()
320		>>> cmp.dist('cat', 'hat')
321		0.3513958291799864
322		>>> cmp.dist('Niall', 'Neil')
323		0.5909885886270658
324		>>> cmp.dist('aluminum', 'Catalan')
325		0.8348163322045603
326		>>> cmp.dist('ATCG', 'TAGC')
327		0.7217609721523955
328
329
330		.. versionadded:: 0.4.1
331
332		"""
333		if src == tar:
334		return 0
335
336		if isinstance(self._discount_from, int):
337	1	discount_from = self._discount_from
338	1	else:
339		discount_from = 1
340	1
341	1	src_len = len(src)
342		tar_len = len(tar)
343	1
344		normalize_term = self._normalizer(
345	1	[
346	1	sum(
347		self._discount_func(max(0, pos - discount_from))
348	1	for pos in range(src_len)
349		),
350		sum(
351		self._discount_func(max(0, pos - discount_from))
352		for pos in range(tar_len)
353		),
354		]
355		)
356
357		return self.dist_abs(src, tar) / normalize_term
358
359
360		if __name__ == '__main__':
361	1	import doctest
362
363		doctest.testmod()
364

chrislit / abydos

DiscountedLevenshtein.dist() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like

DiscountedLevenshtein.dist() A
last analyzed 2020-12-31 20:10 UTC