abydos.distance._damerau_levenshtein.sim_damerau() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.distance._damerau_levenshtein.sim_damerau() A

↳ Parent: abydos.distance._damerau_levenshtein

Complexity

Conditions

Size

Total Lines	34
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	3
dl	0
loc	34
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._damerau_levenshtein.

Damerau-Levenshtein distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from sys import maxsize

from numpy import int as np_int
from numpy import zeros as np_zeros

from six.moves import range

from ._distance import _Distance

__all__ = [
    'DamerauLevenshtein',
    'damerau_levenshtein',
    'dist_damerau',
    'sim_damerau',
]


class DamerauLevenshtein(_Distance):

    """Damerau-Levenshtein distance.

    This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
    Damerau-Levenshtein code is based on Java code by Kevin L. Stern
    :cite:`Stern:2014`, under the MIT license:
    https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
    """

    def dist_abs(self, src, tar, cost=(1, 1, 1, 1)):

        """Return the Damerau-Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and transpositions, respectively
            (by default: (1, 1, 1, 1))

        Returns
        -------
        int (may return a float if cost has float values)
            The Damerau-Levenshtein distance between src & tar

        Raises
        ------
        ValueError
            Unsupported cost assignment; the cost of two transpositions must
            not be less than the cost of an insert plus a delete.

        Examples
        --------
        >>> cmp = DamerauLevenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        3
        >>> cmp.dist_abs('aluminum', 'Catalan')
        7
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2

        """
        ins_cost, del_cost, sub_cost, trans_cost = cost

        if src == tar:
            return 0
        if not src:
            return len(tar) * ins_cost
        if not tar:
            return len(src) * del_cost

        if 2 * trans_cost < ins_cost + del_cost:
            raise ValueError(
                'Unsupported cost assignment; the cost of two transpositions '
                + 'must not be less than the cost of an insert plus a delete.'
            )

        d_mat = np_zeros((len(src)) * (len(tar)), dtype=np_int).reshape(
            (len(src), len(tar))
        )

        if src[0] != tar[0]:
            d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)

        src_index_by_character = {src[0]: 0}
        for i in range(1, len(src)):
            del_distance = d_mat[i - 1, 0] + del_cost
            ins_distance = (i + 1) * del_cost + ins_cost
            match_distance = i * del_cost + (
                0 if src[i] == tar[0] else sub_cost
            )
            d_mat[i, 0] = min(del_distance, ins_distance, match_distance)

        for j in range(1, len(tar)):
            del_distance = (j + 1) * ins_cost + del_cost
            ins_distance = d_mat[0, j - 1] + ins_cost
            match_distance = j * ins_cost + (
                0 if src[0] == tar[j] else sub_cost
            )
            d_mat[0, j] = min(del_distance, ins_distance, match_distance)

        for i in range(1, len(src)):
            max_src_letter_match_index = 0 if src[i] == tar[0] else -1
            for j in range(1, len(tar)):
                candidate_swap_index = (
                    -1
                    if tar[j] not in src_index_by_character
                    else src_index_by_character[tar[j]]
                )
                j_swap = max_src_letter_match_index
                del_distance = d_mat[i - 1, j] + del_cost
                ins_distance = d_mat[i, j - 1] + ins_cost
                match_distance = d_mat[i - 1, j - 1]
                if src[i] != tar[j]:
                    match_distance += sub_cost
                else:
                    max_src_letter_match_index = j

                if candidate_swap_index != -1 and j_swap != -1:
                    i_swap = candidate_swap_index

                    if i_swap == 0 and j_swap == 0:
                        pre_swap_cost = 0
                    else:
                        pre_swap_cost = d_mat[
                            max(0, i_swap - 1), max(0, j_swap - 1)
                        ]
                    swap_distance = (
                        pre_swap_cost
                        + (i - i_swap - 1) * del_cost
                        + (j - j_swap - 1) * ins_cost
                        + trans_cost
                    )
                else:
                    swap_distance = maxsize

                d_mat[i, j] = min(
                    del_distance, ins_distance, match_distance, swap_distance
                )
            src_index_by_character[src[i]] = i

        return d_mat[len(src) - 1, len(tar) - 1]

    def dist(self, src, tar, cost=(1, 1, 1, 1)):

        """Return the Damerau-Levenshtein similarity of two strings.

        Damerau-Levenshtein distance normalized to the interval [0, 1].

        The Damerau-Levenshtein distance is normalized by dividing the
        Damerau-Levenshtein distance by the greater of
        the number of characters in src times the cost of a delete and
        the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and transpositions, respectively
            (by default: (1, 1, 1, 1))

        Returns
        -------
        float
            The normalized Damerau-Levenshtein distance

        Examples
        --------
        >>> cmp = DamerauLevenshtein()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.333333333333
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.6
        >>> cmp.dist('aluminum', 'Catalan')
        0.875
        >>> cmp.dist('ATCG', 'TAGC')
        0.5

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = cost[:2]
        return self.dist_abs(src, tar, cost) / (
            max(len(src) * del_cost, len(tar) * ins_cost)
        )


def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
    """Return the Damerau-Levenshtein distance between two strings.

    This is a wrapper of :py:meth:`DamerauLevenshtein.dist_abs`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    int (may return a float if cost has float values)
        The Damerau-Levenshtein distance between src & tar

    Examples
    --------
    >>> damerau_levenshtein('cat', 'hat')
    1
    >>> damerau_levenshtein('Niall', 'Neil')
    3
    >>> damerau_levenshtein('aluminum', 'Catalan')
    7
    >>> damerau_levenshtein('ATCG', 'TAGC')
    2

    """
    return DamerauLevenshtein().dist_abs(src, tar, cost)


def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
    """Return the Damerau-Levenshtein similarity of two strings.

    This is a wrapper of :py:meth:`DamerauLevenshtein.dist`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    float
        The normalized Damerau-Levenshtein distance

    Examples
    --------
    >>> round(dist_damerau('cat', 'hat'), 12)
    0.333333333333
    >>> round(dist_damerau('Niall', 'Neil'), 12)
    0.6
    >>> dist_damerau('aluminum', 'Catalan')
    0.875
    >>> dist_damerau('ATCG', 'TAGC')
    0.5

    """
    return DamerauLevenshtein().dist(src, tar, cost)


def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
    """Return the Damerau-Levenshtein similarity of two strings.

    This is a wrapper of :py:meth:`DamerauLevenshtein.sim`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    float
        The normalized Damerau-Levenshtein similarity

    Examples
    --------
    >>> round(sim_damerau('cat', 'hat'), 12)
    0.666666666667
    >>> round(sim_damerau('Niall', 'Neil'), 12)
    0.4
    >>> sim_damerau('aluminum', 'Catalan')
    0.125
    >>> sim_damerau('ATCG', 'TAGC')
    0.5

    """
    return DamerauLevenshtein().sim(src, tar, cost)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._damerau_levenshtein.
20
21		Damerau-Levenshtein distance
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from sys import maxsize
32
33	1	from numpy import int as np_int
34	1	from numpy import zeros as np_zeros
35
36	1	from six.moves import range
37
38	1	from ._distance import _Distance
39
40	1	__all__ = [
41		'DamerauLevenshtein',
42		'damerau_levenshtein',
43		'dist_damerau',
44		'sim_damerau',
45		]
46
47
48	1	class DamerauLevenshtein(_Distance):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
49		"""Damerau-Levenshtein distance.
50
51		This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
52		Damerau-Levenshtein code is based on Java code by Kevin L. Stern
53		:cite:`Stern:2014`, under the MIT license:
54		https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
55		"""
56
57	1	def dist_abs(self, src, tar, cost=(1, 1, 1, 1)):
		0 ignored issues – show Comprehensibility introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (22/15). Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'dist_abs' method Loading history...
58		"""Return the Damerau-Levenshtein distance between two strings.
59
60		Parameters
61		----------
62		src : str
63		Source string for comparison
64		tar : str
65		Target string for comparison
66		cost : tuple
67		A 4-tuple representing the cost of the four possible edits:
68		inserts, deletes, substitutions, and transpositions, respectively
69		(by default: (1, 1, 1, 1))
70
71		Returns
72		-------
73		int (may return a float if cost has float values)
74		The Damerau-Levenshtein distance between src & tar
75
76		Raises
77		------
78		ValueError
79		Unsupported cost assignment; the cost of two transpositions must
80		not be less than the cost of an insert plus a delete.
81
82		Examples
83		--------
84		>>> cmp = DamerauLevenshtein()
85		>>> cmp.dist_abs('cat', 'hat')
86		1
87		>>> cmp.dist_abs('Niall', 'Neil')
88		3
89		>>> cmp.dist_abs('aluminum', 'Catalan')
90		7
91		>>> cmp.dist_abs('ATCG', 'TAGC')
92		2
93
94		"""
95	1	ins_cost, del_cost, sub_cost, trans_cost = cost
96
97	1	if src == tar:
98	1	return 0
99	1	if not src:
100	1	return len(tar) * ins_cost
101	1	if not tar:
102	1	return len(src) * del_cost
103
104	1	if 2 * trans_cost < ins_cost + del_cost:
105	1	raise ValueError(
106		'Unsupported cost assignment; the cost of two transpositions '
107		+ 'must not be less than the cost of an insert plus a delete.'
108		)
109
110	1	d_mat = np_zeros((len(src)) * (len(tar)), dtype=np_int).reshape(
111		(len(src), len(tar))
112		)
113
114	1	if src[0] != tar[0]:
115	1	d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
116
117	1	src_index_by_character = {src[0]: 0}
118	1	for i in range(1, len(src)):
119	1	del_distance = d_mat[i - 1, 0] + del_cost
120	1	ins_distance = (i + 1) * del_cost + ins_cost
121	1	match_distance = i * del_cost + (
122		0 if src[i] == tar[0] else sub_cost
123		)
124	1	d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
125
126	1	for j in range(1, len(tar)):
127	1	del_distance = (j + 1) * ins_cost + del_cost
128	1	ins_distance = d_mat[0, j - 1] + ins_cost
129	1	match_distance = j * ins_cost + (
130		0 if src[0] == tar[j] else sub_cost
131		)
132	1	d_mat[0, j] = min(del_distance, ins_distance, match_distance)
133
134	1	for i in range(1, len(src)):
135	1	max_src_letter_match_index = 0 if src[i] == tar[0] else -1
136	1	for j in range(1, len(tar)):
137	1	candidate_swap_index = (
138		-1
139		if tar[j] not in src_index_by_character
140		else src_index_by_character[tar[j]]
141		)
142	1	j_swap = max_src_letter_match_index
143	1	del_distance = d_mat[i - 1, j] + del_cost
144	1	ins_distance = d_mat[i, j - 1] + ins_cost
145	1	match_distance = d_mat[i - 1, j - 1]
146	1	if src[i] != tar[j]:
147	1	match_distance += sub_cost
148		else:
149	1	max_src_letter_match_index = j
150
151	1	if candidate_swap_index != -1 and j_swap != -1:
152	1	i_swap = candidate_swap_index
153
154	1	if i_swap == 0 and j_swap == 0:
155	1	pre_swap_cost = 0
156		else:
157	1	pre_swap_cost = d_mat[
158		max(0, i_swap - 1), max(0, j_swap - 1)
159		]
160	1	swap_distance = (
161		pre_swap_cost
162		+ (i - i_swap - 1) * del_cost
163		+ (j - j_swap - 1) * ins_cost
164		+ trans_cost
165		)
166		else:
167	1	swap_distance = maxsize
168
169	1	d_mat[i, j] = min(
170		del_distance, ins_distance, match_distance, swap_distance
171		)
172	1	src_index_by_character[src[i]] = i
173
174	1	return d_mat[len(src) - 1, len(tar) - 1]
175
176	1	def dist(self, src, tar, cost=(1, 1, 1, 1)):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'dist' method Loading history...
177		"""Return the Damerau-Levenshtein similarity of two strings.
178
179		Damerau-Levenshtein distance normalized to the interval [0, 1].
180
181		The Damerau-Levenshtein distance is normalized by dividing the
182		Damerau-Levenshtein distance by the greater of
183		the number of characters in src times the cost of a delete and
184		the number of characters in tar times the cost of an insert.
185		For the case in which all operations have :math:`cost = 1`, this is
186		equivalent to the greater of the length of the two strings src & tar.
187
188		Parameters
189		----------
190		src : str
191		Source string for comparison
192		tar : str
193		Target string for comparison
194		cost : tuple
195		A 4-tuple representing the cost of the four possible edits:
196		inserts, deletes, substitutions, and transpositions, respectively
197		(by default: (1, 1, 1, 1))
198
199		Returns
200		-------
201		float
202		The normalized Damerau-Levenshtein distance
203
204		Examples
205		--------
206		>>> cmp = DamerauLevenshtein()
207		>>> round(cmp.dist('cat', 'hat'), 12)
208		0.333333333333
209		>>> round(cmp.dist('Niall', 'Neil'), 12)
210		0.6
211		>>> cmp.dist('aluminum', 'Catalan')
212		0.875
213		>>> cmp.dist('ATCG', 'TAGC')
214		0.5
215
216		"""
217	1	if src == tar:
218	1	return 0.0
219	1	ins_cost, del_cost = cost[:2]
220	1	return self.dist_abs(src, tar, cost) / (
221		max(len(src) * del_cost, len(tar) * ins_cost)
222		)
223
224
225	1	def damerau_levenshtein(src, tar, cost=(1, 1, 1, 1)):
226		"""Return the Damerau-Levenshtein distance between two strings.
227
228		This is a wrapper of :py:meth:`DamerauLevenshtein.dist_abs`.
229
230		Parameters
231		----------
232		src : str
233		Source string for comparison
234		tar : str
235		Target string for comparison
236		cost : tuple
237		A 4-tuple representing the cost of the four possible edits: inserts,
238		deletes, substitutions, and transpositions, respectively (by default:
239		(1, 1, 1, 1))
240
241		Returns
242		-------
243		int (may return a float if cost has float values)
244		The Damerau-Levenshtein distance between src & tar
245
246		Examples
247		--------
248		>>> damerau_levenshtein('cat', 'hat')
249		1
250		>>> damerau_levenshtein('Niall', 'Neil')
251		3
252		>>> damerau_levenshtein('aluminum', 'Catalan')
253		7
254		>>> damerau_levenshtein('ATCG', 'TAGC')
255		2
256
257		"""
258	1	return DamerauLevenshtein().dist_abs(src, tar, cost)
259
260
261	1	def dist_damerau(src, tar, cost=(1, 1, 1, 1)):
262		"""Return the Damerau-Levenshtein similarity of two strings.
263
264		This is a wrapper of :py:meth:`DamerauLevenshtein.dist`.
265
266		Parameters
267		----------
268		src : str
269		Source string for comparison
270		tar : str
271		Target string for comparison
272		cost : tuple
273		A 4-tuple representing the cost of the four possible edits: inserts,
274		deletes, substitutions, and transpositions, respectively (by default:
275		(1, 1, 1, 1))
276
277		Returns
278		-------
279		float
280		The normalized Damerau-Levenshtein distance
281
282		Examples
283		--------
284		>>> round(dist_damerau('cat', 'hat'), 12)
285		0.333333333333
286		>>> round(dist_damerau('Niall', 'Neil'), 12)
287		0.6
288		>>> dist_damerau('aluminum', 'Catalan')
289		0.875
290		>>> dist_damerau('ATCG', 'TAGC')
291		0.5
292
293		"""
294	1	return DamerauLevenshtein().dist(src, tar, cost)
295
296
297	1	def sim_damerau(src, tar, cost=(1, 1, 1, 1)):
298		"""Return the Damerau-Levenshtein similarity of two strings.
299
300		This is a wrapper of :py:meth:`DamerauLevenshtein.sim`.
301
302		Parameters
303		----------
304		src : str
305		Source string for comparison
306		tar : str
307		Target string for comparison
308		cost : tuple
309		A 4-tuple representing the cost of the four possible edits: inserts,
310		deletes, substitutions, and transpositions, respectively (by default:
311		(1, 1, 1, 1))
312
313		Returns
314		-------
315		float
316		The normalized Damerau-Levenshtein similarity
317
318		Examples
319		--------
320		>>> round(sim_damerau('cat', 'hat'), 12)
321		0.666666666667
322		>>> round(sim_damerau('Niall', 'Neil'), 12)
323		0.4
324		>>> sim_damerau('aluminum', 'Catalan')
325		0.125
326		>>> sim_damerau('ATCG', 'TAGC')
327		0.5
328
329		"""
330	1	return DamerauLevenshtein().sim(src, tar, cost)
331
332
333		if __name__ == '__main__':
334		import doctest
335
336		doctest.testmod()
337

chrislit / abydos

Push — master ( f43547...71985b )

abydos.distance._damerau_levenshtein.sim_damerau() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like