abydos.distance._levenshtein.levenshtein() - Code Metrics - Inspection of "0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#225)

by Chris

created 2019-07-12 00:08 UTC

abydos.distance._levenshtein.levenshtein() A

↳ Parent: abydos.distance._levenshtein

Complexity

Conditions

Size

Total Lines	56
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
eloc	7
dl	0
loc	56
ccs	3
cts	3
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	4
crap	1

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2014-2019 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._levenshtein.

The distance._Levenshtein module implements string edit distance functions
based on Levenshtein distance, including:

    - Levenshtein distance
    - Optimal String Alignment distance
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from sys import float_info

from deprecation import deprecated

from numpy import float as np_float
from numpy import zeros as np_zeros

from six.moves import range

from ._distance import _Distance
from .. import __version__

__all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein']


class Levenshtein(_Distance):
    """Levenshtein distance.

    This is the standard edit distance measure. Cf.
    :cite:`Levenshtein:1965,Levenshtein:1966`.

    Optimal string alignment (aka restricted
    Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.

    The ordinary Levenshtein & Optimal String Alignment distance both
    employ the Wagner-Fischer dynamic programming algorithm
    :cite:`Wagner:1974`.

    Levenshtein edit distance ordinarily has unit insertion, deletion, and
    substitution costs.

    .. versionadded:: 0.3.6
    .. versionchanged:: 0.4.0
        Added taper option
    """

    def __init__(
        self,
        mode='lev',
        cost=(1, 1, 1, 1),
        normalizer=max,
        taper=False,
        **kwargs
    ):
        """Initialize Levenshtein instance.

        Parameters
        ----------
        mode : str
            Specifies a mode for computing the Levenshtein distance:

                - ``lev`` (default) computes the ordinary Levenshtein distance,
                  in which edits may include inserts, deletes, and
                  substitutions
                - ``osa`` computes the Optimal String Alignment distance, in
                  which edits may include inserts, deletes, substitutions, and
                  transpositions but substrings may only be edited once

        cost : tuple
            A 4-tuple representing the cost of the four possible edits:
            inserts, deletes, substitutions, and transpositions, respectively
            (by default: (1, 1, 1, 1))
        normalizer : function
            A function that takes an list and computes a normalization term
            by which the edit distance is divided (max by default). Another
            good option is the sum function.
        taper : bool
            Enables cost tapering. Following :cite:`Zobel:1996`, it causes
            edits at the start of the string to "just [exceed] twice the
            minimum penalty for replacement or deletion at the end of the
            string".
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(Levenshtein, self).__init__(**kwargs)
        self._mode = mode
        self._cost = cost
        self._normalizer = normalizer
        self._taper_enabled = taper

    def _taper(self, pos, length):
        return (
            round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
            if self._taper_enabled
            else 1
        )

    def _alignment_matrix(self, src, tar):
        """Return the Levenshtein alignment matrix.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        numpy.ndarray
            The alignment matrix


        .. versionadded:: 0.4.1

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)
        max_len = max(src_len, tar_len)

        d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
        for i in range(src_len + 1):
            d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
        for j in range(tar_len + 1):
            d_mat[0, j] = j * self._taper(j, max_len) * ins_cost

        for i in range(src_len):
            for j in range(tar_len):
                d_mat[i + 1, j + 1] = min(
                    d_mat[i + 1, j]
                    + ins_cost * self._taper(1 + max(i, j), max_len),  # ins
                    d_mat[i, j + 1]
                    + del_cost * self._taper(1 + max(i, j), max_len),  # del
                    d_mat[i, j]
                    + (
                        sub_cost * self._taper(1 + max(i, j), max_len)
                        if src[i] != tar[j]
                        else 0
                    ),  # sub/==
                )

                if self._mode == 'osa':
                    if (
                        i + 1 > 1
                        and j + 1 > 1
                        and src[i] == tar[j - 1]
                        and src[i - 1] == tar[j]
                    ):
                        # transposition
                        d_mat[i + 1, j + 1] = min(
                            d_mat[i + 1, j + 1],
                            d_mat[i - 1, j - 1]
                            + trans_cost * self._taper(1 + max(i, j), max_len),
                        )

        return d_mat

    def alignment(self, src, tar):

        """Return the Levenshtein alignment of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        tuple
            A tuple containing the Levenshtein distance and the two strings,
            aligned.

        Examples
        --------
        >>> cmp = Levenshtein()
        >>> cmp.alignment('cat', 'hat')
        (1.0, 'cat', 'hat')
        >>> cmp.alignment('Niall', 'Neil')
        (3.0, 'Niall', 'Neil-')
        >>> cmp.alignment('aluminum', 'Catalan')
        (7.0, '-aluminum', 'Catalan--')
        >>> cmp.alignment('ATCG', 'TAGC')
        (3.0, 'ATCG-', '-TAGC')

        >>> cmp = Levenshtein(mode='osa')
        >>> cmp.alignment('ATCG', 'TAGC')
        (2.0, 'ATCG', 'TAGC')
        >>> cmp.alignment('ACTG', 'TAGC')
        (4.0, 'ACTG', 'TAGC')


        .. versionadded:: 0.4.1

        """
        d_mat = self._alignment_matrix(src, tar)

        src_aligned = []
        tar_aligned = []

        src_pos = len(src)
        tar_pos = len(tar)

        distance = d_mat[src_pos, tar_pos]

        while src_pos and tar_pos:
            up = d_mat[src_pos, tar_pos - 1]
            left = d_mat[src_pos - 1, tar_pos]
            diag = d_mat[src_pos - 1, tar_pos - 1]

            if diag <= min(up, left):
                src_pos -= 1
                tar_pos -= 1
                src_aligned.append(src[src_pos])
                tar_aligned.append(tar[tar_pos])
            elif up <= left:
                tar_pos -= 1
                src_aligned.append('-')
                tar_aligned.append(tar[tar_pos])
            else:
                src_pos -= 1
                src_aligned.append(src[src_pos])
                tar_aligned.append('-')
        while tar_pos:
            tar_pos -= 1
            tar_aligned.append(tar[tar_pos])
            src_aligned.append('-')
        while src_pos:
            src_pos -= 1
            src_aligned.append(src[src_pos])
            tar_aligned.append('-')

        return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])

    def dist_abs(self, src, tar):
        """Return the Levenshtein distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int (may return a float if cost has float values)
            The Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = Levenshtein()
        >>> cmp.dist_abs('cat', 'hat')
        1
        >>> cmp.dist_abs('Niall', 'Neil')
        3
        >>> cmp.dist_abs('aluminum', 'Catalan')
        7
        >>> cmp.dist_abs('ATCG', 'TAGC')
        3

        >>> cmp = Levenshtein(mode='osa')
        >>> cmp.dist_abs('ATCG', 'TAGC')
        2
        >>> cmp.dist_abs('ACTG', 'TAGC')
        4


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        ins_cost, del_cost, sub_cost, trans_cost = self._cost

        src_len = len(src)
        tar_len = len(tar)
        max_len = max(src_len, tar_len)

        if src == tar:
            return 0
        if not src:
            return sum(
                ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
            )
        if not tar:
            return sum(
                del_cost * self._taper(pos, max_len) for pos in range(src_len)
            )

        d_mat = self._alignment_matrix(src, tar)

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]

    def dist(self, src, tar):
        """Return the normalized Levenshtein distance between two strings.

        The Levenshtein distance is normalized by dividing the Levenshtein
        distance (calculated by any of the three supported methods) by the
        greater of the number of characters in src times the cost of a delete
        and the number of characters in tar times the cost of an insert.
        For the case in which all operations have :math:`cost = 1`, this is
        equivalent to the greater of the length of the two strings src & tar.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            The normalized Levenshtein distance between src & tar

        Examples
        --------
        >>> cmp = Levenshtein()
        >>> round(cmp.dist('cat', 'hat'), 12)
        0.333333333333
        >>> round(cmp.dist('Niall', 'Neil'), 12)
        0.6
        >>> cmp.dist('aluminum', 'Catalan')
        0.875
        >>> cmp.dist('ATCG', 'TAGC')
        0.75


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if src == tar:
            return 0.0
        ins_cost, del_cost = self._cost[:2]

        src_len = len(src)
        tar_len = len(tar)

        if self._taper_enabled:
            normalize_term = self._normalizer(
                [
                    sum(
                        self._taper(pos, src_len) * del_cost
                        for pos in range(src_len)
                    ),
                    sum(
                        self._taper(pos, tar_len) * ins_cost
                        for pos in range(tar_len)
                    ),
                ]
            )
        else:
            normalize_term = self._normalizer(
                [src_len * del_cost, tar_len * ins_cost]
            )

        return self.dist_abs(src, tar) / normalize_term


@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the Levenshtein.dist_abs method instead.',
)
def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
    """Return the Levenshtein distance between two strings.

    This is a wrapper of :py:meth:`Levenshtein.dist_abs`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    mode : str
        Specifies a mode for computing the Levenshtein distance:

            - ``lev`` (default) computes the ordinary Levenshtein distance, in
              which edits may include inserts, deletes, and substitutions
            - ``osa`` computes the Optimal String Alignment distance, in which
              edits may include inserts, deletes, substitutions, and
              transpositions but substrings may only be edited once

    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    int (may return a float if cost has float values)
        The Levenshtein distance between src & tar

    Examples
    --------
    >>> levenshtein('cat', 'hat')
    1
    >>> levenshtein('Niall', 'Neil')
    3
    >>> levenshtein('aluminum', 'Catalan')
    7
    >>> levenshtein('ATCG', 'TAGC')
    3

    >>> levenshtein('ATCG', 'TAGC', mode='osa')
    2
    >>> levenshtein('ACTG', 'TAGC', mode='osa')
    4

    .. versionadded:: 0.1.0

    """
    return Levenshtein(mode=mode, cost=cost).dist_abs(src, tar)


@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the Levenshtein.dist method instead.',
)
def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
    """Return the normalized Levenshtein distance between two strings.

    This is a wrapper of :py:meth:`Levenshtein.dist`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    mode : str
        Specifies a mode for computing the Levenshtein distance:

            - ``lev`` (default) computes the ordinary Levenshtein distance, in
              which edits may include inserts, deletes, and substitutions
            - ``osa`` computes the Optimal String Alignment distance, in which
              edits may include inserts, deletes, substitutions, and
              transpositions but substrings may only be edited once

    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    float
        The Levenshtein distance between src & tar

    Examples
    --------
    >>> round(dist_levenshtein('cat', 'hat'), 12)
    0.333333333333
    >>> round(dist_levenshtein('Niall', 'Neil'), 12)
    0.6
    >>> dist_levenshtein('aluminum', 'Catalan')
    0.875
    >>> dist_levenshtein('ATCG', 'TAGC')
    0.75

    .. versionadded:: 0.1.0

    """
    return Levenshtein(mode=mode, cost=cost).dist(src, tar)


@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the Levenshtein.sim method instead.',
)
def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
    """Return the Levenshtein similarity of two strings.

    This is a wrapper of :py:meth:`Levenshtein.sim`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    mode : str
        Specifies a mode for computing the Levenshtein distance:

            - ``lev`` (default) computes the ordinary Levenshtein distance, in
              which edits may include inserts, deletes, and substitutions
            - ``osa`` computes the Optimal String Alignment distance, in which
              edits may include inserts, deletes, substitutions, and
              transpositions but substrings may only be edited once

    cost : tuple
        A 4-tuple representing the cost of the four possible edits: inserts,
        deletes, substitutions, and transpositions, respectively (by default:
        (1, 1, 1, 1))

    Returns
    -------
    float
        The Levenshtein similarity between src & tar

    Examples
    --------
    >>> round(sim_levenshtein('cat', 'hat'), 12)
    0.666666666667
    >>> round(sim_levenshtein('Niall', 'Neil'), 12)
    0.4
    >>> sim_levenshtein('aluminum', 'Catalan')
    0.125
    >>> sim_levenshtein('ATCG', 'TAGC')
    0.25

    .. versionadded:: 0.1.0

    """
    return Levenshtein(mode=mode, cost=cost).sim(src, tar)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2019 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.distance._levenshtein.
20
21			The distance._Levenshtein module implements string edit distance functions
22			based on Levenshtein distance, including:
23
24			- Levenshtein distance
25			- Optimal String Alignment distance
26			"""
27
28	1		from __future__ import (
29			absolute_import,
30			division,
31			print_function,
32			unicode_literals,
33			)
34
35	1		from sys import float_info
36
37	1		from deprecation import deprecated
38
39	1		from numpy import float as np_float
40	1		from numpy import zeros as np_zeros
41
42	1		from six.moves import range
43
44	1		from ._distance import _Distance
45	1		from .. import __version__
46
47	1		__all__ = ['Levenshtein', 'dist_levenshtein', 'levenshtein', 'sim_levenshtein']
48
49
50	1		class Levenshtein(_Distance):
51			"""Levenshtein distance.
52
53			This is the standard edit distance measure. Cf.
54			:cite:`Levenshtein:1965,Levenshtein:1966`.
55
56			Optimal string alignment (aka restricted
57			Damerau-Levenshtein distance) :cite:`Boytsov:2011` is also supported.
58
59			The ordinary Levenshtein & Optimal String Alignment distance both
60			employ the Wagner-Fischer dynamic programming algorithm
61			:cite:`Wagner:1974`.
62
63			Levenshtein edit distance ordinarily has unit insertion, deletion, and
64			substitution costs.
65
66			.. versionadded:: 0.3.6
67			.. versionchanged:: 0.4.0
68			Added taper option
69			"""
70
71	1		def __init__(
72			self,
73			mode='lev',
74			cost=(1, 1, 1, 1),
75			normalizer=max,
76			taper=False,
77			**kwargs
78			):
79			"""Initialize Levenshtein instance.
80
81			Parameters
82			----------
83			mode : str
84			Specifies a mode for computing the Levenshtein distance:
85
86			- ``lev`` (default) computes the ordinary Levenshtein distance,
87			in which edits may include inserts, deletes, and
88			substitutions
89			- ``osa`` computes the Optimal String Alignment distance, in
90			which edits may include inserts, deletes, substitutions, and
91			transpositions but substrings may only be edited once
92
93			cost : tuple
94			A 4-tuple representing the cost of the four possible edits:
95			inserts, deletes, substitutions, and transpositions, respectively
96			(by default: (1, 1, 1, 1))
97			normalizer : function
98			A function that takes an list and computes a normalization term
99			by which the edit distance is divided (max by default). Another
100			good option is the sum function.
101			taper : bool
102			Enables cost tapering. Following :cite:`Zobel:1996`, it causes
103			edits at the start of the string to "just [exceed] twice the
104			minimum penalty for replacement or deletion at the end of the
105			string".
106			**kwargs
107			Arbitrary keyword arguments
108
109
110			.. versionadded:: 0.4.0
111
112			"""
113	1		super(Levenshtein, self).__init__(**kwargs)
114	1		self._mode = mode
115	1		self._cost = cost
116	1		self._normalizer = normalizer
117	1		self._taper_enabled = taper
118
119	1		def _taper(self, pos, length):
120	1		return (
121			round(1 + ((length - pos) / length) * (1 + float_info.epsilon), 15)
122			if self._taper_enabled
123			else 1
124			)
125
126	1		def _alignment_matrix(self, src, tar):
127			"""Return the Levenshtein alignment matrix.
128
129			Parameters
130			----------
131			src : str
132			Source string for comparison
133			tar : str
134			Target string for comparison
135
136			Returns
137			-------
138			numpy.ndarray
139			The alignment matrix
140
141
142			.. versionadded:: 0.4.1
143
144			"""
145	1		ins_cost, del_cost, sub_cost, trans_cost = self._cost
146
147	1		src_len = len(src)
148	1		tar_len = len(tar)
149	1		max_len = max(src_len, tar_len)
150
151	1		d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
152	1		for i in range(src_len + 1):
153	1		d_mat[i, 0] = i * self._taper(i, max_len) * del_cost
154	1		for j in range(tar_len + 1):
155	1		d_mat[0, j] = j * self._taper(j, max_len) * ins_cost
156
157	1		for i in range(src_len):
158	1		for j in range(tar_len):
159	1		d_mat[i + 1, j + 1] = min(
160			d_mat[i + 1, j]
161			+ ins_cost * self._taper(1 + max(i, j), max_len), # ins
162			d_mat[i, j + 1]
163			+ del_cost * self._taper(1 + max(i, j), max_len), # del
164			d_mat[i, j]
165			+ (
166			sub_cost * self._taper(1 + max(i, j), max_len)
167			if src[i] != tar[j]
168			else 0
169			), # sub/==
170			)
171
172	1		if self._mode == 'osa':
173	1		if (
174			i + 1 > 1
175			and j + 1 > 1
176			and src[i] == tar[j - 1]
177			and src[i - 1] == tar[j]
178			):
179			# transposition
180	1		d_mat[i + 1, j + 1] = min(
181			d_mat[i + 1, j + 1],
182			d_mat[i - 1, j - 1]
183			+ trans_cost * self._taper(1 + max(i, j), max_len),
184			)
185
186	1		return d_mat
187
188	1	View Code Duplication	def alignment(self, src, tar):
			0 ignored issues – show Duplication introduced 2019-07-12 00:17 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
189			"""Return the Levenshtein alignment of two strings.
190
191			Parameters
192			----------
193			src : str
194			Source string for comparison
195			tar : str
196			Target string for comparison
197
198			Returns
199			-------
200			tuple
201			A tuple containing the Levenshtein distance and the two strings,
202			aligned.
203
204			Examples
205			--------
206			>>> cmp = Levenshtein()
207			>>> cmp.alignment('cat', 'hat')
208			(1.0, 'cat', 'hat')
209			>>> cmp.alignment('Niall', 'Neil')
210			(3.0, 'Niall', 'Neil-')
211			>>> cmp.alignment('aluminum', 'Catalan')
212			(7.0, '-aluminum', 'Catalan--')
213			>>> cmp.alignment('ATCG', 'TAGC')
214			(3.0, 'ATCG-', '-TAGC')
215
216			>>> cmp = Levenshtein(mode='osa')
217			>>> cmp.alignment('ATCG', 'TAGC')
218			(2.0, 'ATCG', 'TAGC')
219			>>> cmp.alignment('ACTG', 'TAGC')
220			(4.0, 'ACTG', 'TAGC')
221
222
223			.. versionadded:: 0.4.1
224
225			"""
226	1		d_mat = self._alignment_matrix(src, tar)
227
228	1		src_aligned = []
229	1		tar_aligned = []
230
231	1		src_pos = len(src)
232	1		tar_pos = len(tar)
233
234	1		distance = d_mat[src_pos, tar_pos]
235
236	1		while src_pos and tar_pos:
237	1		up = d_mat[src_pos, tar_pos - 1]
238	1		left = d_mat[src_pos - 1, tar_pos]
239	1		diag = d_mat[src_pos - 1, tar_pos - 1]
240
241	1		if diag <= min(up, left):
242	1		src_pos -= 1
243	1		tar_pos -= 1
244	1		src_aligned.append(src[src_pos])
245	1		tar_aligned.append(tar[tar_pos])
246	1		elif up <= left:
247	1		tar_pos -= 1
248	1		src_aligned.append('-')
249	1		tar_aligned.append(tar[tar_pos])
250			else:
251	1		src_pos -= 1
252	1		src_aligned.append(src[src_pos])
253	1		tar_aligned.append('-')
254	1		while tar_pos:
255	1		tar_pos -= 1
256	1		tar_aligned.append(tar[tar_pos])
257	1		src_aligned.append('-')
258	1		while src_pos:
259	1		src_pos -= 1
260	1		src_aligned.append(src[src_pos])
261	1		tar_aligned.append('-')
262
263	1		return distance, ''.join(src_aligned[::-1]), ''.join(tar_aligned[::-1])
264
265	1		def dist_abs(self, src, tar):
266			"""Return the Levenshtein distance between two strings.
267
268			Parameters
269			----------
270			src : str
271			Source string for comparison
272			tar : str
273			Target string for comparison
274
275			Returns
276			-------
277			int (may return a float if cost has float values)
278			The Levenshtein distance between src & tar
279
280			Examples
281			--------
282			>>> cmp = Levenshtein()
283			>>> cmp.dist_abs('cat', 'hat')
284			1
285			>>> cmp.dist_abs('Niall', 'Neil')
286			3
287			>>> cmp.dist_abs('aluminum', 'Catalan')
288			7
289			>>> cmp.dist_abs('ATCG', 'TAGC')
290			3
291
292			>>> cmp = Levenshtein(mode='osa')
293			>>> cmp.dist_abs('ATCG', 'TAGC')
294			2
295			>>> cmp.dist_abs('ACTG', 'TAGC')
296			4
297
298
299			.. versionadded:: 0.1.0
300			.. versionchanged:: 0.3.6
301			Encapsulated in class
302
303			"""
304	1		ins_cost, del_cost, sub_cost, trans_cost = self._cost
305
306	1		src_len = len(src)
307	1		tar_len = len(tar)
308	1		max_len = max(src_len, tar_len)
309
310	1		if src == tar:
311	1		return 0
312	1		if not src:
313	1		return sum(
314			ins_cost * self._taper(pos, max_len) for pos in range(tar_len)
315			)
316	1		if not tar:
317	1		return sum(
318			del_cost * self._taper(pos, max_len) for pos in range(src_len)
319			)
320
321	1		d_mat = self._alignment_matrix(src, tar)
322
323	1		if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
324	1		return int(d_mat[src_len, tar_len])
325			else:
326	1		return d_mat[src_len, tar_len]
327
328	1		def dist(self, src, tar):
329			"""Return the normalized Levenshtein distance between two strings.
330
331			The Levenshtein distance is normalized by dividing the Levenshtein
332			distance (calculated by any of the three supported methods) by the
333			greater of the number of characters in src times the cost of a delete
334			and the number of characters in tar times the cost of an insert.
335			For the case in which all operations have :math:`cost = 1`, this is
336			equivalent to the greater of the length of the two strings src & tar.
337
338			Parameters
339			----------
340			src : str
341			Source string for comparison
342			tar : str
343			Target string for comparison
344
345			Returns
346			-------
347			float
348			The normalized Levenshtein distance between src & tar
349
350			Examples
351			--------
352			>>> cmp = Levenshtein()
353			>>> round(cmp.dist('cat', 'hat'), 12)
354			0.333333333333
355			>>> round(cmp.dist('Niall', 'Neil'), 12)
356			0.6
357			>>> cmp.dist('aluminum', 'Catalan')
358			0.875
359			>>> cmp.dist('ATCG', 'TAGC')
360			0.75
361
362
363			.. versionadded:: 0.1.0
364			.. versionchanged:: 0.3.6
365			Encapsulated in class
366
367			"""
368	1		if src == tar:
369	1		return 0.0
370	1		ins_cost, del_cost = self._cost[:2]
371
372	1		src_len = len(src)
373	1		tar_len = len(tar)
374
375	1		if self._taper_enabled:
376	1		normalize_term = self._normalizer(
377			[
378			sum(
379			self._taper(pos, src_len) * del_cost
380			for pos in range(src_len)
381			),
382			sum(
383			self._taper(pos, tar_len) * ins_cost
384			for pos in range(tar_len)
385			),
386			]
387			)
388			else:
389	1		normalize_term = self._normalizer(
390			[src_len * del_cost, tar_len * ins_cost]
391			)
392
393	1		return self.dist_abs(src, tar) / normalize_term
394
395
396	1		@deprecated(
397			deprecated_in='0.4.0',
398			removed_in='0.6.0',
399			current_version=__version__,
400			details='Use the Levenshtein.dist_abs method instead.',
401			)
402	1		def levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
403			"""Return the Levenshtein distance between two strings.
404
405			This is a wrapper of :py:meth:`Levenshtein.dist_abs`.
406
407			Parameters
408			----------
409			src : str
410			Source string for comparison
411			tar : str
412			Target string for comparison
413			mode : str
414			Specifies a mode for computing the Levenshtein distance:
415
416			- ``lev`` (default) computes the ordinary Levenshtein distance, in
417			which edits may include inserts, deletes, and substitutions
418			- ``osa`` computes the Optimal String Alignment distance, in which
419			edits may include inserts, deletes, substitutions, and
420			transpositions but substrings may only be edited once
421
422			cost : tuple
423			A 4-tuple representing the cost of the four possible edits: inserts,
424			deletes, substitutions, and transpositions, respectively (by default:
425			(1, 1, 1, 1))
426
427			Returns
428			-------
429			int (may return a float if cost has float values)
430			The Levenshtein distance between src & tar
431
432			Examples
433			--------
434			>>> levenshtein('cat', 'hat')
435			1
436			>>> levenshtein('Niall', 'Neil')
437			3
438			>>> levenshtein('aluminum', 'Catalan')
439			7
440			>>> levenshtein('ATCG', 'TAGC')
441			3
442
443			>>> levenshtein('ATCG', 'TAGC', mode='osa')
444			2
445			>>> levenshtein('ACTG', 'TAGC', mode='osa')
446			4
447
448			.. versionadded:: 0.1.0
449
450			"""
451	1		return Levenshtein(mode=mode, cost=cost).dist_abs(src, tar)
452
453
454	1		@deprecated(
455			deprecated_in='0.4.0',
456			removed_in='0.6.0',
457			current_version=__version__,
458			details='Use the Levenshtein.dist method instead.',
459			)
460	1		def dist_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
461			"""Return the normalized Levenshtein distance between two strings.
462
463			This is a wrapper of :py:meth:`Levenshtein.dist`.
464
465			Parameters
466			----------
467			src : str
468			Source string for comparison
469			tar : str
470			Target string for comparison
471			mode : str
472			Specifies a mode for computing the Levenshtein distance:
473
474			- ``lev`` (default) computes the ordinary Levenshtein distance, in
475			which edits may include inserts, deletes, and substitutions
476			- ``osa`` computes the Optimal String Alignment distance, in which
477			edits may include inserts, deletes, substitutions, and
478			transpositions but substrings may only be edited once
479
480			cost : tuple
481			A 4-tuple representing the cost of the four possible edits: inserts,
482			deletes, substitutions, and transpositions, respectively (by default:
483			(1, 1, 1, 1))
484
485			Returns
486			-------
487			float
488			The Levenshtein distance between src & tar
489
490			Examples
491			--------
492			>>> round(dist_levenshtein('cat', 'hat'), 12)
493			0.333333333333
494			>>> round(dist_levenshtein('Niall', 'Neil'), 12)
495			0.6
496			>>> dist_levenshtein('aluminum', 'Catalan')
497			0.875
498			>>> dist_levenshtein('ATCG', 'TAGC')
499			0.75
500
501			.. versionadded:: 0.1.0
502
503			"""
504	1		return Levenshtein(mode=mode, cost=cost).dist(src, tar)
505
506
507	1		@deprecated(
508			deprecated_in='0.4.0',
509			removed_in='0.6.0',
510			current_version=__version__,
511			details='Use the Levenshtein.sim method instead.',
512			)
513	1		def sim_levenshtein(src, tar, mode='lev', cost=(1, 1, 1, 1)):
514			"""Return the Levenshtein similarity of two strings.
515
516			This is a wrapper of :py:meth:`Levenshtein.sim`.
517
518			Parameters
519			----------
520			src : str
521			Source string for comparison
522			tar : str
523			Target string for comparison
524			mode : str
525			Specifies a mode for computing the Levenshtein distance:
526
527			- ``lev`` (default) computes the ordinary Levenshtein distance, in
528			which edits may include inserts, deletes, and substitutions
529			- ``osa`` computes the Optimal String Alignment distance, in which
530			edits may include inserts, deletes, substitutions, and
531			transpositions but substrings may only be edited once
532
533			cost : tuple
534			A 4-tuple representing the cost of the four possible edits: inserts,
535			deletes, substitutions, and transpositions, respectively (by default:
536			(1, 1, 1, 1))
537
538			Returns
539			-------
540			float
541			The Levenshtein similarity between src & tar
542
543			Examples
544			--------
545			>>> round(sim_levenshtein('cat', 'hat'), 12)
546			0.666666666667
547			>>> round(sim_levenshtein('Niall', 'Neil'), 12)
548			0.4
549			>>> sim_levenshtein('aluminum', 'Catalan')
550			0.125
551			>>> sim_levenshtein('ATCG', 'TAGC')
552			0.25
553
554			.. versionadded:: 0.1.0
555
556			"""
557	1		return Levenshtein(mode=mode, cost=cost).sim(src, tar)
558
559
560			if __name__ == '__main__':
561			import doctest
562
563			doctest.testmod()
564

chrislit / abydos

Pull Request — master (#225)

abydos.distance._levenshtein.levenshtein() A

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like