abydos.distance._needleman_wunsch.NeedlemanWunsch.dist_abs() - Code Metrics - Inspection of "0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#225)

by Chris

created 2019-07-12 00:08 UTC

NeedlemanWunsch.dist_abs() A

↳ Parent: abydos.distance._needleman_wunsch

Complexity

Conditions

Size

Total Lines	48
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	13
CRAP Score	5

Importance

Changes

Metric	Value
eloc	14
dl	0
loc	48
ccs	13
cts	13
cp	1
rs	9.2333
c	0
b	0
f	0
cc	5
nop	3
crap	5

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._needleman_wunsch.

Needleman-Wunsch score
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from deprecation import deprecated

from numpy import float32 as np_float32
from numpy import zeros as np_zeros

from six.moves import range

from ._distance import _Distance
from ._ident import sim_ident
from .. import __version__

__all__ = ['NeedlemanWunsch', 'needleman_wunsch']


class NeedlemanWunsch(_Distance):
    """Needleman-Wunsch score.

    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
    distance measure.


    .. versionadded:: 0.3.6
    """

    @staticmethod
    def sim_matrix(
        src,
        tar,
        mat=None,
        mismatch_cost=0,
        match_cost=1,
        symmetric=True,
        alphabet=None,
    ):
        """Return the matrix similarity of two strings.

        With the default parameters, this is identical to sim_ident.
        It is possible for sim_matrix to return values outside of the range
        :math:`[0, 1]`, if values outside that range are present in mat,
        mismatch_cost, or match_cost.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison
        mat : dict
            A dict mapping tuples to costs; the tuples are (src, tar) pairs of
            symbols from the alphabet parameter
        mismatch_cost : float
            The value returned if (src, tar) is absent from mat when src does
            not equal tar
        match_cost : float
            The value returned if (src, tar) is absent from mat when src equals
            tar
        symmetric : bool
            True if the cost of src not matching tar is identical to the cost
            of tar not matching src; in this case, the values in mat need only
            contain (src, tar) or (tar, src), not both
        alphabet : str
            A collection of tokens from which src and tar are drawn; if this is
            defined a ValueError is raised if either tar or src is not found in
            alphabet

        Returns
        -------
        float
            Matrix similarity

        Raises
        ------
        ValueError
            src value not in alphabet
        ValueError
            tar value not in alphabet

        Examples
        --------
        >>> NeedlemanWunsch.sim_matrix('cat', 'hat')
        0
        >>> NeedlemanWunsch.sim_matrix('hat', 'hat')
        1


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if alphabet:
            alphabet = tuple(alphabet)
            for i in src:
                if i not in alphabet:
                    raise ValueError('src value not in alphabet')
            for i in tar:
                if i not in alphabet:
                    raise ValueError('tar value not in alphabet')

        if src == tar:
            if mat and (src, src) in mat:
                return mat[(src, src)]
            return match_cost
        if mat and (src, tar) in mat:
            return mat[(src, tar)]
        elif symmetric and mat and (tar, src) in mat:
            return mat[(tar, src)]
        return mismatch_cost

    def __init__(self, gap_cost=1, sim_func=None, **kwargs):
        """Initialize NeedlemanWunsch instance.

        Parameters
        ----------
        gap_cost : float
            The cost of an alignment gap (1 by default)
        sim_func : function
            A function that returns the similarity of two characters (identity
            similarity by default)
        **kwargs
            Arbitrary keyword arguments


        .. versionadded:: 0.4.0

        """
        super(NeedlemanWunsch, self).__init__(**kwargs)
        self._gap_cost = gap_cost
        self._sim_func = sim_func
        if self._sim_func is None:
            self._sim_func = NeedlemanWunsch.sim_matrix

    def sim_score(self, src, tar):
        """Return the Needleman-Wunsch score of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Needleman-Wunsch score

        Examples
        --------
        >>> cmp = NeedlemanWunsch()
        >>> cmp.sim_score('cat', 'hat')
        2.0
        >>> cmp.sim_score('Niall', 'Neil')
        1.0
        >>> cmp.sim_score('aluminum', 'Catalan')
        -1.0
        >>> cmp.sim_score('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)

        for i in range(len(src) + 1):
            d_mat[i, 0] = -(i * self._gap_cost)
        for j in range(len(tar) + 1):
            d_mat[0, j] = -(j * self._gap_cost)
        for i in range(1, len(src) + 1):
            for j in range(1, len(tar) + 1):
                match = d_mat[i - 1, j - 1] + self._sim_func(
                    src[i - 1], tar[j - 1]
                )
                delete = d_mat[i - 1, j] - self._gap_cost
                insert = d_mat[i, j - 1] - self._gap_cost
                d_mat[i, j] = max(match, delete, insert)
        return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]

    def sim(self, src, tar):
        """Return the normalized Needleman-Wunsch score of two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        float
            Normalized Needleman-Wunsch score

        Examples
        --------
        >>> cmp = NeedlemanWunsch()
        >>> cmp.sim('cat', 'hat')
        0.6666666666666667
        >>> cmp.sim('Niall', 'Neil')
        0.22360679774997896
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
        0.0
        >>> cmp.sim('cat', 'hat')
        0.6666666666666667


        .. versionadded:: 0.4.1

        """
        if src == tar:
            return 1.0
        return max(0.0, self.sim_score(src, tar)) / (
            self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5
        )


@deprecated(
    deprecated_in='0.4.0',
    removed_in='0.6.0',
    current_version=__version__,
    details='Use the NeedlemanWunsch.dist_abs method instead.',
)
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
    """Return the Needleman-Wunsch score of two strings.

    This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`.

    Parameters
    ----------
    src : str
        Source string for comparison
    tar : str
        Target string for comparison
    gap_cost : float
        The cost of an alignment gap (1 by default)
    sim_func : function
        A function that returns the similarity of two characters (identity
        similarity by default)

    Returns
    -------
    float
        Needleman-Wunsch score

    Examples
    --------
    >>> needleman_wunsch('cat', 'hat')
    2.0
    >>> needleman_wunsch('Niall', 'Neil')
    1.0
    >>> needleman_wunsch('aluminum', 'Catalan')
    -1.0
    >>> needleman_wunsch('ATCG', 'TAGC')
    0.0


    .. versionadded:: 0.1.0

    """
    return NeedlemanWunsch(gap_cost, sim_func).sim_score(src, tar)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.distance._needleman_wunsch.
20
21		Needleman-Wunsch score
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from deprecation import deprecated
32
33	1	from numpy import float32 as np_float32
34	1	from numpy import zeros as np_zeros
35
36	1	from six.moves import range
37
38	1	from ._distance import _Distance
39	1	from ._ident import sim_ident
40	1	from .. import __version__
41
42	1	__all__ = ['NeedlemanWunsch', 'needleman_wunsch']
43
44
45	1	class NeedlemanWunsch(_Distance):
46		"""Needleman-Wunsch score.
47
48		The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
49		distance measure.
50
51
52		.. versionadded:: 0.3.6
53		"""
54
55	1	@staticmethod
56	1	def sim_matrix(
57		src,
58		tar,
59		mat=None,
60		mismatch_cost=0,
61		match_cost=1,
62		symmetric=True,
63		alphabet=None,
64		):
65		"""Return the matrix similarity of two strings.
66
67		With the default parameters, this is identical to sim_ident.
68		It is possible for sim_matrix to return values outside of the range
69		:math:`[0, 1]`, if values outside that range are present in mat,
70		mismatch_cost, or match_cost.
71
72		Parameters
73		----------
74		src : str
75		Source string for comparison
76		tar : str
77		Target string for comparison
78		mat : dict
79		A dict mapping tuples to costs; the tuples are (src, tar) pairs of
80		symbols from the alphabet parameter
81		mismatch_cost : float
82		The value returned if (src, tar) is absent from mat when src does
83		not equal tar
84		match_cost : float
85		The value returned if (src, tar) is absent from mat when src equals
86		tar
87		symmetric : bool
88		True if the cost of src not matching tar is identical to the cost
89		of tar not matching src; in this case, the values in mat need only
90		contain (src, tar) or (tar, src), not both
91		alphabet : str
92		A collection of tokens from which src and tar are drawn; if this is
93		defined a ValueError is raised if either tar or src is not found in
94		alphabet
95
96		Returns
97		-------
98		float
99		Matrix similarity
100
101		Raises
102		------
103		ValueError
104		src value not in alphabet
105		ValueError
106		tar value not in alphabet
107
108		Examples
109		--------
110		>>> NeedlemanWunsch.sim_matrix('cat', 'hat')
111		0
112		>>> NeedlemanWunsch.sim_matrix('hat', 'hat')
113		1
114
115
116		.. versionadded:: 0.1.0
117		.. versionchanged:: 0.3.6
118		Encapsulated in class
119
120		"""
121	1	if alphabet:
122	1	alphabet = tuple(alphabet)
123	1	for i in src:
124	1	if i not in alphabet:
125	1	raise ValueError('src value not in alphabet')
126	1	for i in tar:
127	1	if i not in alphabet:
128	1	raise ValueError('tar value not in alphabet')
129
130	1	if src == tar:
131	1	if mat and (src, src) in mat:
132	1	return mat[(src, src)]
133	1	return match_cost
134	1	if mat and (src, tar) in mat:
135	1	return mat[(src, tar)]
136	1	elif symmetric and mat and (tar, src) in mat:
137	1	return mat[(tar, src)]
138	1	return mismatch_cost
139
140	1	def __init__(self, gap_cost=1, sim_func=None, **kwargs):
141		"""Initialize NeedlemanWunsch instance.
142
143		Parameters
144		----------
145		gap_cost : float
146		The cost of an alignment gap (1 by default)
147		sim_func : function
148		A function that returns the similarity of two characters (identity
149		similarity by default)
150		**kwargs
151		Arbitrary keyword arguments
152
153
154		.. versionadded:: 0.4.0
155
156		"""
157	1	super(NeedlemanWunsch, self).__init__(**kwargs)
158	1	self._gap_cost = gap_cost
159	1	self._sim_func = sim_func
160	1	if self._sim_func is None:
161	1	self._sim_func = NeedlemanWunsch.sim_matrix
162
163	1	def sim_score(self, src, tar):
164		"""Return the Needleman-Wunsch score of two strings.
165
166		Parameters
167		----------
168		src : str
169		Source string for comparison
170		tar : str
171		Target string for comparison
172
173		Returns
174		-------
175		float
176		Needleman-Wunsch score
177
178		Examples
179		--------
180		>>> cmp = NeedlemanWunsch()
181		>>> cmp.sim_score('cat', 'hat')
182		2.0
183		>>> cmp.sim_score('Niall', 'Neil')
184		1.0
185		>>> cmp.sim_score('aluminum', 'Catalan')
186		-1.0
187		>>> cmp.sim_score('ATCG', 'TAGC')
188		0.0
189
190
191		.. versionadded:: 0.1.0
192		.. versionchanged:: 0.3.6
193		Encapsulated in class
194
195		"""
196	1	d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
197
198	1	for i in range(len(src) + 1):
199	1	d_mat[i, 0] = -(i * self._gap_cost)
200	1	for j in range(len(tar) + 1):
201	1	d_mat[0, j] = -(j * self._gap_cost)
202	1	for i in range(1, len(src) + 1):
203	1	for j in range(1, len(tar) + 1):
204	1	match = d_mat[i - 1, j - 1] + self._sim_func(
205		src[i - 1], tar[j - 1]
206		)
207	1	delete = d_mat[i - 1, j] - self._gap_cost
208	1	insert = d_mat[i, j - 1] - self._gap_cost
209	1	d_mat[i, j] = max(match, delete, insert)
210	1	return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]
211
212	1	def sim(self, src, tar):
213		"""Return the normalized Needleman-Wunsch score of two strings.
214
215		Parameters
216		----------
217		src : str
218		Source string for comparison
219		tar : str
220		Target string for comparison
221
222		Returns
223		-------
224		float
225		Normalized Needleman-Wunsch score
226
227		Examples
228		--------
229		>>> cmp = NeedlemanWunsch()
230		>>> cmp.sim('cat', 'hat')
231		0.6666666666666667
232		>>> cmp.sim('Niall', 'Neil')
233		0.22360679774997896
234		>>> round(cmp.sim('aluminum', 'Catalan'), 12)
235		0.0
236		>>> cmp.sim('cat', 'hat')
237		0.6666666666666667
238
239
240		.. versionadded:: 0.4.1
241
242		"""
243	1	if src == tar:
244	1	return 1.0
245	1	return max(0.0, self.sim_score(src, tar)) / (
246		self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5
247		)
248
249
250	1	@deprecated(
251		deprecated_in='0.4.0',
252		removed_in='0.6.0',
253		current_version=__version__,
254		details='Use the NeedlemanWunsch.dist_abs method instead.',
255		)
256	1	def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
257		"""Return the Needleman-Wunsch score of two strings.
258
259		This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`.
260
261		Parameters
262		----------
263		src : str
264		Source string for comparison
265		tar : str
266		Target string for comparison
267		gap_cost : float
268		The cost of an alignment gap (1 by default)
269		sim_func : function
270		A function that returns the similarity of two characters (identity
271		similarity by default)
272
273		Returns
274		-------
275		float
276		Needleman-Wunsch score
277
278		Examples
279		--------
280		>>> needleman_wunsch('cat', 'hat')
281		2.0
282		>>> needleman_wunsch('Niall', 'Neil')
283		1.0
284		>>> needleman_wunsch('aluminum', 'Catalan')
285		-1.0
286		>>> needleman_wunsch('ATCG', 'TAGC')
287		0.0
288
289
290		.. versionadded:: 0.1.0
291
292		"""
293	1	return NeedlemanWunsch(gap_cost, sim_func).sim_score(src, tar)
294
295
296		if __name__ == '__main__':
297		import doctest
298
299		doctest.testmod()
300

chrislit / abydos

Pull Request — master (#225)

NeedlemanWunsch.dist_abs() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like