Completed
Pull Request — master (#225)
by Chris
09:15
created

NeedlemanWunsch.dist_abs()   A

Complexity

Conditions 5

Size

Total Lines 48
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 14
dl 0
loc 48
ccs 13
cts 13
cp 1
rs 9.2333
c 0
b 0
f 0
cc 5
nop 3
crap 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._needleman_wunsch.
20
21
Needleman-Wunsch score
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from deprecation import deprecated
32
33 1
from numpy import float32 as np_float32
34 1
from numpy import zeros as np_zeros
35
36 1
from six.moves import range
37
38 1
from ._distance import _Distance
39 1
from ._ident import sim_ident
40 1
from .. import __version__
41
42 1
__all__ = ['NeedlemanWunsch', 'needleman_wunsch']
43
44
45 1
class NeedlemanWunsch(_Distance):
46
    """Needleman-Wunsch score.
47
48
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
49
    distance measure.
50
51
52
    .. versionadded:: 0.3.6
53
    """
54
55 1
    @staticmethod
56 1
    def sim_matrix(
57
        src,
58
        tar,
59
        mat=None,
60
        mismatch_cost=0,
61
        match_cost=1,
62
        symmetric=True,
63
        alphabet=None,
64
    ):
65
        """Return the matrix similarity of two strings.
66
67
        With the default parameters, this is identical to sim_ident.
68
        It is possible for sim_matrix to return values outside of the range
69
        :math:`[0, 1]`, if values outside that range are present in mat,
70
        mismatch_cost, or match_cost.
71
72
        Parameters
73
        ----------
74
        src : str
75
            Source string for comparison
76
        tar : str
77
            Target string for comparison
78
        mat : dict
79
            A dict mapping tuples to costs; the tuples are (src, tar) pairs of
80
            symbols from the alphabet parameter
81
        mismatch_cost : float
82
            The value returned if (src, tar) is absent from mat when src does
83
            not equal tar
84
        match_cost : float
85
            The value returned if (src, tar) is absent from mat when src equals
86
            tar
87
        symmetric : bool
88
            True if the cost of src not matching tar is identical to the cost
89
            of tar not matching src; in this case, the values in mat need only
90
            contain (src, tar) or (tar, src), not both
91
        alphabet : str
92
            A collection of tokens from which src and tar are drawn; if this is
93
            defined a ValueError is raised if either tar or src is not found in
94
            alphabet
95
96
        Returns
97
        -------
98
        float
99
            Matrix similarity
100
101
        Raises
102
        ------
103
        ValueError
104
            src value not in alphabet
105
        ValueError
106
            tar value not in alphabet
107
108
        Examples
109
        --------
110
        >>> NeedlemanWunsch.sim_matrix('cat', 'hat')
111
        0
112
        >>> NeedlemanWunsch.sim_matrix('hat', 'hat')
113
        1
114
115
116
        .. versionadded:: 0.1.0
117
        .. versionchanged:: 0.3.6
118
            Encapsulated in class
119
120
        """
121 1
        if alphabet:
122 1
            alphabet = tuple(alphabet)
123 1
            for i in src:
124 1
                if i not in alphabet:
125 1
                    raise ValueError('src value not in alphabet')
126 1
            for i in tar:
127 1
                if i not in alphabet:
128 1
                    raise ValueError('tar value not in alphabet')
129
130 1
        if src == tar:
131 1
            if mat and (src, src) in mat:
132 1
                return mat[(src, src)]
133 1
            return match_cost
134 1
        if mat and (src, tar) in mat:
135 1
            return mat[(src, tar)]
136 1
        elif symmetric and mat and (tar, src) in mat:
137 1
            return mat[(tar, src)]
138 1
        return mismatch_cost
139
140 1
    def __init__(self, gap_cost=1, sim_func=None, **kwargs):
141
        """Initialize NeedlemanWunsch instance.
142
143
        Parameters
144
        ----------
145
        gap_cost : float
146
            The cost of an alignment gap (1 by default)
147
        sim_func : function
148
            A function that returns the similarity of two characters (identity
149
            similarity by default)
150
        **kwargs
151
            Arbitrary keyword arguments
152
153
154
        .. versionadded:: 0.4.0
155
156
        """
157 1
        super(NeedlemanWunsch, self).__init__(**kwargs)
158 1
        self._gap_cost = gap_cost
159 1
        self._sim_func = sim_func
160 1
        if self._sim_func is None:
161 1
            self._sim_func = NeedlemanWunsch.sim_matrix
162
163 1
    def sim_score(self, src, tar):
164
        """Return the Needleman-Wunsch score of two strings.
165
166
        Parameters
167
        ----------
168
        src : str
169
            Source string for comparison
170
        tar : str
171
            Target string for comparison
172
173
        Returns
174
        -------
175
        float
176
            Needleman-Wunsch score
177
178
        Examples
179
        --------
180
        >>> cmp = NeedlemanWunsch()
181
        >>> cmp.sim_score('cat', 'hat')
182
        2.0
183
        >>> cmp.sim_score('Niall', 'Neil')
184
        1.0
185
        >>> cmp.sim_score('aluminum', 'Catalan')
186
        -1.0
187
        >>> cmp.sim_score('ATCG', 'TAGC')
188
        0.0
189
190
191
        .. versionadded:: 0.1.0
192
        .. versionchanged:: 0.3.6
193
            Encapsulated in class
194
195
        """
196 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
197
198 1
        for i in range(len(src) + 1):
199 1
            d_mat[i, 0] = -(i * self._gap_cost)
200 1
        for j in range(len(tar) + 1):
201 1
            d_mat[0, j] = -(j * self._gap_cost)
202 1
        for i in range(1, len(src) + 1):
203 1
            for j in range(1, len(tar) + 1):
204 1
                match = d_mat[i - 1, j - 1] + self._sim_func(
205
                    src[i - 1], tar[j - 1]
206
                )
207 1
                delete = d_mat[i - 1, j] - self._gap_cost
208 1
                insert = d_mat[i, j - 1] - self._gap_cost
209 1
                d_mat[i, j] = max(match, delete, insert)
210 1
        return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]
211
212 1
    def sim(self, src, tar):
213
        """Return the normalized Needleman-Wunsch score of two strings.
214
215
        Parameters
216
        ----------
217
        src : str
218
            Source string for comparison
219
        tar : str
220
            Target string for comparison
221
222
        Returns
223
        -------
224
        float
225
            Normalized Needleman-Wunsch score
226
227
        Examples
228
        --------
229
        >>> cmp = NeedlemanWunsch()
230
        >>> cmp.sim('cat', 'hat')
231
        0.6666666666666667
232
        >>> cmp.sim('Niall', 'Neil')
233
        0.22360679774997896
234
        >>> round(cmp.sim('aluminum', 'Catalan'), 12)
235
        0.0
236
        >>> cmp.sim('cat', 'hat')
237
        0.6666666666666667
238
239
240
        .. versionadded:: 0.4.1
241
242
        """
243 1
        if src == tar:
244 1
            return 1.0
245 1
        return max(0.0, self.sim_score(src, tar)) / (
246
            self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5
247
        )
248
249
250 1
@deprecated(
251
    deprecated_in='0.4.0',
252
    removed_in='0.6.0',
253
    current_version=__version__,
254
    details='Use the NeedlemanWunsch.dist_abs method instead.',
255
)
256 1
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
257
    """Return the Needleman-Wunsch score of two strings.
258
259
    This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`.
260
261
    Parameters
262
    ----------
263
    src : str
264
        Source string for comparison
265
    tar : str
266
        Target string for comparison
267
    gap_cost : float
268
        The cost of an alignment gap (1 by default)
269
    sim_func : function
270
        A function that returns the similarity of two characters (identity
271
        similarity by default)
272
273
    Returns
274
    -------
275
    float
276
        Needleman-Wunsch score
277
278
    Examples
279
    --------
280
    >>> needleman_wunsch('cat', 'hat')
281
    2.0
282
    >>> needleman_wunsch('Niall', 'Neil')
283
    1.0
284
    >>> needleman_wunsch('aluminum', 'Catalan')
285
    -1.0
286
    >>> needleman_wunsch('ATCG', 'TAGC')
287
    0.0
288
289
290
    .. versionadded:: 0.1.0
291
292
    """
293 1
    return NeedlemanWunsch(gap_cost, sim_func).sim_score(src, tar)
294
295
296
if __name__ == '__main__':
297
    import doctest
298
299
    doctest.testmod()
300