Test Failed
Push — master ( 23810f...afe14d )
by Chris
09:47
created

abydos/distance/_needleman_wunsch.py (1 issue)

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._needleman_wunsch.
20
21
Needleman-Wunsch score
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from numpy import float32 as np_float32
32 1
from numpy import zeros as np_zeros
33
34 1
from six.moves import range
35
36 1
from ._distance import _Distance
37 1
from ._ident import sim_ident
38
39 1
__all__ = ['NeedlemanWunsch', 'needleman_wunsch']
40
41
42 1
class NeedlemanWunsch(_Distance):
43
    """Needleman-Wunsch score.
44
45
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
46
    distance measure.
47
    """
48
49 1
    @staticmethod
50 1
    def sim_matrix(
51
        src,
52
        tar,
53
        mat=None,
54
        mismatch_cost=0,
55
        match_cost=1,
56
        symmetric=True,
57
        alphabet=None,
58
    ):
59
        """Return the matrix similarity of two strings.
60
61
        With the default parameters, this is identical to sim_ident.
62
        It is possible for sim_matrix to return values outside of the range
63
        :math:`[0, 1]`, if values outside that range are present in mat,
64
        mismatch_cost, or match_cost.
65
66
        Parameters
67
        ----------
68
        src : str
69
            Source string for comparison
70
        tar : str
71
            Target string for comparison
72
        mat : dict
73
            A dict mapping tuples to costs; the tuples are (src, tar) pairs of
74
            symbols from the alphabet parameter
75
        mismatch_cost : float
76
            The value returned if (src, tar) is absent from mat when src does
77
            not equal tar
78
        match_cost : float
79
            The value returned if (src, tar) is absent from mat when src equals
80
            tar
81
        symmetric : bool
82
            True if the cost of src not matching tar is identical to the cost
83
            of tar not matching src; in this case, the values in mat need only
84
            contain (src, tar) or (tar, src), not both
85
        alphabet : str
86
            A collection of tokens from which src and tar are drawn; if this is
87
            defined a ValueError is raised if either tar or src is not found in
88
            alphabet
89
90
        Returns
91
        -------
92
        float
93
            Matrix similarity
94
95
        Raises
96
        ------
97
        ValueError
98
            src value not in alphabet
99
        ValueError
100
            tar value not in alphabet
101
102
        Examples
103
        --------
104
        >>> NeedlemanWunsch.sim_matrix('cat', 'hat')
105
        0
106
        >>> NeedlemanWunsch.sim_matrix('hat', 'hat')
107
        1
108
109
        """
110 1
        if alphabet:
111 1
            alphabet = tuple(alphabet)
112 1
            for i in src:
113 1
                if i not in alphabet:
114 1
                    raise ValueError('src value not in alphabet')
115 1
            for i in tar:
116 1
                if i not in alphabet:
117 1
                    raise ValueError('tar value not in alphabet')
118
119 1
        if src == tar:
120 1
            if mat and (src, src) in mat:
121 1
                return mat[(src, src)]
122 1
            return match_cost
123 1
        if mat and (src, tar) in mat:
124 1
            return mat[(src, tar)]
125 1
        elif symmetric and mat and (tar, src) in mat:
126 1
            return mat[(tar, src)]
127 1
        return mismatch_cost
128
129 1 View Code Duplication
    def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
130
        """Return the Needleman-Wunsch score of two strings.
131
132
        Parameters
133
        ----------
134
        src : str
135
            Source string for comparison
136
        tar : str
137
            Target string for comparison
138
        gap_cost : float
139
            The cost of an alignment gap (1 by default)
140
        sim_func : function
141
            A function that returns the similarity of two characters (identity
142
            similarity by default)
143
144
        Returns
145
        -------
146
        float
147
            Needleman-Wunsch score
148
149
        Examples
150
        --------
151
        >>> cmp = NeedlemanWunsch()
152
        >>> cmp.dist_abs('cat', 'hat')
153
        2.0
154
        >>> cmp.dist_abs('Niall', 'Neil')
155
        1.0
156
        >>> cmp.dist_abs('aluminum', 'Catalan')
157
        -1.0
158
        >>> cmp.dist_abs('ATCG', 'TAGC')
159
        0.0
160
161
        """
162 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
163
164 1
        for i in range(len(src) + 1):
165 1
            d_mat[i, 0] = -(i * gap_cost)
166 1
        for j in range(len(tar) + 1):
167 1
            d_mat[0, j] = -(j * gap_cost)
168 1
        for i in range(1, len(src) + 1):
169 1
            for j in range(1, len(tar) + 1):
170 1
                match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1])
171 1
                delete = d_mat[i - 1, j] - gap_cost
172 1
                insert = d_mat[i, j - 1] - gap_cost
173 1
                d_mat[i, j] = max(match, delete, insert)
174 1
        return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]
175
176
177 1
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
178
    """Return the Needleman-Wunsch score of two strings.
179
180
    This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`.
181
182
    Parameters
183
    ----------
184
    src : str
185
        Source string for comparison
186
    tar : str
187
        Target string for comparison
188
    gap_cost : float
189
        The cost of an alignment gap (1 by default)
190
    sim_func : function
191
        A function that returns the similarity of two characters (identity
192
        similarity by default)
193
194
    Returns
195
    -------
196
    float
197
        Needleman-Wunsch score
198
199
    Examples
200
    --------
201
    >>> needleman_wunsch('cat', 'hat')
202
    2.0
203
    >>> needleman_wunsch('Niall', 'Neil')
204
    1.0
205
    >>> needleman_wunsch('aluminum', 'Catalan')
206
    -1.0
207
    >>> needleman_wunsch('ATCG', 'TAGC')
208
    0.0
209
210
    """
211 1
    return NeedlemanWunsch().dist_abs(src, tar, gap_cost, sim_func)
212
213
214
if __name__ == '__main__':
215
    import doctest
216
217
    doctest.testmod()
218