Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.distance._seqalign.smith_waterman()   A

Complexity

Conditions 1

Size

Total Lines 27
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 27
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 4
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.seqalign.
20
21
The distance.seqalign module implements string edit distance functions
22
used in sequence alignment:
23
24
    - Matrix similarity
25
    - Needleman-Wunsch score
26
    - Smith-Waterman score
27
    - Gotoh score
28
"""
29
30 1
from __future__ import unicode_literals
31
32 1
from numpy import float32 as np_float32
33 1
from numpy import zeros as np_zeros
34
35 1
from six.moves import range
36
37 1
from ._basic import sim_ident
38 1
from ._distance import Distance
39
40 1
__all__ = [
41
    'Gotoh',
42
    'NeedlemanWunsch',
43
    'SmithWaterman',
44
    'gotoh',
45
    'needleman_wunsch',
46
    'smith_waterman',
47
]
48
49
50 1
class NeedlemanWunsch(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
51
    """Needleman-Wunsch score.
52
53
    The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit
54
    distance measure.
55
    """
56
57 1
    @staticmethod
58 1
    def sim_matrix(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
59
        src,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
60
        tar,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
61
        mat=None,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
62
        mismatch_cost=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
63
        match_cost=1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
64
        symmetric=True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
65
        alphabet=None,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
66
    ):
67
        """Return the matrix similarity of two strings.
68
69
        With the default parameters, this is identical to sim_ident.
70
        It is possible for sim_matrix to return values outside of the range
71
        :math:`[0, 1]`, if values outside that range are present in mat,
72
        mismatch_cost, or match_cost.
73
74
        Args:
75
            src (str): Source string for comparison
76
            tar (str): Target string for comparison
77
            mat (dict): A dict mapping tuples to costs; the tuples are (src,
78
                tar) pairs of symbols from the alphabet parameter
79
            mismatch_cost (float): the value returned if (src, tar) is absent
80
                from mat when src does not equal tar
81
            match_cost (float): the value returned if (src, tar) is absent from
82
                mat when src equals tar
83
            symmetric (bool): True if the cost of src not matching tar is
84
                identical to the cost of tar not matching src; in this case,
85
                the values in mat need only contain (src, tar) or (tar, src),
86
                not both
87
            alphabet (str): a collection of tokens from which src and tar are
88
                drawn; if this is defined a ValueError is raised if either tar
89
                or src is not found in alphabet
90
91
        Returns:
92
            float: Matrix similarity
93
94
        Raises:
95
            ValueError: src value not in alphabet
96
            ValueError: tar value not in alphabet
97
98
        Examples:
99
            >>> NeedlemanWunsch.sim_matrix('cat', 'hat')
100
            0
101
            >>> NeedlemanWunsch.sim_matrix('hat', 'hat')
102
            1
103
104
        """
105 1
        if alphabet:
106 1
            alphabet = tuple(alphabet)
107 1
            for i in src:
108 1
                if i not in alphabet:
109 1
                    raise ValueError('src value not in alphabet')
110 1
            for i in tar:
111 1
                if i not in alphabet:
112 1
                    raise ValueError('tar value not in alphabet')
113
114 1
        if src == tar:
115 1
            if mat and (src, src) in mat:
116 1
                return mat[(src, src)]
117 1
            return match_cost
118 1
        if mat and (src, tar) in mat:
119 1
            return mat[(src, tar)]
120 1
        elif symmetric and mat and (tar, src) in mat:
121 1
            return mat[(tar, src)]
122 1
        return mismatch_cost
123
124 1 View Code Duplication
    def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
125
        """Return the Needleman-Wunsch score of two strings.
126
127
        Args:
128
            src (str): Source string for comparison
129
            tar (str): Target string for comparison
130
            gap_cost (float): the cost of an alignment gap (1 by default)
131
            sim_func (function): a function that returns the similarity of two
132
                characters (identity similarity by default)
133
134
        Returns:
135
            float: Needleman-Wunsch score
136
137
        Examples:
138
            >>> cmp = NeedlemanWunsch()
139
            >>> cmp.dist_abs('cat', 'hat')
140
            2.0
141
            >>> cmp.dist_abs('Niall', 'Neil')
142
            1.0
143
            >>> cmp.dist_abs('aluminum', 'Catalan')
144
            -1.0
145
            >>> cmp.dist_abs('ATCG', 'TAGC')
146
            0.0
147
148
        """
149 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
150
151 1
        for i in range(len(src) + 1):
152 1
            d_mat[i, 0] = -(i * gap_cost)
153 1
        for j in range(len(tar) + 1):
154 1
            d_mat[0, j] = -(j * gap_cost)
155 1
        for i in range(1, len(src) + 1):
156 1
            for j in range(1, len(tar) + 1):
157 1
                match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1])
158 1
                delete = d_mat[i - 1, j] - gap_cost
159 1
                insert = d_mat[i, j - 1] - gap_cost
160 1
                d_mat[i, j] = max(match, delete, insert)
161 1
        return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]
162
163
164 1
def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident):
165
    """Return the Needleman-Wunsch score of two strings.
166
167
    This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`.
168
169
    Args:
170
        src (str): Source string for comparison
171
        tar (str): Target string for comparison
172
        gap_cost (float): the cost of an alignment gap (1 by default)
173
        sim_func (function): a function that returns the similarity of two
174
            characters (identity similarity by default)
175
176
    Returns:
177
        float: Needleman-Wunsch score
178
179
    Examples:
180
        >>> needleman_wunsch('cat', 'hat')
181
        2.0
182
        >>> needleman_wunsch('Niall', 'Neil')
183
        1.0
184
        >>> needleman_wunsch('aluminum', 'Catalan')
185
        -1.0
186
        >>> needleman_wunsch('ATCG', 'TAGC')
187
        0.0
188
189
    """
190 1
    return NeedlemanWunsch().dist_abs(src, tar, gap_cost, sim_func)
191
192
193 1
class SmithWaterman(NeedlemanWunsch):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
194
    """Smith-Waterman score.
195
196
    The Smith-Waterman score :cite:`Smith:1981` is a standard edit distance
197
    measure, differing from Needleman-Wunsch in that it focuses on local
198
    alignment and disallows negative scores.
199
    """
200
201 1 View Code Duplication
    def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
202
        """Return the Smith-Waterman score of two strings.
203
204
        Args:
205
            src (str): Source string for comparison
206
            tar (str): Target string for comparison
207
            gap_cost (float): the cost of an alignment gap (1 by default)
208
            sim_func (function): a function that returns the similarity of two
209
                characters (identity similarity by default)
210
211
        Returns:
212
            float: Smith-Waterman score
213
214
        Examples:
215
            >>> cmp = SmithWaterman()
216
            >>> cmp.dist_abs('cat', 'hat')
217
            2.0
218
            >>> cmp.dist_abs('Niall', 'Neil')
219
            1.0
220
            >>> cmp.dist_abs('aluminum', 'Catalan')
221
            0.0
222
            >>> cmp.dist_abs('ATCG', 'TAGC')
223
            1.0
224
225
        """
226 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
227
228 1
        for i in range(len(src) + 1):
229 1
            d_mat[i, 0] = 0
230 1
        for j in range(len(tar) + 1):
231 1
            d_mat[0, j] = 0
232 1
        for i in range(1, len(src) + 1):
233 1
            for j in range(1, len(tar) + 1):
234 1
                match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1])
235 1
                delete = d_mat[i - 1, j] - gap_cost
236 1
                insert = d_mat[i, j - 1] - gap_cost
237 1
                d_mat[i, j] = max(0, match, delete, insert)
238 1
        return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1]
239
240
241 1
def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident):
242
    """Return the Smith-Waterman score of two strings.
243
244
    This is a wrapper for :py:meth:`SmithWaterman.dist_abs`.
245
246
    Args:
247
        src (str): Source string for comparison
248
        tar (str): Target string for comparison
249
        gap_cost (float): the cost of an alignment gap (1 by default)
250
        sim_func (function): a function that returns the similarity of two
251
            characters (identity similarity by default)
252
253
    Returns:
254
        float: Smith-Waterman score
255
256
    Examples:
257
        >>> smith_waterman('cat', 'hat')
258
        2.0
259
        >>> smith_waterman('Niall', 'Neil')
260
        1.0
261
        >>> smith_waterman('aluminum', 'Catalan')
262
        0.0
263
        >>> smith_waterman('ATCG', 'TAGC')
264
        1.0
265
266
    """
267 1
    return SmithWaterman().dist_abs(src, tar, gap_cost, sim_func)
268
269
270 1
class Gotoh(NeedlemanWunsch):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
271
    """Gotoh score.
272
273
    The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
274
    affine gap penalties.
275
    """
276
277 1
    def dist_abs(self, src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Bug introduced by
Parameters differ from overridden 'dist_abs' method
Loading history...
278
        """Return the Gotoh score of two strings.
279
280
        Args:
281
            src (str): Source string for comparison
282
            tar (str): Target string for comparison
283
            gap_open (float): the cost of an open alignment gap (1 by default)
284
            gap_ext (float): the cost of an alignment gap extension (0.4 by
285
                default)
286
            sim_func (function): a function that returns the similarity of two
287
                characters (identity similarity by default)
288
289
        Returns:
290
            float: Gotoh score
291
292
        Examples:
293
            >>> cmp = Gotoh()
294
            >>> cmp.dist_abs('cat', 'hat')
295
            2.0
296
            >>> cmp.dist_abs('Niall', 'Neil')
297
            1.0
298
            >>> round(cmp.dist_abs('aluminum', 'Catalan'), 12)
299
            -0.4
300
            >>> cmp.dist_abs('cat', 'hat')
301
            2.0
302
303
        """
304 1
        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
305 1
        p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
306 1
        q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32)
307
308 1
        d_mat[0, 0] = 0
309 1
        p_mat[0, 0] = float('-inf')
310 1
        q_mat[0, 0] = float('-inf')
311 1
        for i in range(1, len(src) + 1):
312 1
            d_mat[i, 0] = float('-inf')
313 1
            p_mat[i, 0] = -gap_open - gap_ext * (i - 1)
314 1
            q_mat[i, 0] = float('-inf')
315 1
            q_mat[i, 1] = -gap_open
316 1
        for j in range(1, len(tar) + 1):
317 1
            d_mat[0, j] = float('-inf')
318 1
            p_mat[0, j] = float('-inf')
319 1
            p_mat[1, j] = -gap_open
320 1
            q_mat[0, j] = -gap_open - gap_ext * (j - 1)
321
322 1
        for i in range(1, len(src) + 1):
323 1
            for j in range(1, len(tar) + 1):
324 1
                sim_val = sim_func(src[i - 1], tar[j - 1])
325 1
                d_mat[i, j] = max(
326
                    d_mat[i - 1, j - 1] + sim_val,
327
                    p_mat[i - 1, j - 1] + sim_val,
328
                    q_mat[i - 1, j - 1] + sim_val,
329
                )
330
331 1
                p_mat[i, j] = max(
332
                    d_mat[i - 1, j] - gap_open, p_mat[i - 1, j] - gap_ext
333
                )
334
335 1
                q_mat[i, j] = max(
336
                    d_mat[i, j - 1] - gap_open, q_mat[i, j - 1] - gap_ext
337
                )
338
339 1
        i, j = (n - 1 for n in d_mat.shape)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable n does not seem to be defined.
Loading history...
340 1
        return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
341
342
343 1
def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident):
344
    """Return the Gotoh score of two strings.
345
346
    This is a wrapper for :py:meth:`Gotoh.dist_abs`.
347
348
    Args:
349
        src (str): Source string for comparison
350
        tar (str): Target string for comparison
351
        gap_open (float): the cost of an open alignment gap (1 by default)
352
        gap_ext (float): the cost of an alignment gap extension (0.4 by
353
            default)
354
        sim_func (function): a function that returns the similarity of two
355
            characters (identity similarity by default)
356
357
    Returns:
358
        float: Gotoh score
359
360
    Examples:
361
        >>> gotoh('cat', 'hat')
362
        2.0
363
        >>> gotoh('Niall', 'Neil')
364
        1.0
365
        >>> round(gotoh('aluminum', 'Catalan'), 12)
366
        -0.4
367
        >>> gotoh('cat', 'hat')
368
        2.0
369
370
    """
371 1
    return Gotoh().dist_abs(src, tar, gap_open, gap_ext, sim_func)
372
373
374
if __name__ == '__main__':
375
    import doctest
376
377
    doctest.testmod()
378