1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Copyright 2014-2018 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | 1 | """abydos.distance._needleman_wunsch. |
|
20 | |||
21 | Needleman-Wunsch score |
||
22 | """ |
||
23 | |||
24 | 1 | from __future__ import ( |
|
25 | absolute_import, |
||
26 | division, |
||
27 | print_function, |
||
28 | unicode_literals, |
||
29 | ) |
||
30 | |||
31 | 1 | from numpy import float32 as np_float32 |
|
32 | 1 | from numpy import zeros as np_zeros |
|
33 | |||
34 | 1 | from six.moves import range |
|
35 | |||
36 | 1 | from ._distance import _Distance |
|
37 | 1 | from ._ident import sim_ident |
|
38 | |||
39 | 1 | __all__ = ['NeedlemanWunsch', 'needleman_wunsch'] |
|
40 | |||
41 | |||
42 | 1 | class NeedlemanWunsch(_Distance): |
|
43 | """Needleman-Wunsch score. |
||
44 | |||
45 | The Needleman-Wunsch score :cite:`Needleman:1970` is a standard edit |
||
46 | distance measure. |
||
47 | """ |
||
48 | |||
49 | 1 | @staticmethod |
|
50 | 1 | def sim_matrix( |
|
51 | src, |
||
52 | tar, |
||
53 | mat=None, |
||
54 | mismatch_cost=0, |
||
55 | match_cost=1, |
||
56 | symmetric=True, |
||
57 | alphabet=None, |
||
58 | ): |
||
59 | """Return the matrix similarity of two strings. |
||
60 | |||
61 | With the default parameters, this is identical to sim_ident. |
||
62 | It is possible for sim_matrix to return values outside of the range |
||
63 | :math:`[0, 1]`, if values outside that range are present in mat, |
||
64 | mismatch_cost, or match_cost. |
||
65 | |||
66 | Parameters |
||
67 | ---------- |
||
68 | src : str |
||
69 | Source string for comparison |
||
70 | tar : str |
||
71 | Target string for comparison |
||
72 | mat : dict |
||
73 | A dict mapping tuples to costs; the tuples are (src, tar) pairs of |
||
74 | symbols from the alphabet parameter |
||
75 | mismatch_cost : float |
||
76 | The value returned if (src, tar) is absent from mat when src does |
||
77 | not equal tar |
||
78 | match_cost : float |
||
79 | The value returned if (src, tar) is absent from mat when src equals |
||
80 | tar |
||
81 | symmetric : bool |
||
82 | True if the cost of src not matching tar is identical to the cost |
||
83 | of tar not matching src; in this case, the values in mat need only |
||
84 | contain (src, tar) or (tar, src), not both |
||
85 | alphabet : str |
||
86 | A collection of tokens from which src and tar are drawn; if this is |
||
87 | defined a ValueError is raised if either tar or src is not found in |
||
88 | alphabet |
||
89 | |||
90 | Returns |
||
91 | ------- |
||
92 | float |
||
93 | Matrix similarity |
||
94 | |||
95 | Raises |
||
96 | ------ |
||
97 | ValueError |
||
98 | src value not in alphabet |
||
99 | ValueError |
||
100 | tar value not in alphabet |
||
101 | |||
102 | Examples |
||
103 | -------- |
||
104 | >>> NeedlemanWunsch.sim_matrix('cat', 'hat') |
||
105 | 0 |
||
106 | >>> NeedlemanWunsch.sim_matrix('hat', 'hat') |
||
107 | 1 |
||
108 | |||
109 | """ |
||
110 | 1 | if alphabet: |
|
111 | 1 | alphabet = tuple(alphabet) |
|
112 | 1 | for i in src: |
|
113 | 1 | if i not in alphabet: |
|
114 | 1 | raise ValueError('src value not in alphabet') |
|
115 | 1 | for i in tar: |
|
116 | 1 | if i not in alphabet: |
|
117 | 1 | raise ValueError('tar value not in alphabet') |
|
118 | |||
119 | 1 | if src == tar: |
|
120 | 1 | if mat and (src, src) in mat: |
|
121 | 1 | return mat[(src, src)] |
|
122 | 1 | return match_cost |
|
123 | 1 | if mat and (src, tar) in mat: |
|
124 | 1 | return mat[(src, tar)] |
|
125 | 1 | elif symmetric and mat and (tar, src) in mat: |
|
126 | 1 | return mat[(tar, src)] |
|
127 | 1 | return mismatch_cost |
|
128 | |||
129 | 1 | View Code Duplication | def dist_abs(self, src, tar, gap_cost=1, sim_func=sim_ident): |
0 ignored issues
–
show
Duplication
introduced
by
![]() |
|||
130 | """Return the Needleman-Wunsch score of two strings. |
||
131 | |||
132 | Parameters |
||
133 | ---------- |
||
134 | src : str |
||
135 | Source string for comparison |
||
136 | tar : str |
||
137 | Target string for comparison |
||
138 | gap_cost : float |
||
139 | The cost of an alignment gap (1 by default) |
||
140 | sim_func : function |
||
141 | A function that returns the similarity of two characters (identity |
||
142 | similarity by default) |
||
143 | |||
144 | Returns |
||
145 | ------- |
||
146 | float |
||
147 | Needleman-Wunsch score |
||
148 | |||
149 | Examples |
||
150 | -------- |
||
151 | >>> cmp = NeedlemanWunsch() |
||
152 | >>> cmp.dist_abs('cat', 'hat') |
||
153 | 2.0 |
||
154 | >>> cmp.dist_abs('Niall', 'Neil') |
||
155 | 1.0 |
||
156 | >>> cmp.dist_abs('aluminum', 'Catalan') |
||
157 | -1.0 |
||
158 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
159 | 0.0 |
||
160 | |||
161 | """ |
||
162 | 1 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) |
|
163 | |||
164 | 1 | for i in range(len(src) + 1): |
|
165 | 1 | d_mat[i, 0] = -(i * gap_cost) |
|
166 | 1 | for j in range(len(tar) + 1): |
|
167 | 1 | d_mat[0, j] = -(j * gap_cost) |
|
168 | 1 | for i in range(1, len(src) + 1): |
|
169 | 1 | for j in range(1, len(tar) + 1): |
|
170 | 1 | match = d_mat[i - 1, j - 1] + sim_func(src[i - 1], tar[j - 1]) |
|
171 | 1 | delete = d_mat[i - 1, j] - gap_cost |
|
172 | 1 | insert = d_mat[i, j - 1] - gap_cost |
|
173 | 1 | d_mat[i, j] = max(match, delete, insert) |
|
174 | 1 | return d_mat[d_mat.shape[0] - 1, d_mat.shape[1] - 1] |
|
175 | |||
176 | |||
177 | 1 | def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident): |
|
178 | """Return the Needleman-Wunsch score of two strings. |
||
179 | |||
180 | This is a wrapper for :py:meth:`NeedlemanWunsch.dist_abs`. |
||
181 | |||
182 | Parameters |
||
183 | ---------- |
||
184 | src : str |
||
185 | Source string for comparison |
||
186 | tar : str |
||
187 | Target string for comparison |
||
188 | gap_cost : float |
||
189 | The cost of an alignment gap (1 by default) |
||
190 | sim_func : function |
||
191 | A function that returns the similarity of two characters (identity |
||
192 | similarity by default) |
||
193 | |||
194 | Returns |
||
195 | ------- |
||
196 | float |
||
197 | Needleman-Wunsch score |
||
198 | |||
199 | Examples |
||
200 | -------- |
||
201 | >>> needleman_wunsch('cat', 'hat') |
||
202 | 2.0 |
||
203 | >>> needleman_wunsch('Niall', 'Neil') |
||
204 | 1.0 |
||
205 | >>> needleman_wunsch('aluminum', 'Catalan') |
||
206 | -1.0 |
||
207 | >>> needleman_wunsch('ATCG', 'TAGC') |
||
208 | 0.0 |
||
209 | |||
210 | """ |
||
211 | 1 | return NeedlemanWunsch().dist_abs(src, tar, gap_cost, sim_func) |
|
212 | |||
213 | |||
214 | if __name__ == '__main__': |
||
215 | import doctest |
||
216 | |||
217 | doctest.testmod() |
||
218 |