@@ 2253-2295 (lines=43) @@ | ||
2250 | return mismatch_cost |
|
2251 | ||
2252 | ||
2253 | def needleman_wunsch(src, tar, gap_cost=1, sim_func=sim_ident): |
|
2254 | """Return the Needleman-Wunsch score of two strings. |
|
2255 | ||
2256 | Needleman-Wunsch score |
|
2257 | ||
2258 | This is the standard edit distance measure. |
|
2259 | ||
2260 | Cf. https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm |
|
2261 | ||
2262 | Cf. |
|
2263 | http://csb.stanford.edu/class/public/readings/Bioinformatics_I_Lecture6/Needleman_Wunsch_JMB_70_Global_alignment.pdf |
|
2264 | ||
2265 | :param str src, tar: two strings to be compared |
|
2266 | :param float gap_cost: the cost of an alignment gap (1 by default) |
|
2267 | :param function sim_func: a function that returns the similarity of two |
|
2268 | characters (identity similarity by default) |
|
2269 | :returns: Needleman-Wunsch score |
|
2270 | :rtype: int (in fact dependent on the gap_cost & return value of sim_func) |
|
2271 | ||
2272 | >>> needleman_wunsch('cat', 'hat') |
|
2273 | 2.0 |
|
2274 | >>> needleman_wunsch('Niall', 'Neil') |
|
2275 | 1.0 |
|
2276 | >>> needleman_wunsch('aluminum', 'Catalan') |
|
2277 | -1.0 |
|
2278 | >>> needleman_wunsch('ATCG', 'TAGC') |
|
2279 | 0.0 |
|
2280 | """ |
|
2281 | # pylint: disable=no-member |
|
2282 | d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float) |
|
2283 | # pylint: enable=no-member |
|
2284 | ||
2285 | for i in range(len(src)+1): |
|
2286 | d_mat[i, 0] = -(i * gap_cost) |
|
2287 | for j in range(len(tar)+1): |
|
2288 | d_mat[0, j] = -(j * gap_cost) |
|
2289 | for i in range(1, len(src)+1): |
|
2290 | for j in range(1, len(tar)+1): |
|
2291 | match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1]) |
|
2292 | delete = d_mat[i-1, j] - gap_cost |
|
2293 | insert = d_mat[i, j-1] - gap_cost |
|
2294 | d_mat[i, j] = max(match, delete, insert) |
|
2295 | return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1] |
|
2296 | ||
2297 | ||
2298 | def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident): |
|
@@ 2298-2337 (lines=40) @@ | ||
2295 | return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1] |
|
2296 | ||
2297 | ||
2298 | def smith_waterman(src, tar, gap_cost=1, sim_func=sim_ident): |
|
2299 | """Return the Smith-Waterman score of two strings. |
|
2300 | ||
2301 | Smith-Waterman score |
|
2302 | ||
2303 | This is the standard edit distance measure. |
|
2304 | ||
2305 | Cf. https://en.wikipedia.org/wiki/Smith–Waterman_algorithm |
|
2306 | ||
2307 | :param str src, tar: two strings to be compared |
|
2308 | :param float gap_cost: the cost of an alignment gap (1 by default) |
|
2309 | :param function sim_func: a function that returns the similarity of two |
|
2310 | characters (identity similarity by default) |
|
2311 | :returns: Smith-Waterman score |
|
2312 | :rtype: int (in fact dependent on the gap_cost & return value of sim_func) |
|
2313 | ||
2314 | >>> smith_waterman('cat', 'hat') |
|
2315 | 2.0 |
|
2316 | >>> smith_waterman('Niall', 'Neil') |
|
2317 | 1.0 |
|
2318 | >>> smith_waterman('aluminum', 'Catalan') |
|
2319 | 0.0 |
|
2320 | >>> smith_waterman('ATCG', 'TAGC') |
|
2321 | 1.0 |
|
2322 | """ |
|
2323 | # pylint: disable=no-member |
|
2324 | d_mat = np.zeros((len(src)+1, len(tar)+1), dtype=np.float) |
|
2325 | # pylint: enable=no-member |
|
2326 | ||
2327 | for i in range(len(src)+1): |
|
2328 | d_mat[i, 0] = 0 |
|
2329 | for j in range(len(tar)+1): |
|
2330 | d_mat[0, j] = 0 |
|
2331 | for i in range(1, len(src)+1): |
|
2332 | for j in range(1, len(tar)+1): |
|
2333 | match = d_mat[i-1, j-1] + sim_func(src[i-1], tar[j-1]) |
|
2334 | delete = d_mat[i-1, j] - gap_cost |
|
2335 | insert = d_mat[i, j-1] - gap_cost |
|
2336 | d_mat[i, j] = max(0, match, delete, insert) |
|
2337 | return d_mat[d_mat.shape[0]-1, d_mat.shape[1]-1] |
|
2338 | ||
2339 | ||
2340 | def gotoh(src, tar, gap_open=1, gap_ext=0.4, sim_func=sim_ident): |