1 | # Copyright 2014-2020 by Christopher C. Little. |
||
2 | # This file is part of Abydos. |
||
3 | # |
||
4 | # Abydos is free software: you can redistribute it and/or modify |
||
5 | # it under the terms of the GNU General Public License as published by |
||
6 | # the Free Software Foundation, either version 3 of the License, or |
||
7 | # (at your option) any later version. |
||
8 | # |
||
9 | # Abydos is distributed in the hope that it will be useful, |
||
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
12 | # GNU General Public License for more details. |
||
13 | # |
||
14 | # You should have received a copy of the GNU General Public License |
||
15 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
16 | |||
17 | """abydos.distance._gotoh. |
||
18 | |||
19 | 1 | Gotoh score |
|
20 | """ |
||
21 | from typing import Any, Callable, Optional, cast |
||
22 | |||
23 | from numpy import float_ as np_float |
||
24 | 1 | from numpy import zeros as np_zeros |
|
25 | |||
26 | from ._needleman_wunsch import NeedlemanWunsch |
||
27 | |||
28 | __all__ = ['Gotoh'] |
||
29 | |||
30 | |||
31 | 1 | class Gotoh(NeedlemanWunsch): |
|
32 | """Gotoh score. |
||
33 | 1 | ||
34 | 1 | The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with |
|
35 | affine gap penalties. |
||
36 | 1 | ||
37 | .. versionadded:: 0.3.6 |
||
38 | 1 | """ |
|
39 | 1 | ||
40 | 1 | def __init__( |
|
41 | self, |
||
42 | 1 | gap_open: float = 1, |
|
43 | gap_ext: float = 0.4, |
||
44 | sim_func: Optional[Callable[[str, str], float]] = None, |
||
45 | 1 | **kwargs: Any |
|
46 | ) -> None: |
||
47 | """Initialize Gotoh instance. |
||
48 | |||
49 | Parameters |
||
50 | ---------- |
||
51 | gap_open : float |
||
52 | The cost of an open alignment gap (1 by default) |
||
53 | gap_ext : float |
||
54 | 1 | The cost of an alignment gap extension (0.4 by default) |
|
55 | sim_func : function |
||
56 | A function that returns the similarity of two characters (identity |
||
57 | similarity by default) |
||
58 | **kwargs |
||
59 | Arbitrary keyword arguments |
||
60 | |||
61 | |||
62 | .. versionadded:: 0.4.0 |
||
63 | |||
64 | """ |
||
65 | super(Gotoh, self).__init__(**kwargs) |
||
66 | self._gap_open = gap_open |
||
67 | self._gap_ext = gap_ext |
||
68 | self._sim_func = cast( |
||
69 | Callable[[str, str], float], |
||
70 | NeedlemanWunsch.sim_matrix if sim_func is None else sim_func, |
||
71 | ) # type: Callable[[str, str], float] |
||
72 | |||
73 | 1 | def sim_score(self, src: str, tar: str) -> float: |
|
74 | 1 | """Return the Gotoh score of two strings. |
|
75 | 1 | ||
76 | 1 | Parameters |
|
77 | 1 | ---------- |
|
78 | 1 | src : str |
|
79 | Source string for comparison |
||
80 | 1 | tar : str |
|
81 | Target string for comparison |
||
82 | |||
83 | Returns |
||
84 | ------- |
||
85 | float |
||
86 | Gotoh score |
||
87 | |||
88 | Examples |
||
89 | -------- |
||
90 | >>> cmp = Gotoh() |
||
91 | >>> cmp.sim_score('cat', 'hat') |
||
92 | 2.0 |
||
93 | >>> cmp.sim_score('Niall', 'Neil') |
||
94 | 1.0 |
||
95 | >>> round(cmp.sim_score('aluminum', 'Catalan'), 12) |
||
96 | -0.4 |
||
97 | >>> cmp.sim_score('cat', 'hat') |
||
98 | 2.0 |
||
99 | |||
100 | |||
101 | .. versionadded:: 0.1.0 |
||
102 | .. versionchanged:: 0.3.6 |
||
103 | Encapsulated in class |
||
104 | |||
105 | """ |
||
106 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
||
107 | p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
||
108 | q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
||
109 | |||
110 | d_mat[0, 0] = 0 |
||
111 | p_mat[0, 0] = float('-inf') |
||
112 | q_mat[0, 0] = float('-inf') |
||
113 | 1 | for i in range(1, len(src) + 1): |
|
114 | 1 | d_mat[i, 0] = float('-inf') |
|
115 | 1 | p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1) |
|
116 | q_mat[i, 0] = float('-inf') |
||
117 | 1 | if len(tar) > 1: |
|
118 | 1 | q_mat[i, 1] = -self._gap_open |
|
119 | 1 | for j in range(1, len(tar) + 1): |
|
120 | 1 | d_mat[0, j] = float('-inf') |
|
121 | 1 | p_mat[0, j] = float('-inf') |
|
122 | 1 | if len(src) > 1: |
|
123 | 1 | p_mat[1, j] = -self._gap_open |
|
124 | 1 | q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1) |
|
125 | 1 | ||
126 | 1 | for i in range(1, len(src) + 1): |
|
127 | 1 | for j in range(1, len(tar) + 1): |
|
128 | 1 | sim_val = self._sim_func(src[i - 1], tar[j - 1]) |
|
129 | 1 | d_mat[i, j] = max( |
|
130 | 1 | d_mat[i - 1, j - 1] + sim_val, |
|
131 | 1 | p_mat[i - 1, j - 1] + sim_val, |
|
132 | q_mat[i - 1, j - 1] + sim_val, |
||
133 | 1 | ) |
|
134 | 1 | ||
135 | 1 | p_mat[i, j] = max( |
|
136 | 1 | d_mat[i - 1, j] - self._gap_open, |
|
137 | p_mat[i - 1, j] - self._gap_ext, |
||
138 | ) |
||
139 | |||
140 | q_mat[i, j] = max( |
||
141 | d_mat[i, j - 1] - self._gap_open, |
||
142 | 1 | q_mat[i, j - 1] - self._gap_ext, |
|
143 | ) |
||
144 | |||
145 | i, j = (n - 1 for n in d_mat.shape) |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
146 | return cast(float, max(d_mat[i, j], p_mat[i, j], q_mat[i, j])) |
||
147 | 1 | ||
148 | def sim(self, src: str, tar: str) -> float: |
||
149 | """Return the normalized Gotoh score of two strings. |
||
150 | |||
151 | Parameters |
||
152 | 1 | ---------- |
|
153 | 1 | src : str |
|
154 | Source string for comparison |
||
155 | 1 | tar : str |
|
156 | Target string for comparison |
||
157 | |||
158 | Returns |
||
159 | ------- |
||
160 | float |
||
161 | Normalized Gotoh score |
||
162 | |||
163 | Examples |
||
164 | -------- |
||
165 | >>> cmp = Gotoh() |
||
166 | >>> cmp.sim('cat', 'hat') |
||
167 | 0.6666666666666667 |
||
168 | >>> cmp.sim('Niall', 'Neil') |
||
169 | 0.22360679774997896 |
||
170 | >>> round(cmp.sim('aluminum', 'Catalan'), 12) |
||
171 | 0.0 |
||
172 | >>> cmp.sim('cat', 'hat') |
||
173 | 0.6666666666666667 |
||
174 | |||
175 | |||
176 | .. versionadded:: 0.4.1 |
||
177 | |||
178 | """ |
||
179 | if src == tar: |
||
180 | return 1.0 |
||
181 | return max(0.0, self.sim_score(src, tar)) / ( |
||
182 | self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5 |
||
183 | ) |
||
184 | |||
185 | |||
186 | 1 | if __name__ == '__main__': |
|
187 | 1 | import doctest |
|
188 | 1 | ||
189 | doctest.testmod() |
||
190 |