Conditions | 23 |
Total Lines | 156 |
Code Lines | 80 |
Lines | 0 |
Ratio | 0 % |
Tests | 49 |
CRAP Score | 23 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._rees_levenshtein.ReesLevenshtein.dist_abs() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Copyright 2019-2020 by Christopher C. Little. |
||
69 | 1 | def dist_abs(self, src: str, tar: str) -> float: |
|
70 | """Return the Rees-Levenshtein distance of two strings. |
||
71 | 1 | ||
72 | This is a straightforward port of the PL/SQL implementation at |
||
73 | https://confluence.csiro.au/public/taxamatch/the-mdld-modified-damerau-levenshtein-distance-algorithm |
||
74 | |||
75 | Parameters |
||
76 | ---------- |
||
77 | src : str |
||
78 | Source string for comparison |
||
79 | tar : str |
||
80 | Target string for comparison |
||
81 | |||
82 | Returns |
||
83 | ------- |
||
84 | float |
||
85 | Rees-Levenshtein distance |
||
86 | |||
87 | Examples |
||
88 | -------- |
||
89 | >>> cmp = ReesLevenshtein() |
||
90 | >>> cmp.dist_abs('cat', 'hat') |
||
91 | 1 |
||
92 | >>> cmp.dist_abs('Niall', 'Neil') |
||
93 | 3 |
||
94 | >>> cmp.dist_abs('aluminum', 'Catalan') |
||
95 | 7 |
||
96 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
97 | 2 |
||
98 | |||
99 | |||
100 | .. versionadded:: 0.4.0 |
||
101 | |||
102 | """ |
||
103 | v_str1_length = len(src) |
||
104 | v_str2_length = len(tar) |
||
105 | 1 | ||
106 | 1 | if tar == src: |
|
107 | return 0 |
||
108 | 1 | if not src: |
|
109 | 1 | return len(tar) |
|
110 | 1 | if not tar: |
|
111 | 1 | return len(src) |
|
112 | 1 | if v_str1_length == 1 and v_str2_length == 1: |
|
113 | 1 | return 1 |
|
114 | 1 | ||
115 | 1 | def _substr(string: str, start: int, length: int) -> str: |
|
116 | if start > 0: |
||
117 | 1 | start -= 1 |
|
118 | 1 | else: |
|
119 | 1 | start += len(string) - 1 |
|
120 | |||
121 | 1 | end = start + length |
|
122 | |||
123 | 1 | return string[start:end] |
|
124 | |||
125 | 1 | v_temp_str1 = str(src) |
|
126 | v_temp_str2 = str(tar) |
||
127 | 1 | ||
128 | 1 | # first trim common leading characters |
|
129 | while v_temp_str1[:1] == v_temp_str2[:1]: |
||
130 | v_temp_str1 = v_temp_str1[1:] |
||
131 | 1 | v_temp_str2 = v_temp_str2[1:] |
|
132 | 1 | ||
133 | 1 | # then trim common trailing characters |
|
134 | while v_temp_str1[-1:] == v_temp_str2[-1:]: |
||
135 | v_temp_str1 = v_temp_str1[:-1] |
||
136 | 1 | v_temp_str2 = v_temp_str2[:-1] |
|
137 | 1 | ||
138 | 1 | v_str1_length = len(v_temp_str1) |
|
139 | v_str2_length = len(v_temp_str2) |
||
140 | 1 | ||
141 | 1 | # then calculate standard Levenshtein Distance |
|
142 | if v_str1_length == 0 or v_str2_length == 0: |
||
143 | return max(v_str2_length, v_str1_length) |
||
144 | 1 | if v_str1_length == 1 and v_str2_length == 1: |
|
145 | 1 | return 1 |
|
146 | 1 | ||
147 | 1 | # create table (NB: this is transposed relative to the PL/SQL version) |
|
148 | d_mat = np_zeros((v_str1_length + 1, v_str2_length + 1), dtype=np_int) |
||
149 | |||
150 | 1 | # enter values in first (leftmost) column |
|
151 | for i in range(1, v_str1_length + 1): |
||
152 | d_mat[i, 0] = i |
||
153 | 1 | # populate remaining columns |
|
154 | 1 | for j in range(1, v_str2_length + 1): |
|
155 | d_mat[0, j] = j |
||
156 | 1 | ||
157 | 1 | for i in range(1, v_str1_length + 1): |
|
158 | if v_temp_str1[i - 1] == v_temp_str2[j - 1]: |
||
159 | 1 | v_this_cost = 0 |
|
160 | 1 | else: |
|
161 | 1 | v_this_cost = 1 |
|
162 | |||
163 | 1 | # extension to cover multiple single, double, triple, etc. |
|
164 | # character transpositions |
||
165 | # that includes calculation of original Levenshtein distance |
||
166 | # when no transposition found |
||
167 | v_temp_block_length = int( |
||
168 | min( |
||
169 | 1 | v_str1_length / 2, v_str2_length / 2, self._block_limit |
|
170 | ) |
||
171 | ) |
||
172 | |||
173 | while v_temp_block_length >= 1: |
||
174 | if ( |
||
175 | 1 | (i >= v_temp_block_length * 2) |
|
176 | 1 | and (j >= v_temp_block_length * 2) |
|
177 | and ( |
||
178 | _substr( |
||
179 | v_temp_str1, |
||
180 | i - v_temp_block_length * 2 - 1, |
||
181 | v_temp_block_length, |
||
182 | ) |
||
183 | == _substr( |
||
184 | v_temp_str2, |
||
185 | j - v_temp_block_length - 1, |
||
186 | v_temp_block_length, |
||
187 | ) |
||
188 | ) |
||
189 | and ( |
||
190 | _substr( |
||
191 | v_temp_str1, |
||
192 | i - v_temp_block_length - 1, |
||
193 | v_temp_block_length, |
||
194 | ) |
||
195 | == _substr( |
||
196 | v_temp_str2, |
||
197 | j - v_temp_block_length * 2 - 1, |
||
198 | v_temp_block_length, |
||
199 | ) |
||
200 | ) |
||
201 | ): |
||
202 | # transposition found |
||
203 | d_mat[i, j] = min( |
||
204 | d_mat[i, j - 1] + 1, |
||
205 | 1 | d_mat[i - 1, j] + 1, |
|
206 | d_mat[ |
||
207 | i - v_temp_block_length * 2, |
||
208 | j - v_temp_block_length * 2, |
||
209 | ] |
||
210 | + v_this_cost |
||
211 | + v_temp_block_length |
||
212 | - 1, |
||
213 | ) |
||
214 | v_temp_block_length = 0 |
||
215 | elif v_temp_block_length == 1: |
||
216 | 1 | # no transposition |
|
217 | 1 | d_mat[i, j] = min( |
|
218 | d_mat[i, j - 1] + 1, |
||
219 | 1 | d_mat[i - 1, j] + 1, |
|
220 | d_mat[i - 1, j - 1] + v_this_cost, |
||
221 | ) |
||
222 | v_temp_block_length -= 1 |
||
223 | |||
224 | 1 | return cast(float, d_mat[v_str1_length, v_str2_length]) |
|
225 | |||
268 |