Conditions | 21 |
Total Lines | 188 |
Code Lines | 71 |
Lines | 0 |
Ratio | 0 % |
Tests | 58 |
CRAP Score | 21 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._typo.Typo.dist_abs() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Copyright 2018-2020 by Christopher C. Little. |
||
156 | def dist_abs(self, src: str, tar: str) -> float: |
||
157 | """Return the typo distance between two strings. |
||
158 | |||
159 | Parameters |
||
160 | ---------- |
||
161 | src : str |
||
162 | Source string for comparison |
||
163 | tar : str |
||
164 | Target string for comparison |
||
165 | |||
166 | Returns |
||
167 | ------- |
||
168 | float |
||
169 | Typo distance |
||
170 | |||
171 | Raises |
||
172 | ------ |
||
173 | ValueError |
||
174 | char not found in any keyboard layouts |
||
175 | |||
176 | Examples |
||
177 | -------- |
||
178 | >>> cmp = Typo() |
||
179 | >>> cmp.dist_abs('cat', 'hat') |
||
180 | 1.5811388300841898 |
||
181 | >>> cmp.dist_abs('Niall', 'Neil') |
||
182 | 2.8251407699364424 |
||
183 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
184 | 3.414213562373095 |
||
185 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
186 | 2.5 |
||
187 | |||
188 | >>> cmp = Typo(metric='manhattan') |
||
189 | >>> cmp.dist_abs('cat', 'hat') |
||
190 | 2.0 |
||
191 | >>> cmp.dist_abs('Niall', 'Neil') |
||
192 | 3.0 |
||
193 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
194 | 3.5 |
||
195 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
196 | 2.5 |
||
197 | |||
198 | >>> cmp = Typo(metric='log-manhattan') |
||
199 | >>> cmp.dist_abs('cat', 'hat') |
||
200 | 0.8047189562170501 |
||
201 | >>> cmp.dist_abs('Niall', 'Neil') |
||
202 | 2.2424533248940004 |
||
203 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
204 | 2.242453324894 |
||
205 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
206 | 2.3465735902799727 |
||
207 | |||
208 | |||
209 | .. versionadded:: 0.3.0 |
||
210 | 1 | .. versionchanged:: 0.3.6 |
|
211 | Encapsulated in class |
||
212 | 1 | ||
213 | 1 | """ |
|
214 | 1 | ins_cost, del_cost, sub_cost, shift_cost = self._cost |
|
215 | 1 | ||
216 | 1 | if src == tar: |
|
217 | 1 | return 0.0 |
|
218 | if not src: |
||
219 | 1 | return len(tar) * ins_cost |
|
220 | 1 | if not tar: |
|
221 | 1 | return len(src) * del_cost |
|
222 | 1 | ||
223 | 1 | if self._layout == 'auto': |
|
224 | 1 | for kb in ['QWERTY', 'QWERTZ', 'AZERTY']: |
|
225 | 1 | keys = set(chain(*chain(*self._keyboard[kb]))) |
|
226 | letters = set(src) | set(tar) |
||
227 | if not (letters - keys): |
||
228 | 1 | keyboard = self._keyboard[kb] |
|
229 | break |
||
230 | 1 | else: |
|
231 | # Fallback to QWERTY |
||
232 | 1 | keyboard = self._keyboard['QWERTY'] |
|
233 | 1 | else: |
|
234 | 1 | keyboard = self._keyboard[self._layout] |
|
235 | |||
236 | 1 | kb_array = [] |
|
237 | for kb_mode in keyboard: |
||
238 | kb_array.append({item for sublist in kb_mode for item in sublist}) |
||
239 | keys = set(chain(*chain(*keyboard))) |
||
240 | |||
241 | def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]: |
||
242 | """Return the keyboard layout that contains ch. |
||
243 | |||
244 | Parameters |
||
245 | ---------- |
||
246 | char : str |
||
247 | The character to lookup |
||
248 | |||
249 | Returns |
||
250 | ------- |
||
251 | tuple |
||
252 | A keyboard |
||
253 | |||
254 | Raises |
||
255 | ------ |
||
256 | ValueError |
||
257 | 1 | char not found in any keyboard layouts |
|
258 | 1 | ||
259 | 1 | .. versionadded:: 0.3.0 |
|
260 | 1 | ||
261 | 1 | """ |
|
262 | for i, kb_mode in enumerate(kb_array): |
||
263 | 1 | if char in kb_mode: |
|
264 | 1 | return keyboard[i] |
|
265 | 1 | raise ValueError(char + ' not found in any keyboard layouts') |
|
266 | 1 | ||
267 | 1 | def _substitution_cost(char1: str, char2: str) -> float: |
|
268 | if self._failsafe and (char1 not in keys or char2 not in keys): |
||
269 | return ins_cost + del_cost |
||
270 | 1 | cost = sub_cost |
|
271 | cost *= metric_dict[self._metric](char1, char2) + shift_cost * ( |
||
272 | 1 | _kb_array_for_char(char1) != _kb_array_for_char(char2) |
|
273 | ) |
||
274 | return cost |
||
275 | |||
276 | def _get_char_coord( |
||
277 | char: str, kb_array: Tuple[Tuple[str, ...], ...] |
||
278 | ) -> Tuple[int, int]: |
||
279 | """Return the row & column of char in the keyboard. |
||
280 | |||
281 | Parameters |
||
282 | ---------- |
||
283 | char : str |
||
284 | The character to search for |
||
285 | kb_array : tuple of tuples |
||
286 | The array of key positions |
||
287 | |||
288 | Returns |
||
289 | ------- |
||
290 | 1 | tuple |
|
291 | 1 | The row & column of the key |
|
292 | 1 | ||
293 | .. versionadded:: 0.3.0 |
||
294 | 1 | ||
295 | 1 | """ |
|
296 | 1 | for row in kb_array: # pragma: no branch |
|
297 | 1 | if char in row: |
|
298 | break |
||
299 | 1 | return kb_array.index(row), row.index(char) |
|
|
|||
300 | 1 | ||
301 | 1 | def _euclidean_keyboard_distance(char1: str, char2: str) -> float: |
|
302 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
303 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
||
304 | 1 | return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
|
305 | 1 | ||
306 | def _manhattan_keyboard_distance(char1: str, char2: str) -> float: |
||
307 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
308 | 1 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
309 | return abs(row1 - row2) + abs(col1 - col2) |
||
310 | 1 | ||
311 | def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float: |
||
312 | return log(1 + _euclidean_keyboard_distance(char1, char2)) |
||
313 | |||
314 | def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float: |
||
315 | return log(1 + _manhattan_keyboard_distance(char1, char2)) |
||
316 | |||
317 | 1 | metric_dict = { |
|
318 | 1 | 'euclidean': _euclidean_keyboard_distance, |
|
319 | 1 | 'manhattan': _manhattan_keyboard_distance, |
|
320 | 1 | 'log-euclidean': _log_euclidean_keyboard_distance, |
|
321 | 1 | 'log-manhattan': _log_manhattan_keyboard_distance, |
|
322 | } |
||
323 | 1 | ||
324 | 1 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
|
325 | 1 | for i in range(len(src) + 1): |
|
326 | d_mat[i, 0] = i * del_cost |
||
327 | for j in range(len(tar) + 1): |
||
328 | d_mat[0, j] = j * ins_cost |
||
329 | |||
330 | for i in range(len(src)): |
||
331 | for j in range(len(tar)): |
||
332 | d_mat[i + 1, j + 1] = min( |
||
333 | d_mat[i + 1, j] + ins_cost, # ins |
||
334 | d_mat[i, j + 1] + del_cost, # del |
||
335 | d_mat[i, j] |
||
336 | 1 | + ( |
|
337 | _substitution_cost(src[i], tar[j]) |
||
338 | 1 | if src[i] != tar[j] |
|
339 | else 0 |
||
340 | ), # sub/== |
||
341 | ) |
||
342 | |||
343 | return cast(float, d_mat[len(src), len(tar)]) |
||
344 | |||
392 |