| Conditions | 21 |
| Total Lines | 188 |
| Code Lines | 71 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 58 |
| CRAP Score | 21 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._typo.Typo.dist_abs() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # Copyright 2018-2020 by Christopher C. Little. |
||
| 156 | def dist_abs(self, src: str, tar: str) -> float: |
||
| 157 | """Return the typo distance between two strings. |
||
| 158 | |||
| 159 | Parameters |
||
| 160 | ---------- |
||
| 161 | src : str |
||
| 162 | Source string for comparison |
||
| 163 | tar : str |
||
| 164 | Target string for comparison |
||
| 165 | |||
| 166 | Returns |
||
| 167 | ------- |
||
| 168 | float |
||
| 169 | Typo distance |
||
| 170 | |||
| 171 | Raises |
||
| 172 | ------ |
||
| 173 | ValueError |
||
| 174 | char not found in any keyboard layouts |
||
| 175 | |||
| 176 | Examples |
||
| 177 | -------- |
||
| 178 | >>> cmp = Typo() |
||
| 179 | >>> cmp.dist_abs('cat', 'hat') |
||
| 180 | 1.5811388300841898 |
||
| 181 | >>> cmp.dist_abs('Niall', 'Neil') |
||
| 182 | 2.8251407699364424 |
||
| 183 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
| 184 | 3.414213562373095 |
||
| 185 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
| 186 | 2.5 |
||
| 187 | |||
| 188 | >>> cmp = Typo(metric='manhattan') |
||
| 189 | >>> cmp.dist_abs('cat', 'hat') |
||
| 190 | 2.0 |
||
| 191 | >>> cmp.dist_abs('Niall', 'Neil') |
||
| 192 | 3.0 |
||
| 193 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
| 194 | 3.5 |
||
| 195 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
| 196 | 2.5 |
||
| 197 | |||
| 198 | >>> cmp = Typo(metric='log-manhattan') |
||
| 199 | >>> cmp.dist_abs('cat', 'hat') |
||
| 200 | 0.8047189562170501 |
||
| 201 | >>> cmp.dist_abs('Niall', 'Neil') |
||
| 202 | 2.2424533248940004 |
||
| 203 | >>> cmp.dist_abs('Colin', 'Cuilen') |
||
| 204 | 2.242453324894 |
||
| 205 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
| 206 | 2.3465735902799727 |
||
| 207 | |||
| 208 | |||
| 209 | .. versionadded:: 0.3.0 |
||
| 210 | 1 | .. versionchanged:: 0.3.6 |
|
| 211 | Encapsulated in class |
||
| 212 | 1 | ||
| 213 | 1 | """ |
|
| 214 | 1 | ins_cost, del_cost, sub_cost, shift_cost = self._cost |
|
| 215 | 1 | ||
| 216 | 1 | if src == tar: |
|
| 217 | 1 | return 0.0 |
|
| 218 | if not src: |
||
| 219 | 1 | return len(tar) * ins_cost |
|
| 220 | 1 | if not tar: |
|
| 221 | 1 | return len(src) * del_cost |
|
| 222 | 1 | ||
| 223 | 1 | if self._layout == 'auto': |
|
| 224 | 1 | for kb in ['QWERTY', 'QWERTZ', 'AZERTY']: |
|
| 225 | 1 | keys = set(chain(*chain(*self._keyboard[kb]))) |
|
| 226 | letters = set(src) | set(tar) |
||
| 227 | if not (letters - keys): |
||
| 228 | 1 | keyboard = self._keyboard[kb] |
|
| 229 | break |
||
| 230 | 1 | else: |
|
| 231 | # Fallback to QWERTY |
||
| 232 | 1 | keyboard = self._keyboard['QWERTY'] |
|
| 233 | 1 | else: |
|
| 234 | 1 | keyboard = self._keyboard[self._layout] |
|
| 235 | |||
| 236 | 1 | kb_array = [] |
|
| 237 | for kb_mode in keyboard: |
||
| 238 | kb_array.append({item for sublist in kb_mode for item in sublist}) |
||
| 239 | keys = set(chain(*chain(*keyboard))) |
||
| 240 | |||
| 241 | def _kb_array_for_char(char: str) -> Tuple[Tuple[str, ...], ...]: |
||
| 242 | """Return the keyboard layout that contains ch. |
||
| 243 | |||
| 244 | Parameters |
||
| 245 | ---------- |
||
| 246 | char : str |
||
| 247 | The character to lookup |
||
| 248 | |||
| 249 | Returns |
||
| 250 | ------- |
||
| 251 | tuple |
||
| 252 | A keyboard |
||
| 253 | |||
| 254 | Raises |
||
| 255 | ------ |
||
| 256 | ValueError |
||
| 257 | 1 | char not found in any keyboard layouts |
|
| 258 | 1 | ||
| 259 | 1 | .. versionadded:: 0.3.0 |
|
| 260 | 1 | ||
| 261 | 1 | """ |
|
| 262 | for i, kb_mode in enumerate(kb_array): |
||
| 263 | 1 | if char in kb_mode: |
|
| 264 | 1 | return keyboard[i] |
|
| 265 | 1 | raise ValueError(char + ' not found in any keyboard layouts') |
|
| 266 | 1 | ||
| 267 | 1 | def _substitution_cost(char1: str, char2: str) -> float: |
|
| 268 | if self._failsafe and (char1 not in keys or char2 not in keys): |
||
| 269 | return ins_cost + del_cost |
||
| 270 | 1 | cost = sub_cost |
|
| 271 | cost *= metric_dict[self._metric](char1, char2) + shift_cost * ( |
||
| 272 | 1 | _kb_array_for_char(char1) != _kb_array_for_char(char2) |
|
| 273 | ) |
||
| 274 | return cost |
||
| 275 | |||
| 276 | def _get_char_coord( |
||
| 277 | char: str, kb_array: Tuple[Tuple[str, ...], ...] |
||
| 278 | ) -> Tuple[int, int]: |
||
| 279 | """Return the row & column of char in the keyboard. |
||
| 280 | |||
| 281 | Parameters |
||
| 282 | ---------- |
||
| 283 | char : str |
||
| 284 | The character to search for |
||
| 285 | kb_array : tuple of tuples |
||
| 286 | The array of key positions |
||
| 287 | |||
| 288 | Returns |
||
| 289 | ------- |
||
| 290 | 1 | tuple |
|
| 291 | 1 | The row & column of the key |
|
| 292 | 1 | ||
| 293 | .. versionadded:: 0.3.0 |
||
| 294 | 1 | ||
| 295 | 1 | """ |
|
| 296 | 1 | for row in kb_array: # pragma: no branch |
|
| 297 | 1 | if char in row: |
|
| 298 | break |
||
| 299 | 1 | return kb_array.index(row), row.index(char) |
|
|
|
|||
| 300 | 1 | ||
| 301 | 1 | def _euclidean_keyboard_distance(char1: str, char2: str) -> float: |
|
| 302 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
| 303 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
||
| 304 | 1 | return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
|
| 305 | 1 | ||
| 306 | def _manhattan_keyboard_distance(char1: str, char2: str) -> float: |
||
| 307 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
| 308 | 1 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
| 309 | return abs(row1 - row2) + abs(col1 - col2) |
||
| 310 | 1 | ||
| 311 | def _log_euclidean_keyboard_distance(char1: str, char2: str) -> float: |
||
| 312 | return log(1 + _euclidean_keyboard_distance(char1, char2)) |
||
| 313 | |||
| 314 | def _log_manhattan_keyboard_distance(char1: str, char2: str) -> float: |
||
| 315 | return log(1 + _manhattan_keyboard_distance(char1, char2)) |
||
| 316 | |||
| 317 | 1 | metric_dict = { |
|
| 318 | 1 | 'euclidean': _euclidean_keyboard_distance, |
|
| 319 | 1 | 'manhattan': _manhattan_keyboard_distance, |
|
| 320 | 1 | 'log-euclidean': _log_euclidean_keyboard_distance, |
|
| 321 | 1 | 'log-manhattan': _log_manhattan_keyboard_distance, |
|
| 322 | } |
||
| 323 | 1 | ||
| 324 | 1 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) |
|
| 325 | 1 | for i in range(len(src) + 1): |
|
| 326 | d_mat[i, 0] = i * del_cost |
||
| 327 | for j in range(len(tar) + 1): |
||
| 328 | d_mat[0, j] = j * ins_cost |
||
| 329 | |||
| 330 | for i in range(len(src)): |
||
| 331 | for j in range(len(tar)): |
||
| 332 | d_mat[i + 1, j + 1] = min( |
||
| 333 | d_mat[i + 1, j] + ins_cost, # ins |
||
| 334 | d_mat[i, j + 1] + del_cost, # del |
||
| 335 | d_mat[i, j] |
||
| 336 | 1 | + ( |
|
| 337 | _substitution_cost(src[i], tar[j]) |
||
| 338 | 1 | if src[i] != tar[j] |
|
| 339 | else 0 |
||
| 340 | ), # sub/== |
||
| 341 | ) |
||
| 342 | |||
| 343 | return cast(float, d_mat[len(src), len(tar)]) |
||
| 344 | |||
| 392 |