| Conditions | 13 |
| Total Lines | 168 |
| Code Lines | 97 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 48 |
| CRAP Score | 13 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._typo.typo() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 36 | 1 | def typo(src, tar, metric='euclidean', cost=(1, 1, 0.5, 0.5), layout='QWERTY'): |
|
|
|
|||
| 37 | """Return the typo distance between two strings. |
||
| 38 | |||
| 39 | This is inspired by Typo-Distance :cite:`Song:2011`, and a fair bit of |
||
| 40 | this was copied from that module. Compared to the original, this supports |
||
| 41 | different metrics for substitution. |
||
| 42 | |||
| 43 | :param str src: source string for comparison |
||
| 44 | :param str tar: target string for comparison |
||
| 45 | :param str metric: supported values include: 'euclidean', 'manhattan', |
||
| 46 | 'log-euclidean', and 'log-manhattan' |
||
| 47 | :param tuple cost: a 4-tuple representing the cost of the four possible |
||
| 48 | edits: inserts, deletes, substitutions, and shift, respectively (by |
||
| 49 | default: (1, 1, 0.5, 0.5)) The substitution & shift costs should be |
||
| 50 | significantly less than the cost of an insertion & deletion unless |
||
| 51 | a log metric is used. |
||
| 52 | :param str layout: name of the keyboard layout to use (Currently supported: |
||
| 53 | QWERTY, Dvorak, AZERTY, QWERTZ) |
||
| 54 | :returns: typo distance |
||
| 55 | :rtype: float |
||
| 56 | |||
| 57 | >>> typo('cat', 'hat') |
||
| 58 | 1.5811388 |
||
| 59 | >>> typo('Niall', 'Neil') |
||
| 60 | 2.8251407 |
||
| 61 | >>> typo('Colin', 'Cuilen') |
||
| 62 | 3.4142137 |
||
| 63 | >>> typo('ATCG', 'TAGC') |
||
| 64 | 2.5 |
||
| 65 | |||
| 66 | >>> typo('cat', 'hat', metric='manhattan') |
||
| 67 | 2.0 |
||
| 68 | >>> typo('Niall', 'Neil', metric='manhattan') |
||
| 69 | 3.0 |
||
| 70 | >>> typo('Colin', 'Cuilen', metric='manhattan') |
||
| 71 | 3.5 |
||
| 72 | >>> typo('ATCG', 'TAGC', metric='manhattan') |
||
| 73 | 2.5 |
||
| 74 | |||
| 75 | >>> typo('cat', 'hat', metric='log-manhattan') |
||
| 76 | 0.804719 |
||
| 77 | >>> typo('Niall', 'Neil', metric='log-manhattan') |
||
| 78 | 2.2424533 |
||
| 79 | >>> typo('Colin', 'Cuilen', metric='log-manhattan') |
||
| 80 | 2.2424533 |
||
| 81 | >>> typo('ATCG', 'TAGC', metric='log-manhattan') |
||
| 82 | 2.3465736 |
||
| 83 | """ |
||
| 84 | 1 | ins_cost, del_cost, sub_cost, shift_cost = cost |
|
| 85 | |||
| 86 | 1 | if src == tar: |
|
| 87 | 1 | return 0.0 |
|
| 88 | 1 | if not src: |
|
| 89 | 1 | return len(tar) * ins_cost |
|
| 90 | 1 | if not tar: |
|
| 91 | 1 | return len(src) * del_cost |
|
| 92 | |||
| 93 | # fmt: off |
||
| 94 | 1 | kbs = {'QWERTY': ( |
|
| 95 | (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '-', '='), |
||
| 96 | ('', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '[', ']', |
||
| 97 | '\\'), |
||
| 98 | ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', ';', '\''), |
||
| 99 | ('', 'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '/')), |
||
| 100 | (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+'), |
||
| 101 | ('', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'), |
||
| 102 | ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'), |
||
| 103 | ('', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?')) |
||
| 104 | ), 'Dvorak': ( |
||
| 105 | (('`', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '[', ']'), |
||
| 106 | ('', '\'', ',', '.', 'p', 'y', 'f', 'g', 'c', 'r', 'l', '/', '=', |
||
| 107 | '\\'), |
||
| 108 | ('', 'a', 'o', 'e', 'u', 'i', 'd', 'h', 't', 'n', 's', '-'), |
||
| 109 | ('', ';', 'q', 'j', 'k', 'x', 'b', 'm', 'w', 'v', 'z')), |
||
| 110 | (('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}'), |
||
| 111 | ('', '"', '<', '>', 'P', 'Y', 'F', 'G', 'C', 'R', 'L', '?', '+', '|'), |
||
| 112 | ('', 'A', 'O', 'E', 'U', 'I', 'D', 'H', 'T', 'N', 'S', '_'), |
||
| 113 | ('', ':', 'Q', 'J', 'K', 'X', 'B', 'M', 'W', 'V', 'Z')) |
||
| 114 | ), 'AZERTY': ( |
||
| 115 | (('²', '&', 'é', '"', '\'', '(', '-', 'è', '_', 'ç', 'à', ')', '='), |
||
| 116 | ('', 'a', 'z', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', '', '$'), |
||
| 117 | ('', 'q', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'ù', '*'), |
||
| 118 | ('<', 'w', 'x', 'c', 'v', 'b', 'n', ',', ';', ':', '!')), |
||
| 119 | (('~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '°', '+'), |
||
| 120 | ('', 'A', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '', '£'), |
||
| 121 | ('', 'Q', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'Ù', 'μ'), |
||
| 122 | ('>', 'W', 'X', 'C', 'V', 'B', 'N', '?', '.', '/', '§')) |
||
| 123 | ), 'QWERTZ': ( |
||
| 124 | (('', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', ''), |
||
| 125 | ('', 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', ' ü', '+', |
||
| 126 | '\\'), |
||
| 127 | ('', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#'), |
||
| 128 | ('<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-')), |
||
| 129 | (('°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', ''), |
||
| 130 | ('', 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', ''), |
||
| 131 | ('', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '\''), |
||
| 132 | ('>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_')) |
||
| 133 | )} |
||
| 134 | # fmt: on |
||
| 135 | |||
| 136 | 1 | keyboard = kbs[layout] |
|
| 137 | 1 | lowercase = {item for sublist in keyboard[0] for item in sublist} |
|
| 138 | 1 | uppercase = {item for sublist in keyboard[1] for item in sublist} |
|
| 139 | |||
| 140 | 1 | def _kb_array_for_char(char): |
|
| 141 | """Return the keyboard layout that contains ch.""" |
||
| 142 | 1 | if char in lowercase: |
|
| 143 | 1 | return keyboard[0] |
|
| 144 | 1 | elif char in uppercase: |
|
| 145 | 1 | return keyboard[1] |
|
| 146 | 1 | raise ValueError(char + ' not found in any keyboard layouts') |
|
| 147 | |||
| 148 | 1 | def _get_char_coord(char, kb_array): |
|
| 149 | """Return the row & column of char in the keyboard.""" |
||
| 150 | 1 | for row in kb_array: # pragma: no branch |
|
| 151 | 1 | if char in row: |
|
| 152 | 1 | return kb_array.index(row), row.index(char) |
|
| 153 | |||
| 154 | 1 | def _euclidean_keyboard_distance(char1, char2): |
|
| 155 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
| 156 | 1 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
| 157 | 1 | return ((row1 - row2) ** 2 + (col1 - col2) ** 2) ** 0.5 |
|
| 158 | |||
| 159 | 1 | def _manhattan_keyboard_distance(char1, char2): |
|
| 160 | 1 | row1, col1 = _get_char_coord(char1, _kb_array_for_char(char1)) |
|
| 161 | 1 | row2, col2 = _get_char_coord(char2, _kb_array_for_char(char2)) |
|
| 162 | 1 | return abs(row1 - row2) + abs(col1 - col2) |
|
| 163 | |||
| 164 | 1 | def _log_euclidean_keyboard_distance(char1, char2): |
|
| 165 | 1 | return log(1 + _euclidean_keyboard_distance(char1, char2)) |
|
| 166 | |||
| 167 | 1 | def _log_manhattan_keyboard_distance(char1, char2): |
|
| 168 | 1 | return log(1 + _manhattan_keyboard_distance(char1, char2)) |
|
| 169 | |||
| 170 | 1 | metric_dict = { |
|
| 171 | 'euclidean': _euclidean_keyboard_distance, |
||
| 172 | 'manhattan': _manhattan_keyboard_distance, |
||
| 173 | 'log-euclidean': _log_euclidean_keyboard_distance, |
||
| 174 | 'log-manhattan': _log_manhattan_keyboard_distance, |
||
| 175 | } |
||
| 176 | |||
| 177 | 1 | def _substitution_cost(char1, char2): |
|
| 178 | 1 | cost = sub_cost |
|
| 179 | 1 | cost *= metric_dict[metric](char1, char2) + shift_cost * ( |
|
| 180 | _kb_array_for_char(char1) != _kb_array_for_char(char2) |
||
| 181 | ) |
||
| 182 | 1 | return cost |
|
| 183 | |||
| 184 | 1 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float32) |
|
| 185 | 1 | for i in range(len(src) + 1): |
|
| 186 | 1 | d_mat[i, 0] = i * del_cost |
|
| 187 | 1 | for j in range(len(tar) + 1): |
|
| 188 | 1 | d_mat[0, j] = j * ins_cost |
|
| 189 | |||
| 190 | 1 | for i in range(len(src)): |
|
| 191 | 1 | for j in range(len(tar)): |
|
| 192 | 1 | d_mat[i + 1, j + 1] = min( |
|
| 193 | d_mat[i + 1, j] + ins_cost, # ins |
||
| 194 | d_mat[i, j + 1] + del_cost, # del |
||
| 195 | d_mat[i, j] |
||
| 196 | + ( |
||
| 197 | _substitution_cost(src[i], tar[j]) |
||
| 198 | if src[i] != tar[j] |
||
| 199 | else 0 |
||
| 200 | ), # sub/== |
||
| 201 | ) |
||
| 202 | |||
| 203 | 1 | return d_mat[len(src), len(tar)] |
|
| 204 | |||
| 274 |