| Conditions | 23 |
| Total Lines | 156 |
| Code Lines | 80 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 49 |
| CRAP Score | 23 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._rees_levenshtein.ReesLevenshtein.dist_abs() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # Copyright 2019-2020 by Christopher C. Little. |
||
| 69 | 1 | def dist_abs(self, src: str, tar: str) -> float: |
|
| 70 | """Return the Rees-Levenshtein distance of two strings. |
||
| 71 | 1 | ||
| 72 | This is a straightforward port of the PL/SQL implementation at |
||
| 73 | https://confluence.csiro.au/public/taxamatch/the-mdld-modified-damerau-levenshtein-distance-algorithm |
||
| 74 | |||
| 75 | Parameters |
||
| 76 | ---------- |
||
| 77 | src : str |
||
| 78 | Source string for comparison |
||
| 79 | tar : str |
||
| 80 | Target string for comparison |
||
| 81 | |||
| 82 | Returns |
||
| 83 | ------- |
||
| 84 | float |
||
| 85 | Rees-Levenshtein distance |
||
| 86 | |||
| 87 | Examples |
||
| 88 | -------- |
||
| 89 | >>> cmp = ReesLevenshtein() |
||
| 90 | >>> cmp.dist_abs('cat', 'hat') |
||
| 91 | 1 |
||
| 92 | >>> cmp.dist_abs('Niall', 'Neil') |
||
| 93 | 3 |
||
| 94 | >>> cmp.dist_abs('aluminum', 'Catalan') |
||
| 95 | 7 |
||
| 96 | >>> cmp.dist_abs('ATCG', 'TAGC') |
||
| 97 | 2 |
||
| 98 | |||
| 99 | |||
| 100 | .. versionadded:: 0.4.0 |
||
| 101 | |||
| 102 | """ |
||
| 103 | v_str1_length = len(src) |
||
| 104 | v_str2_length = len(tar) |
||
| 105 | 1 | ||
| 106 | 1 | if tar == src: |
|
| 107 | return 0 |
||
| 108 | 1 | if not src: |
|
| 109 | 1 | return len(tar) |
|
| 110 | 1 | if not tar: |
|
| 111 | 1 | return len(src) |
|
| 112 | 1 | if v_str1_length == 1 and v_str2_length == 1: |
|
| 113 | 1 | return 1 |
|
| 114 | 1 | ||
| 115 | 1 | def _substr(string: str, start: int, length: int) -> str: |
|
| 116 | if start > 0: |
||
| 117 | 1 | start -= 1 |
|
| 118 | 1 | else: |
|
| 119 | 1 | start += len(string) - 1 |
|
| 120 | |||
| 121 | 1 | end = start + length |
|
| 122 | |||
| 123 | 1 | return string[start:end] |
|
| 124 | |||
| 125 | 1 | v_temp_str1 = str(src) |
|
| 126 | v_temp_str2 = str(tar) |
||
| 127 | 1 | ||
| 128 | 1 | # first trim common leading characters |
|
| 129 | while v_temp_str1[:1] == v_temp_str2[:1]: |
||
| 130 | v_temp_str1 = v_temp_str1[1:] |
||
| 131 | 1 | v_temp_str2 = v_temp_str2[1:] |
|
| 132 | 1 | ||
| 133 | 1 | # then trim common trailing characters |
|
| 134 | while v_temp_str1[-1:] == v_temp_str2[-1:]: |
||
| 135 | v_temp_str1 = v_temp_str1[:-1] |
||
| 136 | 1 | v_temp_str2 = v_temp_str2[:-1] |
|
| 137 | 1 | ||
| 138 | 1 | v_str1_length = len(v_temp_str1) |
|
| 139 | v_str2_length = len(v_temp_str2) |
||
| 140 | 1 | ||
| 141 | 1 | # then calculate standard Levenshtein Distance |
|
| 142 | if v_str1_length == 0 or v_str2_length == 0: |
||
| 143 | return max(v_str2_length, v_str1_length) |
||
| 144 | 1 | if v_str1_length == 1 and v_str2_length == 1: |
|
| 145 | 1 | return 1 |
|
| 146 | 1 | ||
| 147 | 1 | # create table (NB: this is transposed relative to the PL/SQL version) |
|
| 148 | d_mat = np_zeros((v_str1_length + 1, v_str2_length + 1), dtype=np_int) |
||
| 149 | |||
| 150 | 1 | # enter values in first (leftmost) column |
|
| 151 | for i in range(1, v_str1_length + 1): |
||
| 152 | d_mat[i, 0] = i |
||
| 153 | 1 | # populate remaining columns |
|
| 154 | 1 | for j in range(1, v_str2_length + 1): |
|
| 155 | d_mat[0, j] = j |
||
| 156 | 1 | ||
| 157 | 1 | for i in range(1, v_str1_length + 1): |
|
| 158 | if v_temp_str1[i - 1] == v_temp_str2[j - 1]: |
||
| 159 | 1 | v_this_cost = 0 |
|
| 160 | 1 | else: |
|
| 161 | 1 | v_this_cost = 1 |
|
| 162 | |||
| 163 | 1 | # extension to cover multiple single, double, triple, etc. |
|
| 164 | # character transpositions |
||
| 165 | # that includes calculation of original Levenshtein distance |
||
| 166 | # when no transposition found |
||
| 167 | v_temp_block_length = int( |
||
| 168 | min( |
||
| 169 | 1 | v_str1_length / 2, v_str2_length / 2, self._block_limit |
|
| 170 | ) |
||
| 171 | ) |
||
| 172 | |||
| 173 | while v_temp_block_length >= 1: |
||
| 174 | if ( |
||
| 175 | 1 | (i >= v_temp_block_length * 2) |
|
| 176 | 1 | and (j >= v_temp_block_length * 2) |
|
| 177 | and ( |
||
| 178 | _substr( |
||
| 179 | v_temp_str1, |
||
| 180 | i - v_temp_block_length * 2 - 1, |
||
| 181 | v_temp_block_length, |
||
| 182 | ) |
||
| 183 | == _substr( |
||
| 184 | v_temp_str2, |
||
| 185 | j - v_temp_block_length - 1, |
||
| 186 | v_temp_block_length, |
||
| 187 | ) |
||
| 188 | ) |
||
| 189 | and ( |
||
| 190 | _substr( |
||
| 191 | v_temp_str1, |
||
| 192 | i - v_temp_block_length - 1, |
||
| 193 | v_temp_block_length, |
||
| 194 | ) |
||
| 195 | == _substr( |
||
| 196 | v_temp_str2, |
||
| 197 | j - v_temp_block_length * 2 - 1, |
||
| 198 | v_temp_block_length, |
||
| 199 | ) |
||
| 200 | ) |
||
| 201 | ): |
||
| 202 | # transposition found |
||
| 203 | d_mat[i, j] = min( |
||
| 204 | d_mat[i, j - 1] + 1, |
||
| 205 | 1 | d_mat[i - 1, j] + 1, |
|
| 206 | d_mat[ |
||
| 207 | i - v_temp_block_length * 2, |
||
| 208 | j - v_temp_block_length * 2, |
||
| 209 | ] |
||
| 210 | + v_this_cost |
||
| 211 | + v_temp_block_length |
||
| 212 | - 1, |
||
| 213 | ) |
||
| 214 | v_temp_block_length = 0 |
||
| 215 | elif v_temp_block_length == 1: |
||
| 216 | 1 | # no transposition |
|
| 217 | 1 | d_mat[i, j] = min( |
|
| 218 | d_mat[i, j - 1] + 1, |
||
| 219 | 1 | d_mat[i - 1, j] + 1, |
|
| 220 | d_mat[i - 1, j - 1] + v_this_cost, |
||
| 221 | ) |
||
| 222 | v_temp_block_length -= 1 |
||
| 223 | |||
| 224 | 1 | return cast(float, d_mat[v_str1_length, v_str2_length]) |
|
| 225 | |||
| 268 |