| Conditions | 18 |
| Total Lines | 116 |
| Code Lines | 77 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 40 |
| CRAP Score | 18 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.distance._editex.editex() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 37 | 1 | def editex(src, tar, cost=(0, 1, 2), local=False): |
|
|
|
|||
| 38 | """Return the Editex distance between two strings. |
||
| 39 | |||
| 40 | As described on pages 3 & 4 of :cite:`Zobel:1996`. |
||
| 41 | |||
| 42 | The local variant is based on :cite:`Ring:2009`. |
||
| 43 | |||
| 44 | :param str src: source string for comparison |
||
| 45 | :param str tar: target string for comparison |
||
| 46 | :param tuple cost: a 3-tuple representing the cost of the four possible |
||
| 47 | edits: |
||
| 48 | match, same-group, and mismatch respectively (by default: (0, 1, 2)) |
||
| 49 | :param bool local: if True, the local variant of Editex is used |
||
| 50 | :returns: Editex distance |
||
| 51 | :rtype: int |
||
| 52 | |||
| 53 | >>> editex('cat', 'hat') |
||
| 54 | 2 |
||
| 55 | >>> editex('Niall', 'Neil') |
||
| 56 | 2 |
||
| 57 | >>> editex('aluminum', 'Catalan') |
||
| 58 | 12 |
||
| 59 | >>> editex('ATCG', 'TAGC') |
||
| 60 | 6 |
||
| 61 | """ |
||
| 62 | 1 | match_cost, group_cost, mismatch_cost = cost |
|
| 63 | 1 | letter_groups = ( |
|
| 64 | {'A', 'E', 'I', 'O', 'U', 'Y'}, |
||
| 65 | {'B', 'P'}, |
||
| 66 | {'C', 'K', 'Q'}, |
||
| 67 | {'D', 'T'}, |
||
| 68 | {'L', 'R'}, |
||
| 69 | {'M', 'N'}, |
||
| 70 | {'G', 'J'}, |
||
| 71 | {'F', 'P', 'V'}, |
||
| 72 | {'S', 'X', 'Z'}, |
||
| 73 | {'C', 'S', 'Z'}, |
||
| 74 | ) |
||
| 75 | 1 | all_letters = { |
|
| 76 | 'A', |
||
| 77 | 'B', |
||
| 78 | 'C', |
||
| 79 | 'D', |
||
| 80 | 'E', |
||
| 81 | 'F', |
||
| 82 | 'G', |
||
| 83 | 'I', |
||
| 84 | 'J', |
||
| 85 | 'K', |
||
| 86 | 'L', |
||
| 87 | 'M', |
||
| 88 | 'N', |
||
| 89 | 'O', |
||
| 90 | 'P', |
||
| 91 | 'Q', |
||
| 92 | 'R', |
||
| 93 | 'S', |
||
| 94 | 'T', |
||
| 95 | 'U', |
||
| 96 | 'V', |
||
| 97 | 'X', |
||
| 98 | 'Y', |
||
| 99 | 'Z', |
||
| 100 | } |
||
| 101 | |||
| 102 | 1 | def r_cost(ch1, ch2): |
|
| 103 | """Return r(a,b) according to Zobel & Dart's definition.""" |
||
| 104 | 1 | if ch1 == ch2: |
|
| 105 | 1 | return match_cost |
|
| 106 | 1 | if ch1 in all_letters and ch2 in all_letters: |
|
| 107 | 1 | for group in letter_groups: |
|
| 108 | 1 | if ch1 in group and ch2 in group: |
|
| 109 | 1 | return group_cost |
|
| 110 | 1 | return mismatch_cost |
|
| 111 | |||
| 112 | 1 | def d_cost(ch1, ch2): |
|
| 113 | """Return d(a,b) according to Zobel & Dart's definition.""" |
||
| 114 | 1 | if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): |
|
| 115 | 1 | return group_cost |
|
| 116 | 1 | return r_cost(ch1, ch2) |
|
| 117 | |||
| 118 | # convert both src & tar to NFKD normalized unicode |
||
| 119 | 1 | src = unicode_normalize('NFKD', text_type(src.upper())) |
|
| 120 | 1 | tar = unicode_normalize('NFKD', text_type(tar.upper())) |
|
| 121 | # convert ß to SS (for Python2) |
||
| 122 | 1 | src = src.replace('ß', 'SS') |
|
| 123 | 1 | tar = tar.replace('ß', 'SS') |
|
| 124 | |||
| 125 | 1 | if src == tar: |
|
| 126 | 1 | return 0 |
|
| 127 | 1 | if not src: |
|
| 128 | 1 | return len(tar) * mismatch_cost |
|
| 129 | 1 | if not tar: |
|
| 130 | 1 | return len(src) * mismatch_cost |
|
| 131 | |||
| 132 | 1 | d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_int) |
|
| 133 | 1 | lens = len(src) |
|
| 134 | 1 | lent = len(tar) |
|
| 135 | 1 | src = ' ' + src |
|
| 136 | 1 | tar = ' ' + tar |
|
| 137 | |||
| 138 | 1 | if not local: |
|
| 139 | 1 | for i in range(1, lens + 1): |
|
| 140 | 1 | d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(src[i - 1], src[i]) |
|
| 141 | 1 | for j in range(1, lent + 1): |
|
| 142 | 1 | d_mat[0, j] = d_mat[0, j - 1] + d_cost(tar[j - 1], tar[j]) |
|
| 143 | |||
| 144 | 1 | for i in range(1, lens + 1): |
|
| 145 | 1 | for j in range(1, lent + 1): |
|
| 146 | 1 | d_mat[i, j] = min( |
|
| 147 | d_mat[i - 1, j] + d_cost(src[i - 1], src[i]), |
||
| 148 | d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]), |
||
| 149 | d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]), |
||
| 150 | ) |
||
| 151 | |||
| 152 | 1 | return d_mat[lens, lent] |
|
| 153 | |||
| 222 |